free-programming-books-parser/index.js

186 lines
6.0 KiB
JavaScript
Raw Normal View History

2021-11-05 00:29:01 +00:00
#!/usr/bin/env node
2021-10-07 18:56:44 +00:00
const fs = require('fs');
const path = require('path');
const remark = require('remark');
2021-10-15 22:06:27 +00:00
const languages = require('./languages')
2021-10-07 18:56:44 +00:00
2021-10-22 19:04:23 +00:00
const excludes = [
'README.md',
'CONTRIBUTING.md',
'CODE_OF_CONDUCT.md',
'SUMMARY.md'
]
2021-10-07 18:56:44 +00:00
// TODO!!
/**
* Summary TBD.
*
* Desciption TBD.
*
* @param {Object} listItem - a listItem in AST format defined by remark-parse
*
* @return {Object} Returns an Object containing details about the piece of media Exact format TBD.
*/
let parseListItem = function(listItem){
2021-10-16 19:57:23 +00:00
let entry = {};
const link = listItem[0];
entry.url = link.url;
entry.title = link.children[0].value;
// remember to get OTHER STUFF!! remember there may be multiple links!
return entry;
2021-10-07 18:56:44 +00:00
}
2021-10-22 19:04:23 +00:00
// from free-programming-books-lint
function getLangFromFilename (filename) {
const dash = filename.lastIndexOf('-')
const dot = filename.lastIndexOf('.')
let lang = filename.slice(dash + 1, dot).replace(/_/, '-')
if (!languages.hasOwnProperty(lang)) {
if (/^[a-z]{2}$/.test(lang) || /^[a-z]{2}-[A-Z]{2}$/.test(lang)) {
return ''
}
lang = 'en-US'
}
return lang
}
// from free-programming-books-lint
function getFilesFromDir (dir) {
return fs.readdirSync(dir).filter(file => path.extname(file) === '.md' && excludes.indexOf(file) === -1).map(file => path.join(dir, file))
}
function getMediaFromDirectory(dir){
const slash = dir.lastIndexOf('/');
let mediaType = dir.slice(2, slash);
return mediaType;
}
let parseMarkdown = function(doc){
let tree = remark.parse(doc).children;
2021-11-05 19:31:51 +00:00
let sections = []; // This will go into root object later
2021-11-05 17:31:29 +00:00
let errors = [];
2021-11-05 19:31:51 +00:00
let currentDepth = 3; // used to determine if the last heading was an h4 or h3
2021-10-07 18:56:44 +00:00
2021-10-22 19:04:23 +00:00
// find where Index ends
2021-10-14 16:37:21 +00:00
// probably could be done better, review later
let i=0, count = 0;
for(i; i < tree.length; i++){
if(tree[i].type=='heading' && tree[i].depth=='3')
count++;
if(count == 2)
break;
}
2021-10-22 19:04:23 +00:00
2021-11-05 19:31:51 +00:00
tree.slice(i).forEach( (item) => { // Start iterating after Index
2021-10-28 21:34:49 +00:00
try {
if(item.type == "heading" && item.children[0].value == 'Index')
2021-10-22 19:04:23 +00:00
return;
if(item.type == "heading"){
2021-11-05 19:31:51 +00:00
if(item.depth == 3){ // Heading is an h3
2021-10-14 16:37:21 +00:00
currentDepth = 3;
2021-11-05 19:31:51 +00:00
let newSection = {
section: item.children[0].value, // Get the name of the section
entries: [],
subsections: []
};
sections.push(newSection); // Push the section to the output array
2021-10-07 18:56:44 +00:00
}
2021-11-05 19:31:51 +00:00
else if(item.depth == 4){ // Heading is an h4
2021-10-14 16:37:21 +00:00
currentDepth = 4;
2021-11-05 19:31:51 +00:00
let newSubsection = {
section: item.children[0].value, // Get the name of the subsection
entries: []
};
sections[sections.length-1].subsections.push(newSubsection); // Add to subsection array of most recent h3
2021-10-07 18:56:44 +00:00
}
}
2021-10-22 19:04:23 +00:00
else if(item.type == 'list'){
item.children.forEach( (listItem) => {
2021-11-05 19:31:51 +00:00
let content = listItem.children[0].children; // gets array containing a remark-link and a remark-paragraph
2021-10-28 21:34:49 +00:00
// if(content[0].type !== 'link'){ // SKIPS OVER bad formatting
// return;
// }
2021-10-07 18:56:44 +00:00
if(currentDepth == 3){
2021-10-22 19:04:23 +00:00
let contentJson = parseListItem(content);
2021-11-05 19:31:51 +00:00
sections[sections.length-1].entries.push(contentJson); // add the entry to most recent h3
2021-10-07 18:56:44 +00:00
}
else if(currentDepth == 4){
2021-11-05 19:31:51 +00:00
let lastSection = sections.length-1;
let lastSubSec = sections[lastSection].subsections.length-1;
2021-10-22 19:04:23 +00:00
let contentJson = parseListItem(content);
2021-11-05 19:31:51 +00:00
sections[lastSection].subsections[lastSubSec].entries.push(contentJson); // add entry to most recent h4
2021-10-07 18:56:44 +00:00
}
2021-10-22 19:04:23 +00:00
});
2021-10-07 18:56:44 +00:00
}
2021-10-28 21:34:49 +00:00
} catch (e) {
// if there was an error while parsing, print the error to an error log
// looks really ugly, maybe try to refine output later
2021-11-05 17:36:22 +00:00
// start_output = JSON.stringify(item.position.start)
// end_output = JSON.stringify(item.position.end)
// str = `Parser had an error while parsing the document starting at ${start_output} and ending at ${end_output}.`
// errors.push(str)
2021-10-28 21:34:49 +00:00
}
2021-10-22 19:04:23 +00:00
});
2021-11-05 19:31:51 +00:00
return sections;
2021-10-22 19:04:23 +00:00
}
function parseDirectory(directory){
2021-11-05 19:31:51 +00:00
let dirChildren = []; // this will hold the output each markdown doc
2021-10-22 19:04:23 +00:00
let mediaType = getMediaFromDirectory(directory);
const filenames = getFilesFromDir(path.resolve(directory));
filenames.forEach((filename) => {
2021-11-05 18:27:14 +00:00
const doc = fs.readFileSync(filename);
2021-11-05 19:31:51 +00:00
let sections = parseMarkdown(doc); // parse the markdown document
2021-10-22 19:04:23 +00:00
const langCode = getLangFromFilename(filename);
let docJson = {
language: {
code: langCode,
name: languages[langCode],
},
index: {
},
2021-11-05 19:31:51 +00:00
sections: sections
2021-10-22 19:04:23 +00:00
};
2021-11-05 17:36:22 +00:00
// if (errors.length !== 0) {
// dir_errors.push(errors);
// }
2021-10-22 19:04:23 +00:00
dirChildren.push(docJson);
});
let dirJson = {
type: mediaType,
index: {
},
children: dirChildren
};
2021-11-05 17:36:22 +00:00
return dirJson; //, dir_errors;
2021-10-22 19:04:23 +00:00
}
2021-11-05 19:31:51 +00:00
function parseAll(directories){
let rootChildren = []; // this will hold the output of each directory
2021-10-22 19:04:23 +00:00
2021-11-05 19:31:51 +00:00
directories.forEach( (directory) => {
2021-10-22 19:04:23 +00:00
let dirJson = parseDirectory(directory);
rootChildren.push(dirJson);
2021-11-05 17:43:44 +00:00
// if (errors.length !== 0) {
// errors_array.push(errors)
// }
2021-10-22 19:04:23 +00:00
});
let rootJson = {
type: 'root',
children: rootChildren
2021-10-07 18:56:44 +00:00
}
2021-11-05 18:27:14 +00:00
fs.writeFileSync('./parser/fpb.json', JSON.stringify(rootJson, null, 3), function(err) {
2021-10-22 19:04:23 +00:00
if (err) {
console.log(err);
}
});
2021-10-07 18:56:44 +00:00
}
2021-10-22 19:04:23 +00:00
console.time('Parse Time')
2021-11-05 17:00:31 +00:00
parseAll(['./fpb/books']);
2021-10-22 19:04:23 +00:00
console.timeEnd('Parse Time');