free-programming-books-parser/index.js

#!/usr/bin/env node

const fs = require("fs");
const path = require("path");
const remark = require("remark");
const { Objects, Strings } = require("./lib/functions");
const languages = require("./languages");
const commandLineArgs = require("command-line-args");

const optionDefinitions = [
  {
    name: "input",
    multiple: true,
    defaultValue: ["./fpb/books", "./fpb/casts", "./fpb/courses", "./fpb/more"],
  },
  { name: "output", defaultValue: "./parser/fpb.json" },
];

const excludes = [
  "README.md",
  "CONTRIBUTING.md",
  "CODE_OF_CONDUCT.md",
  "SUMMARY.md",
];

/**
 * Parses the contents of a heading from remark-parse into a readable format.
 *
 * @param {Array<Object>} children - an array of AST items defined by remark-parse for
 *        the content of headings (H1..H7)
 *
 * @returns {string} an string with the name of the section related with the input heading
 */
function getSectionNameFromHeadingContent(children) {
  // visit nodes in depth
  const walk = (children, depth) =>
    children.reduce((text, node, index) => {
      if (!node || !node.type) return text; // not AST, maybe plain text
      switch (node.type) {
        //
        // meaningfull nodes
        //
        case "emphasis":
        case "strong":
          text += Strings.templater(remarkTokenAST(node), {
            text: walk(node.children, depth + 1),
          });
          break;
        case "inlineCode":
        case "text":
          text += Strings.templater(remarkTokenAST(node), {
            text: node.value,
          });
          break;
        //
        // skipped nodes
        //
        case "heading":
        case "html":
        case "link":
        case "list":
        case "paragraph":
        default:
          break;
      }
      return text;
    }, "");

  return walk(children, 0);
}

/**
 * Parses the contents of a link from remark-parse into a readable format.
 *
 * @param {Array<Object>} children - an array of AST items defined by remark-parse for
 *        the content of a link (A)
 *
 * @returns {string} an string with the text of the related input link
 */
function getLinkTextFromLinkNodes(children) {
  // visit nodes in depth
  const walk = (children, depth) => {
    // not AST, maybe plain text
    if (!Array.isArray(children)) return Objects.toString(children);
    // AST children array nodes
    return children.reduce((text, node, index) => {
      if (!node || !node.type) return text; // not AST, maybe plain text
      switch (node.type) {
        //
        // rebuild meaningfull nodes
        //
        case "image":
          text += Strings.templater(remarkTokenAST(node), {
            text: node.alt || node.title,
            url: node.url,
          });
          break;
        case "inlineCode":
        case "text":
          text += Strings.templater(remarkTokenAST(node), {
            text: node.value,
          });
          break;
        case "emphasis":
        case "strong":
          text += Strings.templater(remarkTokenAST(node), {
            text: walk(node.children, depth + 1),
          });
          break;
        //
        // skipped nodes
        //
        default:
          console.log(
            "getLinkTextFromLinkNodes::skipped",
            depth,
            node.type,
            node
          );
          break;
      }
      return text;
    }, "");
  };

  return walk(children, 0);
}

/**
 * Gets the template related with AST remark-parse node.
 * @param {Object} node - AST node defined by remark-parse
 * @returns {string} - the template string
 */
function remarkTokenAST(node) {
  if (node && node.type) {
    switch (node.type) {
      case "break": // {type: 'break', position: {...}}
        return "<br/>";
      case "emphasis": // {type: 'emphasis', children: [...], position: {...}}
        return Strings.wrap("{{text}}", "_");
      case "heading": // {type: 'heading', depth: 1, children: [...], position: {...}}
        return ["#".repeat(node.depth || 0), "{{text}}"].join("");
      case "image": // {type: 'image', title: '...', url: '...', alt: '...', position: {...}}
        return "![{{text}}]({{url}})";
      case "inlineCode": // {type: 'inlineCode', value: '...', position: {...}}
        return Strings.wrap("{{text}}", "`");
      case "link": // {type: 'link', title: '...', url: '...', children: [...], position: {...}}
        return "[{{text}}]({{url}})";
      case "list": // {type: 'list', ordered: false, start: null, spread: false, children: [...], position: {...}}
      case "listItem": // {type: 'listItem', spread: false, checked: null, children: [...], position: {...}}
        // TODO: generate token for list/listItem
        break;
      case "strong": // {type: 'strong', children: [...], position: {...}}
        return Strings.wrap("{{text}}", "**");
      case "html": // {type: 'html', value: '...', position: {...}}
      case "paragraph": // {type: 'paragraph', children: [...], position: {...}}
      case "text": // {type: 'text', value: '...', position: {...}}
        return Strings.wrap("{{text}}"); // identity
      default:
        break;
    }
  }
  throw new Error("Unrecognized remark node type: " + (node && node.type));
}

/**
 * Parses a list item generated from remark-parse into a readable format.
 *
 * remark-parse parses a markdown file into a long, intricate json.
 * Many fields in this json either give information we do not care
 * about or does not go into enough detail. This function parses the
 * output of remark-parse into a format preferred by this project,
 * indicating authors, notes, and links etc.
 *
 * @param {Object} listItem - a listItem in AST format defined by remark-parse
 *
 * @return {Object} Returns an Object containing details about the piece of media.
 */
function parseListItem(listItem) {
  let entry = {};
  let s = ""; // If we need to build up a string over multiple listItem elements
  let leftParen,
    rightParen = -1; // If we need to parse parenthesized text
  // head of listItem = url, the rest is "other stuff"
  const [link, ...otherStuff] = listItem;
  entry.url = link.url;
  // link.children || link.value => weak way to check if link.type === "link"
  entry.title = getLinkTextFromLinkNodes(link.children || link.value);
  // remember to get OTHER STUFF!! remember there may be multiple links!
  for (let i of otherStuff) {
    if (s === "") {
      // this is almost always, except for when we are parsing a multi-element note
      if (i.type === "text" && i.value.slice(0, 3) === " - ") {
        // author found
        let parenIndex = i.value.indexOf("(");
        if (parenIndex === -1) {
          entry.author = i.value.slice(3).trim();
        } else {
          entry.author = i.value.slice(3, parenIndex).trim(); // go from " - " until the first "("
        }
      }
      if (
        i.type === "emphasis" &&
        i.children[0].value.slice(0, 1) === "(" &&
        i.children[0].value.slice(-1) === ")"
      ) {
        // access notes found (currently assumes exactly one child, so far this is always the case)
        entry.accessNotes = i.children[0].value.slice(1, -1);
      }
      if (i.type === "link") {
        // other links found
        if (entry.otherLinks === undefined) entry.otherLinks = [];
        entry.otherLinks.push({
          title: Strings.stripParens(getLinkTextFromLinkNodes(i.children)),
          url: i.url,
        });
        // entry.otherLinks = [...entry.otherLinks, {title: i.children[0].value, url: i.url}];      // <-- i wish i could get this syntax to work with arrays
      }
      if (i.type === "text" && i.value.indexOf("(") !== -1) {
        // notes found (currently assumes no nested parentheses)
        if (entry.notes === undefined) entry.notes = [];
        leftParen = i.value.indexOf("(");
        while (leftParen != -1) {
          rightParen = i.value.indexOf(")", leftParen);
          if (rightParen === -1) {
            // there must be some *emphasis* found
            s += i.value.slice(leftParen);
            break;
          }
          entry.notes.push(i.value.slice(leftParen + 1, rightParen));
          leftParen = i.value.indexOf("(", rightParen);
        }
      }
    } else {
      // for now we assume that all previous ifs are mutually exclusive with this, may polish later
      if (i.type === "emphasis") {
        // this is the emphasis, add it in boldface and move on
        s += "*" + i.children[0].value + "*";
      } else if (i.type === "link") {
        // something has gone terribly wrong. this book must be viewed and edited manually.
        entry.manualReviewRequired = true;
        break;
      } else {
        // hopefully this is the end of the note
        let rightParen = i.value.indexOf(")");
        if (rightParen === -1) {
          // we have to go AGAIN
          s += i.value;
        } else {
          // finally, we have reached the end of the note
          entry.notes.push(
            Strings.stripParens(s + i.value.slice(0, rightParen + 1))
          );
          s = "";
          // this is a copypaste of another block of code. probably not a good thing tbh.
          leftParen = i.value.indexOf("(");
          while (leftParen != -1) {
            rightParen = i.value.indexOf(")", leftParen);
            if (rightParen === -1) {
              // there must be some *emphasis* found
              s += i.value.slice(leftParen);
              break;
            }
            entry.notes.push(i.value.slice(leftParen + 1, rightParen));
            leftParen = i.value.indexOf("(", rightParen);
          }
        }
      }
    }
  }
  return entry;
}

/**
 * Determines the language a certain file is based on the format
 * from the FreeEbookFoundation GitHub page
 * @param {String} filename A filename in the format kept by all markdown files on the FreeProgrammingBooks Github
 * @returns {String} The language the file is
 */
function getLangFromFilename(filename) {
  const dash = filename.lastIndexOf("-");
  const dot = filename.lastIndexOf(".");
  let lang = filename.slice(dash + 1, dot).replace(/_/, "-");
  let isSubject = false;
  if (!languages.hasOwnProperty(lang)) {
    if (/^[a-z]{2}$/.test(lang) || /^[a-z]{2}-[A-Z]{2}$/.test(lang)) {
      return "";
    }
    // console.log(lang);
    if (lang === "subjects") {
      isSubject = true;
    }
    lang = "en";
  }
  return { lang: lang, isSubject: isSubject };
}

/**
 * Gets all markdown files in a directory,
 * @param {String} dir - A directory path
 * @returns A list of all md files in a directory, excluding those in the excludes array
 */
function getFilesFromDir(dir) {
  return fs
    .readdirSync(dir)
    .filter(
      (file) => path.extname(file) === ".md" && excludes.indexOf(file) === -1
    )
    .map((file) => path.join(dir, file));
}

/**
 * Retrieves the folder name from a string representing a directory and file
 * @param {String} str - A string representing a path directory alike in the format "./directory/file"
 * @returns {String} The extracted directory name
 */
function getMediaTypeFromDirectoryPath(str) {
  str = path.resolve(str); // sanatize and expand (OS independent)
  let type;
  if (fs.lstatSync(str).isDirectory()) {
    // if path is itself a directory, use it name as result
    type = path.basename(str);
  } else {
    // if not... parent/previous slug is always a directory; extract this part
    // path.sep: Windows -> "\", Unix -> "/"
    type = str.split(path.sep).slice(-2, -1).join(path.sep);
  }
  return type;
}

/**
 * Turns a single markdown file into the json structure needed
 * @param {path} doc - a single file path to a markdown file
 * @returns {object} Json object of entries in the md file
 */
function parseMarkdown(doc) {
  let tree = remark.parse(doc).children;
  let sections = []; // This will go into root object later
  let errors = [];
  let currentDepth = 3; // used to determine if the last heading was an h4 or h3

  // find where Index ends
  // probably could be done better, review later
  let i = 0,
    count = 0;
  for (i; i < tree.length; i++) {
    if (tree[i].type == "heading" && tree[i].depth == "3") count++;
    if (count == 2) break;
  }

  tree.slice(i).forEach((item) => {
    // Start iterating after Index
    try {
      if (item.type == "heading") {
        const sectionName = getSectionNameFromHeadingContent(item.children);
        if (sectionName == "Index") return;
        if (item.depth == 3) {
          // Heading is an h3
          currentDepth = 3;
          // create section record
          let newSection = {
            section: sectionName,
            entries: [],
            subsections: [],
          };
          // Push the section to the output array
          sections.push(newSection);
        } else if (item.depth == 4) {
          // Heading is an h4
          currentDepth = 4;
          // create subsection record
          let newSubsection = {
            section: sectionName,
            entries: [],
          };
          // Add to subsection array of most recent h3
          sections[sections.length - 1].subsections.push(newSubsection);
        }
      } else if (item.type == "list") {
        item.children.forEach((listItem) => {
          let content = listItem.children[0].children; // gets array containing a remark-link and a remark-paragraph
          // if(content[0].type !== 'link'){ // SKIPS OVER bad formatting
          //     return;
          // }
          if (currentDepth == 3) {
            let contentJson = parseListItem(content);
            sections[sections.length - 1].entries.push(contentJson); // add the entry to most recent h3
          } else if (currentDepth == 4) {
            let lastSection = sections.length - 1;
            let lastSubSec = sections[lastSection].subsections.length - 1;
            let contentJson = parseListItem(content);
            sections[lastSection].subsections[lastSubSec].entries.push(
              contentJson
            ); // add entry to most recent h4
          }
        });
      }
    } catch (e) {
      // if there was an error while parsing, print the error to an error log
      // looks really ugly, maybe try to refine output later
      let errStart = JSON.stringify(item.position.start.line);
      let errEnd = JSON.stringify(item.position.end.line);
      str = `Error at line ${errStart} - line ${errEnd}.`;
      errors.push(str);
    }
  });
  return { sections: sections, errors: errors };
}

/**
 * Parses a single directory's md files and converts them into usable json
 * @param {String} directory A string pointing to a directory
 * @returns {Object} An object containing two values, dirJson and dirErrors.
 *                   dirJson contains all data that was successfully parsed from
 *                   the markdown files. dirErrors contains all entries that had
 *                   an error occur while parsing.
 */
function parseDirectory(directory) {
  let dirChildren = []; // this will hold the output each markdown doc
  let dirErrors = []; //contains error for a given directory

  let mediaType = getMediaTypeFromDirectoryPath(directory);
  const filenames = getFilesFromDir(path.resolve(directory));
  filenames.forEach((filename) => {
    const doc = fs.readFileSync(filename);
    let { sections, errors } = parseMarkdown(doc); // parse the markdown document
    const { lang, isSubject } = getLangFromFilename(filename);

    // Entries
    let docJson = {
      language: {
        code: lang,
        name: languages[lang],
      },
      index: {},
      sections: sections,
    };
    if (lang === "en") docJson.language.isSubject = isSubject;
    dirChildren.push(docJson);

    // Errors
    if (errors.length !== 0) {
      let docErrors = {
        file: path.basename(filename),
        errors: errors,
      };
      dirErrors.push(docErrors);
    }
  });

  // File entries
  let dirJson = {
    type: mediaType,
    index: {},
    children: dirChildren,
  };

  // Errors

  return { dirJson: dirJson, dirErrors: dirErrors };
}

/**
 * Reads all given directories for markdown files and prints the parsed json in the output directory
 *
 * @param {Array}  directories A list of strings of directories to scan for markdown files
 * @param {String} output A string for the path that the output should be placed in
 */
function parseAll(directories, output) {
  let rootChildren = []; // this will hold the output of each directory
  let rootErrors = [];

  directories.forEach((directory) => {
    let { dirJson, dirErrors } = parseDirectory(directory);
    rootChildren.push(dirJson);
    if (dirErrors.length !== 0) {
      rootErrors.push({
        directory: path.basename(directory),
        files: dirErrors,
      });
    }
  });

  // ALl entries
  let rootJson = {
    type: "root",
    children: rootChildren,
  };

  // Errors
  let allErrors = {
    type: "root",
    directories: rootErrors,
  };
  fs.writeFileSync(output, JSON.stringify(rootJson, null, 3), function (err) {
    if (err) {
      console.log(err);
    }
  });
  // fs.writeFileSync(
  //     "./parser/fpb.log",
  //     JSON.stringify(allErrors, null, 3),
  //     function (err) {
  //         if (err) {
  //             console.log(err);
  //         }
  //     }
  // );
}

let { input, output } = commandLineArgs(optionDefinitions);
parseAll(input, output);