Merge pull request #11 from davorpa/bugfix/8

[BUGFIX-8] Improve extraction of resource titles from Markdown links
2022-09-24 14:55:44 -04:00 · 2022-09-24 14:55:44 -04:00 · c39d7a0866
parent dc53b8c803 f4b24bdefe
commit c39d7a0866
4 changed files with 183 additions and 16 deletions
--- a/index.js
+++ b/index.js
@ -3,6 +3,7 @@
 const fs = require("fs");
 const path = require("path");
 const remark = require("remark");
+const { Objects, Strings } = require("./lib/functions");
 const languages = require("./languages");
 const commandLineArgs = require("command-line-args");

@ -31,24 +32,25 @@ const excludes = [
 * @returns {string} an string with the name of the section related with the input heading
 */
 function getSectionNameFromHeadingContent(children) {
+  // visit nodes in depth
  const walk = (children, depth) =>
    children.reduce((text, node, index) => {
-      if (!node || !node.type) return text;
+      if (!node || !node.type) return text; // not AST, maybe plain text
      switch (node.type) {
        //
        // meaningfull nodes
        //
        case "emphasis":
-          text += "_" + walk(node.children, depth + 1) + "_";
+        case "strong":
+          text += Strings.templater(remarkTokenAST(node), {
+            text: walk(node.children, depth + 1),
+          });
          break;
        case "inlineCode":
-          text += "`" + node.value + "`";
-          break;
-        case "strong":
-          text += "**" + walk(node.children, depth + 1) + "**";
-          break;
        case "text":
-          text += node.value;
+          text += Strings.templater(remarkTokenAST(node), {
+            text: node.value,
+          });
          break;
        //
        // skipped nodes
@ -67,6 +69,100 @@ function getSectionNameFromHeadingContent(children) {
  return walk(children, 0);
 }

+/**
+ * Parses the contents of a link from remark-parse into a readable format.
+ *
+ * @param {Array<Object>} children - an array of AST items defined by remark-parse for
+ *        the content of a link (A)
+ *
+ * @returns {string} an string with the text of the related input link
+ */
+function getLinkTextFromLinkNodes(children) {
+  // visit nodes in depth
+  const walk = (children, depth) => {
+    // not AST, maybe plain text
+    if (!Array.isArray(children)) return Objects.toString(children);
+    // AST children array nodes
+    return children.reduce((text, node, index) => {
+      if (!node || !node.type) return text; // not AST, maybe plain text
+      switch (node.type) {
+        //
+        // rebuild meaningfull nodes
+        //
+        case "image":
+          text += Strings.templater(remarkTokenAST(node), {
+            text: node.alt || node.title,
+            url: node.url,
+          });
+          break;
+        case "inlineCode":
+        case "text":
+          text += Strings.templater(remarkTokenAST(node), {
+            text: node.value,
+          });
+          break;
+        case "emphasis":
+        case "strong":
+          text += Strings.templater(remarkTokenAST(node), {
+            text: walk(node.children, depth + 1),
+          });
+          break;
+        //
+        // skipped nodes
+        //
+        default:
+          console.log(
+            "getLinkTextFromLinkNodes::skipped",
+            depth,
+            node.type,
+            node
+          );
+          break;
+      }
+      return text;
+    }, "");
+  };
+
+  return walk(children, 0);
+}
+
+/**
+ * Gets the template related with AST remark-parse node.
+ * @param {Object} node - AST node defined by remark-parse
+ * @returns {string} - the template string
+ */
+function remarkTokenAST(node) {
+  if (node && node.type) {
+    switch (node.type) {
+      case "break": // {type: 'break', position: {...}}
+        return "<br/>";
+      case "emphasis": // {type: 'emphasis', children: [...], position: {...}}
+        return Strings.wrap("{{text}}", "_");
+      case "heading": // {type: 'heading', depth: 1, children: [...], position: {...}}
+        return ["#".repeat(item.depth || 0), "{{text}}"].join("");
+      case "image": // {type: 'image', title: '...', url: '...', alt: '...', position: {...}}
+        return "![{{text}}]({{url}})";
+      case "inlineCode": // {type: 'inlineCode', value: '...', position: {...}}
+        return Strings.wrap("{{text}}", "`");
+      case "link": // {type: 'link', title: '...', url: '...', children: [...], position: {...}}
+        return "[{{text}}]({{url}})";
+      case "list": // {type: 'list', ordered: false, start: null, spread: false, children: [...], position: {...}}
+      case "listItem": // {type: 'listItem', spread: false, checked: null, children: [...], position: {...}}
+        // TODO: generate token for list/listItem
+        break;
+      case "strong": // {type: 'strong', children: [...], position: {...}}
+        return Strings.wrap("{{text}}", "**");
+      case "html": // {type: 'html', value: '...', position: {...}}
+      case "paragraph": // {type: 'paragraph', children: [...], position: {...}}
+      case "text": // {type: 'text', value: '...', position: {...}}
+        return Strings.wrap("{{text}}"); // identity
+      default:
+        break;
+    }
+  }
+  throw new Error("Unrecognized remark node type: " + (node && node.type));
+}
+
 /**
 * Parses a list item generated from remark-parse into a readable format.
 *
@ -81,17 +177,15 @@ function getSectionNameFromHeadingContent(children) {
 * @return {Object} Returns an Object containing details about the piece of media.
 */
 function parseListItem(listItem) {
-  let stripParens = function (s) {
-    if (s.slice(0, 1) === "(" && s.slice(-1) === ")") return s.slice(1, -1);
-    return s;
-  };
  let entry = {};
  let s = ""; // If we need to build up a string over multiple listItem elements
  let leftParen,
    rightParen = -1; // If we need to parse parenthesized text
-  const [link, ...otherStuff] = listItem; // head of listItem = url, the rest is "other stuff"
+  // head of listItem = url, the rest is "other stuff"
+  const [link, ...otherStuff] = listItem;
  entry.url = link.url;
-  entry.title = link.children[0].value;
+  // link.children || link.value => weak way to check if link.type === "link"
+  entry.title = getLinkTextFromLinkNodes(link.children || link.value);
  // remember to get OTHER STUFF!! remember there may be multiple links!
  for (let i of otherStuff) {
    if (s === "") {
@ -117,7 +211,7 @@ function parseListItem(listItem) {
        // other links found
        if (entry.otherLinks === undefined) entry.otherLinks = [];
        entry.otherLinks.push({
-          title: stripParens(i.children[0].value),
+          title: Strings.stripParens(getLinkTextFromLinkNodes(i.children)),
          url: i.url,
        });
        // entry.otherLinks = [...entry.otherLinks, {title: i.children[0].value, url: i.url}];      // <-- i wish i could get this syntax to work with arrays
@ -154,7 +248,9 @@ function parseListItem(listItem) {
          s += i.value;
        } else {
          // finally, we have reached the end of the note
-          entry.notes.push(stripParens(s + i.value.slice(0, rightParen + 1)));
+          entry.notes.push(
+            Strings.stripParens(s + i.value.slice(0, rightParen + 1))
+          );
          s = "";
          // this is a copypaste of another block of code. probably not a good thing tbh.
          leftParen = i.value.indexOf("(");
--- a/lib/functions/Objects.js
+++ b/lib/functions/Objects.js
@ -0,0 +1,19 @@
+/**
+ * To string
+ * @param {any} o - the object to get it text representation
+ * @returns {string} the `o` as string
+ */
+function toString(o) {
+  // null or undefined
+  if (o === null || o === void 0) return o;
+  // is string
+  if (typeof o === "string") return o;
+  // has a toString function in their prototype
+  if (typeof o.toString === "function") return o.toString();
+  // as string in the latest intent
+  return String(o);
+}
+
+module.exports = {
+  toString,
+};
--- a/lib/functions/Strings.js
+++ b/lib/functions/Strings.js
@ -0,0 +1,45 @@
+/**
+ * Strip wrapped parenthesis from a string.
+ * @param {string} s - the string to process
+ * @returns {string} the stripped string if parens found, the input string if don't
+ */
+function stripParens(s) {
+  // null or undefined
+  if (s === null || s === void 0) return s;
+  // is wrapped by ( and )?, then unwrap
+  if (s.slice(0, 1) === "(" && s.slice(-1) === ")") return s.slice(1, -1);
+  // leave as it is
+  return s;
+}
+
+/**
+ * Replaces a data tokens in a template string.
+ * @param {string} template - the template string
+ * @param {object} context - the data used to replace the tokens with
+ * @returns string replace
+ */
+function templater(template, context = {}) {
+  // replaceAll using a replacer function
+  return template.replace(
+    /{{([^{}]+)}}/g, // {{key}}
+    (matchedText, key) => context[key] || ""
+  );
+}
+
+/**
+ * Wraps a string between other that acts as token.
+ * @param {string} s - the text to wrap
+ * @param {string} token - the text to wrap with between
+ * @returns a string in the form `${token}${s}${token}`
+ */
+function wrap(s, token = "") {
+  // avoid mix concatenate/sum string/numbers using array join hack
+  //return `${token}${s}${token}`;
+  return [token, token].join(s);
+}
+
+module.exports = {
+  stripParens,
+  templater,
+  wrap,
+};
--- a/lib/functions/index.js
+++ b/lib/functions/index.js
@ -0,0 +1,7 @@
+const Objects = require("./Objects");
+const Strings = require("./Strings");
+
+module.exports = {
+  Objects,
+  Strings,
+};