From 6f1c264583bc1509ca7ff179a3fa90cf5e1fb761 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Wed, 16 Oct 2024 12:49:13 +0530 Subject: [PATCH 01/32] Node: Start with toUSX() with xmldom library --- node-usfm-parser/package.json | 7 +++--- node-usfm-parser/src/usfmParser.js | 37 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/node-usfm-parser/package.json b/node-usfm-parser/package.json index eb10cf79..e20a2b0b 100644 --- a/node-usfm-parser/package.json +++ b/node-usfm-parser/package.json @@ -27,13 +27,14 @@ ], "dependencies": { "tree-sitter": "0.21.1", - "tree-sitter-usfm3": "file:../tree-sitter-usfm3" + "tree-sitter-usfm3": "file:../tree-sitter-usfm3", + "xmldom": "^0.6.0", + "xpath": "^0.0.34" }, "devDependencies": { "ajv": "^8.17.1", "glob": "^11.0.0", "mocha": "^10.7.3", - "parcel": "^2.12.0", - "xml2js": "^0.6.2" + "parcel": "^2.12.0" } } diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js index 94bf26c7..b537e26e 100644 --- a/node-usfm-parser/src/usfmParser.js +++ b/node-usfm-parser/src/usfmParser.js @@ -1,11 +1,14 @@ const Parser = require('tree-sitter'); +const { DOMImplementation, XMLSerializer } = require('xmldom'); const {USFMGenerator} = require("./usfmGenerator"); const {USJGenerator} = require("./usjGenerator"); const {ListGenerator} = require("./listGenerator"); +const {USXGenerator} = require("./usxGenerator") const { includeMarkersInUsj, excludeMarkersInUsj, Filter } = require("./filters.js"); const USFM3 = require('tree-sitter-usfm3'); const { Query } = Parser; +const usxSerializer = new XMLSerializer(); class USFMParser { @@ -48,6 +51,7 @@ Only one of USFM, USJ or USX is supported in one object.`) this.errors = []; this.warnings = []; this.parseUSFM(); + } initializeParser() { this.parser = new Parser(); @@ -210,6 +214,39 @@ Only one of USFM, USJ or USX is supported in one object.`) } + toUSX(ignoreErrors = false) { + /* Convert the syntax_tree to the XML format (USX) */ + + if (!ignoreErrors && this.errors && this.errors.length > 0) { + const errStr = this.errors.map(err => err.join(":")).join("\n\t"); + throw new Error(`Errors present:\n\t${errStr}\nUse ignoreErrors=true to generate output despite errors`); + } + let xmlContent = null; + + try { + // Initialize the USX generator (assuming the constructor is already implemented in JS) + const usxGenerator = new USXGenerator(USFM3, + this.usfm); + + // Process the syntax tree and convert to USX format + usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); + + xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors && this.errors.length > 0) { + const errStr = this.errors.map(err => err.join(":")).join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errStr}`; + } + throw new Error(message, { cause: exe }); + } + + // Return the generated XML structure (in JSON format) + return xmlContent; + } + + } From b00a1ab6de10de3ecd290d7cf05cf0a91b23836d Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Wed, 16 Oct 2024 12:50:27 +0530 Subject: [PATCH 02/32] Node: Startwith USXGenerator class, constructore and Id node --- node-usfm-parser/src/usxGenerator.js | 126 +++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 node-usfm-parser/src/usxGenerator.js diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js new file mode 100644 index 00000000..35da66dc --- /dev/null +++ b/node-usfm-parser/src/usxGenerator.js @@ -0,0 +1,126 @@ +//Logics for syntax-tree to xml(USX) conversions +const { DOMImplementation, XMLSerializer } = require('xmldom'); +const xpath = require('xpath'); +const Parser = require("tree-sitter"); +const {Query} = Parser; + +const { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } = require("./utils/markers"); + + +class USXGenerator { + /** + * A binding for all methods used in generating USX from Syntax tree + * @param {object} treeSitterLanguageObj - The Tree-sitter language object + * @param {Buffer} usfmString - The USFM byte data + * @param {Element} [usxRootElement] - The root element of the USX (optional) + */ + constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + + const domImpl = new DOMImplementation(); + const doc = domImpl.createDocument(null, 'usx', null); + + if (usxRootElement === null) { + this.xmlRootNode = doc.documentElement; + this.xmlRootNode.setAttribute('version', '3.1'); + } else { + this.xmlRootNode = usxRootElement; + } + } + + /** + * Builds the ID node in USX + * @param {SyntaxNode} node - The syntax node + * @param {Element} parentXmlNode - The parent XML node to append the ID to + */ + node2UsxId(node, parentXmlNode) { + const idCaptures = new Query(this.usfmLanguage, + "(id (bookcode) @book-code (description)? @desc)") + .captures(node); + + let code = null; + let desc = null; + + idCaptures.forEach(capture => { + if (capture.name === 'book-code') { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === 'desc') { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + + const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); + bookXmlNode.setAttribute('code', code); + bookXmlNode.setAttribute('style', 'id'); + + if (desc && desc.trim() !== '') { + const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); + bookXmlNode.appendChild(textNode); + } + + parentXmlNode.appendChild(bookXmlNode); + } + + + node2Usx(node, parentXmlNode) { + // Handling node types with respective functions + if (node.type === "id") { + this.node2UsxId(node, parentXmlNode); + // } else if (node.type === "chapter") { + // this.node2UsxChapter(node, parentXmlNode); + // } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + // this.node2UsxGeneric(node, parentXmlNode); + // } else if (["ca", "va"].includes(node.type)) { + // this.node2UsxCaVa(node, parentXmlNode); + // } else if (node.type === "v") { + // this.node2UsxVerse(node, parentXmlNode); + // } else if (node.type === "verseText") { + // node.children.forEach(child => { + // this.node2Usx(child, parentXmlNode); + // }); + // } else if (["paragraph", "pi", "ph"].includes(node.type)) { + // this.node2UsxPara(node, parentXmlNode); + // } else if (this.NOTE_MARKERS.includes(node.type)) { + // this.node2UsxNotes(node, parentXmlNode); + // } else if ( + // this.CHAR_STYLE_MARKERS.concat(this.NESTED_CHAR_STYLE_MARKERS, ["xt_standalone", "ref"]).includes(node.type) + // ) { + // this.node2UsxChar(node, parentXmlNode); + // } else if (node.type.endsWith("Attribute")) { + // this.node2UsxAttrib(node, parentXmlNode); + // } else if (node.type === "text") { + // let textVal = this.usfm.slice(node.startByte, node.endByte).toString('utf-8').trim(); + // let siblings = xpath.select('./*', parentXmlNode); + + // if (siblings.length > 0) { + // siblings[siblings.length - 1].appendData(textVal); + // } else { + // parentXmlNode.appendChild(parentXmlNode.ownerDocument.createTextNode(textVal)); + // } + // } else if (["table", "tr"].concat(this.TABLE_CELL_MARKERS).includes(node.type)) { + // this.node2UsxTable(node, parentXmlNode); + // } else if (node.type === "milestone" || node.type === "zNameSpace") { + // this.node2UsxMilestone(node, parentXmlNode); + // } else if (["esb", "cat", "fig"].includes(node.type)) { + // this.node2UsxSpecial(node, parentXmlNode); + // } else if ( + // this.PARA_STYLE_MARKERS.includes(node.type) || + // this.PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + // ) { + // this.node2UsxGeneric(node, parentXmlNode); + } else if (["", "|"].includes(node.type.trim())) { + // Skip whitespace nodes + } else if (node.children.length > 0) { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } + // else { + // throw new Error(`Encountered unknown element: ${node}`); + // } + } +} + + +exports.USXGenerator = USXGenerator; From 4f4e452e2c9f1c92d4baa48768cabddec1cd5d66 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Wed, 16 Oct 2024 13:41:02 +0530 Subject: [PATCH 03/32] Node: Implement chapter, verse, text etc methods in USX Generation --- node-usfm-parser/src/usxGenerator.js | 277 +++++++++++++++++++++++++-- 1 file changed, 256 insertions(+), 21 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 35da66dc..3f7acbc7 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -62,23 +62,257 @@ class USXGenerator { parentXmlNode.appendChild(bookXmlNode); } + node2UsxC(node, parentXmlNode) { + // Build c, the chapter milestone node in usj + const chapCap = new Query(this.usfmLanguage, + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + const bookNode = xpath.select1("book", parentXmlNode); + const bookCode = bookNode.getAttribute("code"); + const chapRef = `${bookCode} ${chapNum}`; + + // Create the 'chapter' element + const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); + chapXmlNode.setAttribute("number", chapNum); + chapXmlNode.setAttribute("style", "c"); + chapXmlNode.setAttribute("sid", chapRef); + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + const altNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('altnumber', altNum); + } + if (cap.name === "pub-num") { + const punNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('pubnumber', pubNum); + } + }); + + parentXmlNode.appendChild(chapXmlNode); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.node2Usx(child, parentXmlNode); + } + }); + } + + + + node2UsxChapter(node, parentXmlNode) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.node2UsxC(child, parentXmlNode); + } else { + this.node2Usx(child, parentXmlNode); + } + }); + } + + findPrevUncle(parentXmlNode) { + // Get the grandparent node + const grandParent = parentXmlNode.parentNode; + let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling + + while (uncleIndex >= 0) { + const uncle = grandParent.childNodes[uncleIndex]; + + // Skip 'sidebar' and 'ms' elements + if (uncle.tagName === "sidebar" || uncle.tagName === "ms") { + uncleIndex--; + } + // Skip elements with 'ca' or 'cp' in the style attribute + else if (uncle.getAttribute('style') === 'ca' || uncle.getAttribute('style') === 'cp') { + uncleIndex--; + } + // Return the found uncle element + else { + return uncle; + } + } + return null; // No suitable uncle found + } + + node2UsxVerse(node, parentXmlNode) { + // Find all previous 'verse' elements + const prevVerses = xpath.select("//verse", this.xmlRootNode); + + // Check if there are previous verses and if the last one has a 'sid' attribute + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + let vEndXmlNode; + if (parentXmlNode.textContent.trim() !== "") { + // If there is verse text in the current parent + vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vEndXmlNode); + } else { + // If no text, find the previous uncle and attach the end verse + const prevUncle = this.findPrevUncle(parentXmlNode); + if (prevUncle.tagName === "para") { + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + prevUncle.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = prevUncle.getElementsByTagName('tr'); + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + throw new Error(`prev_uncle is ${String(prevUncle)}`); + } + } + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + } + + // Query to capture verse-related elements + const verseNumCap = new Query(this.usfmLanguage, + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vXmlNode); + + // Loop through the captured elements and set the attributes + verseNumCap.forEach(capture => { + if (capture.name === 'alt') { + const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + vXmlNode.setAttribute('altnumber', altNum); + } else if (capture.name === 'vp') { + const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); + vXmlNode.setAttribute('pubnumber', vpText); + } + }); + + // Get the last chapter's 'sid' attribute to form the verse reference + const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); + const ref = `${chapterSid}:${verseNum}`; + + // Set attributes on the newly created 'verse' element + vXmlNode.setAttribute('number', verseNum.trim()); + vXmlNode.setAttribute('style', 'v'); + vXmlNode.setAttribute('sid', ref.trim()); + } + + node2UsxCaVa(node, parentXmlNode) { + // Build elements for independent ca and va away from c and v + const style = node.type; + + // Create a new 'char' element under the parent XML node + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + charXmlNode.setAttribute('style', style); + + // Query to capture chapterNumber or verseNumber + const altNumMatch = new Query(this.usfmLanguage, + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + // Extract the alternate number from the captured range + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + // Set the attributes on the 'char' element + charXmlNode.setAttribute('altnumber', altNum); + charXmlNode.setAttribute('closed', 'true'); + + // Append the 'char' element to the parent XML node + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxGeneric(node, parentXmlNode) { + const tagNode = node.children[0]; + let style = this.usfm.slice(tagNode.startIndex, tagNode.startIndex); + + // Strip leading backslashes from the style or use node type + if (style.startsWith('\\')) { + style = style.replace('\\', '').trim(); + } else { + style = node.type; + } + + let childrenRangeStart = 1; + if ( + node.children.length > 1 && + node.children[1].type.startsWith("numbered") + ) { + const numNode = node.children[1]; + const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); + style += num; + childrenRangeStart = 2; + } + + // Create a 'para' element and set its style attribute + const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); + paraXmlNode.setAttribute('style', style); + + // Loop through the child nodes and recursively process them + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // If the child is of one of the allowed types, nest it inside the para node + this.node2Usx(child, paraXmlNode); + } else { + // Otherwise, append the child to the parent XML node + this.node2Usx(child, parentXmlNode); + } + } + + // Append the created para node to the parent XML node + parentXmlNode.appendChild(paraXmlNode); + } node2Usx(node, parentXmlNode) { // Handling node types with respective functions if (node.type === "id") { this.node2UsxId(node, parentXmlNode); - // } else if (node.type === "chapter") { - // this.node2UsxChapter(node, parentXmlNode); - // } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { - // this.node2UsxGeneric(node, parentXmlNode); - // } else if (["ca", "va"].includes(node.type)) { - // this.node2UsxCaVa(node, parentXmlNode); - // } else if (node.type === "v") { - // this.node2UsxVerse(node, parentXmlNode); - // } else if (node.type === "verseText") { - // node.children.forEach(child => { - // this.node2Usx(child, parentXmlNode); - // }); + } else if (node.type === "chapter") { + this.node2UsxChapter(node, parentXmlNode); + } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["ca", "va"].includes(node.type)) { + this.node2UsxCaVa(node, parentXmlNode); + } else if (node.type === "v") { + this.node2UsxVerse(node, parentXmlNode); + } else if (node.type === "verseText") { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); // } else if (["paragraph", "pi", "ph"].includes(node.type)) { // this.node2UsxPara(node, parentXmlNode); // } else if (this.NOTE_MARKERS.includes(node.type)) { @@ -89,15 +323,16 @@ class USXGenerator { // this.node2UsxChar(node, parentXmlNode); // } else if (node.type.endsWith("Attribute")) { // this.node2UsxAttrib(node, parentXmlNode); - // } else if (node.type === "text") { - // let textVal = this.usfm.slice(node.startByte, node.endByte).toString('utf-8').trim(); - // let siblings = xpath.select('./*', parentXmlNode); - - // if (siblings.length > 0) { - // siblings[siblings.length - 1].appendData(textVal); - // } else { - // parentXmlNode.appendChild(parentXmlNode.ownerDocument.createTextNode(textVal)); - // } + } else if (node.type === "text") { + let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); + let siblings = xpath.select('./*', parentXmlNode); + + if (siblings.length > 0) { + siblings[siblings.length - 1].appendChild(textNode); + } else { + parentXmlNode.appendChild(textNode); + } // } else if (["table", "tr"].concat(this.TABLE_CELL_MARKERS).includes(node.type)) { // this.node2UsxTable(node, parentXmlNode); // } else if (node.type === "milestone" || node.type === "zNameSpace") { From 9ef34c56459cd02effa045524f1782eb591c5036 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Wed, 16 Oct 2024 13:52:12 +0530 Subject: [PATCH 04/32] Node: Implement content paragraph node in USX Generation --- node-usfm-parser/src/usxGenerator.js | 42 ++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 3f7acbc7..3147e701 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -242,6 +242,44 @@ class USXGenerator { parentXmlNode.appendChild(charXmlNode); } + node2UsxPara(node, parentXmlNode) { + // Build paragraph nodes in USX + if (node.children[0].type.endsWith('Block')) { + for (const child of node.children[0].children) { + this.node2UsxPara(child, parentXmlNode); + } + } else if (node.type === 'paragraph') { + const paraTagCap = new Query(this.usfmLanguage, + "(paragraph (_) @para-marker)").captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (!paraMarker.endsWith("Block")) { + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + + for (const child of paraTagCap.node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + parentXmlNode.appendChild(paraXmlNode); + } + } else if (['pi', 'ph'].includes(node.type)) { + const paraMarker = this.usfm.slice(node.children[0].startByte, node.children[0].endByte) + .toString('utf-8') + .replace("\\", "") + .trim(); + + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + + for (const child of node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + parentXmlNode.appendChild(paraXmlNode); + } + } + node2UsxGeneric(node, parentXmlNode) { const tagNode = node.children[0]; let style = this.usfm.slice(tagNode.startIndex, tagNode.startIndex); @@ -313,8 +351,8 @@ class USXGenerator { node.children.forEach(child => { this.node2Usx(child, parentXmlNode); }); - // } else if (["paragraph", "pi", "ph"].includes(node.type)) { - // this.node2UsxPara(node, parentXmlNode); + } else if (["paragraph", "pi", "ph"].includes(node.type)) { + this.node2UsxPara(node, parentXmlNode); // } else if (this.NOTE_MARKERS.includes(node.type)) { // this.node2UsxNotes(node, parentXmlNode); // } else if ( From f7881eb503ac8bb65bf3ebec72caffe8516ee88d Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 09:44:46 +0530 Subject: [PATCH 05/32] Node: Implement Notes conversion to USX --- node-usfm-parser/src/usxGenerator.js | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 3147e701..13c5ffb2 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -280,6 +280,28 @@ class USXGenerator { } } + + node2UsxNotes(node, parentXmlNode) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); + noteXmlNode.setAttribute('style', style); + const caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + noteXmlNode.setAttribute('caller', caller); + for (let i = 2; i < node.children.length - 1; i++) { + this.node2Usx(node.children[i], noteXmlNode); + } + + parentXmlNode.appendChild(noteXmlNode); + } + node2UsxGeneric(node, parentXmlNode) { const tagNode = node.children[0]; let style = this.usfm.slice(tagNode.startIndex, tagNode.startIndex); @@ -353,8 +375,8 @@ class USXGenerator { }); } else if (["paragraph", "pi", "ph"].includes(node.type)) { this.node2UsxPara(node, parentXmlNode); - // } else if (this.NOTE_MARKERS.includes(node.type)) { - // this.node2UsxNotes(node, parentXmlNode); + } else if (NOTE_MARKERS.includes(node.type)) { + this.node2UsxNotes(node, parentXmlNode); // } else if ( // this.CHAR_STYLE_MARKERS.concat(this.NESTED_CHAR_STYLE_MARKERS, ["xt_standalone", "ref"]).includes(node.type) // ) { From 6c0e185a22a324c1a3ef85913fbeb224e5b9c374 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 09:58:29 +0530 Subject: [PATCH 06/32] Node: Implement char nodes and attributes conversion to USX --- node-usfm-parser/src/usxGenerator.js | 70 +++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 13c5ffb2..aaa4e18d 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -302,6 +302,64 @@ class USXGenerator { parentXmlNode.appendChild(noteXmlNode); } + node2UsxChar(node, parentXmlNode) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + charXmlNode.setAttribute('style', style); + + // Assume a flag for closed markup, toggle this if your conditions and data structure require + // charJsonObj.closed = node.children[node.children.length - 1].type.startsWith('\\'); + + for (let i = 1; i < childrenRange; i++) { + this.node2Usx(node.children[i], charXmlNode); + } + + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxAttrib(node, parentXmlNode) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = new Query(this.usfmLanguage, + "((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentXmlNode.setAttribute(attribName, attribValue); + } + node2UsxGeneric(node, parentXmlNode) { const tagNode = node.children[0]; let style = this.usfm.slice(tagNode.startIndex, tagNode.startIndex); @@ -377,12 +435,12 @@ class USXGenerator { this.node2UsxPara(node, parentXmlNode); } else if (NOTE_MARKERS.includes(node.type)) { this.node2UsxNotes(node, parentXmlNode); - // } else if ( - // this.CHAR_STYLE_MARKERS.concat(this.NESTED_CHAR_STYLE_MARKERS, ["xt_standalone", "ref"]).includes(node.type) - // ) { - // this.node2UsxChar(node, parentXmlNode); - // } else if (node.type.endsWith("Attribute")) { - // this.node2UsxAttrib(node, parentXmlNode); + } else if ( + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone", "ref"]).includes(node.type) + ) { + this.node2UsxChar(node, parentXmlNode); + } else if (node.type.endsWith("Attribute")) { + this.node2UsxAttrib(node, parentXmlNode); } else if (node.type === "text") { let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); From 03f16ff2a22a47d31ae35432011c9f45b17e95a3 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 10:15:44 +0530 Subject: [PATCH 07/32] Node: Implement esb, cat, ref etc and generic parastyle markers in USX generation --- node-usfm-parser/src/usxGenerator.js | 48 ++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index aaa4e18d..231699e8 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -360,6 +360,40 @@ class USXGenerator { parentXmlNode.setAttribute(attribName, attribValue); } + node2UsxSpecial(node, parentXmlNode) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); + sidebarXmlNode.setAttribute('marker', "esb"); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, sidebarXmlNode); + }); + parentXmlNode.appendChild(sidebarXmlNode); + } else if (node.type === "cat") { + const catCap = new Query(this.usfmLanguage, + "((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentXmlNode.setAttribute("category", category); + } else if (node.type === "fig") { + const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); + figXmlNode.setAttribute("marker", "fig"); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, figXmlNode); + }); + parentXmlNode.appendChild(figXmlNode); + } else if (node.type === "ref") { + const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, refJsonObj); + }); + parentXmlNode.appendChild(refXmlNode); + } + } + node2UsxGeneric(node, parentXmlNode) { const tagNode = node.children[0]; let style = this.usfm.slice(tagNode.startIndex, tagNode.startIndex); @@ -455,13 +489,13 @@ class USXGenerator { // this.node2UsxTable(node, parentXmlNode); // } else if (node.type === "milestone" || node.type === "zNameSpace") { // this.node2UsxMilestone(node, parentXmlNode); - // } else if (["esb", "cat", "fig"].includes(node.type)) { - // this.node2UsxSpecial(node, parentXmlNode); - // } else if ( - // this.PARA_STYLE_MARKERS.includes(node.type) || - // this.PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) - // ) { - // this.node2UsxGeneric(node, parentXmlNode); + } else if (["esb", "cat", "fig"].includes(node.type)) { + this.node2UsxSpecial(node, parentXmlNode); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + ) { + this.node2UsxGeneric(node, parentXmlNode); } else if (["", "|"].includes(node.type.trim())) { // Skip whitespace nodes } else if (node.children.length > 0) { From c9986d294d83f7873ec9585782425510414a959d Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 11:00:41 +0530 Subject: [PATCH 08/32] Node: Implement milestone and table nodes in USX generation --- node-usfm-parser/src/usxGenerator.js | 75 ++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 231699e8..709ffd9e 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -163,7 +163,7 @@ class USXGenerator { vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); prevUncle.appendChild(vEndXmlNode); } else if (prevUncle.tagName === "table") { - const rows = prevUncle.getElementsByTagName('tr'); + const rows = prevUncle.getElementsByTagName('table:row'); vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); rows[rows.length - 1].appendChild(vEndXmlNode); } else { @@ -256,12 +256,12 @@ class USXGenerator { if (!paraMarker.endsWith("Block")) { const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); for (const child of paraTagCap.node.children.slice(1)) { this.node2Usx(child, paraXmlNode); } - parentXmlNode.appendChild(paraXmlNode); } } else if (['pi', 'ph'].includes(node.type)) { const paraMarker = this.usfm.slice(node.children[0].startByte, node.children[0].endByte) @@ -271,12 +271,12 @@ class USXGenerator { const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); for (const child of node.children.slice(1)) { this.node2Usx(child, paraXmlNode); } - parentXmlNode.appendChild(paraXmlNode); } } @@ -295,11 +295,11 @@ class USXGenerator { .substring(callerNode.startIndex, callerNode.endIndex) .trim(); noteXmlNode.setAttribute('caller', caller); + parentXmlNode.appendChild(noteXmlNode); for (let i = 2; i < node.children.length - 1; i++) { this.node2Usx(node.children[i], noteXmlNode); } - parentXmlNode.appendChild(noteXmlNode); } node2UsxChar(node, parentXmlNode) { @@ -316,15 +316,12 @@ class USXGenerator { .replace("+", "") .trim(); charXmlNode.setAttribute('style', style); - - // Assume a flag for closed markup, toggle this if your conditions and data structure require - // charJsonObj.closed = node.children[node.children.length - 1].type.startsWith('\\'); + parentXmlNode.appendChild(charXmlNode); for (let i = 1; i < childrenRange; i++) { this.node2Usx(node.children[i], charXmlNode); } - parentXmlNode.appendChild(charXmlNode); } node2UsxAttrib(node, parentXmlNode) { @@ -360,6 +357,64 @@ class USXGenerator { parentXmlNode.setAttribute(attribName, attribValue); } + node2UsxTable(node, parentXmlNode) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); + parentXmlNode.appendChild(tableXmlNode); + node.children.forEach((child) => { + this.node2Usx(child, tableXmlNode); + }); + } else if (node.type === "tr") { + const rowXmlNode = parentXmlNode.ownerDocument.createElement('table:row'); + rowXmlNode.setAttribute("marker", "tr"); + parentXmlNode.appendChild(rowXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, rowXmlNode); + }); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("table:cell"); + cellXmlNode.setAttribute("marker", style); + cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); + parentXmlNode.appendChild(cellXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, cellXmlNode); + }); + } + } + + node2UsxMilestone(node, parentXmlNode) { + // Create ms node in USJ + + const msNameCap = new Query(this.usfmLanguage, + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); + msXmlNode.setAttribute("marker", style); + parentXmlNode.appendChild(msXmlNode); + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.node2Usx(child, msXmlNode); + } + }); + } + node2UsxSpecial(node, parentXmlNode) { // Build nodes for esb, cat, fig, optbreak in USJ @@ -485,8 +540,8 @@ class USXGenerator { } else { parentXmlNode.appendChild(textNode); } - // } else if (["table", "tr"].concat(this.TABLE_CELL_MARKERS).includes(node.type)) { - // this.node2UsxTable(node, parentXmlNode); + } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { + this.node2UsxTable(node, parentXmlNode); // } else if (node.type === "milestone" || node.type === "zNameSpace") { // this.node2UsxMilestone(node, parentXmlNode); } else if (["esb", "cat", "fig"].includes(node.type)) { From dd0505b11996d0d31d5739f2256335dd484d1495 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 11:03:58 +0530 Subject: [PATCH 09/32] Node: Make verse nodes empty and not carrying the text in USX generation --- node-usfm-parser/src/usxGenerator.js | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 709ffd9e..717df3fe 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -532,18 +532,13 @@ class USXGenerator { this.node2UsxAttrib(node, parentXmlNode); } else if (node.type === "text") { let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + textVal = textVal.replace("~", " ") const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); - let siblings = xpath.select('./*', parentXmlNode); - - if (siblings.length > 0) { - siblings[siblings.length - 1].appendChild(textNode); - } else { - parentXmlNode.appendChild(textNode); - } + parentXmlNode.appendChild(textNode); } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { this.node2UsxTable(node, parentXmlNode); - // } else if (node.type === "milestone" || node.type === "zNameSpace") { - // this.node2UsxMilestone(node, parentXmlNode); + } else if (node.type === "milestone" || node.type === "zNameSpace") { + this.node2UsxMilestone(node, parentXmlNode); } else if (["esb", "cat", "fig"].includes(node.type)) { this.node2UsxSpecial(node, parentXmlNode); } else if ( From da2b383f00ee086c6ddfeae4180815969561a9cc Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 12:19:44 +0530 Subject: [PATCH 10/32] Node: Return xlmdom element instead of string after USX generation --- node-usfm-parser/src/usxGenerator.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 717df3fe..e9836feb 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -366,8 +366,8 @@ class USXGenerator { this.node2Usx(child, tableXmlNode); }); } else if (node.type === "tr") { - const rowXmlNode = parentXmlNode.ownerDocument.createElement('table:row'); - rowXmlNode.setAttribute("marker", "tr"); + const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); + rowXmlNode.setAttribute("style", "tr"); parentXmlNode.appendChild(rowXmlNode); node.children.slice(1).forEach((child) => { this.node2Usx(child, rowXmlNode); @@ -378,8 +378,8 @@ class USXGenerator { .substring(tagNode.startIndex, tagNode.endIndex) .replace("\\", "") .trim(); - const cellXmlNode = parentXmlNode.ownerDocument.createElement("table:cell"); - cellXmlNode.setAttribute("marker", style); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); + cellXmlNode.setAttribute("style", style); cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); parentXmlNode.appendChild(cellXmlNode); node.children.slice(1).forEach((child) => { @@ -406,7 +406,7 @@ class USXGenerator { .replace("\\", "") .trim(); const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); - msXmlNode.setAttribute("marker", style); + msXmlNode.setAttribute("style", style); parentXmlNode.appendChild(msXmlNode); node.children.forEach((child) => { if (child.type.endsWith("Attribute")) { From f167e95ef98e65f95c69a170ae75aab08afe0bb8 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 12:25:48 +0530 Subject: [PATCH 11/32] Node: Fix issue of not adding node to xml tree before processing children --- node-usfm-parser/src/usfmParser.js | 4 ++-- node-usfm-parser/src/usxGenerator.js | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js index b537e26e..008d00a5 100644 --- a/node-usfm-parser/src/usfmParser.js +++ b/node-usfm-parser/src/usfmParser.js @@ -231,8 +231,8 @@ Only one of USFM, USJ or USX is supported in one object.`) // Process the syntax tree and convert to USX format usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); - xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); - + // xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + xmlContent = usxGenerator.xmlRootNode; } catch (exe) { let message = "Unable to do the conversion. "; if (this.errors && this.errors.length > 0) { diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index e9836feb..3cbb3771 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -421,10 +421,10 @@ class USXGenerator { if (node.type === "esb") { const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); sidebarXmlNode.setAttribute('marker', "esb"); + parentXmlNode.appendChild(sidebarXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, sidebarXmlNode); }); - parentXmlNode.appendChild(sidebarXmlNode); } else if (node.type === "cat") { const catCap = new Query(this.usfmLanguage, "((category) @category)") @@ -436,16 +436,16 @@ class USXGenerator { } else if (node.type === "fig") { const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); figXmlNode.setAttribute("marker", "fig"); + parentXmlNode.appendChild(figXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, figXmlNode); }); - parentXmlNode.appendChild(figXmlNode); } else if (node.type === "ref") { const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + parentXmlNode.appendChild(refXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, refJsonObj); }); - parentXmlNode.appendChild(refXmlNode); } } @@ -474,6 +474,7 @@ class USXGenerator { // Create a 'para' element and set its style attribute const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); paraXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(paraXmlNode); // Loop through the child nodes and recursively process them for (let i = childrenRangeStart; i < node.children.length; i++) { @@ -501,7 +502,6 @@ class USXGenerator { } // Append the created para node to the parent XML node - parentXmlNode.appendChild(paraXmlNode); } node2Usx(node, parentXmlNode) { From d4e4d4dd92845f4e37f8a8cbf7a4baef8d33f4ba Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 13:10:59 +0530 Subject: [PATCH 12/32] Node: Add verse end node at chapter end --- node-usfm-parser/src/usxGenerator.js | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 3cbb3771..33b91029 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -119,6 +119,23 @@ class USXGenerator { this.node2Usx(child, parentXmlNode); } }); + + const prevVerses = xpath.select("//verse", this.xmlRootNode); + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + const sibblingCount = parentXmlNode.childNodes.length; + const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; + if (lastSibbling.tagName === "para") { + lastSibbling.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = lastSibbling.getElementsByTagName('row'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + parentXmlNode.appendChild(vEndXmlNode); + } + } + } findPrevUncle(parentXmlNode) { @@ -163,7 +180,7 @@ class USXGenerator { vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); prevUncle.appendChild(vEndXmlNode); } else if (prevUncle.tagName === "table") { - const rows = prevUncle.getElementsByTagName('table:row'); + const rows = prevUncle.getElementsByTagName('row'); vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); rows[rows.length - 1].appendChild(vEndXmlNode); } else { From b6e90f9c0b0f4e0dcd6d1bc89dff1b9091d03428 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 13:54:56 +0530 Subject: [PATCH 13/32] Node: More minor fixes in USX generation --- node-usfm-parser/src/usxGenerator.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 33b91029..5d6018d0 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -92,7 +92,7 @@ class USXGenerator { chapXmlNode.setAttribute('altnumber', altNum); } if (cap.name === "pub-num") { - const punNum = this.usfm + const pubNum = this.usfm .substring(cap.node.startIndex, cap.node.endIndex) .trim(); chapXmlNode.setAttribute('pubnumber', pubNum); @@ -128,7 +128,7 @@ class USXGenerator { const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; if (lastSibbling.tagName === "para") { lastSibbling.appendChild(vEndXmlNode); - } else if (prevUncle.tagName === "table") { + } else if (lastSibbling.tagName === "table") { const rows = lastSibbling.getElementsByTagName('row'); rows[rows.length - 1].appendChild(vEndXmlNode); } else { From 7a3757a4db1fec13b883f9756146de704acc0862 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 14:18:38 +0530 Subject: [PATCH 14/32] Node: Use @xmldom/xlmdom instead of xmldom and xml2js --- node-usfm-parser/package.json | 2 +- node-usfm-parser/src/usfmParser.js | 2 -- node-usfm-parser/src/usxGenerator.js | 2 +- node-usfm-parser/test/config.js | 12 ++++-------- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/node-usfm-parser/package.json b/node-usfm-parser/package.json index e20a2b0b..2121d6ea 100644 --- a/node-usfm-parser/package.json +++ b/node-usfm-parser/package.json @@ -26,9 +26,9 @@ "Samuel JD (https://github.com/samueljd)" ], "dependencies": { + "@xmldom/xmldom": "^0.9.4", "tree-sitter": "0.21.1", "tree-sitter-usfm3": "file:../tree-sitter-usfm3", - "xmldom": "^0.6.0", "xpath": "^0.0.34" }, "devDependencies": { diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js index 008d00a5..106e5c56 100644 --- a/node-usfm-parser/src/usfmParser.js +++ b/node-usfm-parser/src/usfmParser.js @@ -1,5 +1,4 @@ const Parser = require('tree-sitter'); -const { DOMImplementation, XMLSerializer } = require('xmldom'); const {USFMGenerator} = require("./usfmGenerator"); const {USJGenerator} = require("./usjGenerator"); @@ -8,7 +7,6 @@ const {USXGenerator} = require("./usxGenerator") const { includeMarkersInUsj, excludeMarkersInUsj, Filter } = require("./filters.js"); const USFM3 = require('tree-sitter-usfm3'); const { Query } = Parser; -const usxSerializer = new XMLSerializer(); class USFMParser { diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 5d6018d0..34404337 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -1,5 +1,5 @@ //Logics for syntax-tree to xml(USX) conversions -const { DOMImplementation, XMLSerializer } = require('xmldom'); +const { DOMImplementation, XMLSerializer } = require('@xmldom/xmldom'); const xpath = require('xpath'); const Parser = require("tree-sitter"); const {Query} = Parser; diff --git a/node-usfm-parser/test/config.js b/node-usfm-parser/test/config.js index c2dd098d..a0b645ef 100644 --- a/node-usfm-parser/test/config.js +++ b/node-usfm-parser/test/config.js @@ -1,6 +1,6 @@ const {glob} = require('glob'); const fs = require('node:fs'); -const xml2js = require('xml2js'); +const { DOMParser } = require('@xmldom/xmldom') const {USFMParser} = require("../src/index"); let allUsfmFiles = []; @@ -132,13 +132,9 @@ const checkValidUsfm = function (inputUsfmPath) { let metaFilePath = inputUsfmPath.replace("origin.usfm", "metadata.xml") let metadata = fs.readFileSync(metaFilePath, 'utf8') - xml2js.parseString(metadata, (err, result) => { - if (err) { - console.error('Error parsing XML:', err); - return; - } - value = result['test-metadata']['validated'][0]; - }); + const doc = new DOMParser().parseFromString(metadata, 'text/xml'); + + value = doc.getElementsByTagName("validated")[0].textContent; if (value === "fail"){ return false From 061dfc49707cf9d37fb64dcd91f4c2bc1485ea6a Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 14:45:12 +0530 Subject: [PATCH 15/32] Node: Add tests for errorless usfm-usx conversion checks --- node-usfm-parser/test/test_usx_conversion.js | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 node-usfm-parser/test/test_usx_conversion.js diff --git a/node-usfm-parser/test/test_usx_conversion.js b/node-usfm-parser/test/test_usx_conversion.js new file mode 100644 index 00000000..768e635a --- /dev/null +++ b/node-usfm-parser/test/test_usx_conversion.js @@ -0,0 +1,26 @@ +const assert = require('assert'); +const fs = require('node:fs'); +const { DOMImplementation, XMLSerializer } = require('@xmldom/xmldom'); +const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} = require('./config'); +const {USFMParser, Filter} = require("../src/index"); + +describe("Check successful USFM-USX conversion for positive samples", () => { + const domImpl = new DOMImplementation(); + const sampleDoc = domImpl.createDocument(null, 'usx', null); + allUsfmFiles.forEach(function(value) { + + if (isValidUsfm[value]) { + it(`Convert ${value} to USX`, (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + // assert(usx instanceof DOMImplementation.Document); + assert(usx.tagName === "usx"); + assert(usx.getAttribute("version") === "3.1"); + assert(usx.childNodes[0].tagName === "book"); + assert(usx.childNodes[0].getAttribute("style") === "id"); + }); + } + }); +}); From 3141ad1be72ea6516f9e22e4fc054e6221b4acb9 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 15:11:54 +0530 Subject: [PATCH 16/32] Node: Switch back to xmldom for speed --- node-usfm-parser/package.json | 2 +- node-usfm-parser/src/usxGenerator.js | 3 ++- node-usfm-parser/test/config.js | 24 +++++++++++++++++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/node-usfm-parser/package.json b/node-usfm-parser/package.json index 2121d6ea..e20a2b0b 100644 --- a/node-usfm-parser/package.json +++ b/node-usfm-parser/package.json @@ -26,9 +26,9 @@ "Samuel JD (https://github.com/samueljd)" ], "dependencies": { - "@xmldom/xmldom": "^0.9.4", "tree-sitter": "0.21.1", "tree-sitter-usfm3": "file:../tree-sitter-usfm3", + "xmldom": "^0.6.0", "xpath": "^0.0.34" }, "devDependencies": { diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 34404337..75b9b253 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -1,5 +1,6 @@ //Logics for syntax-tree to xml(USX) conversions -const { DOMImplementation, XMLSerializer } = require('@xmldom/xmldom'); +// const { DOMImplementation, XMLSerializer } = require('@xmldom/xmldom'); +const { DOMImplementation, XMLSerializer } = require('xmldom'); const xpath = require('xpath'); const Parser = require("tree-sitter"); const {Query} = Parser; diff --git a/node-usfm-parser/test/config.js b/node-usfm-parser/test/config.js index a0b645ef..a5f6f1aa 100644 --- a/node-usfm-parser/test/config.js +++ b/node-usfm-parser/test/config.js @@ -1,6 +1,6 @@ const {glob} = require('glob'); const fs = require('node:fs'); -const { DOMParser } = require('@xmldom/xmldom') +const { DOMParser } = require('xmldom') const {USFMParser} = require("../src/index"); let allUsfmFiles = []; @@ -105,6 +105,28 @@ let excludeUSJs = [ ] +let excludeUSXs = [ + `${TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml`, + // \ef not treated as inline content of paragraph + `${TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml`, + // verse number="+"!!! + `${TEST_DIR}/specExamples/character/origin.xml`, + // lit element treated as a body paragraph enclosing a verse! + `${TEST_DIR}/usfmjsTests/esb/origin.xml`, + // last verse text given outside of paragraph. + `${TEST_DIR}/special-cases/nbsp/origin.xml`, + // ~ not being replaced by nbsp in usfm-grammar + `${TEST_DIR}/special-cases/empty-attributes/origin.xml`, + // attributes treated as text content of marker + `${TEST_DIR}/biblica/CategoriesOnNotes/origin.xml`, + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.xml`, + // ref node has type ref. Is it char or ref? + `${TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.xml`, + // \v and other contents contained inside \lit. New docs doesnt have \lit + `${TEST_DIR}/usfmjsTests/usfm-body-testF/origin.xml`, + // does the ms go inside \s5 or after it? +] + const initialiseParser = function (inputUsfmPath){ `Open and parse the given file` try { From 90ee92ec7e472815a217c7144131e9e89d8c8573 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 16:14:41 +0530 Subject: [PATCH 17/32] Node: exlcude usfm(version) node in USX --- node-usfm-parser/src/usxGenerator.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 75b9b253..ff9387ce 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -1,5 +1,4 @@ //Logics for syntax-tree to xml(USX) conversions -// const { DOMImplementation, XMLSerializer } = require('@xmldom/xmldom'); const { DOMImplementation, XMLSerializer } = require('xmldom'); const xpath = require('xpath'); const Parser = require("tree-sitter"); @@ -478,6 +477,10 @@ class USXGenerator { style = node.type; } + if (style === "usfm") { + return + } + let childrenRangeStart = 1; if ( node.children.length > 1 && From 8143a77ec1d0ef0e73e3dbfa633e4260e0dea9f9 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 16:47:09 +0530 Subject: [PATCH 18/32] Node: Fix issue with numbered markers --- node-usfm-parser/src/usxGenerator.js | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index ff9387ce..1a1fd192 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -468,13 +468,13 @@ class USXGenerator { node2UsxGeneric(node, parentXmlNode) { const tagNode = node.children[0]; - let style = this.usfm.slice(tagNode.startIndex, tagNode.startIndex); + let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); // Strip leading backslashes from the style or use node type if (style.startsWith('\\')) { - style = style.replace('\\', '').trim(); - } else { - style = node.type; + style = style.replace('\\', ''); + // } else { + // style = node.type; } if (style === "usfm") { @@ -482,15 +482,6 @@ class USXGenerator { } let childrenRangeStart = 1; - if ( - node.children.length > 1 && - node.children[1].type.startsWith("numbered") - ) { - const numNode = node.children[1]; - const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); - style += num; - childrenRangeStart = 2; - } // Create a 'para' element and set its style attribute const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); From a4568fa33771989a3c3545d0d480634363f9097e Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 16:50:47 +0530 Subject: [PATCH 19/32] Node: Fix marker usage instead od style --- node-usfm-parser/src/usxGenerator.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 1a1fd192..c06f3287 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -437,7 +437,7 @@ class USXGenerator { if (node.type === "esb") { const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); - sidebarXmlNode.setAttribute('marker', "esb"); + sidebarXmlNode.setAttribute('style', "esb"); parentXmlNode.appendChild(sidebarXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, sidebarXmlNode); @@ -452,7 +452,7 @@ class USXGenerator { parentXmlNode.setAttribute("category", category); } else if (node.type === "fig") { const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); - figXmlNode.setAttribute("marker", "fig"); + figXmlNode.setAttribute("style", "fig"); parentXmlNode.appendChild(figXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, figXmlNode); From 4557449c33fdae50c72aa997a9799bbe610e5361 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 16:59:22 +0530 Subject: [PATCH 20/32] Node: Fix issues of pi style value --- node-usfm-parser/src/usxGenerator.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index c06f3287..822a94f8 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -281,11 +281,9 @@ class USXGenerator { } } else if (['pi', 'ph'].includes(node.type)) { - const paraMarker = this.usfm.slice(node.children[0].startByte, node.children[0].endByte) - .toString('utf-8') + const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) .replace("\\", "") .trim(); - const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); paraXmlNode.setAttribute("style", paraMarker); parentXmlNode.appendChild(paraXmlNode); From afbf86e28a1ac53188e4a16e9791227151b21814 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 17:22:45 +0530 Subject: [PATCH 21/32] Node: Fix the similar issue with numbered marker fiun in USJ generation --- node-usfm-parser/src/usjGenerator.js | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/node-usfm-parser/src/usjGenerator.js b/node-usfm-parser/src/usjGenerator.js index 13531a3f..edf9f9bf 100644 --- a/node-usfm-parser/src/usjGenerator.js +++ b/node-usfm-parser/src/usjGenerator.js @@ -418,21 +418,21 @@ class USJGenerator { let style = this.usfm.substring(tagNode.startIndex, tagNode.endIndex); if (style.startsWith("\\")) { style = style.replace("\\", "").trim(); - } else { - style = node.type; + // } else { + // style = node.type; } // console.log(node.children.length, node.children[0].type, node.children[1].type) let childrenRangeStart = 1; - if ( - node.children.length > 1 && - node.children[1].type.startsWith("numbered") - ) { - const numNode = node.children[1]; - const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); - style += num; - childrenRangeStart = 2; - } + // if ( + // node.children.length > 1 && + // node.children[1].type.startsWith("numbered") + // ) { + // const numNode = node.children[1]; + // const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); + // style += num; + // childrenRangeStart = 2; + // } const paraJsonObj = { type: "para", marker: style, content: [] }; parentJsonObj.content.push(paraJsonObj); From 0ebc4a6ffbc987d9dc52983458c099923b8793b0 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 17:23:22 +0530 Subject: [PATCH 22/32] Node: Keep ref marker not as char in USX generation --- node-usfm-parser/src/usxGenerator.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index 822a94f8..d8a6a202 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -535,7 +535,7 @@ class USXGenerator { } else if (NOTE_MARKERS.includes(node.type)) { this.node2UsxNotes(node, parentXmlNode); } else if ( - CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone", "ref"]).includes(node.type) + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) ) { this.node2UsxChar(node, parentXmlNode); } else if (node.type.endsWith("Attribute")) { @@ -549,7 +549,7 @@ class USXGenerator { this.node2UsxTable(node, parentXmlNode); } else if (node.type === "milestone" || node.type === "zNameSpace") { this.node2UsxMilestone(node, parentXmlNode); - } else if (["esb", "cat", "fig"].includes(node.type)) { + } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { this.node2UsxSpecial(node, parentXmlNode); } else if ( PARA_STYLE_MARKERS.includes(node.type) || From 04bfc89787af10f91ca1522e0114a0ff121aa9d4 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 17:24:02 +0530 Subject: [PATCH 23/32] Node: tests for checking all markers in generated USX --- node-usfm-parser/test/config.js | 1 + node-usfm-parser/test/test_usx_conversion.js | 94 +++++++++++++++++++- 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/node-usfm-parser/test/config.js b/node-usfm-parser/test/config.js index a5f6f1aa..815f3f3d 100644 --- a/node-usfm-parser/test/config.js +++ b/node-usfm-parser/test/config.js @@ -220,5 +220,6 @@ module.exports = { initialiseParser: initialiseParser, isValidUsfm: isValidUsfm, excludeUSJs: excludeUSJs, + excludeUSXs: excludeUSXs, findAllMarkers: findAllMarkers }; diff --git a/node-usfm-parser/test/test_usx_conversion.js b/node-usfm-parser/test/test_usx_conversion.js index 768e635a..e85b9596 100644 --- a/node-usfm-parser/test/test_usx_conversion.js +++ b/node-usfm-parser/test/test_usx_conversion.js @@ -1,7 +1,7 @@ const assert = require('assert'); const fs = require('node:fs'); -const { DOMImplementation, XMLSerializer } = require('@xmldom/xmldom'); -const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} = require('./config'); +const { DOMImplementation, XMLSerializer, DOMParser } = require('xmldom'); +const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSXs, findAllMarkers} = require('./config'); const {USFMParser, Filter} = require("../src/index"); describe("Check successful USFM-USX conversion for positive samples", () => { @@ -24,3 +24,93 @@ describe("Check successful USFM-USX conversion for positive samples", () => { } }); }); + + + +describe("Ensure all markers are in USX", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USX`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, keepId=true))] + const allUSXNodes = getNodes(usx); + + assert.deepStrictEqual(inputMarkers, allUSXNodes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + + +// describe("Compare generated USX with testsuite sample", () => { + +// allUsfmFiles.forEach(function(value) { +// const usxPath = value.replace(".usfm", ".xml"); +// if (isValidUsfm[value] && ! excludeUSXs.includes(usxPath)) { +// it(`Compare generated USX to ${usxPath}`, (inputUsfmPath=value) => { +// const testParser = initialiseParser(inputUsfmPath) +// const generatedUSX = testParser.toUSX(); +// const filePath = usxPath; +// let fileData = null; +// try { +// fileData = fs.readFileSync(filePath, "utf8"); +// } catch(err) { +// if (err.code === "ENOENT") { +// return +// } +// } +// const testsuiteUSX = new DOMParser().parseFromString( +// fileData, 'text/xml').getElementsByTagName("usx")[0]; + +// assert.deepEqual(generatedUSX, testsuiteUSX); +// }); +// } +// }); +// }); + +function getNodes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (element.nodeType === element.TEXT_NODE) { + return types; // Return empty array if element is a string + } else { + if (element.getAttribute('style')) { + types.push(element.getAttribute('style')); + } + if (element.tagName === "ref") { + types.push("ref"); + } + if (element.getAttribute('altnumber')) { + if (element.tagName === 'chapter') { + types.push('ca'); + } else { + types.push('va'); + } + } + if (element.getAttribute('pubnumber')) { + if (element.tagName === 'chapter') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if (element.getAttribute('category')) { + types.push('cat'); + } + if (element.childNodes.length > 0) { + Array.from(element.childNodes).forEach(child => { + types = types.concat(getNodes(child)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +} From 95a5e92a6c61d86d403225d19a013fd3665289a9 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 17:58:19 +0530 Subject: [PATCH 24/32] Node: Minor fix in ref handling --- node-usfm-parser/src/usxGenerator.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js index d8a6a202..37c61c1d 100644 --- a/node-usfm-parser/src/usxGenerator.js +++ b/node-usfm-parser/src/usxGenerator.js @@ -459,7 +459,7 @@ class USXGenerator { const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); parentXmlNode.appendChild(refXmlNode); node.children.slice(1, -1).forEach((child) => { - this.node2Usx(child, refJsonObj); + this.node2Usx(child, refXmlNode); }); } } From f1d16fa20877b98c48f0080577be3cafdeda4e6a Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 17:59:06 +0530 Subject: [PATCH 25/32] Node: Allow tests to run in parallel --- node-usfm-parser/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-usfm-parser/package.json b/node-usfm-parser/package.json index e20a2b0b..1711652a 100644 --- a/node-usfm-parser/package.json +++ b/node-usfm-parser/package.json @@ -6,7 +6,7 @@ "module": "./dist/es/index.mjs", "scripts": { "build": "parcel build ./src/index.js", - "test": "mocha --timeout 40000" + "test": "mocha --timeout 40000 --parallel" }, "repository": { "type": "git", From 3ef16d2957beab284fa6f805aeed0107e9d71640 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 19:44:55 +0530 Subject: [PATCH 26/32] Web: Replicate toUSX() implementation as in Node --- web-usfm-parser/package.json | 4 + web-usfm-parser/src/usfmParser.js | 45 ++- web-usfm-parser/src/usxGenerator.js | 576 ++++++++++++++++++++++++++++ 3 files changed, 619 insertions(+), 6 deletions(-) create mode 100644 web-usfm-parser/src/usxGenerator.js diff --git a/web-usfm-parser/package.json b/web-usfm-parser/package.json index 20b86a15..19e81860 100644 --- a/web-usfm-parser/package.json +++ b/web-usfm-parser/package.json @@ -46,5 +46,9 @@ "process": "^0.11.10", "web-tree-sitter": "^0.22.6", "xml2js": "^0.6.2" + }, + "dependencies": { + "xmldom": "^0.6.0", + "xpath": "^0.0.34" } } diff --git a/web-usfm-parser/src/usfmParser.js b/web-usfm-parser/src/usfmParser.js index 9e8ffd51..60988e77 100644 --- a/web-usfm-parser/src/usfmParser.js +++ b/web-usfm-parser/src/usfmParser.js @@ -3,6 +3,7 @@ import Parser from './web-tree-sitter/tree-sitter.js'; import USFMGenerator from "./usfmGenerator.js"; import USJGenerator from "./usjGenerator.js"; import ListGenerator from "./listGenerator.js" +import USXGenerator from "./usxGenerator.js"; import { Filter } from "./filters.js"; @@ -202,9 +203,9 @@ Only one of USFM, USJ or USX is supported in one object.`) /* Uses the toJSON function and converts JSON to CSV To be re-implemented to work with the flat JSON schema */ - if (!ignoreErrors && this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - throw new Error(`Errors present:\n\t${errStr}\nUse ignoreErrors=true to generate output despite errors`); + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); } try { @@ -216,13 +217,45 @@ Only one of USFM, USJ or USX is supported in one object.`) } catch (exe) { let message = "Unable to do the conversion. "; - if (this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - message += `Could be due to an error in the USFM\n\t${errStr}`; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + } + + toUSX(ignoreErrors = false) { + /* Convert the syntax_tree to the XML format (USX) */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + let xmlContent = null; + + try { + // Initialize the USX generator (assuming the constructor is already implemented in JS) + const usxGenerator = new USXGenerator(USFMParser.language, + this.usfm); + + // Process the syntax tree and convert to USX format + usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); + + // xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + xmlContent = usxGenerator.xmlRootNode; + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; } throw new Error(message, { cause: exe }); } + // Return the generated XML structure (in JSON format) + return xmlContent; } } diff --git a/web-usfm-parser/src/usxGenerator.js b/web-usfm-parser/src/usxGenerator.js new file mode 100644 index 00000000..00ade8a1 --- /dev/null +++ b/web-usfm-parser/src/usxGenerator.js @@ -0,0 +1,576 @@ +//Logics for syntax-tree to xml(USX) conversions +import { DOMImplementation, XMLSerializer } from 'xmldom'; +import xpath from 'xpath'; + +import { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } from "./utils/markers.js"; + + +class USXGenerator { + /** + * A binding for all methods used in generating USX from Syntax tree + * @param {object} treeSitterLanguageObj - The Tree-sitter language object + * @param {Buffer} usfmString - The USFM byte data + * @param {Element} [usxRootElement] - The root element of the USX (optional) + */ + constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + + const domImpl = new DOMImplementation(); + const doc = domImpl.createDocument(null, 'usx', null); + + if (usxRootElement === null) { + this.xmlRootNode = doc.documentElement; + this.xmlRootNode.setAttribute('version', '3.1'); + } else { + this.xmlRootNode = usxRootElement; + } + } + + /** + * Builds the ID node in USX + * @param {SyntaxNode} node - The syntax node + * @param {Element} parentXmlNode - The parent XML node to append the ID to + */ + node2UsxId(node, parentXmlNode) { + const idCaptures = this.usfmLanguage + .query("(id (bookcode) @book-code (description)? @desc)") + .captures(node); + + let code = null; + let desc = null; + + idCaptures.forEach(capture => { + if (capture.name === 'book-code') { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === 'desc') { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + + const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); + bookXmlNode.setAttribute('code', code); + bookXmlNode.setAttribute('style', 'id'); + + if (desc && desc.trim() !== '') { + const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); + bookXmlNode.appendChild(textNode); + } + + parentXmlNode.appendChild(bookXmlNode); + } + + node2UsxC(node, parentXmlNode) { + // Build c, the chapter milestone node in usj + const chapCap = this.usfmLanguage + .query( + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + const bookNode = xpath.select1("book", parentXmlNode); + const bookCode = bookNode.getAttribute("code"); + const chapRef = `${bookCode} ${chapNum}`; + + // Create the 'chapter' element + const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); + chapXmlNode.setAttribute("number", chapNum); + chapXmlNode.setAttribute("style", "c"); + chapXmlNode.setAttribute("sid", chapRef); + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + const altNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('altnumber', altNum); + } + if (cap.name === "pub-num") { + const pubNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('pubnumber', pubNum); + } + }); + + parentXmlNode.appendChild(chapXmlNode); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.node2Usx(child, parentXmlNode); + } + }); + } + + + + node2UsxChapter(node, parentXmlNode) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.node2UsxC(child, parentXmlNode); + } else { + this.node2Usx(child, parentXmlNode); + } + }); + + const prevVerses = xpath.select("//verse", this.xmlRootNode); + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + const sibblingCount = parentXmlNode.childNodes.length; + const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; + if (lastSibbling.tagName === "para") { + lastSibbling.appendChild(vEndXmlNode); + } else if (lastSibbling.tagName === "table") { + const rows = lastSibbling.getElementsByTagName('row'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + parentXmlNode.appendChild(vEndXmlNode); + } + } + + } + + findPrevUncle(parentXmlNode) { + // Get the grandparent node + const grandParent = parentXmlNode.parentNode; + let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling + + while (uncleIndex >= 0) { + const uncle = grandParent.childNodes[uncleIndex]; + + // Skip 'sidebar' and 'ms' elements + if (uncle.tagName === "sidebar" || uncle.tagName === "ms") { + uncleIndex--; + } + // Skip elements with 'ca' or 'cp' in the style attribute + else if (uncle.getAttribute('style') === 'ca' || uncle.getAttribute('style') === 'cp') { + uncleIndex--; + } + // Return the found uncle element + else { + return uncle; + } + } + return null; // No suitable uncle found + } + + node2UsxVerse(node, parentXmlNode) { + // Find all previous 'verse' elements + const prevVerses = xpath.select("//verse", this.xmlRootNode); + + // Check if there are previous verses and if the last one has a 'sid' attribute + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + let vEndXmlNode; + if (parentXmlNode.textContent.trim() !== "") { + // If there is verse text in the current parent + vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vEndXmlNode); + } else { + // If no text, find the previous uncle and attach the end verse + const prevUncle = this.findPrevUncle(parentXmlNode); + if (prevUncle.tagName === "para") { + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + prevUncle.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = prevUncle.getElementsByTagName('row'); + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + throw new Error(`prev_uncle is ${String(prevUncle)}`); + } + } + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + } + + // Query to capture verse-related elements + const verseNumCap = this.usfmLanguage + .query( + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vXmlNode); + + // Loop through the captured elements and set the attributes + verseNumCap.forEach(capture => { + if (capture.name === 'alt') { + const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + vXmlNode.setAttribute('altnumber', altNum); + } else if (capture.name === 'vp') { + const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); + vXmlNode.setAttribute('pubnumber', vpText); + } + }); + + // Get the last chapter's 'sid' attribute to form the verse reference + const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); + const ref = `${chapterSid}:${verseNum}`; + + // Set attributes on the newly created 'verse' element + vXmlNode.setAttribute('number', verseNum.trim()); + vXmlNode.setAttribute('style', 'v'); + vXmlNode.setAttribute('sid', ref.trim()); + } + + node2UsxCaVa(node, parentXmlNode) { + // Build elements for independent ca and va away from c and v + const style = node.type; + + // Create a new 'char' element under the parent XML node + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + charXmlNode.setAttribute('style', style); + + // Query to capture chapterNumber or verseNumber + const altNumMatch = this.usfmLanguage + .query( + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + // Extract the alternate number from the captured range + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + // Set the attributes on the 'char' element + charXmlNode.setAttribute('altnumber', altNum); + charXmlNode.setAttribute('closed', 'true'); + + // Append the 'char' element to the parent XML node + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxPara(node, parentXmlNode) { + // Build paragraph nodes in USX + if (node.children[0].type.endsWith('Block')) { + for (const child of node.children[0].children) { + this.node2UsxPara(child, parentXmlNode); + } + } else if (node.type === 'paragraph') { + const paraTagCap = this.usfmLanguage + .query("(paragraph (_) @para-marker)") + .captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (!paraMarker.endsWith("Block")) { + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of paraTagCap.node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } else if (['pi', 'ph'].includes(node.type)) { + const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } + + + node2UsxNotes(node, parentXmlNode) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); + noteXmlNode.setAttribute('style', style); + const caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + noteXmlNode.setAttribute('caller', caller); + parentXmlNode.appendChild(noteXmlNode); + for (let i = 2; i < node.children.length - 1; i++) { + this.node2Usx(node.children[i], noteXmlNode); + } + + } + + node2UsxChar(node, parentXmlNode) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + charXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(charXmlNode); + + for (let i = 1; i < childrenRange; i++) { + this.node2Usx(node.children[i], charXmlNode); + } + + } + + node2UsxAttrib(node, parentXmlNode) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = this.usfmLanguage + .query("((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentXmlNode.setAttribute(attribName, attribValue); + } + + node2UsxTable(node, parentXmlNode) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); + parentXmlNode.appendChild(tableXmlNode); + node.children.forEach((child) => { + this.node2Usx(child, tableXmlNode); + }); + } else if (node.type === "tr") { + const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); + rowXmlNode.setAttribute("style", "tr"); + parentXmlNode.appendChild(rowXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, rowXmlNode); + }); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); + cellXmlNode.setAttribute("style", style); + cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); + parentXmlNode.appendChild(cellXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, cellXmlNode); + }); + } + } + + node2UsxMilestone(node, parentXmlNode) { + // Create ms node in USJ + + const msNameCap = this.usfmLanguage + .query( + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); + msXmlNode.setAttribute("style", style); + parentXmlNode.appendChild(msXmlNode); + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.node2Usx(child, msXmlNode); + } + }); + } + + node2UsxSpecial(node, parentXmlNode) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); + sidebarXmlNode.setAttribute('style', "esb"); + parentXmlNode.appendChild(sidebarXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, sidebarXmlNode); + }); + } else if (node.type === "cat") { + const catCap = this.usfmLanguage + .query("((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentXmlNode.setAttribute("category", category); + } else if (node.type === "fig") { + const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); + figXmlNode.setAttribute("style", "fig"); + parentXmlNode.appendChild(figXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, figXmlNode); + }); + } else if (node.type === "ref") { + const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + parentXmlNode.appendChild(refXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, refXmlNode); + }); + } + } + + node2UsxGeneric(node, parentXmlNode) { + const tagNode = node.children[0]; + let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); + + // Strip leading backslashes from the style or use node type + if (style.startsWith('\\')) { + style = style.replace('\\', ''); + // } else { + // style = node.type; + } + + if (style === "usfm") { + return + } + + let childrenRangeStart = 1; + + // Create a 'para' element and set its style attribute + const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); + paraXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(paraXmlNode); + + // Loop through the child nodes and recursively process them + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // If the child is of one of the allowed types, nest it inside the para node + this.node2Usx(child, paraXmlNode); + } else { + // Otherwise, append the child to the parent XML node + this.node2Usx(child, parentXmlNode); + } + } + + // Append the created para node to the parent XML node + } + + node2Usx(node, parentXmlNode) { + // Handling node types with respective functions + if (node.type === "id") { + this.node2UsxId(node, parentXmlNode); + } else if (node.type === "chapter") { + this.node2UsxChapter(node, parentXmlNode); + } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["ca", "va"].includes(node.type)) { + this.node2UsxCaVa(node, parentXmlNode); + } else if (node.type === "v") { + this.node2UsxVerse(node, parentXmlNode); + } else if (node.type === "verseText") { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } else if (["paragraph", "pi", "ph"].includes(node.type)) { + this.node2UsxPara(node, parentXmlNode); + } else if (NOTE_MARKERS.includes(node.type)) { + this.node2UsxNotes(node, parentXmlNode); + } else if ( + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) + ) { + this.node2UsxChar(node, parentXmlNode); + } else if (node.type.endsWith("Attribute")) { + this.node2UsxAttrib(node, parentXmlNode); + } else if (node.type === "text") { + let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + textVal = textVal.replace("~", " ") + const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); + parentXmlNode.appendChild(textNode); + } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { + this.node2UsxTable(node, parentXmlNode); + } else if (node.type === "milestone" || node.type === "zNameSpace") { + this.node2UsxMilestone(node, parentXmlNode); + } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { + this.node2UsxSpecial(node, parentXmlNode); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + ) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["", "|"].includes(node.type.trim())) { + // Skip whitespace nodes + } else if (node.children.length > 0) { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } + // else { + // throw new Error(`Encountered unknown element: ${node}`); + // } + } +} + + +export default USXGenerator; From 533b7026b976085e6610e2125441da66460d96b1 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 19:45:55 +0530 Subject: [PATCH 27/32] Web: Add tests for toUSX() connersion as in Node --- web-usfm-parser/test/config.js | 23 ++++ web-usfm-parser/test/test_usx_conversion.js | 117 ++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 web-usfm-parser/test/test_usx_conversion.js diff --git a/web-usfm-parser/test/config.js b/web-usfm-parser/test/config.js index 869d8adf..cb375cbf 100644 --- a/web-usfm-parser/test/config.js +++ b/web-usfm-parser/test/config.js @@ -104,6 +104,28 @@ let excludeUSJs = [ `${TEST_DIR}/specExamples/character/origin.json`,// lit element treated as a body paragraph enclosing a verse! Issue from USX ] + +let excludeUSXs = [ + `${TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml`, + // \ef not treated as inline content of paragraph + `${TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml`, + // verse number="+"!!! + `${TEST_DIR}/specExamples/character/origin.xml`, + // lit element treated as a body paragraph enclosing a verse! + `${TEST_DIR}/usfmjsTests/esb/origin.xml`, + // last verse text given outside of paragraph. + `${TEST_DIR}/special-cases/nbsp/origin.xml`, + // ~ not being replaced by nbsp in usfm-grammar + `${TEST_DIR}/special-cases/empty-attributes/origin.xml`, + // attributes treated as text content of marker + `${TEST_DIR}/biblica/CategoriesOnNotes/origin.xml`, + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.xml`, + // ref node has type ref. Is it char or ref? + `${TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.xml`, + // \v and other contents contained inside \lit. New docs doesnt have \lit + `${TEST_DIR}/usfmjsTests/usfm-body-testF/origin.xml`, + // does the ms go inside \s5 or after it? +] await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); @@ -204,5 +226,6 @@ export{ initialiseParser, isValidUsfm, excludeUSJs, + excludeUSXs, findAllMarkers }; diff --git a/web-usfm-parser/test/test_usx_conversion.js b/web-usfm-parser/test/test_usx_conversion.js new file mode 100644 index 00000000..f1815559 --- /dev/null +++ b/web-usfm-parser/test/test_usx_conversion.js @@ -0,0 +1,117 @@ + +import assert from 'assert'; +import fs from "node:fs"; +import { DOMImplementation, XMLSerializer, DOMParser } from 'xmldom'; +import {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSXs, findAllMarkers} from './config.js' +import {USFMParser, Filter} from '../src/index.js'; + +describe("Check successful USFM-USX conversion for positive samples", () => { + const domImpl = new DOMImplementation(); + const sampleDoc = domImpl.createDocument(null, 'usx', null); + allUsfmFiles.forEach(function(value) { + + if (isValidUsfm[value]) { + it(`Convert ${value} to USX`, async (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + // assert(usx instanceof DOMImplementation.Document); + assert(usx.tagName === "usx"); + assert(usx.getAttribute("version") === "3.1"); + assert(usx.childNodes[0].tagName === "book"); + assert(usx.childNodes[0].getAttribute("style") === "id"); + }); + } + }); +}); + + + +describe("Ensure all markers are in USX", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USX`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, true))] + const allUSXNodes = getNodes(usx); + + assert.deepStrictEqual(inputMarkers, allUSXNodes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + + +// describe("Compare generated USX with testsuite sample", () => { + +// allUsfmFiles.forEach(function(value) { +// const usxPath = value.replace(".usfm", ".xml"); +// if (isValidUsfm[value] && ! excludeUSXs.includes(usxPath)) { +// it(`Compare generated USX to ${usxPath}`, async (inputUsfmPath=value) => { +// const testParser = await initialiseParser(inputUsfmPath) +// const generatedUSX = testParser.toUSX(); +// const filePath = usxPath; +// let fileData = null; +// try { +// fileData = fs.readFileSync(filePath, "utf8"); +// } catch(err) { +// if (err.code === "ENOENT") { +// return +// } +// } +// const testsuiteUSX = new DOMParser().parseFromString( +// fileData, 'text/xml').getElementsByTagName("usx")[0]; + +// assert.deepEqual(generatedUSX, testsuiteUSX); +// }); +// } +// }); +// }); + +function getNodes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (element.nodeType === element.TEXT_NODE) { + return types; // Return empty array if element is a string + } else { + if (element.getAttribute('style')) { + types.push(element.getAttribute('style')); + } + if (element.tagName === "ref") { + types.push("ref"); + } + if (element.getAttribute('altnumber')) { + if (element.tagName === 'chapter') { + types.push('ca'); + } else { + types.push('va'); + } + } + if (element.getAttribute('pubnumber')) { + if (element.tagName === 'chapter') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if (element.getAttribute('category')) { + types.push('cat'); + } + if (element.childNodes.length > 0) { + Array.from(element.childNodes).forEach(child => { + types = types.concat(getNodes(child)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +} From 83daaa1f6d21f2b0aa0756449f3880c5cbfa11ef Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Fri, 18 Oct 2024 19:51:14 +0530 Subject: [PATCH 28/32] Node: Fix issue is error handling --- node-usfm-parser/src/usfmParser.js | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js index 106e5c56..de72c2b0 100644 --- a/node-usfm-parser/src/usfmParser.js +++ b/node-usfm-parser/src/usfmParser.js @@ -189,9 +189,9 @@ Only one of USFM, USJ or USX is supported in one object.`) /* Uses the toJSON function and converts JSON to CSV To be re-implemented to work with the flat JSON schema */ - if (!ignoreErrors && this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - throw new Error(`Errors present:\n\t${errStr}\nUse ignoreErrors=true to generate output despite errors`); + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); } try { @@ -203,9 +203,9 @@ Only one of USFM, USJ or USX is supported in one object.`) } catch (exe) { let message = "Unable to do the conversion. "; - if (this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - message += `Could be due to an error in the USFM\n\t${errStr}`; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; } throw new Error(message, { cause: exe }); } @@ -215,9 +215,9 @@ Only one of USFM, USJ or USX is supported in one object.`) toUSX(ignoreErrors = false) { /* Convert the syntax_tree to the XML format (USX) */ - if (!ignoreErrors && this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - throw new Error(`Errors present:\n\t${errStr}\nUse ignoreErrors=true to generate output despite errors`); + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); } let xmlContent = null; @@ -233,9 +233,9 @@ Only one of USFM, USJ or USX is supported in one object.`) xmlContent = usxGenerator.xmlRootNode; } catch (exe) { let message = "Unable to do the conversion. "; - if (this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - message += `Could be due to an error in the USFM\n\t${errStr}`; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; } throw new Error(message, { cause: exe }); } From 9735ede1f3cdf8a0577d00e9517d69b1fbfbe17f Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Sat, 19 Oct 2024 13:17:07 +0530 Subject: [PATCH 29/32] Node: Implement USFM Generation from USX and fromUsx initialization of parser --- node-usfm-parser/src/usfmGenerator.js | 123 ++++++++++++++++++++++++++ node-usfm-parser/src/usfmParser.js | 30 ++++++- node-usfm-parser/src/utils/types.js | 14 +++ 3 files changed, 166 insertions(+), 1 deletion(-) diff --git a/node-usfm-parser/src/usfmGenerator.js b/node-usfm-parser/src/usfmGenerator.js index 3a866595..8c8ee550 100644 --- a/node-usfm-parser/src/usfmGenerator.js +++ b/node-usfm-parser/src/usfmGenerator.js @@ -1,4 +1,7 @@ const { NO_USFM_USJ_TYPES, CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } = require("./utils/types"); +const { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } = require("./utils/types"); +const { DOMParser } = require('xmldom'); + class USFMGenerator { constructor() { this.usfmString = ""; @@ -73,6 +76,126 @@ class USFMGenerator { } return this.usfmString; } + + usxToUsfm(xmlObj, nested=false) { + // Check if xmlObj is a string + // if (typeof xmlObj === 'string') { + // // this.usfmString += xmlObj; + // return; + // } + + const objType = xmlObj.tagName; + let marker = null; + let usfmAttributes = []; + + if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) { + return; + } + + if (!NO_NEWLINE_USX_TYPES.includes(objType)) { + this.usfmString += '\n'; + } + + if (objType === 'optbreak') { + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += '// '; + } + + if (xmlObj.hasAttribute('style')) { + marker = xmlObj.getAttribute('style'); + if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) { + marker = `+${marker}`; + } + this.usfmString += `\\${marker} `; + } else if (objType === 'ref') { + marker = 'ref' + this.usfmString += `\\${marker} `; + } + + if (xmlObj.hasAttribute('code')) { + this.usfmString += xmlObj.getAttribute('code'); + } + + if (xmlObj.hasAttribute('number')) { + this.usfmString += `${xmlObj.getAttribute('number')} `; + } + + if (xmlObj.hasAttribute('caller')) { + this.usfmString += `${xmlObj.getAttribute('caller')} `; + } + + if (xmlObj.hasAttribute('altnumber')) { + if (objType === 'verse') { + this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`; + } + } + + if (xmlObj.hasAttribute('pubnumber')) { + if (objType === 'verse') { + this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`; + } + } + + if (xmlObj.hasAttribute('category')) { + this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`; + } + + const children = Array.from(xmlObj.childNodes); + for (const child of children) { + if (child.nodeType === 1) { // Check if child is an element node + if (objType === 'char') { + this.usxToUsfm(child, true); + } else { + this.usxToUsfm(child, false); + } + } + if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += child.nodeValue.trim(); + } + } + + const attributes = Array.from(xmlObj.attributes); + for (const attrNode of attributes) { + let key = attrNode.name; + let val = attrNode.value.replace(/"/g, ''); + if (key === 'file' && objType === 'figure') { + usfmAttributes.push(`src="${val}"`); + } else if (!NON_ATTRIB_USX_KEYS.includes(key)) { + usfmAttributes.push(`${key}="${val}"`); + } + if (['sid', 'eid'].includes(key) && objType === 'ms') { + usfmAttributes.push(`${key}="${val}"`); + } + } + + if (usfmAttributes.length > 0) { + this.usfmString += '|'; + this.usfmString += usfmAttributes.join(' '); + } + + if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true') + || CLOSING_USJ_TYPES.includes(objType) + || usfmAttributes.length > 0) { + if (objType === 'ms') { + this.usfmString += '\\*'; + } else { + this.usfmString += `\\${marker}*`; + } + } + + if (objType === 'sidebar') { + this.usfmString += '\n\\esbe\n'; + } + } } exports.USFMGenerator = USFMGenerator; diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js index de72c2b0..9eccbf33 100644 --- a/node-usfm-parser/src/usfmParser.js +++ b/node-usfm-parser/src/usfmParser.js @@ -1,4 +1,5 @@ const Parser = require('tree-sitter'); +const assert = require('assert'); const {USFMGenerator} = require("./usfmGenerator"); const {USJGenerator} = require("./usjGenerator"); @@ -40,7 +41,7 @@ Only one of USFM, USJ or USX is supported in one object.`) this.usfm = this.convertUSJToUSFM() } else if (fromUsx !== null) { this.usx = fromUsx; - // this.usfm = this.convertUSXToUSFM() + this.usfm = this.convertUSXToUSFM() } this.parser = null; this.initializeParser(); @@ -137,6 +138,33 @@ Only one of USFM, USJ or USX is supported in one object.`) return outputUSFM; } + convertUSXToUSFM() { + try { + assert(1 <= this.usx.nodeType && this.usx.nodeType <= 12 , + 'Input must be an instance of xmldom Document or Element' + ); + if (this.usx.tagName !== "usx") { + assert(this.usx.getElementsByTagName('usx').length === 1, + 'Expects a node. Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml'); + + this.usx = this.usx.getElementsByTagName('usx')[0] + } + // assert(this.usx.childNodes[0].tagName === 'book', " expected as first element in ") + + } catch(err) { + throw new Error("USX not in expected format. "+err.message) + } + try { + const usfmGen = new USFMGenerator() + usfmGen.usxToUsfm(this.usx); + // console.log(usfmGen.usfmString) + return usfmGen.usfmString; + } catch(err) { + let message = "Unable to do the conversion from USX to USFM. "; + throw new Error(message, { cause: err }); + } + } + convertUSFMToUSJ( excludeMarkers = null, includeMarkers = null, diff --git a/node-usfm-parser/src/utils/types.js b/node-usfm-parser/src/utils/types.js index 423abc7e..d6484a19 100644 --- a/node-usfm-parser/src/utils/types.js +++ b/node-usfm-parser/src/utils/types.js @@ -15,4 +15,18 @@ exports.NON_ATTRIB_USJ_KEYS = [ "pubnumber", "category", ]; + +exports.NON_ATTRIB_USX_KEYS = [ + "style", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; exports.NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; +exports.NO_NEWLINE_USX_TYPES = ["char", "note", "verse", "cell"]; From 7f1d2cee049038aaa046b0b18b603897e986661c Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Sat, 19 Oct 2024 13:17:40 +0530 Subject: [PATCH 30/32] Node: Add tests for roundtripping USFM via USX --- node-usfm-parser/test/test_usx_conversion.js | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/node-usfm-parser/test/test_usx_conversion.js b/node-usfm-parser/test/test_usx_conversion.js index e85b9596..180e6b59 100644 --- a/node-usfm-parser/test/test_usx_conversion.js +++ b/node-usfm-parser/test/test_usx_conversion.js @@ -46,6 +46,30 @@ describe("Ensure all markers are in USX", () => { }); +describe("Test USFM-USX-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USX`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + assert(usx.nodeType === 1); + + const testParser2 = new USFMParser(usfmString=null, fromUsj=null, fromUsx=usx); + const generatedUSFM = testParser2.usfm.trim(); + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + }); + } + }); + +}); + // describe("Compare generated USX with testsuite sample", () => { From 0afce5976b05c3c896c3be029ce62127fa8fcd00 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Sat, 19 Oct 2024 13:18:55 +0530 Subject: [PATCH 31/32] Web: Implement USFM Generation from USX and fromUsx initialization of parser --- web-usfm-parser/src/usfmGenerator.js | 125 +++++++++++++++++++++++++++ web-usfm-parser/src/usfmParser.js | 31 ++++++- web-usfm-parser/src/utils/types.js | 16 +++- 3 files changed, 170 insertions(+), 2 deletions(-) diff --git a/web-usfm-parser/src/usfmGenerator.js b/web-usfm-parser/src/usfmGenerator.js index b91750c3..0ceac2d2 100644 --- a/web-usfm-parser/src/usfmGenerator.js +++ b/web-usfm-parser/src/usfmGenerator.js @@ -1,4 +1,6 @@ import { NO_USFM_USJ_TYPES, CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } from "./utils/types.js"; +import { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } from "./utils/types.js"; + class USFMGenerator { constructor() { this.usfmString = ""; @@ -73,6 +75,129 @@ class USFMGenerator { } return this.usfmString; } + + usxToUsfm(xmlObj, nested=false) { + // Check if xmlObj is a string + // if (typeof xmlObj === 'string') { + // // this.usfmString += xmlObj; + // return; + // } + + const objType = xmlObj.tagName; + let marker = null; + let usfmAttributes = []; + + if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) { + return; + } + + if (!NO_NEWLINE_USX_TYPES.includes(objType)) { + this.usfmString += '\n'; + } + + if (objType === 'optbreak') { + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += '// '; + } + + if (xmlObj.hasAttribute('style')) { + marker = xmlObj.getAttribute('style'); + if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) { + marker = `+${marker}`; + } + this.usfmString += `\\${marker} `; + } else if (objType === 'ref') { + marker = 'ref' + this.usfmString += `\\${marker} `; + } + + if (xmlObj.hasAttribute('code')) { + this.usfmString += xmlObj.getAttribute('code'); + } + + if (xmlObj.hasAttribute('number')) { + this.usfmString += `${xmlObj.getAttribute('number')} `; + } + + if (xmlObj.hasAttribute('caller')) { + this.usfmString += `${xmlObj.getAttribute('caller')} `; + } + + if (xmlObj.hasAttribute('altnumber')) { + if (objType === 'verse') { + this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`; + } + } + + if (xmlObj.hasAttribute('pubnumber')) { + if (objType === 'verse') { + this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`; + } + } + + if (xmlObj.hasAttribute('category')) { + this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`; + } + + const children = Array.from(xmlObj.childNodes); + for (const child of children) { + if (child.nodeType === 1) { // Check if child is an element node + if (objType === 'char') { + this.usxToUsfm(child, true); + } else { + this.usxToUsfm(child, false); + } + } + if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += child.nodeValue.trim(); + } + } + + const attributes = Array.from(xmlObj.attributes); + for (const attrNode of attributes) { + let key = attrNode.name; + let val = attrNode.value.replace(/"/g, ''); + if (key === 'file' && objType === 'figure') { + usfmAttributes.push(`src="${val}"`); + } else if (!NON_ATTRIB_USX_KEYS.includes(key)) { + usfmAttributes.push(`${key}="${val}"`); + } + if (['sid', 'eid'].includes(key) && objType === 'ms') { + usfmAttributes.push(`${key}="${val}"`); + } + } + + if (usfmAttributes.length > 0) { + this.usfmString += '|'; + this.usfmString += usfmAttributes.join(' '); + } + + if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true') + || CLOSING_USJ_TYPES.includes(objType) + || usfmAttributes.length > 0) { + if (objType === 'ms') { + this.usfmString += '\\*'; + } else { + this.usfmString += `\\${marker}*`; + } + } + + if (objType === 'sidebar') { + this.usfmString += '\n\\esbe\n'; + } + } + + + } export default USFMGenerator; diff --git a/web-usfm-parser/src/usfmParser.js b/web-usfm-parser/src/usfmParser.js index 60988e77..81c3ad60 100644 --- a/web-usfm-parser/src/usfmParser.js +++ b/web-usfm-parser/src/usfmParser.js @@ -1,3 +1,4 @@ +import assert from 'assert'; import Parser from './web-tree-sitter/tree-sitter.js'; import USFMGenerator from "./usfmGenerator.js"; @@ -50,7 +51,7 @@ Only one of USFM, USJ or USX is supported in one object.`) this.usfm = this.convertUSJToUSFM() } else if (fromUsx !== null) { this.usx = fromUsx; - // this.usfm = this.convertUSXToUSFM() + this.usfm = this.convertUSXToUSFM() } this.parser = null; this.initializeParser(); @@ -101,6 +102,34 @@ Only one of USFM, USJ or USX is supported in one object.`) return this.usfm; } + convertUSXToUSFM() { + try { + assert(1 <= this.usx.nodeType && this.usx.nodeType <= 12 , + 'Input must be an instance of xmldom Document or Element' + ); + if (this.usx.tagName !== "usx") { + assert(this.usx.getElementsByTagName('usx').length === 1, + 'Expects a node. Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml'); + + this.usx = this.usx.getElementsByTagName('usx')[0] + } + // assert(this.usx.childNodes[0].tagName === 'book', " expected as first element in ") + + } catch(err) { + throw new Error("USX not in expected format. "+err.message) + } + try { + const usfmGen = new USFMGenerator() + usfmGen.usxToUsfm(this.usx); + // console.log(usfmGen.usfmString) + return usfmGen.usfmString; + } catch(err) { + let message = "Unable to do the conversion from USX to USFM. "; + throw new Error(message, { cause: err }); + } + } + + parseUSFM() { let tree = null; try { diff --git a/web-usfm-parser/src/utils/types.js b/web-usfm-parser/src/utils/types.js index b6e48b6c..a0aaee5e 100644 --- a/web-usfm-parser/src/utils/types.js +++ b/web-usfm-parser/src/utils/types.js @@ -15,4 +15,18 @@ export const NON_ATTRIB_USJ_KEYS = [ "pubnumber", "category", ]; -export const NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; \ No newline at end of file + +export const NON_ATTRIB_USX_KEYS = [ + "style", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; +export const NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; +export const NO_NEWLINE_USX_TYPES = ["char", "note", "verse", "cell"]; From 6689d2bcdc024984126a8b3e41b3df66bfa8fba5 Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Sat, 19 Oct 2024 13:19:24 +0530 Subject: [PATCH 32/32] Web: Add tests for roundtripping USFM via USX --- web-usfm-parser/test.js | 24 +++++++++++++++++---- web-usfm-parser/test/test_usx_conversion.js | 23 ++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/web-usfm-parser/test.js b/web-usfm-parser/test.js index 13df0b41..7ab8d4dc 100644 --- a/web-usfm-parser/test.js +++ b/web-usfm-parser/test.js @@ -1,5 +1,6 @@ import {USFMParser} from './src/index.js'; import { readFile } from 'fs/promises'; +import { DOMParser } from 'xmldom'; (async () => { await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); @@ -12,15 +13,30 @@ import { readFile } from 'fs/promises'; // const output2 = usfmParser.usfm; // console.log({ output2 }); - const filePath = "../tests/usfmjsTests/missing_verses/origin.usfm"; + // const filePath = "../tests/usfmjsTests/missing_verses/origin.usfm"; + // const content = await readFile(filePath, 'utf-8'); // Specify encoding + // console.log(content); + + // await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); + // const usfmParser = new USFMParser(content); + // const output = usfmParser.toUSJ(null, null, true); + // console.log({output}) + + const filePath = "../tests/usfmjsTests/missing_verses/origin.xml"; const content = await readFile(filePath, 'utf-8'); // Specify encoding console.log(content); + console.log("*************************"); + + const doc = new DOMParser().parseFromString(content); + const usfmParser = new USFMParser(null, null, doc); + console.log(usfmParser.usfm) + console.log("*************************"); - await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); - const usfmParser = new USFMParser(content); const output = usfmParser.toUSJ(null, null, true); console.log({output}) - + console.log("*************************"); + + })(); diff --git a/web-usfm-parser/test/test_usx_conversion.js b/web-usfm-parser/test/test_usx_conversion.js index f1815559..d74a91e5 100644 --- a/web-usfm-parser/test/test_usx_conversion.js +++ b/web-usfm-parser/test/test_usx_conversion.js @@ -47,6 +47,29 @@ describe("Ensure all markers are in USX", () => { }); +describe("Test USFM-USX-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USX`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + assert(usx.nodeType === 1); + + const testParser2 = new USFMParser(null, null, usx); + const generatedUSFM = testParser2.usfm.trim(); + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + }); + } + }); + +}); // describe("Compare generated USX with testsuite sample", () => {