diff --git a/node-usfm-parser/package.json b/node-usfm-parser/package.json index eb10cf79..1711652a 100644 --- a/node-usfm-parser/package.json +++ b/node-usfm-parser/package.json @@ -6,7 +6,7 @@ "module": "./dist/es/index.mjs", "scripts": { "build": "parcel build ./src/index.js", - "test": "mocha --timeout 40000" + "test": "mocha --timeout 40000 --parallel" }, "repository": { "type": "git", @@ -27,13 +27,14 @@ ], "dependencies": { "tree-sitter": "0.21.1", - "tree-sitter-usfm3": "file:../tree-sitter-usfm3" + "tree-sitter-usfm3": "file:../tree-sitter-usfm3", + "xmldom": "^0.6.0", + "xpath": "^0.0.34" }, "devDependencies": { "ajv": "^8.17.1", "glob": "^11.0.0", "mocha": "^10.7.3", - "parcel": "^2.12.0", - "xml2js": "^0.6.2" + "parcel": "^2.12.0" } } diff --git a/node-usfm-parser/src/usfmGenerator.js b/node-usfm-parser/src/usfmGenerator.js index 3a866595..8c8ee550 100644 --- a/node-usfm-parser/src/usfmGenerator.js +++ b/node-usfm-parser/src/usfmGenerator.js @@ -1,4 +1,7 @@ const { NO_USFM_USJ_TYPES, CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } = require("./utils/types"); +const { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } = require("./utils/types"); +const { DOMParser } = require('xmldom'); + class USFMGenerator { constructor() { this.usfmString = ""; @@ -73,6 +76,126 @@ class USFMGenerator { } return this.usfmString; } + + usxToUsfm(xmlObj, nested=false) { + // Check if xmlObj is a string + // if (typeof xmlObj === 'string') { + // // this.usfmString += xmlObj; + // return; + // } + + const objType = xmlObj.tagName; + let marker = null; + let usfmAttributes = []; + + if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) { + return; + } + + if (!NO_NEWLINE_USX_TYPES.includes(objType)) { + this.usfmString += '\n'; + } + + if (objType === 'optbreak') { + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += '// '; + } + + if (xmlObj.hasAttribute('style')) { + marker = xmlObj.getAttribute('style'); + if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) { + marker = `+${marker}`; + } + this.usfmString += `\\${marker} `; + } else if (objType === 'ref') { + marker = 'ref' + this.usfmString += `\\${marker} `; + } + + if (xmlObj.hasAttribute('code')) { + this.usfmString += xmlObj.getAttribute('code'); + } + + if (xmlObj.hasAttribute('number')) { + this.usfmString += `${xmlObj.getAttribute('number')} `; + } + + if (xmlObj.hasAttribute('caller')) { + this.usfmString += `${xmlObj.getAttribute('caller')} `; + } + + if (xmlObj.hasAttribute('altnumber')) { + if (objType === 'verse') { + this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`; + } + } + + if (xmlObj.hasAttribute('pubnumber')) { + if (objType === 'verse') { + this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`; + } + } + + if (xmlObj.hasAttribute('category')) { + this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`; + } + + const children = Array.from(xmlObj.childNodes); + for (const child of children) { + if (child.nodeType === 1) { // Check if child is an element node + if (objType === 'char') { + this.usxToUsfm(child, true); + } else { + this.usxToUsfm(child, false); + } + } + if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += child.nodeValue.trim(); + } + } + + const attributes = Array.from(xmlObj.attributes); + for (const attrNode of attributes) { + let key = attrNode.name; + let val = attrNode.value.replace(/"/g, ''); + if (key === 'file' && objType === 'figure') { + usfmAttributes.push(`src="${val}"`); + } else if (!NON_ATTRIB_USX_KEYS.includes(key)) { + usfmAttributes.push(`${key}="${val}"`); + } + if (['sid', 'eid'].includes(key) && objType === 'ms') { + usfmAttributes.push(`${key}="${val}"`); + } + } + + if (usfmAttributes.length > 0) { + this.usfmString += '|'; + this.usfmString += usfmAttributes.join(' '); + } + + if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true') + || CLOSING_USJ_TYPES.includes(objType) + || usfmAttributes.length > 0) { + if (objType === 'ms') { + this.usfmString += '\\*'; + } else { + this.usfmString += `\\${marker}*`; + } + } + + if (objType === 'sidebar') { + this.usfmString += '\n\\esbe\n'; + } + } } exports.USFMGenerator = USFMGenerator; diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js index 94bf26c7..9eccbf33 100644 --- a/node-usfm-parser/src/usfmParser.js +++ b/node-usfm-parser/src/usfmParser.js @@ -1,8 +1,10 @@ const Parser = require('tree-sitter'); +const assert = require('assert'); const {USFMGenerator} = require("./usfmGenerator"); const {USJGenerator} = require("./usjGenerator"); const {ListGenerator} = require("./listGenerator"); +const {USXGenerator} = require("./usxGenerator") const { includeMarkersInUsj, excludeMarkersInUsj, Filter } = require("./filters.js"); const USFM3 = require('tree-sitter-usfm3'); const { Query } = Parser; @@ -39,7 +41,7 @@ Only one of USFM, USJ or USX is supported in one object.`) this.usfm = this.convertUSJToUSFM() } else if (fromUsx !== null) { this.usx = fromUsx; - // this.usfm = this.convertUSXToUSFM() + this.usfm = this.convertUSXToUSFM() } this.parser = null; this.initializeParser(); @@ -48,6 +50,7 @@ Only one of USFM, USJ or USX is supported in one object.`) this.errors = []; this.warnings = []; this.parseUSFM(); + } initializeParser() { this.parser = new Parser(); @@ -135,6 +138,33 @@ Only one of USFM, USJ or USX is supported in one object.`) return outputUSFM; } + convertUSXToUSFM() { + try { + assert(1 <= this.usx.nodeType && this.usx.nodeType <= 12 , + 'Input must be an instance of xmldom Document or Element' + ); + if (this.usx.tagName !== "usx") { + assert(this.usx.getElementsByTagName('usx').length === 1, + 'Expects a node. Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml'); + + this.usx = this.usx.getElementsByTagName('usx')[0] + } + // assert(this.usx.childNodes[0].tagName === 'book', " expected as first element in ") + + } catch(err) { + throw new Error("USX not in expected format. "+err.message) + } + try { + const usfmGen = new USFMGenerator() + usfmGen.usxToUsfm(this.usx); + // console.log(usfmGen.usfmString) + return usfmGen.usfmString; + } catch(err) { + let message = "Unable to do the conversion from USX to USFM. "; + throw new Error(message, { cause: err }); + } + } + convertUSFMToUSJ( excludeMarkers = null, includeMarkers = null, @@ -187,9 +217,9 @@ Only one of USFM, USJ or USX is supported in one object.`) /* Uses the toJSON function and converts JSON to CSV To be re-implemented to work with the flat JSON schema */ - if (!ignoreErrors && this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - throw new Error(`Errors present:\n\t${errStr}\nUse ignoreErrors=true to generate output despite errors`); + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); } try { @@ -201,15 +231,48 @@ Only one of USFM, USJ or USX is supported in one object.`) } catch (exe) { let message = "Unable to do the conversion. "; - if (this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - message += `Could be due to an error in the USFM\n\t${errStr}`; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + } + + toUSX(ignoreErrors = false) { + /* Convert the syntax_tree to the XML format (USX) */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + let xmlContent = null; + + try { + // Initialize the USX generator (assuming the constructor is already implemented in JS) + const usxGenerator = new USXGenerator(USFM3, + this.usfm); + + // Process the syntax tree and convert to USX format + usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); + + // xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + xmlContent = usxGenerator.xmlRootNode; + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; } throw new Error(message, { cause: exe }); } + // Return the generated XML structure (in JSON format) + return xmlContent; } + } diff --git a/node-usfm-parser/src/usjGenerator.js b/node-usfm-parser/src/usjGenerator.js index 13531a3f..edf9f9bf 100644 --- a/node-usfm-parser/src/usjGenerator.js +++ b/node-usfm-parser/src/usjGenerator.js @@ -418,21 +418,21 @@ class USJGenerator { let style = this.usfm.substring(tagNode.startIndex, tagNode.endIndex); if (style.startsWith("\\")) { style = style.replace("\\", "").trim(); - } else { - style = node.type; + // } else { + // style = node.type; } // console.log(node.children.length, node.children[0].type, node.children[1].type) let childrenRangeStart = 1; - if ( - node.children.length > 1 && - node.children[1].type.startsWith("numbered") - ) { - const numNode = node.children[1]; - const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); - style += num; - childrenRangeStart = 2; - } + // if ( + // node.children.length > 1 && + // node.children[1].type.startsWith("numbered") + // ) { + // const numNode = node.children[1]; + // const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); + // style += num; + // childrenRangeStart = 2; + // } const paraJsonObj = { type: "para", marker: style, content: [] }; parentJsonObj.content.push(paraJsonObj); diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js new file mode 100644 index 00000000..37c61c1d --- /dev/null +++ b/node-usfm-parser/src/usxGenerator.js @@ -0,0 +1,573 @@ +//Logics for syntax-tree to xml(USX) conversions +const { DOMImplementation, XMLSerializer } = require('xmldom'); +const xpath = require('xpath'); +const Parser = require("tree-sitter"); +const {Query} = Parser; + +const { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } = require("./utils/markers"); + + +class USXGenerator { + /** + * A binding for all methods used in generating USX from Syntax tree + * @param {object} treeSitterLanguageObj - The Tree-sitter language object + * @param {Buffer} usfmString - The USFM byte data + * @param {Element} [usxRootElement] - The root element of the USX (optional) + */ + constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + + const domImpl = new DOMImplementation(); + const doc = domImpl.createDocument(null, 'usx', null); + + if (usxRootElement === null) { + this.xmlRootNode = doc.documentElement; + this.xmlRootNode.setAttribute('version', '3.1'); + } else { + this.xmlRootNode = usxRootElement; + } + } + + /** + * Builds the ID node in USX + * @param {SyntaxNode} node - The syntax node + * @param {Element} parentXmlNode - The parent XML node to append the ID to + */ + node2UsxId(node, parentXmlNode) { + const idCaptures = new Query(this.usfmLanguage, + "(id (bookcode) @book-code (description)? @desc)") + .captures(node); + + let code = null; + let desc = null; + + idCaptures.forEach(capture => { + if (capture.name === 'book-code') { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === 'desc') { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + + const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); + bookXmlNode.setAttribute('code', code); + bookXmlNode.setAttribute('style', 'id'); + + if (desc && desc.trim() !== '') { + const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); + bookXmlNode.appendChild(textNode); + } + + parentXmlNode.appendChild(bookXmlNode); + } + + node2UsxC(node, parentXmlNode) { + // Build c, the chapter milestone node in usj + const chapCap = new Query(this.usfmLanguage, + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + const bookNode = xpath.select1("book", parentXmlNode); + const bookCode = bookNode.getAttribute("code"); + const chapRef = `${bookCode} ${chapNum}`; + + // Create the 'chapter' element + const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); + chapXmlNode.setAttribute("number", chapNum); + chapXmlNode.setAttribute("style", "c"); + chapXmlNode.setAttribute("sid", chapRef); + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + const altNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('altnumber', altNum); + } + if (cap.name === "pub-num") { + const pubNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('pubnumber', pubNum); + } + }); + + parentXmlNode.appendChild(chapXmlNode); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.node2Usx(child, parentXmlNode); + } + }); + } + + + + node2UsxChapter(node, parentXmlNode) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.node2UsxC(child, parentXmlNode); + } else { + this.node2Usx(child, parentXmlNode); + } + }); + + const prevVerses = xpath.select("//verse", this.xmlRootNode); + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + const sibblingCount = parentXmlNode.childNodes.length; + const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; + if (lastSibbling.tagName === "para") { + lastSibbling.appendChild(vEndXmlNode); + } else if (lastSibbling.tagName === "table") { + const rows = lastSibbling.getElementsByTagName('row'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + parentXmlNode.appendChild(vEndXmlNode); + } + } + + } + + findPrevUncle(parentXmlNode) { + // Get the grandparent node + const grandParent = parentXmlNode.parentNode; + let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling + + while (uncleIndex >= 0) { + const uncle = grandParent.childNodes[uncleIndex]; + + // Skip 'sidebar' and 'ms' elements + if (uncle.tagName === "sidebar" || uncle.tagName === "ms") { + uncleIndex--; + } + // Skip elements with 'ca' or 'cp' in the style attribute + else if (uncle.getAttribute('style') === 'ca' || uncle.getAttribute('style') === 'cp') { + uncleIndex--; + } + // Return the found uncle element + else { + return uncle; + } + } + return null; // No suitable uncle found + } + + node2UsxVerse(node, parentXmlNode) { + // Find all previous 'verse' elements + const prevVerses = xpath.select("//verse", this.xmlRootNode); + + // Check if there are previous verses and if the last one has a 'sid' attribute + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + let vEndXmlNode; + if (parentXmlNode.textContent.trim() !== "") { + // If there is verse text in the current parent + vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vEndXmlNode); + } else { + // If no text, find the previous uncle and attach the end verse + const prevUncle = this.findPrevUncle(parentXmlNode); + if (prevUncle.tagName === "para") { + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + prevUncle.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = prevUncle.getElementsByTagName('row'); + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + throw new Error(`prev_uncle is ${String(prevUncle)}`); + } + } + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + } + + // Query to capture verse-related elements + const verseNumCap = new Query(this.usfmLanguage, + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vXmlNode); + + // Loop through the captured elements and set the attributes + verseNumCap.forEach(capture => { + if (capture.name === 'alt') { + const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + vXmlNode.setAttribute('altnumber', altNum); + } else if (capture.name === 'vp') { + const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); + vXmlNode.setAttribute('pubnumber', vpText); + } + }); + + // Get the last chapter's 'sid' attribute to form the verse reference + const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); + const ref = `${chapterSid}:${verseNum}`; + + // Set attributes on the newly created 'verse' element + vXmlNode.setAttribute('number', verseNum.trim()); + vXmlNode.setAttribute('style', 'v'); + vXmlNode.setAttribute('sid', ref.trim()); + } + + node2UsxCaVa(node, parentXmlNode) { + // Build elements for independent ca and va away from c and v + const style = node.type; + + // Create a new 'char' element under the parent XML node + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + charXmlNode.setAttribute('style', style); + + // Query to capture chapterNumber or verseNumber + const altNumMatch = new Query(this.usfmLanguage, + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + // Extract the alternate number from the captured range + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + // Set the attributes on the 'char' element + charXmlNode.setAttribute('altnumber', altNum); + charXmlNode.setAttribute('closed', 'true'); + + // Append the 'char' element to the parent XML node + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxPara(node, parentXmlNode) { + // Build paragraph nodes in USX + if (node.children[0].type.endsWith('Block')) { + for (const child of node.children[0].children) { + this.node2UsxPara(child, parentXmlNode); + } + } else if (node.type === 'paragraph') { + const paraTagCap = new Query(this.usfmLanguage, + "(paragraph (_) @para-marker)").captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (!paraMarker.endsWith("Block")) { + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of paraTagCap.node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } else if (['pi', 'ph'].includes(node.type)) { + const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } + + + node2UsxNotes(node, parentXmlNode) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); + noteXmlNode.setAttribute('style', style); + const caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + noteXmlNode.setAttribute('caller', caller); + parentXmlNode.appendChild(noteXmlNode); + for (let i = 2; i < node.children.length - 1; i++) { + this.node2Usx(node.children[i], noteXmlNode); + } + + } + + node2UsxChar(node, parentXmlNode) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + charXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(charXmlNode); + + for (let i = 1; i < childrenRange; i++) { + this.node2Usx(node.children[i], charXmlNode); + } + + } + + node2UsxAttrib(node, parentXmlNode) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = new Query(this.usfmLanguage, + "((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentXmlNode.setAttribute(attribName, attribValue); + } + + node2UsxTable(node, parentXmlNode) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); + parentXmlNode.appendChild(tableXmlNode); + node.children.forEach((child) => { + this.node2Usx(child, tableXmlNode); + }); + } else if (node.type === "tr") { + const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); + rowXmlNode.setAttribute("style", "tr"); + parentXmlNode.appendChild(rowXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, rowXmlNode); + }); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); + cellXmlNode.setAttribute("style", style); + cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); + parentXmlNode.appendChild(cellXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, cellXmlNode); + }); + } + } + + node2UsxMilestone(node, parentXmlNode) { + // Create ms node in USJ + + const msNameCap = new Query(this.usfmLanguage, + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); + msXmlNode.setAttribute("style", style); + parentXmlNode.appendChild(msXmlNode); + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.node2Usx(child, msXmlNode); + } + }); + } + + node2UsxSpecial(node, parentXmlNode) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); + sidebarXmlNode.setAttribute('style', "esb"); + parentXmlNode.appendChild(sidebarXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, sidebarXmlNode); + }); + } else if (node.type === "cat") { + const catCap = new Query(this.usfmLanguage, + "((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentXmlNode.setAttribute("category", category); + } else if (node.type === "fig") { + const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); + figXmlNode.setAttribute("style", "fig"); + parentXmlNode.appendChild(figXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, figXmlNode); + }); + } else if (node.type === "ref") { + const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + parentXmlNode.appendChild(refXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, refXmlNode); + }); + } + } + + node2UsxGeneric(node, parentXmlNode) { + const tagNode = node.children[0]; + let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); + + // Strip leading backslashes from the style or use node type + if (style.startsWith('\\')) { + style = style.replace('\\', ''); + // } else { + // style = node.type; + } + + if (style === "usfm") { + return + } + + let childrenRangeStart = 1; + + // Create a 'para' element and set its style attribute + const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); + paraXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(paraXmlNode); + + // Loop through the child nodes and recursively process them + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // If the child is of one of the allowed types, nest it inside the para node + this.node2Usx(child, paraXmlNode); + } else { + // Otherwise, append the child to the parent XML node + this.node2Usx(child, parentXmlNode); + } + } + + // Append the created para node to the parent XML node + } + + node2Usx(node, parentXmlNode) { + // Handling node types with respective functions + if (node.type === "id") { + this.node2UsxId(node, parentXmlNode); + } else if (node.type === "chapter") { + this.node2UsxChapter(node, parentXmlNode); + } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["ca", "va"].includes(node.type)) { + this.node2UsxCaVa(node, parentXmlNode); + } else if (node.type === "v") { + this.node2UsxVerse(node, parentXmlNode); + } else if (node.type === "verseText") { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } else if (["paragraph", "pi", "ph"].includes(node.type)) { + this.node2UsxPara(node, parentXmlNode); + } else if (NOTE_MARKERS.includes(node.type)) { + this.node2UsxNotes(node, parentXmlNode); + } else if ( + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) + ) { + this.node2UsxChar(node, parentXmlNode); + } else if (node.type.endsWith("Attribute")) { + this.node2UsxAttrib(node, parentXmlNode); + } else if (node.type === "text") { + let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + textVal = textVal.replace("~", " ") + const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); + parentXmlNode.appendChild(textNode); + } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { + this.node2UsxTable(node, parentXmlNode); + } else if (node.type === "milestone" || node.type === "zNameSpace") { + this.node2UsxMilestone(node, parentXmlNode); + } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { + this.node2UsxSpecial(node, parentXmlNode); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + ) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["", "|"].includes(node.type.trim())) { + // Skip whitespace nodes + } else if (node.children.length > 0) { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } + // else { + // throw new Error(`Encountered unknown element: ${node}`); + // } + } +} + + +exports.USXGenerator = USXGenerator; diff --git a/node-usfm-parser/src/utils/types.js b/node-usfm-parser/src/utils/types.js index 423abc7e..d6484a19 100644 --- a/node-usfm-parser/src/utils/types.js +++ b/node-usfm-parser/src/utils/types.js @@ -15,4 +15,18 @@ exports.NON_ATTRIB_USJ_KEYS = [ "pubnumber", "category", ]; + +exports.NON_ATTRIB_USX_KEYS = [ + "style", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; exports.NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; +exports.NO_NEWLINE_USX_TYPES = ["char", "note", "verse", "cell"]; diff --git a/node-usfm-parser/test/config.js b/node-usfm-parser/test/config.js index c2dd098d..815f3f3d 100644 --- a/node-usfm-parser/test/config.js +++ b/node-usfm-parser/test/config.js @@ -1,6 +1,6 @@ const {glob} = require('glob'); const fs = require('node:fs'); -const xml2js = require('xml2js'); +const { DOMParser } = require('xmldom') const {USFMParser} = require("../src/index"); let allUsfmFiles = []; @@ -105,6 +105,28 @@ let excludeUSJs = [ ] +let excludeUSXs = [ + `${TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml`, + // \ef not treated as inline content of paragraph + `${TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml`, + // verse number="+"!!! + `${TEST_DIR}/specExamples/character/origin.xml`, + // lit element treated as a body paragraph enclosing a verse! + `${TEST_DIR}/usfmjsTests/esb/origin.xml`, + // last verse text given outside of paragraph. + `${TEST_DIR}/special-cases/nbsp/origin.xml`, + // ~ not being replaced by nbsp in usfm-grammar + `${TEST_DIR}/special-cases/empty-attributes/origin.xml`, + // attributes treated as text content of marker + `${TEST_DIR}/biblica/CategoriesOnNotes/origin.xml`, + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.xml`, + // ref node has type ref. Is it char or ref? + `${TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.xml`, + // \v and other contents contained inside \lit. New docs doesnt have \lit + `${TEST_DIR}/usfmjsTests/usfm-body-testF/origin.xml`, + // does the ms go inside \s5 or after it? +] + const initialiseParser = function (inputUsfmPath){ `Open and parse the given file` try { @@ -132,13 +154,9 @@ const checkValidUsfm = function (inputUsfmPath) { let metaFilePath = inputUsfmPath.replace("origin.usfm", "metadata.xml") let metadata = fs.readFileSync(metaFilePath, 'utf8') - xml2js.parseString(metadata, (err, result) => { - if (err) { - console.error('Error parsing XML:', err); - return; - } - value = result['test-metadata']['validated'][0]; - }); + const doc = new DOMParser().parseFromString(metadata, 'text/xml'); + + value = doc.getElementsByTagName("validated")[0].textContent; if (value === "fail"){ return false @@ -202,5 +220,6 @@ module.exports = { initialiseParser: initialiseParser, isValidUsfm: isValidUsfm, excludeUSJs: excludeUSJs, + excludeUSXs: excludeUSXs, findAllMarkers: findAllMarkers }; diff --git a/node-usfm-parser/test/test_usx_conversion.js b/node-usfm-parser/test/test_usx_conversion.js new file mode 100644 index 00000000..180e6b59 --- /dev/null +++ b/node-usfm-parser/test/test_usx_conversion.js @@ -0,0 +1,140 @@ +const assert = require('assert'); +const fs = require('node:fs'); +const { DOMImplementation, XMLSerializer, DOMParser } = require('xmldom'); +const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSXs, findAllMarkers} = require('./config'); +const {USFMParser, Filter} = require("../src/index"); + +describe("Check successful USFM-USX conversion for positive samples", () => { + const domImpl = new DOMImplementation(); + const sampleDoc = domImpl.createDocument(null, 'usx', null); + allUsfmFiles.forEach(function(value) { + + if (isValidUsfm[value]) { + it(`Convert ${value} to USX`, (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + // assert(usx instanceof DOMImplementation.Document); + assert(usx.tagName === "usx"); + assert(usx.getAttribute("version") === "3.1"); + assert(usx.childNodes[0].tagName === "book"); + assert(usx.childNodes[0].getAttribute("style") === "id"); + }); + } + }); +}); + + + +describe("Ensure all markers are in USX", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USX`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, keepId=true))] + const allUSXNodes = getNodes(usx); + + assert.deepStrictEqual(inputMarkers, allUSXNodes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + +describe("Test USFM-USX-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USX`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + assert(usx.nodeType === 1); + + const testParser2 = new USFMParser(usfmString=null, fromUsj=null, fromUsx=usx); + const generatedUSFM = testParser2.usfm.trim(); + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + }); + } + }); + +}); + + +// describe("Compare generated USX with testsuite sample", () => { + +// allUsfmFiles.forEach(function(value) { +// const usxPath = value.replace(".usfm", ".xml"); +// if (isValidUsfm[value] && ! excludeUSXs.includes(usxPath)) { +// it(`Compare generated USX to ${usxPath}`, (inputUsfmPath=value) => { +// const testParser = initialiseParser(inputUsfmPath) +// const generatedUSX = testParser.toUSX(); +// const filePath = usxPath; +// let fileData = null; +// try { +// fileData = fs.readFileSync(filePath, "utf8"); +// } catch(err) { +// if (err.code === "ENOENT") { +// return +// } +// } +// const testsuiteUSX = new DOMParser().parseFromString( +// fileData, 'text/xml').getElementsByTagName("usx")[0]; + +// assert.deepEqual(generatedUSX, testsuiteUSX); +// }); +// } +// }); +// }); + +function getNodes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (element.nodeType === element.TEXT_NODE) { + return types; // Return empty array if element is a string + } else { + if (element.getAttribute('style')) { + types.push(element.getAttribute('style')); + } + if (element.tagName === "ref") { + types.push("ref"); + } + if (element.getAttribute('altnumber')) { + if (element.tagName === 'chapter') { + types.push('ca'); + } else { + types.push('va'); + } + } + if (element.getAttribute('pubnumber')) { + if (element.tagName === 'chapter') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if (element.getAttribute('category')) { + types.push('cat'); + } + if (element.childNodes.length > 0) { + Array.from(element.childNodes).forEach(child => { + types = types.concat(getNodes(child)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +} diff --git a/web-usfm-parser/package.json b/web-usfm-parser/package.json index 20b86a15..19e81860 100644 --- a/web-usfm-parser/package.json +++ b/web-usfm-parser/package.json @@ -46,5 +46,9 @@ "process": "^0.11.10", "web-tree-sitter": "^0.22.6", "xml2js": "^0.6.2" + }, + "dependencies": { + "xmldom": "^0.6.0", + "xpath": "^0.0.34" } } diff --git a/web-usfm-parser/src/usfmGenerator.js b/web-usfm-parser/src/usfmGenerator.js index b91750c3..0ceac2d2 100644 --- a/web-usfm-parser/src/usfmGenerator.js +++ b/web-usfm-parser/src/usfmGenerator.js @@ -1,4 +1,6 @@ import { NO_USFM_USJ_TYPES, CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } from "./utils/types.js"; +import { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } from "./utils/types.js"; + class USFMGenerator { constructor() { this.usfmString = ""; @@ -73,6 +75,129 @@ class USFMGenerator { } return this.usfmString; } + + usxToUsfm(xmlObj, nested=false) { + // Check if xmlObj is a string + // if (typeof xmlObj === 'string') { + // // this.usfmString += xmlObj; + // return; + // } + + const objType = xmlObj.tagName; + let marker = null; + let usfmAttributes = []; + + if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) { + return; + } + + if (!NO_NEWLINE_USX_TYPES.includes(objType)) { + this.usfmString += '\n'; + } + + if (objType === 'optbreak') { + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += '// '; + } + + if (xmlObj.hasAttribute('style')) { + marker = xmlObj.getAttribute('style'); + if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) { + marker = `+${marker}`; + } + this.usfmString += `\\${marker} `; + } else if (objType === 'ref') { + marker = 'ref' + this.usfmString += `\\${marker} `; + } + + if (xmlObj.hasAttribute('code')) { + this.usfmString += xmlObj.getAttribute('code'); + } + + if (xmlObj.hasAttribute('number')) { + this.usfmString += `${xmlObj.getAttribute('number')} `; + } + + if (xmlObj.hasAttribute('caller')) { + this.usfmString += `${xmlObj.getAttribute('caller')} `; + } + + if (xmlObj.hasAttribute('altnumber')) { + if (objType === 'verse') { + this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`; + } + } + + if (xmlObj.hasAttribute('pubnumber')) { + if (objType === 'verse') { + this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`; + } + } + + if (xmlObj.hasAttribute('category')) { + this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`; + } + + const children = Array.from(xmlObj.childNodes); + for (const child of children) { + if (child.nodeType === 1) { // Check if child is an element node + if (objType === 'char') { + this.usxToUsfm(child, true); + } else { + this.usxToUsfm(child, false); + } + } + if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += child.nodeValue.trim(); + } + } + + const attributes = Array.from(xmlObj.attributes); + for (const attrNode of attributes) { + let key = attrNode.name; + let val = attrNode.value.replace(/"/g, ''); + if (key === 'file' && objType === 'figure') { + usfmAttributes.push(`src="${val}"`); + } else if (!NON_ATTRIB_USX_KEYS.includes(key)) { + usfmAttributes.push(`${key}="${val}"`); + } + if (['sid', 'eid'].includes(key) && objType === 'ms') { + usfmAttributes.push(`${key}="${val}"`); + } + } + + if (usfmAttributes.length > 0) { + this.usfmString += '|'; + this.usfmString += usfmAttributes.join(' '); + } + + if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true') + || CLOSING_USJ_TYPES.includes(objType) + || usfmAttributes.length > 0) { + if (objType === 'ms') { + this.usfmString += '\\*'; + } else { + this.usfmString += `\\${marker}*`; + } + } + + if (objType === 'sidebar') { + this.usfmString += '\n\\esbe\n'; + } + } + + + } export default USFMGenerator; diff --git a/web-usfm-parser/src/usfmParser.js b/web-usfm-parser/src/usfmParser.js index 9e8ffd51..81c3ad60 100644 --- a/web-usfm-parser/src/usfmParser.js +++ b/web-usfm-parser/src/usfmParser.js @@ -1,8 +1,10 @@ +import assert from 'assert'; import Parser from './web-tree-sitter/tree-sitter.js'; import USFMGenerator from "./usfmGenerator.js"; import USJGenerator from "./usjGenerator.js"; import ListGenerator from "./listGenerator.js" +import USXGenerator from "./usxGenerator.js"; import { Filter } from "./filters.js"; @@ -49,7 +51,7 @@ Only one of USFM, USJ or USX is supported in one object.`) this.usfm = this.convertUSJToUSFM() } else if (fromUsx !== null) { this.usx = fromUsx; - // this.usfm = this.convertUSXToUSFM() + this.usfm = this.convertUSXToUSFM() } this.parser = null; this.initializeParser(); @@ -100,6 +102,34 @@ Only one of USFM, USJ or USX is supported in one object.`) return this.usfm; } + convertUSXToUSFM() { + try { + assert(1 <= this.usx.nodeType && this.usx.nodeType <= 12 , + 'Input must be an instance of xmldom Document or Element' + ); + if (this.usx.tagName !== "usx") { + assert(this.usx.getElementsByTagName('usx').length === 1, + 'Expects a node. Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml'); + + this.usx = this.usx.getElementsByTagName('usx')[0] + } + // assert(this.usx.childNodes[0].tagName === 'book', " expected as first element in ") + + } catch(err) { + throw new Error("USX not in expected format. "+err.message) + } + try { + const usfmGen = new USFMGenerator() + usfmGen.usxToUsfm(this.usx); + // console.log(usfmGen.usfmString) + return usfmGen.usfmString; + } catch(err) { + let message = "Unable to do the conversion from USX to USFM. "; + throw new Error(message, { cause: err }); + } + } + + parseUSFM() { let tree = null; try { @@ -202,9 +232,9 @@ Only one of USFM, USJ or USX is supported in one object.`) /* Uses the toJSON function and converts JSON to CSV To be re-implemented to work with the flat JSON schema */ - if (!ignoreErrors && this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - throw new Error(`Errors present:\n\t${errStr}\nUse ignoreErrors=true to generate output despite errors`); + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); } try { @@ -216,13 +246,45 @@ Only one of USFM, USJ or USX is supported in one object.`) } catch (exe) { let message = "Unable to do the conversion. "; - if (this.errors && this.errors.length > 0) { - const errStr = this.errors.map(err => err.join(":")).join("\n\t"); - message += `Could be due to an error in the USFM\n\t${errStr}`; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + } + + toUSX(ignoreErrors = false) { + /* Convert the syntax_tree to the XML format (USX) */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + let xmlContent = null; + + try { + // Initialize the USX generator (assuming the constructor is already implemented in JS) + const usxGenerator = new USXGenerator(USFMParser.language, + this.usfm); + + // Process the syntax tree and convert to USX format + usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); + + // xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + xmlContent = usxGenerator.xmlRootNode; + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; } throw new Error(message, { cause: exe }); } + // Return the generated XML structure (in JSON format) + return xmlContent; } } diff --git a/web-usfm-parser/src/usxGenerator.js b/web-usfm-parser/src/usxGenerator.js new file mode 100644 index 00000000..00ade8a1 --- /dev/null +++ b/web-usfm-parser/src/usxGenerator.js @@ -0,0 +1,576 @@ +//Logics for syntax-tree to xml(USX) conversions +import { DOMImplementation, XMLSerializer } from 'xmldom'; +import xpath from 'xpath'; + +import { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } from "./utils/markers.js"; + + +class USXGenerator { + /** + * A binding for all methods used in generating USX from Syntax tree + * @param {object} treeSitterLanguageObj - The Tree-sitter language object + * @param {Buffer} usfmString - The USFM byte data + * @param {Element} [usxRootElement] - The root element of the USX (optional) + */ + constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + + const domImpl = new DOMImplementation(); + const doc = domImpl.createDocument(null, 'usx', null); + + if (usxRootElement === null) { + this.xmlRootNode = doc.documentElement; + this.xmlRootNode.setAttribute('version', '3.1'); + } else { + this.xmlRootNode = usxRootElement; + } + } + + /** + * Builds the ID node in USX + * @param {SyntaxNode} node - The syntax node + * @param {Element} parentXmlNode - The parent XML node to append the ID to + */ + node2UsxId(node, parentXmlNode) { + const idCaptures = this.usfmLanguage + .query("(id (bookcode) @book-code (description)? @desc)") + .captures(node); + + let code = null; + let desc = null; + + idCaptures.forEach(capture => { + if (capture.name === 'book-code') { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === 'desc') { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + + const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); + bookXmlNode.setAttribute('code', code); + bookXmlNode.setAttribute('style', 'id'); + + if (desc && desc.trim() !== '') { + const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); + bookXmlNode.appendChild(textNode); + } + + parentXmlNode.appendChild(bookXmlNode); + } + + node2UsxC(node, parentXmlNode) { + // Build c, the chapter milestone node in usj + const chapCap = this.usfmLanguage + .query( + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + const bookNode = xpath.select1("book", parentXmlNode); + const bookCode = bookNode.getAttribute("code"); + const chapRef = `${bookCode} ${chapNum}`; + + // Create the 'chapter' element + const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); + chapXmlNode.setAttribute("number", chapNum); + chapXmlNode.setAttribute("style", "c"); + chapXmlNode.setAttribute("sid", chapRef); + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + const altNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('altnumber', altNum); + } + if (cap.name === "pub-num") { + const pubNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('pubnumber', pubNum); + } + }); + + parentXmlNode.appendChild(chapXmlNode); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.node2Usx(child, parentXmlNode); + } + }); + } + + + + node2UsxChapter(node, parentXmlNode) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.node2UsxC(child, parentXmlNode); + } else { + this.node2Usx(child, parentXmlNode); + } + }); + + const prevVerses = xpath.select("//verse", this.xmlRootNode); + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + const sibblingCount = parentXmlNode.childNodes.length; + const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; + if (lastSibbling.tagName === "para") { + lastSibbling.appendChild(vEndXmlNode); + } else if (lastSibbling.tagName === "table") { + const rows = lastSibbling.getElementsByTagName('row'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + parentXmlNode.appendChild(vEndXmlNode); + } + } + + } + + findPrevUncle(parentXmlNode) { + // Get the grandparent node + const grandParent = parentXmlNode.parentNode; + let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling + + while (uncleIndex >= 0) { + const uncle = grandParent.childNodes[uncleIndex]; + + // Skip 'sidebar' and 'ms' elements + if (uncle.tagName === "sidebar" || uncle.tagName === "ms") { + uncleIndex--; + } + // Skip elements with 'ca' or 'cp' in the style attribute + else if (uncle.getAttribute('style') === 'ca' || uncle.getAttribute('style') === 'cp') { + uncleIndex--; + } + // Return the found uncle element + else { + return uncle; + } + } + return null; // No suitable uncle found + } + + node2UsxVerse(node, parentXmlNode) { + // Find all previous 'verse' elements + const prevVerses = xpath.select("//verse", this.xmlRootNode); + + // Check if there are previous verses and if the last one has a 'sid' attribute + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + let vEndXmlNode; + if (parentXmlNode.textContent.trim() !== "") { + // If there is verse text in the current parent + vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vEndXmlNode); + } else { + // If no text, find the previous uncle and attach the end verse + const prevUncle = this.findPrevUncle(parentXmlNode); + if (prevUncle.tagName === "para") { + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + prevUncle.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = prevUncle.getElementsByTagName('row'); + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + throw new Error(`prev_uncle is ${String(prevUncle)}`); + } + } + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + } + + // Query to capture verse-related elements + const verseNumCap = this.usfmLanguage + .query( + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vXmlNode); + + // Loop through the captured elements and set the attributes + verseNumCap.forEach(capture => { + if (capture.name === 'alt') { + const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + vXmlNode.setAttribute('altnumber', altNum); + } else if (capture.name === 'vp') { + const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); + vXmlNode.setAttribute('pubnumber', vpText); + } + }); + + // Get the last chapter's 'sid' attribute to form the verse reference + const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); + const ref = `${chapterSid}:${verseNum}`; + + // Set attributes on the newly created 'verse' element + vXmlNode.setAttribute('number', verseNum.trim()); + vXmlNode.setAttribute('style', 'v'); + vXmlNode.setAttribute('sid', ref.trim()); + } + + node2UsxCaVa(node, parentXmlNode) { + // Build elements for independent ca and va away from c and v + const style = node.type; + + // Create a new 'char' element under the parent XML node + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + charXmlNode.setAttribute('style', style); + + // Query to capture chapterNumber or verseNumber + const altNumMatch = this.usfmLanguage + .query( + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + // Extract the alternate number from the captured range + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + // Set the attributes on the 'char' element + charXmlNode.setAttribute('altnumber', altNum); + charXmlNode.setAttribute('closed', 'true'); + + // Append the 'char' element to the parent XML node + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxPara(node, parentXmlNode) { + // Build paragraph nodes in USX + if (node.children[0].type.endsWith('Block')) { + for (const child of node.children[0].children) { + this.node2UsxPara(child, parentXmlNode); + } + } else if (node.type === 'paragraph') { + const paraTagCap = this.usfmLanguage + .query("(paragraph (_) @para-marker)") + .captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (!paraMarker.endsWith("Block")) { + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of paraTagCap.node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } else if (['pi', 'ph'].includes(node.type)) { + const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } + + + node2UsxNotes(node, parentXmlNode) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); + noteXmlNode.setAttribute('style', style); + const caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + noteXmlNode.setAttribute('caller', caller); + parentXmlNode.appendChild(noteXmlNode); + for (let i = 2; i < node.children.length - 1; i++) { + this.node2Usx(node.children[i], noteXmlNode); + } + + } + + node2UsxChar(node, parentXmlNode) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + charXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(charXmlNode); + + for (let i = 1; i < childrenRange; i++) { + this.node2Usx(node.children[i], charXmlNode); + } + + } + + node2UsxAttrib(node, parentXmlNode) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = this.usfmLanguage + .query("((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentXmlNode.setAttribute(attribName, attribValue); + } + + node2UsxTable(node, parentXmlNode) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); + parentXmlNode.appendChild(tableXmlNode); + node.children.forEach((child) => { + this.node2Usx(child, tableXmlNode); + }); + } else if (node.type === "tr") { + const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); + rowXmlNode.setAttribute("style", "tr"); + parentXmlNode.appendChild(rowXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, rowXmlNode); + }); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); + cellXmlNode.setAttribute("style", style); + cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); + parentXmlNode.appendChild(cellXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, cellXmlNode); + }); + } + } + + node2UsxMilestone(node, parentXmlNode) { + // Create ms node in USJ + + const msNameCap = this.usfmLanguage + .query( + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); + msXmlNode.setAttribute("style", style); + parentXmlNode.appendChild(msXmlNode); + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.node2Usx(child, msXmlNode); + } + }); + } + + node2UsxSpecial(node, parentXmlNode) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); + sidebarXmlNode.setAttribute('style', "esb"); + parentXmlNode.appendChild(sidebarXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, sidebarXmlNode); + }); + } else if (node.type === "cat") { + const catCap = this.usfmLanguage + .query("((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentXmlNode.setAttribute("category", category); + } else if (node.type === "fig") { + const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); + figXmlNode.setAttribute("style", "fig"); + parentXmlNode.appendChild(figXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, figXmlNode); + }); + } else if (node.type === "ref") { + const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + parentXmlNode.appendChild(refXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, refXmlNode); + }); + } + } + + node2UsxGeneric(node, parentXmlNode) { + const tagNode = node.children[0]; + let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); + + // Strip leading backslashes from the style or use node type + if (style.startsWith('\\')) { + style = style.replace('\\', ''); + // } else { + // style = node.type; + } + + if (style === "usfm") { + return + } + + let childrenRangeStart = 1; + + // Create a 'para' element and set its style attribute + const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); + paraXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(paraXmlNode); + + // Loop through the child nodes and recursively process them + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // If the child is of one of the allowed types, nest it inside the para node + this.node2Usx(child, paraXmlNode); + } else { + // Otherwise, append the child to the parent XML node + this.node2Usx(child, parentXmlNode); + } + } + + // Append the created para node to the parent XML node + } + + node2Usx(node, parentXmlNode) { + // Handling node types with respective functions + if (node.type === "id") { + this.node2UsxId(node, parentXmlNode); + } else if (node.type === "chapter") { + this.node2UsxChapter(node, parentXmlNode); + } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["ca", "va"].includes(node.type)) { + this.node2UsxCaVa(node, parentXmlNode); + } else if (node.type === "v") { + this.node2UsxVerse(node, parentXmlNode); + } else if (node.type === "verseText") { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } else if (["paragraph", "pi", "ph"].includes(node.type)) { + this.node2UsxPara(node, parentXmlNode); + } else if (NOTE_MARKERS.includes(node.type)) { + this.node2UsxNotes(node, parentXmlNode); + } else if ( + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) + ) { + this.node2UsxChar(node, parentXmlNode); + } else if (node.type.endsWith("Attribute")) { + this.node2UsxAttrib(node, parentXmlNode); + } else if (node.type === "text") { + let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + textVal = textVal.replace("~", " ") + const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); + parentXmlNode.appendChild(textNode); + } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { + this.node2UsxTable(node, parentXmlNode); + } else if (node.type === "milestone" || node.type === "zNameSpace") { + this.node2UsxMilestone(node, parentXmlNode); + } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { + this.node2UsxSpecial(node, parentXmlNode); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + ) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["", "|"].includes(node.type.trim())) { + // Skip whitespace nodes + } else if (node.children.length > 0) { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } + // else { + // throw new Error(`Encountered unknown element: ${node}`); + // } + } +} + + +export default USXGenerator; diff --git a/web-usfm-parser/src/utils/types.js b/web-usfm-parser/src/utils/types.js index b6e48b6c..a0aaee5e 100644 --- a/web-usfm-parser/src/utils/types.js +++ b/web-usfm-parser/src/utils/types.js @@ -15,4 +15,18 @@ export const NON_ATTRIB_USJ_KEYS = [ "pubnumber", "category", ]; -export const NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; \ No newline at end of file + +export const NON_ATTRIB_USX_KEYS = [ + "style", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; +export const NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; +export const NO_NEWLINE_USX_TYPES = ["char", "note", "verse", "cell"]; diff --git a/web-usfm-parser/test.js b/web-usfm-parser/test.js index 13df0b41..7ab8d4dc 100644 --- a/web-usfm-parser/test.js +++ b/web-usfm-parser/test.js @@ -1,5 +1,6 @@ import {USFMParser} from './src/index.js'; import { readFile } from 'fs/promises'; +import { DOMParser } from 'xmldom'; (async () => { await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); @@ -12,15 +13,30 @@ import { readFile } from 'fs/promises'; // const output2 = usfmParser.usfm; // console.log({ output2 }); - const filePath = "../tests/usfmjsTests/missing_verses/origin.usfm"; + // const filePath = "../tests/usfmjsTests/missing_verses/origin.usfm"; + // const content = await readFile(filePath, 'utf-8'); // Specify encoding + // console.log(content); + + // await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); + // const usfmParser = new USFMParser(content); + // const output = usfmParser.toUSJ(null, null, true); + // console.log({output}) + + const filePath = "../tests/usfmjsTests/missing_verses/origin.xml"; const content = await readFile(filePath, 'utf-8'); // Specify encoding console.log(content); + console.log("*************************"); + + const doc = new DOMParser().parseFromString(content); + const usfmParser = new USFMParser(null, null, doc); + console.log(usfmParser.usfm) + console.log("*************************"); - await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); - const usfmParser = new USFMParser(content); const output = usfmParser.toUSJ(null, null, true); console.log({output}) - + console.log("*************************"); + + })(); diff --git a/web-usfm-parser/test/config.js b/web-usfm-parser/test/config.js index 869d8adf..cb375cbf 100644 --- a/web-usfm-parser/test/config.js +++ b/web-usfm-parser/test/config.js @@ -104,6 +104,28 @@ let excludeUSJs = [ `${TEST_DIR}/specExamples/character/origin.json`,// lit element treated as a body paragraph enclosing a verse! Issue from USX ] + +let excludeUSXs = [ + `${TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml`, + // \ef not treated as inline content of paragraph + `${TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml`, + // verse number="+"!!! + `${TEST_DIR}/specExamples/character/origin.xml`, + // lit element treated as a body paragraph enclosing a verse! + `${TEST_DIR}/usfmjsTests/esb/origin.xml`, + // last verse text given outside of paragraph. + `${TEST_DIR}/special-cases/nbsp/origin.xml`, + // ~ not being replaced by nbsp in usfm-grammar + `${TEST_DIR}/special-cases/empty-attributes/origin.xml`, + // attributes treated as text content of marker + `${TEST_DIR}/biblica/CategoriesOnNotes/origin.xml`, + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.xml`, + // ref node has type ref. Is it char or ref? + `${TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.xml`, + // \v and other contents contained inside \lit. New docs doesnt have \lit + `${TEST_DIR}/usfmjsTests/usfm-body-testF/origin.xml`, + // does the ms go inside \s5 or after it? +] await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); @@ -204,5 +226,6 @@ export{ initialiseParser, isValidUsfm, excludeUSJs, + excludeUSXs, findAllMarkers }; diff --git a/web-usfm-parser/test/test_usx_conversion.js b/web-usfm-parser/test/test_usx_conversion.js new file mode 100644 index 00000000..d74a91e5 --- /dev/null +++ b/web-usfm-parser/test/test_usx_conversion.js @@ -0,0 +1,140 @@ + +import assert from 'assert'; +import fs from "node:fs"; +import { DOMImplementation, XMLSerializer, DOMParser } from 'xmldom'; +import {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSXs, findAllMarkers} from './config.js' +import {USFMParser, Filter} from '../src/index.js'; + +describe("Check successful USFM-USX conversion for positive samples", () => { + const domImpl = new DOMImplementation(); + const sampleDoc = domImpl.createDocument(null, 'usx', null); + allUsfmFiles.forEach(function(value) { + + if (isValidUsfm[value]) { + it(`Convert ${value} to USX`, async (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + // assert(usx instanceof DOMImplementation.Document); + assert(usx.tagName === "usx"); + assert(usx.getAttribute("version") === "3.1"); + assert(usx.childNodes[0].tagName === "book"); + assert(usx.childNodes[0].getAttribute("style") === "id"); + }); + } + }); +}); + + + +describe("Ensure all markers are in USX", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USX`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, true))] + const allUSXNodes = getNodes(usx); + + assert.deepStrictEqual(inputMarkers, allUSXNodes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + +describe("Test USFM-USX-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USX`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + assert(usx.nodeType === 1); + + const testParser2 = new USFMParser(null, null, usx); + const generatedUSFM = testParser2.usfm.trim(); + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + }); + } + }); + +}); + +// describe("Compare generated USX with testsuite sample", () => { + +// allUsfmFiles.forEach(function(value) { +// const usxPath = value.replace(".usfm", ".xml"); +// if (isValidUsfm[value] && ! excludeUSXs.includes(usxPath)) { +// it(`Compare generated USX to ${usxPath}`, async (inputUsfmPath=value) => { +// const testParser = await initialiseParser(inputUsfmPath) +// const generatedUSX = testParser.toUSX(); +// const filePath = usxPath; +// let fileData = null; +// try { +// fileData = fs.readFileSync(filePath, "utf8"); +// } catch(err) { +// if (err.code === "ENOENT") { +// return +// } +// } +// const testsuiteUSX = new DOMParser().parseFromString( +// fileData, 'text/xml').getElementsByTagName("usx")[0]; + +// assert.deepEqual(generatedUSX, testsuiteUSX); +// }); +// } +// }); +// }); + +function getNodes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (element.nodeType === element.TEXT_NODE) { + return types; // Return empty array if element is a string + } else { + if (element.getAttribute('style')) { + types.push(element.getAttribute('style')); + } + if (element.tagName === "ref") { + types.push("ref"); + } + if (element.getAttribute('altnumber')) { + if (element.tagName === 'chapter') { + types.push('ca'); + } else { + types.push('va'); + } + } + if (element.getAttribute('pubnumber')) { + if (element.tagName === 'chapter') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if (element.getAttribute('category')) { + types.push('cat'); + } + if (element.childNodes.length > 0) { + Array.from(element.childNodes).forEach(child => { + types = types.concat(getNodes(child)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +}