-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Python: Add a Validator class with validate_usfm() and validate_usj() methods * Python: Try implementing auto fix for common issues * Add some test cases for auto fix * Implement validation and autofix in node * Add tests for autofix in node * Fix an issue in empty attribute case in python module * Implement validation and autofix in web * Add tests for autofix in web module * Fix issue in USJ error reporting in node and web. Also remobe print statements * Handle pylint issues * Add tests for USJ validation in node, web and python
- Loading branch information
1 parent
df9da06
commit edccb2c
Showing
24 changed files
with
1,127 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,13 +26,13 @@ | |
"Samuel JD <[email protected]> (https://github.com/samueljd)" | ||
], | ||
"dependencies": { | ||
"ajv": "^8.17.1", | ||
"tree-sitter": "0.21.1", | ||
"tree-sitter-usfm3": "3.0.0-beta.9", | ||
"xmldom": "^0.6.0", | ||
"xpath": "^0.0.34" | ||
}, | ||
"devDependencies": { | ||
"ajv": "^8.17.1", | ||
"glob": "^11.0.0", | ||
"mocha": "^10.7.3", | ||
"parcel": "^2.12.0" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
const {USFMParser, Filter, Format } = require("./usfmParser"); | ||
const {Validator} = require("./validator"); | ||
|
||
exports.USFMParser = USFMParser; | ||
exports.Filter = Filter; | ||
exports.Format = Format; | ||
exports.Format = Format; | ||
exports.Validator = Validator; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
const Parser = require('tree-sitter'); | ||
const USFM3 = require('tree-sitter-usfm3'); | ||
|
||
const fs = require('node:fs'); | ||
const Ajv = require('ajv'); | ||
|
||
const { Query } = Parser; | ||
|
||
class Validator { | ||
constructor(usjSchemaPath = '../schemas/usj.js') { | ||
this.USFMParser = new Parser(); | ||
this.USFMParser.setLanguage(USFM3); | ||
this.parserOptions = Parser.Options = { | ||
bufferSize: 1024 * 1024, | ||
}; | ||
|
||
this.USFMErrors = []; | ||
|
||
// Load the schema for validation | ||
this.USJValidator = null; | ||
try { | ||
const ajv = new Ajv(); | ||
const schemaStr = fs.readFileSync("../schemas/usj.js", 'utf8'); | ||
const schema = JSON.parse(schemaStr); | ||
this.USJValidator = ajv.compile(schema); | ||
} catch (error) { | ||
console.error("Error loading schema:", error); | ||
} | ||
|
||
this.message = ""; | ||
this.modifiedUSFM = ""; | ||
this.usfm = "" | ||
} | ||
|
||
isValidUSJ(usj) { | ||
this.message = ""; | ||
|
||
if (this.USJValidator(usj) === true) { | ||
return true; | ||
} else { | ||
for (let err of this.USJValidator.errors) { | ||
this.message += `Error at ${err.instancePath}: ${err.message}\n`; | ||
} | ||
return false; | ||
} | ||
} | ||
|
||
isValidUSFM(usfm) { | ||
this.usfm = usfm; | ||
this.USFMErrors = []; | ||
let tree = null; | ||
if (usfm.length > 25000) { | ||
tree = this.USFMParser.parse(usfm, null, this.parserOptions); | ||
} | ||
else { | ||
tree = this.USFMParser.parse(usfm); | ||
} | ||
const errorQuery = new Query(USFM3, "(ERROR) @errors"); | ||
const errors = errorQuery.captures(tree.rootNode); | ||
|
||
for (let error of errors) { | ||
// console.log(getAllProperties(error.node)); | ||
this.USFMErrors.push(error.node); | ||
} | ||
|
||
this.checkForMissing(tree.rootNode); | ||
|
||
if (this.USFMErrors.length > 0) { | ||
this.message = this.formatErrors(); | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
checkForMissing(node) { | ||
for (let n of node.children) { | ||
if (n.isMissing){ | ||
this.USFMErrors.push(n); | ||
} else { | ||
this.checkForMissing(n); | ||
} | ||
|
||
} | ||
} | ||
|
||
formatErrors() { | ||
const errLines = this.USFMErrors.map(err => { | ||
if (err.isMissing) { | ||
const start = Math.max(0, err.startIndex - 3); | ||
const end = Math.min(this.usfm.length, err.startIndex + 10); | ||
return `At ${err.startIndex}:Missing something here:${this.usfm.slice(start, end)}`; | ||
} else { | ||
return `At ${err.startPosition.row}:${err.startPosition.column}, Error: ${this.usfm.substring(err.startIndex, err.endIndex)}`; | ||
} | ||
}); | ||
return `Errors present:\n\t${errLines.join('\n\t')}`; | ||
} | ||
|
||
autoFixUSFM(usfm, fixed=false) { | ||
if (this.isValidUSFM(usfm)) { | ||
if (fixed) { | ||
this.message = "Fixed Errors in USFM" | ||
} else { | ||
this.message = "No Errors in USFM"; | ||
} | ||
return usfm; | ||
} | ||
let modifiedUSFM = usfm; | ||
let changed = false; | ||
|
||
for (let error of this.USFMErrors) { | ||
const errorText = usfm.substring(error.startIndex, error.endIndex); | ||
// No \P after \s5 | ||
if (error.isError && errorText.startsWith("\\s5") && | ||
!error.children.some(ch => ch.type === "paragraph")) { | ||
// console.log("Match 1"); | ||
modifiedUSFM = modifiedUSFM.replace(/\\s5[\s\n\r]*/g, '\\s5 \n\\p\n'); | ||
changed = true; | ||
} | ||
// Missing space after \s5 | ||
else if (error.isMissing && error.parent.type === "sTag" && error.toString() === '(MISSING " ")') { | ||
// console.log("Match 2"); | ||
modifiedUSFM = modifiedUSFM.replace(/\\s5\n/g, '\\s5 \n'); | ||
changed = true; | ||
} | ||
// Book code is missing (empty id marker) | ||
else if (bookCodeMissingPattern.test(modifiedUSFM)) { | ||
// console.log("Match 3"); | ||
modifiedUSFM = modifiedUSFM.replace(/\\id[\s\n\r]*\\/g, '\\id XXX xxx\n\\'); | ||
changed = true; | ||
} | ||
// \p not given after section heading | ||
else if (error.isError && errorText.startsWith("\\v") && error.parent.type === "s" && | ||
!error.children.some(ch => ch.type === "paragraph")) { | ||
// console.log("Match 4"); | ||
const start = error.parent.startIndex; | ||
const end = error.startIndex; | ||
const toReplace = modifiedUSFM.slice(start, end); | ||
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`); | ||
changed = true; | ||
} | ||
// Space missing between \v and number | ||
else if (vWithoutSpacePattern.test(errorText)) { | ||
// console.log("Match 5"); | ||
modifiedUSFM = modifiedUSFM.replace(vWithoutSpacePattern, "$1 $2"); | ||
changed = true; | ||
} | ||
// Space missing between \c and number | ||
else if (cWithoutSpacePattern.test(errorText)) { | ||
// console.log("Match 6"); | ||
modifiedUSFM = modifiedUSFM.replace(cWithoutSpacePattern, "$1 $2"); | ||
changed = true; | ||
} | ||
// \p not given at chapter start | ||
else if (error.isError && errorText.startsWith("\\v") && error.previousSibling.type === "chapter" && | ||
!error.children.some(ch => ch.type === "paragraph")) { | ||
// console.log("Match 7"); | ||
const start = error.previousSibling.startIndex; | ||
const end = error.startIndex; | ||
const toReplace = modifiedUSFM.slice(start, end); | ||
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`); | ||
changed = true; | ||
} | ||
// Stray slash not with a valid marker | ||
else if (errorText.startsWith("\\") && !validMarkersPattern.test(errorText)) { | ||
// console.log("Match 8"); | ||
modifiedUSFM = modifiedUSFM.replace(errorText, errorText.slice(1)); | ||
changed = true; | ||
} | ||
// Just a single problematic marker (could be w/o text) | ||
else if (errorText.startsWith("\\") && validMarkersPattern.test(errorText)) { | ||
// console.log("Match 9"); | ||
const start = Math.max(0, error.startIndex - 5); | ||
const end = Math.min(modifiedUSFM.length, error.endIndex + 5); | ||
const toReplace = modifiedUSFM.slice(start, end); | ||
const replacement = toReplace.replace(errorText, ""); | ||
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement); | ||
changed = true; | ||
} | ||
// Empty attribute | ||
else if (errorText.trim() === "|") { | ||
// console.log("Match 10"); | ||
// console.log(errorText); | ||
const start = Math.max(0, error.startIndex - 5); | ||
const end = Math.min(modifiedUSFM.length, error.endIndex + 5); | ||
const toReplace = modifiedUSFM.slice(start, end); | ||
const replacement = toReplace.replace(errorText, ""); | ||
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement); | ||
changed = true; | ||
} | ||
// Stray content in the chapter line | ||
else if (error.parent.type === "chapter" && error.previousSibling.type === "c" && !errorText.includes("\\")) { | ||
// console.log("Match 11"); | ||
modifiedUSFM = modifiedUSFM.replace(errorText, ""); | ||
changed = true; | ||
} | ||
} | ||
|
||
if (!changed || modifiedUSFM===usfm) { | ||
const errStr = this.formatErrors(); | ||
this.message = `Cannot fix these errors:\n\t${errStr}`; | ||
return modifiedUSFM; | ||
} | ||
// return modifiedUSFM | ||
|
||
return this.autoFixUSFM(modifiedUSFM, true); | ||
} | ||
} | ||
|
||
const bookCodeMissingPattern = /\\id[\s\n\r]*\\/; | ||
const vWithoutSpacePattern = /(\\v)(\d+)/; | ||
const cWithoutSpacePattern = /(\\c)(\d+)/; | ||
const validMarkersPattern = /(\\id|\\usfm|\\ide|\\ref|\\h|\\toc|\\toca|\\sts|\\rem|\\restore|\\lit|\\iqt|\\imt|\\imte|\\is|\\io|\\ior|\\iot|\\ip|\\im|\\ipi|\\imi|\\ili|\\ipq|\\imq|\\ipr|\\ib|\\iq|\\ie|\\iex|\\v|\\va|\\vp|\\c|\\cl|\\ca|\\cp|\\cd|\\mt|\\mte|\\ms|\\mr|\\s|\\sr|\\r|\\sp|\\d|\\sd|\\p|\\m|\\po|\\pr|\\cls|\\pmo|\\pm|\\pmc|\\pmr|\\pi|\\mi|\\nb|\\pc|\\ph|\\phi|\\b|\\q|\\qr|\\qc|\\qs|\\qa|\\qac|\\qm|\\qd|\\lh|\\lf|\\li|\\lim|\\liv|\\lik|\\litl|\\tr|\\th|\\thr|\\tc|\\tcr|\\f|\\fe|\\ef|\\fr|\\fq|\\fqa|\\fk|\\fl|\\fw|\\fp|\\ft|\\fdc|\\fv|\\fm|\\x|\\xo|\\xk|\\xq|\\xt|\\xta|\\xop|\\xot|\\xnt|\\xdc|\\rq|\\add|\\bk|\\dc|\\k|\\nd|\\ord|\\pn|\\png|\\addpn|\\qt|\\sig|\\sls|\\tl|\\wj|\\em|\\bd|\\it|\\bdit|\\no|\\sc|\\sup|\\ndx|\\pro|\\rb|\\w|\\wg|\\wh|\\wa|\\fig|\\jmp|\\pb|\\z|\\esb|\\esbe|\\cat)(\d|\s|\n|\r|$)/; | ||
|
||
exports.Validator = Validator; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
const assert = require('assert'); | ||
const fs = require('node:fs'); | ||
const {glob} = require('glob'); | ||
const {Validator} = require("../src/index"); | ||
|
||
const TEST_DIR = "../tests"; | ||
const allUSFMFiles = glob.sync(TEST_DIR+'/autofix/*'); | ||
const sampleUSJs = glob.sync(TEST_DIR+'/specExamples/*/origin.json') | ||
|
||
describe("Try autofixing errors in USFM", () => { | ||
|
||
allUSFMFiles.forEach(function(value) { | ||
it(`Fix ${value}`, (inputUsfmPath=value) => { | ||
//Tests if input parses without errors | ||
const testVaidator = new Validator() | ||
assert(testVaidator instanceof Validator) | ||
const inputUsfm = fs.readFileSync(inputUsfmPath, 'utf8') | ||
const firstTest = testVaidator.isValidUSFM(inputUsfm); | ||
const fixedUsfm = testVaidator.autoFixUSFM(inputUsfm); | ||
const secondTest = testVaidator.isValidUSFM(fixedUsfm); | ||
// assert.ok(!firstTest); | ||
assert.ok(secondTest); | ||
}); | ||
}); | ||
|
||
}); | ||
|
||
describe("Validate USJ", () => { | ||
sampleUSJs.forEach(function(value) { | ||
it(`Validate ${value}`, (inputUsjPath=value) => { | ||
//Tests if input parses without errors | ||
const testVaidator = new Validator() | ||
assert(testVaidator instanceof Validator) | ||
const inputUsj = fs.readFileSync(inputUsjPath, 'utf8') | ||
const usj = JSON.parse(inputUsj); | ||
// assert.ok(!firstTest); | ||
assert.ok(testVaidator.isValidUSJ(usj)); | ||
}); | ||
}); | ||
|
||
sampleUSJs.forEach(function(value) { | ||
it(`Validate ${value} and report error`, (inputUsjPath=value) => { | ||
//Tests if input parses without errors | ||
const testVaidator = new Validator() | ||
assert(testVaidator instanceof Validator) | ||
let inputUsj = fs.readFileSync(inputUsjPath, 'utf8') | ||
inputUsj = inputUsj.replace("code", "cooode"); | ||
inputUsj = inputUsj.replace("content", "contents"); | ||
const usj = JSON.parse(inputUsj); | ||
assert.ok(!testVaidator.isValidUSJ(usj)); | ||
assert(testVaidator.message !== ""); | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
'''Entry point of the package with its public values''' | ||
|
||
from usfm_grammar import usfm_parser | ||
from usfm_grammar import usfm_generator | ||
from usfm_grammar import validator | ||
|
||
Filter = usfm_parser.Filter | ||
Format = usfm_parser.Format | ||
USFMParser = usfm_parser.USFMParser | ||
|
||
Validator = validator.Validator | ||
|
||
__version__ = "3.0.0-beta.10" |
Oops, something went wrong.