Skip to content

Commit

Permalink
Validator class (#268)
Browse files Browse the repository at this point in the history
* Python: Add a Validator class with validate_usfm() and validate_usj() methods

* Python: Try implementing auto fix for common issues

* Add some test cases for auto fix

* Implement validation and autofix in node

* Add tests for autofix in node

* Fix an issue in empty attribute case in python module

* Implement validation and autofix in web

* Add tests for autofix in web module

* Fix issue in USJ error reporting in node and web. Also remobe print statements

* Handle pylint issues

* Add tests for USJ validation in node, web and python
  • Loading branch information
kavitharaju authored Nov 5, 2024
1 parent df9da06 commit edccb2c
Show file tree
Hide file tree
Showing 24 changed files with 1,127 additions and 5 deletions.
2 changes: 1 addition & 1 deletion node-usfm-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
"Samuel JD <[email protected]> (https://github.com/samueljd)"
],
"dependencies": {
"ajv": "^8.17.1",
"tree-sitter": "0.21.1",
"tree-sitter-usfm3": "3.0.0-beta.9",
"xmldom": "^0.6.0",
"xpath": "^0.0.34"
},
"devDependencies": {
"ajv": "^8.17.1",
"glob": "^11.0.0",
"mocha": "^10.7.3",
"parcel": "^2.12.0"
Expand Down
4 changes: 3 additions & 1 deletion node-usfm-parser/src/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
const {USFMParser, Filter, Format } = require("./usfmParser");
const {Validator} = require("./validator");

exports.USFMParser = USFMParser;
exports.Filter = Filter;
exports.Format = Format;
exports.Format = Format;
exports.Validator = Validator;
215 changes: 215 additions & 0 deletions node-usfm-parser/src/validator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
const Parser = require('tree-sitter');
const USFM3 = require('tree-sitter-usfm3');

const fs = require('node:fs');
const Ajv = require('ajv');

const { Query } = Parser;

class Validator {
constructor(usjSchemaPath = '../schemas/usj.js') {
this.USFMParser = new Parser();
this.USFMParser.setLanguage(USFM3);
this.parserOptions = Parser.Options = {
bufferSize: 1024 * 1024,
};

this.USFMErrors = [];

// Load the schema for validation
this.USJValidator = null;
try {
const ajv = new Ajv();
const schemaStr = fs.readFileSync("../schemas/usj.js", 'utf8');
const schema = JSON.parse(schemaStr);
this.USJValidator = ajv.compile(schema);
} catch (error) {
console.error("Error loading schema:", error);
}

this.message = "";
this.modifiedUSFM = "";
this.usfm = ""
}

isValidUSJ(usj) {
this.message = "";

if (this.USJValidator(usj) === true) {
return true;
} else {
for (let err of this.USJValidator.errors) {
this.message += `Error at ${err.instancePath}: ${err.message}\n`;
}
return false;
}
}

isValidUSFM(usfm) {
this.usfm = usfm;
this.USFMErrors = [];
let tree = null;
if (usfm.length > 25000) {
tree = this.USFMParser.parse(usfm, null, this.parserOptions);
}
else {
tree = this.USFMParser.parse(usfm);
}
const errorQuery = new Query(USFM3, "(ERROR) @errors");
const errors = errorQuery.captures(tree.rootNode);

for (let error of errors) {
// console.log(getAllProperties(error.node));
this.USFMErrors.push(error.node);
}

this.checkForMissing(tree.rootNode);

if (this.USFMErrors.length > 0) {
this.message = this.formatErrors();
return false;
}
return true;
}

checkForMissing(node) {
for (let n of node.children) {
if (n.isMissing){
this.USFMErrors.push(n);
} else {
this.checkForMissing(n);
}

}
}

formatErrors() {
const errLines = this.USFMErrors.map(err => {
if (err.isMissing) {
const start = Math.max(0, err.startIndex - 3);
const end = Math.min(this.usfm.length, err.startIndex + 10);
return `At ${err.startIndex}:Missing something here:${this.usfm.slice(start, end)}`;
} else {
return `At ${err.startPosition.row}:${err.startPosition.column}, Error: ${this.usfm.substring(err.startIndex, err.endIndex)}`;
}
});
return `Errors present:\n\t${errLines.join('\n\t')}`;
}

autoFixUSFM(usfm, fixed=false) {
if (this.isValidUSFM(usfm)) {
if (fixed) {
this.message = "Fixed Errors in USFM"
} else {
this.message = "No Errors in USFM";
}
return usfm;
}
let modifiedUSFM = usfm;
let changed = false;

for (let error of this.USFMErrors) {
const errorText = usfm.substring(error.startIndex, error.endIndex);
// No \P after \s5
if (error.isError && errorText.startsWith("\\s5") &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 1");
modifiedUSFM = modifiedUSFM.replace(/\\s5[\s\n\r]*/g, '\\s5 \n\\p\n');
changed = true;
}
// Missing space after \s5
else if (error.isMissing && error.parent.type === "sTag" && error.toString() === '(MISSING " ")') {
// console.log("Match 2");
modifiedUSFM = modifiedUSFM.replace(/\\s5\n/g, '\\s5 \n');
changed = true;
}
// Book code is missing (empty id marker)
else if (bookCodeMissingPattern.test(modifiedUSFM)) {
// console.log("Match 3");
modifiedUSFM = modifiedUSFM.replace(/\\id[\s\n\r]*\\/g, '\\id XXX xxx\n\\');
changed = true;
}
// \p not given after section heading
else if (error.isError && errorText.startsWith("\\v") && error.parent.type === "s" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 4");
const start = error.parent.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
// Space missing between \v and number
else if (vWithoutSpacePattern.test(errorText)) {
// console.log("Match 5");
modifiedUSFM = modifiedUSFM.replace(vWithoutSpacePattern, "$1 $2");
changed = true;
}
// Space missing between \c and number
else if (cWithoutSpacePattern.test(errorText)) {
// console.log("Match 6");
modifiedUSFM = modifiedUSFM.replace(cWithoutSpacePattern, "$1 $2");
changed = true;
}
// \p not given at chapter start
else if (error.isError && errorText.startsWith("\\v") && error.previousSibling.type === "chapter" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 7");
const start = error.previousSibling.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
// Stray slash not with a valid marker
else if (errorText.startsWith("\\") && !validMarkersPattern.test(errorText)) {
// console.log("Match 8");
modifiedUSFM = modifiedUSFM.replace(errorText, errorText.slice(1));
changed = true;
}
// Just a single problematic marker (could be w/o text)
else if (errorText.startsWith("\\") && validMarkersPattern.test(errorText)) {
// console.log("Match 9");
const start = Math.max(0, error.startIndex - 5);
const end = Math.min(modifiedUSFM.length, error.endIndex + 5);
const toReplace = modifiedUSFM.slice(start, end);
const replacement = toReplace.replace(errorText, "");
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement);
changed = true;
}
// Empty attribute
else if (errorText.trim() === "|") {
// console.log("Match 10");
// console.log(errorText);
const start = Math.max(0, error.startIndex - 5);
const end = Math.min(modifiedUSFM.length, error.endIndex + 5);
const toReplace = modifiedUSFM.slice(start, end);
const replacement = toReplace.replace(errorText, "");
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement);
changed = true;
}
// Stray content in the chapter line
else if (error.parent.type === "chapter" && error.previousSibling.type === "c" && !errorText.includes("\\")) {
// console.log("Match 11");
modifiedUSFM = modifiedUSFM.replace(errorText, "");
changed = true;
}
}

if (!changed || modifiedUSFM===usfm) {
const errStr = this.formatErrors();
this.message = `Cannot fix these errors:\n\t${errStr}`;
return modifiedUSFM;
}
// return modifiedUSFM

return this.autoFixUSFM(modifiedUSFM, true);
}
}

const bookCodeMissingPattern = /\\id[\s\n\r]*\\/;
const vWithoutSpacePattern = /(\\v)(\d+)/;
const cWithoutSpacePattern = /(\\c)(\d+)/;
const validMarkersPattern = /(\\id|\\usfm|\\ide|\\ref|\\h|\\toc|\\toca|\\sts|\\rem|\\restore|\\lit|\\iqt|\\imt|\\imte|\\is|\\io|\\ior|\\iot|\\ip|\\im|\\ipi|\\imi|\\ili|\\ipq|\\imq|\\ipr|\\ib|\\iq|\\ie|\\iex|\\v|\\va|\\vp|\\c|\\cl|\\ca|\\cp|\\cd|\\mt|\\mte|\\ms|\\mr|\\s|\\sr|\\r|\\sp|\\d|\\sd|\\p|\\m|\\po|\\pr|\\cls|\\pmo|\\pm|\\pmc|\\pmr|\\pi|\\mi|\\nb|\\pc|\\ph|\\phi|\\b|\\q|\\qr|\\qc|\\qs|\\qa|\\qac|\\qm|\\qd|\\lh|\\lf|\\li|\\lim|\\liv|\\lik|\\litl|\\tr|\\th|\\thr|\\tc|\\tcr|\\f|\\fe|\\ef|\\fr|\\fq|\\fqa|\\fk|\\fl|\\fw|\\fp|\\ft|\\fdc|\\fv|\\fm|\\x|\\xo|\\xk|\\xq|\\xt|\\xta|\\xop|\\xot|\\xnt|\\xdc|\\rq|\\add|\\bk|\\dc|\\k|\\nd|\\ord|\\pn|\\png|\\addpn|\\qt|\\sig|\\sls|\\tl|\\wj|\\em|\\bd|\\it|\\bdit|\\no|\\sc|\\sup|\\ndx|\\pro|\\rb|\\w|\\wg|\\wh|\\wa|\\fig|\\jmp|\\pb|\\z|\\esb|\\esbe|\\cat)(\d|\s|\n|\r|$)/;

exports.Validator = Validator;
54 changes: 54 additions & 0 deletions node-usfm-parser/test/test_auto_fix.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
const assert = require('assert');
const fs = require('node:fs');
const {glob} = require('glob');
const {Validator} = require("../src/index");

const TEST_DIR = "../tests";
const allUSFMFiles = glob.sync(TEST_DIR+'/autofix/*');
const sampleUSJs = glob.sync(TEST_DIR+'/specExamples/*/origin.json')

describe("Try autofixing errors in USFM", () => {

allUSFMFiles.forEach(function(value) {
it(`Fix ${value}`, (inputUsfmPath=value) => {
//Tests if input parses without errors
const testVaidator = new Validator()
assert(testVaidator instanceof Validator)
const inputUsfm = fs.readFileSync(inputUsfmPath, 'utf8')
const firstTest = testVaidator.isValidUSFM(inputUsfm);
const fixedUsfm = testVaidator.autoFixUSFM(inputUsfm);
const secondTest = testVaidator.isValidUSFM(fixedUsfm);
// assert.ok(!firstTest);
assert.ok(secondTest);
});
});

});

describe("Validate USJ", () => {
sampleUSJs.forEach(function(value) {
it(`Validate ${value}`, (inputUsjPath=value) => {
//Tests if input parses without errors
const testVaidator = new Validator()
assert(testVaidator instanceof Validator)
const inputUsj = fs.readFileSync(inputUsjPath, 'utf8')
const usj = JSON.parse(inputUsj);
// assert.ok(!firstTest);
assert.ok(testVaidator.isValidUSJ(usj));
});
});

sampleUSJs.forEach(function(value) {
it(`Validate ${value} and report error`, (inputUsjPath=value) => {
//Tests if input parses without errors
const testVaidator = new Validator()
assert(testVaidator instanceof Validator)
let inputUsj = fs.readFileSync(inputUsjPath, 'utf8')
inputUsj = inputUsj.replace("code", "cooode");
inputUsj = inputUsj.replace("content", "contents");
const usj = JSON.parse(inputUsj);
assert.ok(!testVaidator.isValidUSJ(usj));
assert(testVaidator.message !== "");
});
});
});
4 changes: 3 additions & 1 deletion py-usfm-parser/src/usfm_grammar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
'''Entry point of the package with its public values'''

from usfm_grammar import usfm_parser
from usfm_grammar import usfm_generator
from usfm_grammar import validator

Filter = usfm_parser.Filter
Format = usfm_parser.Format
USFMParser = usfm_parser.USFMParser

Validator = validator.Validator

__version__ = "3.0.0-beta.10"
Loading

0 comments on commit edccb2c

Please sign in to comment.