Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validator class #268

Merged
merged 11 commits into from
Nov 5, 2024
2 changes: 1 addition & 1 deletion node-usfm-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
"Samuel JD <[email protected]> (https://github.com/samueljd)"
],
"dependencies": {
"ajv": "^8.17.1",
"tree-sitter": "0.21.1",
"tree-sitter-usfm3": "3.0.0-beta.9",
"xmldom": "^0.6.0",
"xpath": "^0.0.34"
},
"devDependencies": {
"ajv": "^8.17.1",
"glob": "^11.0.0",
"mocha": "^10.7.3",
"parcel": "^2.12.0"
Expand Down
4 changes: 3 additions & 1 deletion node-usfm-parser/src/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
const {USFMParser, Filter, Format } = require("./usfmParser");
const {Validator} = require("./validator");

exports.USFMParser = USFMParser;
exports.Filter = Filter;
exports.Format = Format;
exports.Format = Format;
exports.Validator = Validator;
215 changes: 215 additions & 0 deletions node-usfm-parser/src/validator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
const Parser = require('tree-sitter');
const USFM3 = require('tree-sitter-usfm3');

const fs = require('node:fs');
const Ajv = require('ajv');

const { Query } = Parser;

class Validator {
constructor(usjSchemaPath = '../schemas/usj.js') {
this.USFMParser = new Parser();
this.USFMParser.setLanguage(USFM3);
this.parserOptions = Parser.Options = {
bufferSize: 1024 * 1024,
};

this.USFMErrors = [];

// Load the schema for validation
this.USJValidator = null;
try {
const ajv = new Ajv();
const schemaStr = fs.readFileSync("../schemas/usj.js", 'utf8');
const schema = JSON.parse(schemaStr);
this.USJValidator = ajv.compile(schema);
} catch (error) {
console.error("Error loading schema:", error);
}

this.message = "";
this.modifiedUSFM = "";
this.usfm = ""
}

isValidUSJ(usj) {
this.message = "";

if (this.USJValidator(usj) === true) {
return true;
} else {
for (let err of this.USJValidator.errors) {
this.message += `Error at ${err.instancePath}: ${err.message}\n`;
}
return false;
}
}

isValidUSFM(usfm) {
this.usfm = usfm;
this.USFMErrors = [];
let tree = null;
if (usfm.length > 25000) {
tree = this.USFMParser.parse(usfm, null, this.parserOptions);
}
else {
tree = this.USFMParser.parse(usfm);
}
const errorQuery = new Query(USFM3, "(ERROR) @errors");
const errors = errorQuery.captures(tree.rootNode);

for (let error of errors) {
// console.log(getAllProperties(error.node));
this.USFMErrors.push(error.node);
}

this.checkForMissing(tree.rootNode);

if (this.USFMErrors.length > 0) {
this.message = this.formatErrors();
return false;
}
return true;
}

checkForMissing(node) {
for (let n of node.children) {
if (n.isMissing){
this.USFMErrors.push(n);
} else {
this.checkForMissing(n);
}

}
}

formatErrors() {
const errLines = this.USFMErrors.map(err => {
if (err.isMissing) {
const start = Math.max(0, err.startIndex - 3);
const end = Math.min(this.usfm.length, err.startIndex + 10);
return `At ${err.startIndex}:Missing something here:${this.usfm.slice(start, end)}`;
} else {
return `At ${err.startPosition.row}:${err.startPosition.column}, Error: ${this.usfm.substring(err.startIndex, err.endIndex)}`;
}
});
return `Errors present:\n\t${errLines.join('\n\t')}`;
}

autoFixUSFM(usfm, fixed=false) {
if (this.isValidUSFM(usfm)) {
if (fixed) {
this.message = "Fixed Errors in USFM"
} else {
this.message = "No Errors in USFM";
}
return usfm;
}
let modifiedUSFM = usfm;
let changed = false;

for (let error of this.USFMErrors) {
const errorText = usfm.substring(error.startIndex, error.endIndex);
// No \P after \s5
if (error.isError && errorText.startsWith("\\s5") &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 1");
modifiedUSFM = modifiedUSFM.replace(/\\s5[\s\n\r]*/g, '\\s5 \n\\p\n');
changed = true;
}
// Missing space after \s5
else if (error.isMissing && error.parent.type === "sTag" && error.toString() === '(MISSING " ")') {
// console.log("Match 2");
modifiedUSFM = modifiedUSFM.replace(/\\s5\n/g, '\\s5 \n');
changed = true;
}
// Book code is missing (empty id marker)
else if (bookCodeMissingPattern.test(modifiedUSFM)) {
// console.log("Match 3");
modifiedUSFM = modifiedUSFM.replace(/\\id[\s\n\r]*\\/g, '\\id XXX xxx\n\\');
changed = true;
}
// \p not given after section heading
else if (error.isError && errorText.startsWith("\\v") && error.parent.type === "s" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 4");
const start = error.parent.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
// Space missing between \v and number
else if (vWithoutSpacePattern.test(errorText)) {
// console.log("Match 5");
modifiedUSFM = modifiedUSFM.replace(vWithoutSpacePattern, "$1 $2");
changed = true;
}
// Space missing between \c and number
else if (cWithoutSpacePattern.test(errorText)) {
// console.log("Match 6");
modifiedUSFM = modifiedUSFM.replace(cWithoutSpacePattern, "$1 $2");
changed = true;
}
// \p not given at chapter start
else if (error.isError && errorText.startsWith("\\v") && error.previousSibling.type === "chapter" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 7");
const start = error.previousSibling.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
// Stray slash not with a valid marker
else if (errorText.startsWith("\\") && !validMarkersPattern.test(errorText)) {
// console.log("Match 8");
modifiedUSFM = modifiedUSFM.replace(errorText, errorText.slice(1));
changed = true;
}
// Just a single problematic marker (could be w/o text)
else if (errorText.startsWith("\\") && validMarkersPattern.test(errorText)) {
// console.log("Match 9");
const start = Math.max(0, error.startIndex - 5);
const end = Math.min(modifiedUSFM.length, error.endIndex + 5);
const toReplace = modifiedUSFM.slice(start, end);
const replacement = toReplace.replace(errorText, "");
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement);
changed = true;
}
// Empty attribute
else if (errorText.trim() === "|") {
// console.log("Match 10");
// console.log(errorText);
const start = Math.max(0, error.startIndex - 5);
const end = Math.min(modifiedUSFM.length, error.endIndex + 5);
const toReplace = modifiedUSFM.slice(start, end);
const replacement = toReplace.replace(errorText, "");
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement);
changed = true;
}
// Stray content in the chapter line
else if (error.parent.type === "chapter" && error.previousSibling.type === "c" && !errorText.includes("\\")) {
// console.log("Match 11");
modifiedUSFM = modifiedUSFM.replace(errorText, "");
changed = true;
}
}

if (!changed || modifiedUSFM===usfm) {
const errStr = this.formatErrors();
this.message = `Cannot fix these errors:\n\t${errStr}`;
return modifiedUSFM;
}
// return modifiedUSFM

return this.autoFixUSFM(modifiedUSFM, true);
}
}

const bookCodeMissingPattern = /\\id[\s\n\r]*\\/;
const vWithoutSpacePattern = /(\\v)(\d+)/;
const cWithoutSpacePattern = /(\\c)(\d+)/;
const validMarkersPattern = /(\\id|\\usfm|\\ide|\\ref|\\h|\\toc|\\toca|\\sts|\\rem|\\restore|\\lit|\\iqt|\\imt|\\imte|\\is|\\io|\\ior|\\iot|\\ip|\\im|\\ipi|\\imi|\\ili|\\ipq|\\imq|\\ipr|\\ib|\\iq|\\ie|\\iex|\\v|\\va|\\vp|\\c|\\cl|\\ca|\\cp|\\cd|\\mt|\\mte|\\ms|\\mr|\\s|\\sr|\\r|\\sp|\\d|\\sd|\\p|\\m|\\po|\\pr|\\cls|\\pmo|\\pm|\\pmc|\\pmr|\\pi|\\mi|\\nb|\\pc|\\ph|\\phi|\\b|\\q|\\qr|\\qc|\\qs|\\qa|\\qac|\\qm|\\qd|\\lh|\\lf|\\li|\\lim|\\liv|\\lik|\\litl|\\tr|\\th|\\thr|\\tc|\\tcr|\\f|\\fe|\\ef|\\fr|\\fq|\\fqa|\\fk|\\fl|\\fw|\\fp|\\ft|\\fdc|\\fv|\\fm|\\x|\\xo|\\xk|\\xq|\\xt|\\xta|\\xop|\\xot|\\xnt|\\xdc|\\rq|\\add|\\bk|\\dc|\\k|\\nd|\\ord|\\pn|\\png|\\addpn|\\qt|\\sig|\\sls|\\tl|\\wj|\\em|\\bd|\\it|\\bdit|\\no|\\sc|\\sup|\\ndx|\\pro|\\rb|\\w|\\wg|\\wh|\\wa|\\fig|\\jmp|\\pb|\\z|\\esb|\\esbe|\\cat)(\d|\s|\n|\r|$)/;

exports.Validator = Validator;
54 changes: 54 additions & 0 deletions node-usfm-parser/test/test_auto_fix.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
const assert = require('assert');
const fs = require('node:fs');
const {glob} = require('glob');
const {Validator} = require("../src/index");

const TEST_DIR = "../tests";
const allUSFMFiles = glob.sync(TEST_DIR+'/autofix/*');
const sampleUSJs = glob.sync(TEST_DIR+'/specExamples/*/origin.json')

describe("Try autofixing errors in USFM", () => {

allUSFMFiles.forEach(function(value) {
it(`Fix ${value}`, (inputUsfmPath=value) => {
//Tests if input parses without errors
const testVaidator = new Validator()
assert(testVaidator instanceof Validator)
const inputUsfm = fs.readFileSync(inputUsfmPath, 'utf8')
const firstTest = testVaidator.isValidUSFM(inputUsfm);
const fixedUsfm = testVaidator.autoFixUSFM(inputUsfm);
const secondTest = testVaidator.isValidUSFM(fixedUsfm);
// assert.ok(!firstTest);
assert.ok(secondTest);
});
});

});

describe("Validate USJ", () => {
sampleUSJs.forEach(function(value) {
it(`Validate ${value}`, (inputUsjPath=value) => {
//Tests if input parses without errors
const testVaidator = new Validator()
assert(testVaidator instanceof Validator)
const inputUsj = fs.readFileSync(inputUsjPath, 'utf8')
const usj = JSON.parse(inputUsj);
// assert.ok(!firstTest);
assert.ok(testVaidator.isValidUSJ(usj));
});
});

sampleUSJs.forEach(function(value) {
it(`Validate ${value} and report error`, (inputUsjPath=value) => {
//Tests if input parses without errors
const testVaidator = new Validator()
assert(testVaidator instanceof Validator)
let inputUsj = fs.readFileSync(inputUsjPath, 'utf8')
inputUsj = inputUsj.replace("code", "cooode");
inputUsj = inputUsj.replace("content", "contents");
const usj = JSON.parse(inputUsj);
assert.ok(!testVaidator.isValidUSJ(usj));
assert(testVaidator.message !== "");
});
});
});
4 changes: 3 additions & 1 deletion py-usfm-parser/src/usfm_grammar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
'''Entry point of the package with its public values'''

from usfm_grammar import usfm_parser
from usfm_grammar import usfm_generator
from usfm_grammar import validator

Filter = usfm_parser.Filter
Format = usfm_parser.Format
USFMParser = usfm_parser.USFMParser

Validator = validator.Validator

__version__ = "3.0.0-beta.10"
Loading
Loading