diff --git a/package.json b/package.json index aabca56..3e9fd86 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "word-aligner-rcl", - "version": "1.1.3", + "version": "1.1.4-beta", "main": "dist/index.cjs.js", "module": "dist/index.es.js", "repository": "https://github.com/unfoldingWord/word-aligner-rcl.git", @@ -61,6 +61,7 @@ "string-punctuation-tokenizer": "2.2.0", "usfm-js": "3.4.3", "word-aligner": "1.0.2", + "word-aligner-lib": "0.9.1-beta", "wordmap-lexer": "^0.3.6" }, "resolutions": { diff --git a/src/__tests__/testUpdateAlignments.test.js b/src/__tests__/testUpdateAlignments.test.js deleted file mode 100644 index 543b450..0000000 --- a/src/__tests__/testUpdateAlignments.test.js +++ /dev/null @@ -1,246 +0,0 @@ -/* eslint-env jest */ -import {describe, expect, test} from '@jest/globals' -import { - areAlgnmentsComplete, - extractAlignmentsFromTargetVerse, - parseUsfmToWordAlignerData, - updateAlignmentsToTargetVerse, - updateAlignmentsToTargetVerseWithOriginal, -} from "../utils/alignmentHelpers"; -import {removeUsfmMarkers, usfmVerseToJson} from "../utils/usfmHelpers"; -import Lexer from "wordmap-lexer"; -import {migrateTargetAlignmentsToOriginal} from "../utils/migrateOriginalLanguageHelpers"; -import {convertVerseDataToUSFM, getUsfmForVerseContent} from "../utils/UsfmFileConversionHelpers"; -import path from "path-extra"; -import fs from 'fs-extra'; -import cloneDeep from "lodash.clonedeep"; - -jest.unmock('fs-extra'); - -const simpleUpdatesPath = path.join(__dirname, './fixtures/alignments/simpleEditsTests.json'); -const otMigrationUpdatesPath = path.join(__dirname, './fixtures/alignments/otMigrationEditsTests.json'); -const ntMigrationUpdatesPath = path.join(__dirname, './fixtures/alignments/ntMigrationEditsTests.json'); - - -describe('testing edit of aligned target text', () => { - const tests = fs.readJsonSync(simpleUpdatesPath) - const testNames = Object.keys(tests) - // console.log(tests) - for (const testName of testNames) { - const test_ = tests[testName] - test(`${testName}`, () => { - let { - initialAlignedUsfm, - initialEditText, - steps, - } = test_ - - const initialVerseObjects = usfmVerseToJson(initialAlignedUsfm); - let currentVerseObjects = cloneDeep(initialVerseObjects); // set initial test conditions - const expectedInitialEditText = getUsfmForVerseContent({ verseObjects: currentVerseObjects }) - expect(initialEditText).toEqual(expectedInitialEditText) - expect(currentVerseObjects).toEqual(initialVerseObjects) // check for object mod - - for (const step of steps) { - //////////// - // Given - - const {newEditText, expectedFinalUsfm} = step - const startingVerseObjects = cloneDeep(currentVerseObjects); // save initial object - - //////////// - // When - - const results = updateAlignmentsToTargetVerse(currentVerseObjects, newEditText) - - //////////// - // Then - - expect(results.targetVerseText).toEqual(expectedFinalUsfm) - - const initialWords = Lexer.tokenize(removeUsfmMarkers(newEditText)) - const { targetWords: targetWords } = parseUsfmToWordAlignerData(results.targetVerseText, null) - expect(targetWords.length).toEqual(initialWords.length) - expect(currentVerseObjects).toEqual(startingVerseObjects) // check for object mod - - // final conditions of step become initial conditions for next step - currentVerseObjects = results.targetVerseObjects - } - }) - } -}) - -describe('testing alignment operations', () => { - const testaments = [ - { - name: "New Testament", - path: ntMigrationUpdatesPath, - }, - { - name: "Old Testament", - path: otMigrationUpdatesPath, - }] - - for (const testament of testaments) { - - // create a describe block for each testament - const {name: testamentName, path: testamentPath} = testament - console.log(testamentName) - - describe(`${testamentName} edit tests with original language validation`, () => { - const tests = fs.readJsonSync(testamentPath) - const testNames = Object.keys(tests) - // console.log(tests) - for (const testName of testNames) { - const test_ = tests[testName] - - test(`${testName}`, () => { - let { - initialAlignedUsfm, - initialEditText, - originalLanguageUsfm, - steps, - } = test_ - - const initialVerseObjects = usfmVerseToJson(initialAlignedUsfm); - let currentVerseObjects = cloneDeep(initialVerseObjects); // set initial test conditions - // make sure initial text matches the expected - const expectedInitialEditText = getUsfmForVerseContent({ verseObjects: currentVerseObjects }) - expect(initialEditText).toEqual(expectedInitialEditText) - expect(currentVerseObjects).toEqual(initialVerseObjects) // check for object mod - const originalLanguageVerseObjects = usfmVerseToJson(originalLanguageUsfm); // set initial test conditions - - for (const step of steps) { - - //////////// - // Given - - const {newEditText, expectedFinalUsfm} = step - const startingVerseObjects = cloneDeep(currentVerseObjects); // save initial object - - //////////// - // when - - // apply edited text - const results = updateAlignmentsToTargetVerseWithOriginal(currentVerseObjects, newEditText, originalLanguageVerseObjects) - - //////////// - // then - - expect(results.targetVerseText).toEqual(expectedFinalUsfm) - expect(currentVerseObjects).toEqual(startingVerseObjects) // check for object mod - - // final conditions of step become initial conditions for next step - currentVerseObjects = results.targetVerseObjects - } - }) - } - }) - - describe(`${testamentName} migration tests`, () => { - const tests = fs.readJsonSync(testamentPath) - const testNames = Object.keys(tests) - - // create a test for each item in json file - for (const testName of testNames) { - const test_ = tests[testName] - - test(`${testName}`, () => { - let { - initialAlignedUsfm, - originalLanguageUsfm, - steps, - } = test_ - - let currentVerseObjects = usfmVerseToJson(initialAlignedUsfm); // set initial test conditions - const originalLanguageVerseObjects = usfmVerseToJson(originalLanguageUsfm); // set initial test conditions - - for (const step of steps) { - - //////////// - // Given - - const {newEditText, migrationExpected} = step - - //////////// - // When - - const targetVerseObjects = migrateTargetAlignmentsToOriginal(currentVerseObjects, originalLanguageVerseObjects) - - //////////// - // Then - - validateMigrations(currentVerseObjects, targetVerseObjects, migrationExpected); - } - }) - } - }) - } -}) - -////////////////////////////// -// Testing Support functions -////////////////////////////// - -function getWordCountFromVerseObjects(verseObjects) { - let count = 0 - for (const vo of verseObjects) { - if (vo?.type === 'word') { - count++ - } - if (vo?.children) { - const _count = getWordCountFromVerseObjects(vo.children) - count += _count - } - } - return count -} - -function getWordCountFromAlignments(verseAlignments) { - let count = 0 - for (const alignment of verseAlignments) { - if (alignment?.sourceNgram) { - count += alignment?.sourceNgram?.length - } - } - return count -} - -function _areAlgnmentsComplete(targetVerseUSFM, originalVerseObjects) { - const { - alignments, - wordBank - } = extractAlignmentsFromTargetVerse(targetVerseUSFM, originalVerseObjects) - return areAlgnmentsComplete(wordBank, alignments) -} - -function getVerseObjectsFromUsfms(initialAlignment) { - const initialVerseObjects = usfmVerseToJson(initialAlignment); - const originalLanguageVerseObjects = usfmVerseToJson(psa_73_5_originalVerseText); - const areInitialAlignmentsComplete = _areAlgnmentsComplete(initialAlignment, originalLanguageVerseObjects) - return {initialVerseObjects, originalLanguageVerseObjects, areInitialAlignmentsComplete}; -} - -function validateFinalAlignment(areInitialAlignmentsComplete, expectInitialAlignmentsValid, results, newText, expectedOriginalWords, expectFinalAlignmentsValid, originalLanguageVerseObjects) { - expect(areInitialAlignmentsComplete).toEqual(expectInitialAlignmentsValid) - expect(results).toMatchSnapshot() - const initialWords = Lexer.tokenize(removeUsfmMarkers(newText)); - const alignerResults = parseUsfmToWordAlignerData(results.targetVerseText, psa_73_5_originalVerseText); - expect(alignerResults).toMatchSnapshot() - const {targetWords, verseAlignments} = alignerResults; - expect(targetWords.length).toEqual(initialWords.length) - const finalOriginalWords = getWordCountFromAlignments(verseAlignments) - expect(finalOriginalWords).toEqual(expectedOriginalWords) - const areAlignmentsComplete = _areAlgnmentsComplete(results.targetVerseText, originalLanguageVerseObjects) - expect(areAlignmentsComplete).toEqual(expectFinalAlignmentsValid) -} - -function validateMigrations(initialVerseObjects, migratedVerseObjects, expectMigration) { - const initialVerseText = convertVerseDataToUSFM({verseObjects: initialVerseObjects}) - const migratedVerseText = convertVerseDataToUSFM({verseObjects: migratedVerseObjects}); - if (expectMigration) { - expect(migratedVerseText).not.toEqual(initialVerseText) - } else { - expect(migratedVerseText).toEqual(initialVerseText) - } -} diff --git a/src/__tests__/verseSpan.test.js b/src/__tests__/verseSpan.test.js deleted file mode 100644 index ac00fa8..0000000 --- a/src/__tests__/verseSpan.test.js +++ /dev/null @@ -1,69 +0,0 @@ -/* eslint-disable array-callback-return */ -import {getParsedUSFM, usfmVerseToJson} from "../utils/usfmHelpers"; -import { - convertAlignmentsFromVerseSpansToVerse, - convertAlignmentFromVerseToVerseSpan, -} from "../utils/alignmentHelpers"; -import {convertVerseDataToUSFM } from "../utils/UsfmFileConversionHelpers"; - -const en_ust_gal_2_data = require(`./fixtures/alignments/en_gal_2_data.json`); -const en_ust_gal_2_usfm = en_ust_gal_2_data.ust_usfm; -const en_ust_gal_2_no_alignments_usfm = en_ust_gal_2_data.ust_usfm_no_alignments; -const en_ust_gal_2_partial_alignments_usfm = en_ust_gal_2_data.ust_usfm_partial_alignments; -const ugnt_gal_2_verseObjects = en_ust_gal_2_data.ugnt_verseObjects; -const en_ust_gal_2_verseObjects = getParsedUSFM(en_ust_gal_2_usfm); -const en_ust_gal_2_no_alignments_verseObjects = getParsedUSFM(en_ust_gal_2_no_alignments_usfm); -const en_ust_gal_2_partial_alignments_verseObjects = getParsedUSFM(en_ust_gal_2_partial_alignments_usfm); - -describe('test verseSpans round trip conversions', () => { - - it('check verse span support for alignments', () => { - const chapter = '2'; - const verseSpan = '11-13'; - const en_ust_gal_2_11_to_13 = en_ust_gal_2_verseObjects.chapters[chapter][verseSpan]; - - const targetLanguageVerse = en_ust_gal_2_11_to_13; - const originalLanguageChapterData = ugnt_gal_2_verseObjects; - const { alignedTargetVerseObjects, originalLanguageVerseObjects } = convertAlignmentFromVerseToVerseSpan(targetLanguageVerse, originalLanguageChapterData, chapter, verseSpan); - expect(originalLanguageVerseObjects.verseObjects.length).toBeTruthy(); - expect(alignedTargetVerseObjects.verseObjects.length).toBeTruthy(); - - const finalUSFM = convertAlignmentsFromVerseSpansToVerse(originalLanguageChapterData, alignedTargetVerseObjects, chapter, verseSpan); - expect(finalUSFM).toEqual(convertVerseDataToUSFM(en_ust_gal_2_11_to_13)) - }); - - it('check verse span support for no alignments', () => { - const chapter = '2'; - const verseSpan = '11-13'; - const en_ust_gal_2_11_to_13 = en_ust_gal_2_no_alignments_verseObjects.chapters[chapter][verseSpan]; - - const targetLanguageVerse = en_ust_gal_2_11_to_13; - const originalLanguageChapterData = ugnt_gal_2_verseObjects; - const { alignedTargetVerseObjects, originalLanguageVerseObjects } = convertAlignmentFromVerseToVerseSpan(targetLanguageVerse, originalLanguageChapterData, chapter, verseSpan); - expect(originalLanguageVerseObjects.verseObjects.length).toBeTruthy(); - expect(alignedTargetVerseObjects.verseObjects.length).toBeTruthy(); - - const finalUSFM = convertAlignmentsFromVerseSpansToVerse(originalLanguageChapterData, alignedTargetVerseObjects, chapter, verseSpan); - expect(finalUSFM).toEqual(convertVerseDataToUSFM(en_ust_gal_2_11_to_13)) - }); - - it('check verse span support for partial alignments', () => { - const chapter = '2'; - const verseSpan = '11-13'; - const en_ust_gal_2_11_to_13 = en_ust_gal_2_partial_alignments_verseObjects.chapters[chapter][verseSpan]; - - const targetLanguageVerse = en_ust_gal_2_11_to_13; - const originalLanguageChapterData = ugnt_gal_2_verseObjects; - const { alignedTargetVerseObjects, originalLanguageVerseObjects } = convertAlignmentFromVerseToVerseSpan(targetLanguageVerse, originalLanguageChapterData, chapter, verseSpan); - expect(originalLanguageVerseObjects.verseObjects.length).toBeTruthy(); - expect(alignedTargetVerseObjects.verseObjects.length).toBeTruthy(); - - const finalUSFM = convertAlignmentsFromVerseSpansToVerse(originalLanguageChapterData, alignedTargetVerseObjects, chapter, verseSpan); - expect(finalUSFM).toEqual(convertVerseDataToUSFM(en_ust_gal_2_11_to_13)) - }); -}); - -// -// Helpers -// - diff --git a/src/components/WordAligner.md b/src/components/WordAligner.md index 4a831fe..f95fe3f 100644 --- a/src/components/WordAligner.md +++ b/src/components/WordAligner.md @@ -3,12 +3,8 @@ Word Aligner Example: ```js import React, {useState} from 'react'; import { - addAlignmentsToVerseUSFM, - areAlgnmentsComplete, - parseUsfmToWordAlignerData, - resetAlignments, -} from "../utils/alignmentHelpers"; -import {convertVerseDataToUSFM} from "../utils/UsfmFileConversionHelpers"; + AlignmentHelpers +} from "word-aligner-lib"; import {NT_ORIG_LANG} from "../common/constants"; // a fully aligned example @@ -29,9 +25,9 @@ const translate = (key) => { const targetVerseUSFM = alignedVerseJson.usfm; const sourceVerseUSFM = originalVerseJson.usfm; -const {targetWords: targetWords_, verseAlignments: verseAlignments_} = parseUsfmToWordAlignerData(targetVerseUSFM, sourceVerseUSFM); +const {targetWords: targetWords_, verseAlignments: verseAlignments_} = AlignmentHelpers.parseUsfmToWordAlignerData(targetVerseUSFM, sourceVerseUSFM); -const alignmentComplete = areAlgnmentsComplete(targetWords_, verseAlignments_); +const alignmentComplete = AlignmentHelpers.areAlgnmentsComplete(targetWords_, verseAlignments_); console.log(`Alignments are ${alignmentComplete ? 'COMPLETE!' : 'incomplete'}`); const App = () => { @@ -66,15 +62,15 @@ const App = () => { function onChange(results) { console.log(`WordAligner() - alignment changed, results`, results);// merge alignments into target verse and convert to USFM const {targetWords, verseAlignments} = results; - const verseUsfm = addAlignmentsToVerseUSFM(targetWords, verseAlignments, targetVerseUSFM); + const verseUsfm = AlignmentHelpers.addAlignmentsToVerseUSFM(targetWords, verseAlignments, targetVerseUSFM); console.log(verseUsfm); - const alignmentComplete = areAlgnmentsComplete(targetWords, verseAlignments); + const alignmentComplete = AlignmentHelpers.areAlgnmentsComplete(targetWords, verseAlignments); console.log(`Alignments are ${alignmentComplete ? 'COMPLETE!' : 'incomplete'}`); } function onReset() { console.log("WordAligner() - reset Alignments") - const alignmentData = resetAlignments(verseAlignments, targetWords) + const alignmentData = AlignmentHelpers.resetAlignments(verseAlignments, targetWords) setState({ verseAlignments: alignmentData.verseAlignments, targetWords: alignmentData.targetWords, @@ -105,7 +101,7 @@ const App = () => { loadLexiconEntry={loadLexiconEntry} onChange={onChange} getLexiconData={getLexiconData_} - resetAlignments={resetAlignments} + resetAlignments={AlignmentHelpers.resetAlignments} /> diff --git a/src/index.js b/src/index.js index 16c7dbc..afc0e0a 100644 --- a/src/index.js +++ b/src/index.js @@ -1,12 +1,4 @@ import WordAligner from './components/WordAligner' -import * as AlignmentHelpers from './utils/alignmentHelpers' -import * as UsfmFileConversionHelpers from './utils/UsfmFileConversionHelpers' -import * as usfmHelpers from './utils/usfmHelpers' -import * as migrateOriginalLanguageHelpers from './utils/migrateOriginalLanguageHelpers' export { - AlignmentHelpers, - migrateOriginalLanguageHelpers, - UsfmFileConversionHelpers, - usfmHelpers, WordAligner, } diff --git a/src/utils/UsfmFileConversionHelpers.js b/src/utils/UsfmFileConversionHelpers.js deleted file mode 100644 index f44f4f8..0000000 --- a/src/utils/UsfmFileConversionHelpers.js +++ /dev/null @@ -1,269 +0,0 @@ -/* eslint-disable no-async-promise-executor, no-throw-literal */ -import usfmjs from 'usfm-js'; -import cloneDeep from "lodash.clonedeep"; -import {getVerseAlignments, getWordCountInVerse} from "./alignmentHelpers"; - -/** - * test to see if verse is a verseSpan - * @param {string|number} verse - * @return {boolean} - */ -export function isVerseSpan(verse) { - return verse.toString().includes('-'); -} - -/** - * called in case of invalid alignment that is not valid for the verse span, Sets alignment occurrence to high value - * so that alignment will be invalidated and has to be reviewed. - * @param alignment - */ -export function invalidateAlignment(alignment) { - delete alignment.ref; - alignment.occurrences = 100000; - alignment.occurrence = 100000; -} - -/** - * business logic for convertAlignmentFromVerseToVerseSpan: - * for each alignment converts mapping to original verse by ref to be mapped to original language verse span by removing ref and updating occurrence(s) - * @param {object} originalVerseSpanData - original bible merged to verse span - * @param {object} alignedVerseObjects - aligned verse objects for current verse (modified) - * @param {number|string} chapter - * @param {number} low - low verse number of span - * @param {number} hi - high verse number of span - * @param blankVerseAlignments - raw verse alignments for extracting word counts for each verse - * @return {{verseObjects}} - original verse span data - */ -export function convertAlignmentFromVerseToVerseSpanSub(originalVerseSpanData, alignedVerseObjects, chapter, low, hi, blankVerseAlignments) { - const bibleVerse = { verseObjects: originalVerseSpanData }; - const alignments = getVerseAlignments(alignedVerseObjects.verseObjects); - - for (let alignment of alignments) { - const ref = alignment.ref || ''; - const refParts = ref.split(':'); - let verseRef; - let chapterRef = chapter; // default to current chapter - const word = alignment.content; - let occurrence = alignment.occurrence; - let occurrences = 0; - - if (refParts.length > 1) { // if both chapter and verse - verseRef = parseInt(refParts[1]); - chapterRef = refParts[0]; - } else { // verse only - verseRef = parseInt(refParts[0]); - } - - if (chapterRef.toString() !== chapter.toString()) { - console.warn(`convertAlignmentFromVerseToVerseSpan() - alignment of word "${word}:${occurrence}" - chapter in ref "${ref}" does not match current chapter ${chapter} for verse span "${low}-${hi}" - skipping`); - invalidateAlignment(alignment); - continue; - } - - if (!(occurrence > 0)) { - console.warn(`convertAlignmentFromVerseToVerseSpan() - alignment of word "${word}:${occurrence}" - invalid occurrence in current verse span "${low}-${hi}" - skipping`); - invalidateAlignment(alignment); - continue; - } - - if (!((verseRef >= low) || (verseRef <= hi))) { - console.warn(`convertAlignmentFromVerseToVerseSpan() - alignment of word "${word}:${occurrence}" - verse in ref ${ref} is not within current verse span "${low}-${hi}" - skipping`); - invalidateAlignment(alignment); - continue; - } - - // transform occurrence(s) from verse based to verse span - for (let verse = low; verse <= hi; verse++) { - const wordCount = getWordCountInVerse(blankVerseAlignments, verse, word); - occurrences += wordCount; - - if (verse < verseRef) { - occurrence += wordCount; // add word counts for lower verses to occurrence - } - } - - if ((occurrence > occurrences)) { - console.warn(`convertAlignmentFromVerseToVerseSpan() - alignment of word "${word}:${occurrence}" - beyond ocurrences ${occurrences} in current verse span "${low}-${hi}" - skipping`); - invalidateAlignment(alignment); - } else { - delete alignment.ref; - alignment.occurrences = occurrences; - alignment.occurrence = occurrence; - } - } - return bibleVerse; -} - -/** - * dive down into milestone to extract words and text - * @param {Object} verseObject - milestone to parse - * @return {string} text content of milestone - */ -const parseMilestone = verseObject => { - let text = verseObject.text || ''; - let wordSpacing = ''; - const length = verseObject.children ? verseObject.children.length : 0; - - for (let i = 0; i < length; i++) { - let child = verseObject.children[i]; - - switch (child.type) { - case 'word': - text += wordSpacing + child.text; - wordSpacing = ' '; - break; - - case 'milestone': - text += wordSpacing + parseMilestone(child); - wordSpacing = ' '; - break; - - default: - if (child.text) { - text += child.text; - const lastChar = text.substr(-1); - - if ((lastChar !== ',') && (lastChar !== '.') && (lastChar !== '?') && (lastChar !== ';')) { // legacy support, make sure padding before word - wordSpacing = ''; - } - } - break; - } - } - return text; -}; - -/** - * get text from word and milestone markers - * @param {Object} verseObject - to parse - * @param {String} wordSpacing - spacing to use before next word - * @return {*} new verseObject and word spacing - */ -const replaceWordsAndMilestones = (verseObject, wordSpacing) => { - let text = ''; - - if (verseObject.type === 'word') { - text = wordSpacing + verseObject.text; - } else if (verseObject.type === 'milestone') { - text = wordSpacing + parseMilestone(verseObject); - } - - if (text) { // replace with text object - verseObject = { - type: 'text', - text, - }; - wordSpacing = ' '; - } else { - wordSpacing = ' '; - - if (verseObject.nextChar) { - wordSpacing = ''; // no need for spacing before next word if this item has it - } else if (verseObject.text) { - const lastChar = verseObject.text.substr(-1); - - if (![',', '.', '?', ';'].includes(lastChar)) { // legacy support, make sure padding before next word if punctuation - wordSpacing = ''; - } - } - - if (verseObject.children) { // handle nested - const verseObject_ = cloneDeep(verseObject); - let wordSpacing_ = ''; - const length = verseObject.children.length; - - for (let i = 0; i < length; i++) { - const flattened = - replaceWordsAndMilestones(verseObject.children[i], wordSpacing_); - wordSpacing_ = flattened.wordSpacing; - verseObject_.children[i] = flattened.verseObject; - } - verseObject = verseObject_; - } - } - return { verseObject, wordSpacing }; -}; - -/** - * check if string has alignment markers - * @param {String} usfmData - * @return {Boolean} true if string has alignment markers - */ -export const hasAlignments = usfmData => { - const hasAlignment = usfmData.includes('\\zaln-s') || usfmData.includes('\\w'); - return hasAlignment; -}; - -/** - * @description verseObjects with occurrences via string - * @param {String} usfmData - The string to search in - * @return {String} - cleaned USFM - */ -export const cleanAlignmentMarkersFromString = usfmData => { - if (hasAlignments(usfmData)) { - // convert string using usfm to JSON - const verseObjects = usfmjs.toJSON('\\v 1 ' + usfmData, { chunk: true }).verses['1']; - return getUsfmForVerseContent(verseObjects); - } - return usfmData; -}; - -/** - * converts verse from verse objects to USFM string - * @param verseData - * @return {string} - */ -export function convertVerseDataToUSFM(verseData) { - const outputData = { - 'chapters': {}, - 'headers': [], - 'verses': { '1': verseData }, - }; - const USFM = usfmjs.toUSFM(outputData, { chunk: true, forcedNewLines: true }); - const split = USFM.split('\\v 1'); - - if (split.length > 1) { - let content = split[1]; - - if (content.substr(0, 1) === ' ') { // remove space separator - content = content.substr(1); - } - return content; - } - return ''; // error on JSON to USFM -} - -/** - * @description remove milestones and word markers - * @param {Object|Array} verseData - * @return {Object} - */ -export function removeMilestonesAndWordMarkers(verseData) { - const verseObjects = verseData?.verseObjects || verseData; - if (verseObjects) { - let wordSpacing = ''; - const flattenedData = []; - const length = verseObjects.length; - - for (let i = 0; i < length; i++) { - const verseObject = verseObjects[i]; - const flattened = replaceWordsAndMilestones(verseObject, wordSpacing); - wordSpacing = flattened.wordSpacing; - flattenedData.push(flattened.verseObject); - } - verseData = { // use flattened data - verseObjects: flattenedData, - }; - } - return verseData; -} - -/** - * @description convert verse from verse objects to USFM string, removing milestones and word markers - * @param {Object|Array} verseData - * @return {String} - */ -export const getUsfmForVerseContent = (verseData) => { - verseData = removeMilestonesAndWordMarkers(verseData); - return convertVerseDataToUSFM(verseData); -}; diff --git a/src/utils/alignmentHelpers.js b/src/utils/alignmentHelpers.js deleted file mode 100644 index fb5ad93..0000000 --- a/src/utils/alignmentHelpers.js +++ /dev/null @@ -1,628 +0,0 @@ -import cloneDeep from "lodash.clonedeep"; -import {removeUsfmMarkers, usfmVerseToJson} from "./usfmHelpers"; -import wordaligner from "word-aligner"; -import * as UsfmFileConversionHelpers from "./UsfmFileConversionHelpers"; -import { - convertAlignmentFromVerseToVerseSpanSub, - convertVerseDataToUSFM, - getUsfmForVerseContent -} from "./UsfmFileConversionHelpers"; -import { - getAlignedWordListFromAlignments, - getOriginalLanguageListForVerseData, - migrateTargetAlignmentsToOriginal, - updateAlignedWordsFromOriginalWordList -} from "./migrateOriginalLanguageHelpers"; -import Lexer from "wordmap-lexer"; -import { getVerseSpanRange } from './verseObjects'; - -/** - * get all the alignments for verse from nested array (finds zaln objects) - * @param {array} verseSpanAlignments - * @return {*[]} - */ -export function getVerseAlignments(verseSpanAlignments) { - let alignments = []; - - if (verseSpanAlignments) { - for (let alignment of verseSpanAlignments) { - if (alignment.tag === 'zaln') { - alignments.push(alignment); - } - - if (alignment.children) { - const subAlignments = getVerseAlignments(alignment.children); - - if (subAlignments.length) { - alignments = alignments.concat(subAlignments); - } - } - } - } - return alignments; -} - -/** - * search through verseAlignments for word and get occurrences - * @param {object} verseAlignments - * @param {string|number} matchVerse - * @param {string} word - * @return {number} - */ -export function getWordCountInVerse(verseAlignments, matchVerse, word) { - let matchedAlignment = null; - - for (let alignment of verseAlignments[matchVerse]) { - for (let topWord of alignment.topWords) { - if (topWord.word === word) { - matchedAlignment = topWord; - break; - } - } - - if (matchedAlignment) { - break; - } - } - - const wordCount = matchedAlignment && matchedAlignment.occurrences; - return wordCount || 0; -} - -/** - * convert to number if string - * @param {string|number} value - * @returns {number|*} - */ -function parseStrToNumber(value) { - if (typeof value === 'string') { - const number = parseInt(value); - return number; - } - return value; -} - -/** - * convert occurrence(s) in word to numbers - * @param {object} item - * @returns {object} - new word with occurrence(s) converted to numbers - */ -function convertOccurrencesInWord(item) { - const occurrence = parseStrToNumber(item.occurrence); - const occurrences = parseStrToNumber(item.occurrences); - if ( - (occurrence !== item.occurrence) - || (occurrences !== item.occurrences) - ) { // if occurrence(s) changed, create new word - return { - ...item, - occurrence, - occurrences, - } - } - - return item; -} - -/** - * for each item in word list convert occurrence(s) to numbers - * @param {array} wordlist - * @returns {array} - */ -function convertOccurrences(wordlist) { - const wordlist_ = wordlist.map(item => { - return convertOccurrencesInWord(item); - }) - return wordlist_; -} - -/** - * get the word list from alignments - * @param {array} verseObjects - * @return {array} - */ -export function getWordListFromVerseObjects(verseObjects) { - const targetVerseUSFM = getUsfmForVerseContent({verseObjects}) - let targetTokens = Lexer.tokenize(removeUsfmMarkers(targetVerseUSFM)); - targetTokens = targetTokens.map(token => ( - { // exclude unneeded data - text: token.text || token.word, - occurrence: token.tokenOccurrence, - occurrences: token.tokenOccurrences, - index: token.tokenPos, - })) - return targetTokens; -} - -/** - * extract alignments from target verse USFM using sourceVerse for word ordering - * @param {string} alignedTargetVerse - * @param {object|null} sourceVerse - optional source verse in verseObject format to maintain source language word order - * @return {array} list of alignments in target text - */ -export function extractAlignmentsFromTargetVerse(alignedTargetVerse, sourceVerse) { - try { - const targetVerse = usfmVerseToJson(alignedTargetVerse); - const alignments = wordaligner.unmerge(targetVerse, sourceVerse); - const originalLangWordList = sourceVerse && getOriginalLanguageListForVerseData(sourceVerse); - const alignmentsWordList = getAlignedWordListFromAlignments(alignments.alignment); - const targetTokens = getWordListFromVerseObjects(targetVerse); - // clean up metadata in alignments - originalLangWordList && updateAlignedWordsFromOriginalWordList(originalLangWordList, alignmentsWordList); - if (alignments.alignment) { // for compatibility change alignment to alignments - // convert occurrence(s) from string to number - const alignments_ = alignments.alignment.map(alignment => { - const topWords = convertOccurrences(alignment.topWords); - const bottomWords = convertOccurrences(alignment.bottomWords); - return { - sourceNgram: topWords.map(topWord => { // word aligner uses sourceNgram instead of topWord - if (originalLangWordList) { - const pos = originalLangWordList.findIndex(item => ( - topWord.word === (item.word || item.text) && - topWord.occurrence == item.occurrence //Tricky: we want to allow automatic conversion between string and integer because occurrence could be either - )); - const newSource = { - ...topWord, - index: pos, - text: topWord.text || topWord.word, - }; - delete newSource.word - return newSource - } - const newSource = { - ...topWord, - text: topWord.text || topWord.word, - }; - delete newSource.word - delete newSource.position - return newSource - }), - targetNgram: bottomWords.map(bottomWord => { // word aligner uses targetNgram instead of bottomWords - const word = bottomWord.text || bottomWord.word - // noinspection EqualityComparisonWithCoercionJS - const pos = targetTokens.findIndex(item => ( - word === item.text && - // eslint-disable-next-line eqeqeq - bottomWord.occurrence == item.occurrence - )); - - const newTarget = { - ...bottomWord, - index: pos, - text: word, - }; - delete newTarget.word - return newTarget; - }), - } - }) - alignments.alignments = alignments_; - } - return alignments; - } catch (e) { - console.warn(`extractAlignmentsFromTargetVerse()`,e) - return null - } -} - -/** - * merge alignments into target verse - * @param {string} targetVerseText - target verse to receive alignments - * @param {{alignments, wordBank}} verseAlignments - contains all the alignments and wordbank is list of unaligned target words - * @return {string|null} target verse in USFM format - */ -export function addAlignmentsToTargetVerseUsingMerge(targetVerseText, verseAlignments) { - const verseString = UsfmFileConversionHelpers.cleanAlignmentMarkersFromString(targetVerseText); - let verseObjects; - - try { - verseObjects = wordaligner.merge( - verseAlignments.alignments, verseAlignments.wordBank, verseString, true, - ); - } catch (e) { - console.error(`addAlignmentsToTargetVerseUsingMerge() - invalid alignment`, e); - } - - if (verseObjects) { - const targetVerse = convertVerseDataToUSFM({verseObjects}); - return targetVerse; - } - - return null; -} - -/** - * iterate through the target words marking words as disabled if they are already used in alignments - * @param {array} targetWordList - * @param {array} alignments - * @returns {array} updated target word list - */ -export function markTargetWordsAsDisabledIfAlreadyUsedForAlignments(targetWordList, alignments) { - return targetWordList.map(token => { - let isUsed = false; - - for (const alignment of alignments) { - for (const usedToken of alignment.targetNgram) { - if (token.text.toString() === usedToken.text.toString() - && token.occurrence === usedToken.occurrence - && token.occurrences === usedToken.occurrences) { - isUsed = true; - break; - } - } - if (isUsed) { - break; - } - } - - const targetWord = { // exclude unneeded data - disabled: isUsed, - text: token.text, - occurrence: token.tokenOccurrence, - occurrences: token.tokenOccurrences, - index: token.tokenPos, - } - return targetWord; - }); -} - -/** - * create wordbank of unused target words, transform alignments, and then merge alignments into target verse - * @param {array} wordBankWords - list of all target words in word bank - the disabled flag indicates word is aligned - * @param {array} verseAlignments - * @param {string} targetVerseText - target verse to receive alignments - * @return {string|null} target verse in USFM format - */ -export function addAlignmentsToVerseUSFM(wordBankWords, verseAlignments, targetVerseText) { - let wordBank = wordBankWords.filter(item => (!item.disabled)) - wordBank = wordBank.map(item => ({ - ...item, - word: item.word || item.text, - occurrence: item.occurrence || item.occurrence, - occurrences: item.occurrences || item.occurrences, - })) - // remap sourceNgram:topWords, targetNgram:bottomWords, - const alignments_ = verseAlignments.map(item => ({ - ...item, - topWords: item.sourceNgram.map(item => ({ - strong: item.strong, - lemma: item.lemma, - morph: item.morph, - occurrence: item.occurrence, - occurrences: item.occurrences, - word: item.word || item.text, - })), - bottomWords: item.targetNgram.map(item => ({ - ...item, - word: item.word || item.text - })), - })); - const alignments = { - alignments: alignments_, - wordBank, - } - const verseUsfm = addAlignmentsToTargetVerseUsingMerge(targetVerseText, alignments); - return verseUsfm; -} - -/** - * parse target language and original language USFM text into the structures needed by the word-aligner - * @param {string} targetVerseUSFM - * @param {string|null} sourceVerseUSFM - * @returns {{targetWords: *[], verseAlignments: *}} - */ -export function parseUsfmToWordAlignerData(targetVerseUSFM, sourceVerseUSFM) { - let targetTokens = []; - if (targetVerseUSFM) { - targetTokens = Lexer.tokenize(removeUsfmMarkers(targetVerseUSFM)); - } - - const sourceVerseObjects = sourceVerseUSFM && usfmVerseToJson(sourceVerseUSFM); - let targetWords = []; - const targetVerseAlignments = extractAlignmentsFromTargetVerse(targetVerseUSFM, sourceVerseObjects); - const verseAlignments = targetVerseAlignments?.alignments; - targetWords = markTargetWordsAsDisabledIfAlreadyUsedForAlignments(targetTokens, verseAlignments); - return {targetWords, verseAlignments}; -} - -/** - * iterate through target word list to make sure all words are used, and then iterate through all alignments to make sure all source alignments have target words - * @param {array} targetWords - * @param {array} verseAlignments - * @returns {boolean} - */ -export function areAlgnmentsComplete(targetWords, verseAlignments) { - let alignmentComplete = true; - for (const word of targetWords) { - if (!word.disabled) { - alignmentComplete = false; - break; - } - } - - if (alignmentComplete) { - for (const alignment of verseAlignments) { - const sourceWordCount = alignment.sourceNgram?.length || 0; - const targetWordCount = alignment.targetNgram?.length || 0; - if (!targetWordCount && sourceWordCount) { // if no target words, but we have source words, then incomplete alignment - alignmentComplete = false; - break; - } - } - } - return alignmentComplete; -} - -/** - * iterates through target words looking for words not in wordBankList. If an added word is found, it is added to - * wordbank. And if there are other instances are found, then occurrence counts are updated. - * @param {array} targetWordList - list of target words - * @param {array} wordBankList - list of target words in the word bank - * @param {array} verseAlignments - list of verse alignments that may need updating - */ -function handleAddedWordsInNewText(targetWordList, wordBankList, verseAlignments) { - for (const targetToken of targetWordList) { - const pos = wordBankList.findIndex(word => ( - word.text === targetToken.text && - word.occurrence === targetToken.occurrence - )) - if (pos < 0) { - const occurrences = targetToken.occurrences; - const tokenWord = targetToken.text; - // update occurrence count for all aligned instances of this word - for (const alignment of verseAlignments) { - for (const targetWord of alignment.targetNgram) { - var word_ = targetWord.word || targetWord.text; - if (word_ === tokenWord) { - targetWord.occurrences = occurrences - } - } - } - // update occurrence count for all wordbank instances of this word - for (const wordBank of wordBankList) { - var word_ = wordBank.word || wordBank.text; - if (word_ === tokenWord) { - wordBank.occurrences = occurrences - } - } - wordBankList.push(targetToken); - } - } -} - -/** - * iterates through verse alignments looking for target words not in target word list. If an extra word is found, it - * is removed from the verse alignments and occurrence(s) are updated. - * @param {array} verseAlignments - list of verse alignments that may need updating - * @param {array} targetWordList - list of target words in new verse text - * @param {array} targetWords - list of target words in alignments - * */ -function handleDeletedWords(verseAlignments, targetWordList, targetWords) { - for (const alignment of verseAlignments) { - let delete_ = []; - for (let i = 0, l = alignment.targetNgram.length; i < l; i++) { - let wordFound = false; - const targetWord = alignment.targetNgram[i]; - const word = targetWord.text; - for (const targetToken of targetWordList) { - if (targetToken.text === word) { - if (targetWord.occurrence > targetToken.occurrences) { - delete_.push(i); // extra aligned word - } else if (targetWord.occurrences !== targetToken.occurrences) { - // fixup counts - targetWord.occurrences = targetToken.occurrences; - } - wordFound = true; - break; - } - } - if (!wordFound) { - delete_.push(i); // extra aligned word - } - } - while (delete_.length) { - const remove = delete_.pop(); - alignment.targetNgram.splice(remove, 1); - } - } - - // remove extra target words that are not in targetWordList - for (let i = 0; i < targetWords.length; i++) { - let newOccurrences = 0 - const wordBankWord = targetWords[i] - const found = targetWordList.findIndex(word => { - if (word.text === wordBankWord.text ) { - if (word.occurrence === wordBankWord.occurrence) { - return true - } - newOccurrences = word.occurrences - } - return false; - }) - if (found < 0) { - // update occurrences - if (newOccurrences) { - for (const word of targetWords) { - if (word.text === wordBankWord.text) { - if (word.occurrences !== newOccurrences) { - // fixup counts - word.occurrences = newOccurrences - } - } - } - } - // remove extra word - targetWords.splice(i, 1) - i-- - } - } -} - -/** - * merge alignments into target verse - * @return {string|null} target verse in USFM format - * @param {object[]} targetVerseObjects - * @param {string} newTargetVerse - */ -export function updateAlignmentsToTargetVerse(targetVerseObjects, newTargetVerse) { - let targetVerseText = convertVerseDataToUSFM(targetVerseObjects); - let { targetWords, verseAlignments } = parseUsfmToWordAlignerData(targetVerseText, null); - const targetTokens = getWordListFromVerseObjects(usfmVerseToJson(newTargetVerse)); - handleAddedWordsInNewText(targetTokens, targetWords, verseAlignments); - handleDeletedWords(verseAlignments, targetTokens, targetWords); - targetVerseText = addAlignmentsToVerseUSFM(targetWords, verseAlignments, newTargetVerse); - if (targetVerseText === null) { - console.warn(`updateAlignmentsToTargetVerse() - alignment FAILED for ${newTargetVerse}, removing all alignments`); - targetVerseText = newTargetVerse; - } - const alignedVerseObjects = usfmVerseToJson(targetVerseText) - return { - targetVerseObjects: alignedVerseObjects, - targetVerseText, - }; -} - -/** - * migrate alignments to match original language words, and then merge alignments into target verse - * @return {string|null} target verse in USFM format - * @param {object[]} targetVerseObjects - * @param {string} newTargetVerse - * @param {object[]} originalLanguageVerseObjects - */ -export function updateAlignmentsToTargetVerseWithOriginal(targetVerseObjects, newTargetVerse, originalLanguageVerseObjects) { - // migrate the initial alignments to current original source - const migratedTargetVerseObjects = migrateTargetAlignmentsToOriginal(targetVerseObjects, originalLanguageVerseObjects) - - // apply new verse text - const results = updateAlignmentsToTargetVerse(migratedTargetVerseObjects, newTargetVerse) - return results -} - -/** - * generate blank alignments for all the verses in a verse span - * @param {string} verseSpan - * @param {object} origLangChapterJson - * @param {object} blankVerseAlignments - object to return verse alignments - * @return {{low, hi}} get range of verses in verse span - */ -function getRawAlignmentsForVerseSpan(verseSpan, origLangChapterJson, blankVerseAlignments) { - const { low, high } = getVerseSpanRange(verseSpan); - - // generate raw alignment data for each verse in range - for (let verse = low; verse <= high; verse++) { - const originalVerse = origLangChapterJson[verse]; - - if (originalVerse) { - const blankAlignments = wordaligner.generateBlankAlignments(originalVerse); - blankVerseAlignments[verse] = blankAlignments; - } - } - - return { low, hi: high }; -} - -/** - * business logic for convertAlignmentsFromVerseSpansToVerse: - * for each alignment converts mapping to original language verse span to be mapped to original language verse by adding ref and updating occurrence(s) - * @param {object} verseSpanData - aligned output data - will be modified with verse span fixes - * @param {number} low - low verse number of span - * @param {number} hi - high verse number of span - * @param {object} blankVerseAlignments - raw verse alignments for extracting word counts for each verse - * @param {number|string} chapterNumber - */ -function convertAlignmentsFromVerseSpansToVerseSub(verseSpanData, low, hi, blankVerseAlignments, chapterNumber) { - const verseSpanAlignments = verseSpanData && verseSpanData.verseObjects; - const alignments = getVerseAlignments(verseSpanAlignments); - - for (let alignment of alignments) { - const word = alignment.content; - let occurrence = alignment.occurrence; - - // transform occurrence(s) from verse span based to verse reference - for (let verse = low; verse <= hi; verse++) { - const wordCount = getWordCountInVerse(blankVerseAlignments, verse, word); - - if (occurrence <= wordCount) { // if inside this verse, add reference - alignment.ref = `${chapterNumber}:${verse}`; - alignment.occurrences = wordCount; - alignment.occurrence = occurrence; - break; - } else { - occurrence -= wordCount; // subtract counts for this verse - } - } - } -} - -/** - * for each alignment converts mapping to original verse by ref to be mapped to original language verse span by removing ref and updating occurrence(s) - * @param {object} targetLanguageVerse - in verseObjects format - * @param {object} originalLanguageChapterData - verseObjects for the current chapter - * @param {string} chapter - current chapter (used for sanity checking refs for original language alignments) - * @param {string} verseSpan - range of verses (e.g. '11-13') - * @return {{alignedTargetVerseObjects: *, originalLanguageVerseObjects: {verseObjects}}} - */ -export function convertAlignmentFromVerseToVerseSpan(targetLanguageVerse, originalLanguageChapterData, chapter, verseSpan) { - const blankVerseAlignments = {}; - const alignedTargetVerseObjects = cloneDeep(targetLanguageVerse) - const {low, hi} = getRawAlignmentsForVerseSpan(verseSpan, originalLanguageChapterData, blankVerseAlignments); - let mergedUgntData = []; - for (let verse = low; verse <= hi; verse++) { - const verseObjectsForVerse = originalLanguageChapterData?.[verse]?.verseObjects; - mergedUgntData = mergedUgntData.concat(verseObjectsForVerse || []) - } - const originalLanguageVerseObjects = convertAlignmentFromVerseToVerseSpanSub(mergedUgntData, alignedTargetVerseObjects, chapter, low, hi, blankVerseAlignments) - return {alignedTargetVerseObjects, originalLanguageVerseObjects}; -} - -/** - * for each alignment converts mapping to original language verse span to be mapped to original language verse by adding ref and updating occurrence(s) - * @param originalLanguageChapterData - * @param alignedTargetVerseObjects - * @param {string} chapter - * @param {string} verseSpan - * @return {string} - */ -export function convertAlignmentsFromVerseSpansToVerse(originalLanguageChapterData, alignedTargetVerseObjects, chapter, verseSpan) { - const blankVerseAlignments = {}; - const {low, hi} = getRawAlignmentsForVerseSpan(verseSpan, originalLanguageChapterData, blankVerseAlignments); - const verseSpanData = cloneDeep(alignedTargetVerseObjects) - convertAlignmentsFromVerseSpansToVerseSub(verseSpanData, low, hi, blankVerseAlignments, chapter) - const finalUSFM = convertVerseDataToUSFM(verseSpanData) - return finalUSFM; -} - -/** - * reset the alignments in verseAlignments_ and targetWords_ - returns new arrays with alignments reset - * @param {array[AlignmentType]} verseAlignments_ - * @param {array[TargetWordBankType]} targetWords_ - * @returns {{words: array[TargetWordBankType], verseAlignments: array[AlignmentType] }} - */ -export function resetAlignments(verseAlignments_, targetWords_) { - if (verseAlignments_?.length) { - const verseAlignments = cloneDeep(verseAlignments_) - const targetWords = cloneDeep(targetWords_) - - for (const alignment of verseAlignments) { // clear out each alignment - alignment.targetNgram = [] // remove target words for each alignment - if (alignment.sourceNgram?.length > 1) { // if there are multiple source words, split each into separate alignment - for (let i = 1; i < alignment.sourceNgram?.length; i++) { - const sourceNgram = alignment.sourceNgram[i] - const newAlignment = { - sourceNgram: [sourceNgram], - targetNgram: [] - } - verseAlignments.push(newAlignment) - } - - alignment.sourceNgram = [alignment.sourceNgram[0]] - } - } - - for (const word of targetWords) { // clear all words marked used - word.disabled = false - } - return {verseAlignments, targetWords} - } - return { } -} diff --git a/src/utils/bibleHelpers.js b/src/utils/bibleHelpers.js deleted file mode 100644 index a2ffe57..0000000 --- a/src/utils/bibleHelpers.js +++ /dev/null @@ -1,72 +0,0 @@ -/* eslint-disable no-console */ -import * as Bible from '../common/BooksOfTheBible'; - -/** - * - * @param {string} bookAbbr - The book abbreviation to convert - */ -export function convertToFullBookName(bookAbbr) { - if (!bookAbbr) { - return; - } - return Bible.ALL_BIBLE_BOOKS[bookAbbr.toString().toLowerCase()]; -} - -/** - * tests if book is a Old Testament book - * @param bookId - * @return {boolean} - */ -export function isOldTestament(bookId) { - return bookId in Bible.BIBLE_BOOKS.oldTestament; -} - -/** - * tests if book is a New Testament book - * @param bookId - * @return {boolean} - */ -export function isNewTestament(bookId) { - return bookId in Bible.BIBLE_BOOKS.newTestament; -} - -/** - * tests if book is in Old or New Testament - * @param bookId - * @return {boolean} - */ -export function isValidBibleBook(bookId) { - return (isNewTestament(bookId) || isOldTestament(bookId)) ; -} - -/** - * returns true if this bookId and languageId match the original language bible - * @param {String} languageId - * @param {String} bookId - * @return {boolean} - */ -export function isOriginalLanguageBible(languageId, bookId) { - return ((languageId.toLowerCase() === Bible.NT_ORIG_LANG && bookId.toLowerCase() === Bible.NT_ORIG_LANG_BIBLE) || - (languageId.toLowerCase() === Bible.OT_ORIG_LANG && bookId.toLowerCase() === Bible.OT_ORIG_LANG_BIBLE)); -} - -/** - * returns true if this bookId and languageId match the original language bible - * @param {String} languageId - * @return {boolean} - */ -export function isOriginalLanguage(languageId) { - return (languageId.toLowerCase() === Bible.NT_ORIG_LANG || languageId.toLowerCase() === Bible.OT_ORIG_LANG); -} - -/** - * determine Original Language and Original Language bible for book - * @param bookId - * @return {{resourceLanguage: string, bibleID: string}} - */ -export function getOrigLangforBook(bookId) { - const isOT = isOldTestament(bookId); - const languageId = (isOT) ? Bible.OT_ORIG_LANG : Bible.NT_ORIG_LANG; - const bibleId = (isOT) ? Bible.OT_ORIG_LANG_BIBLE : Bible.NT_ORIG_LANG_BIBLE; - return { languageId, bibleId }; -} diff --git a/src/utils/migrateOriginalLanguageHelpers.js b/src/utils/migrateOriginalLanguageHelpers.js deleted file mode 100644 index 8ed96a9..0000000 --- a/src/utils/migrateOriginalLanguageHelpers.js +++ /dev/null @@ -1,589 +0,0 @@ -import { normalizer } from 'string-punctuation-tokenizer'; -import { referenceHelpers } from 'bible-reference-range'; -import { - convertVerseDataToUSFM, - getUsfmForVerseContent, - removeMilestonesAndWordMarkers -} from "./UsfmFileConversionHelpers"; -import { - addAlignmentsToVerseUSFM, - extractAlignmentsFromTargetVerse, - parseUsfmToWordAlignerData, -} from "./alignmentHelpers"; -import {usfmVerseToJson} from "./usfmHelpers"; - -const ignoreFields = [ 'tag', 'type', 'text' ]; -const ignoreOrig = [ 'tw' ]; -export const QUOTE_MARK = '\u2019'; - -/** - * convert value to int if string, otherwise just return value - * @param {string|int} value - * @returns {int} - */ -export function toInt(value) { - return (typeof value === 'string') ? parseInt(value, 10) : value; -} - -/** - * extract words from wordlist - * @param {array} wordList - * @param {array} verseObjects - */ -function arrayToWordList(wordList, verseObjects) { - if (Array.isArray(verseObjects)) { - for (const item of verseObjects) { - if ((item?.type === 'word') && item.text) { - const newWord = {}; - - for (const key of Object.keys(item)) { - if (!ignoreFields.includes(key)) { - newWord[key] = item[key]; - } - } - newWord.word = item.text; - wordList.push(newWord); - } else { - if (item?.children) { - arrayToWordList(wordList, item?.children); - } - } - } - } -} - -/** - * find occurrence and occurrences for word at pos in wordList - * @param {string} word - * @param {number} pos - * @param {array} wordList - * @returns {{occurrences: number, occurrence: number}} - */ -function getOccurrencesForWord(word, pos, wordList) { - let occurrence = 0; - let occurrences = 0; - - for (let i = 0, l = wordList.length; i < l; i ++) { - const item = wordList[i]; - - if (item.word === word) { - if (i <= pos) { - occurrence++; - } - occurrences++; - } - } - return { occurrence, occurrences }; -} - -/** - * calculate the occurrence and occurrences for each word in verse - * @param {array} wordList - */ -function getOccurrencesForWordList(wordList) { - for (let i = 0, l = wordList.length; i < l; i ++) { - const item = wordList[i]; - const { occurrence, occurrences } = getOccurrencesForWord(item.word, i, wordList); - item.occurrence = occurrence; - item.occurrences = occurrences; - } -} - -/** - * get the word list for the original language in the format used by alignment data - @param {array} verseObjects - * @returns {*[]} - */ -export function getOriginalLanguageListForVerseData(verseObjects) { - const wordList = []; - arrayToWordList(wordList, verseObjects); - getOccurrencesForWordList(wordList); - return wordList; -} - -/** - * get the word list for the original language in the format used by alignment data - * @param {object} chapterJson - * @param {string|number} verseRef - * @returns {*[]} - */ -export function getOrigLangWordListForVerse(chapterJson, verseRef) { - const verseObjects = chapterJson?.[verseRef]?.verseObjects; - return getOriginalLanguageListForVerseData(verseObjects); -} - -/** - * get the word list from alignments - * @param alignments - * @return {array} - */ -export function getAlignedWordListFromAlignments(alignments) { - const wordList = []; - for (const alignment of alignments) { - for (const topWord of alignment.topWords) { - topWord.unmatched = true; - - if (alignment.bottomWords.length) { // this is a data bug, if there are no bottom words, this is not an alignment so skip word - wordList.push(topWord); - } - } - } - return wordList; -} - -/** - * get the word list for the aligned original words - * @param {object} chapterJson - * @param {string|number} verseRef - * @returns {*[]} - */ -export function getAlignedWordListForVerse(chapterJson, verseRef) { - const alignments = chapterJson?.[verseRef]?.alignments || []; - const wordList = getAlignedWordListFromAlignments(alignments); - return wordList; -} - -/** - * normalize word by doing unicode normalization, converting to lower case, and then fixing the trailing accent - * @param word - * @return {{length}} - */ -function normalize(word) { - let word_ = normalizer(word || ''); - word_ = word_.toLowerCase(); - - if (word_.length) { - const lastCharPos = word_.length - 1; - const lastChar = word_[lastCharPos]; - - if (lastChar === QUOTE_MARK) { // handle invalid accent at end of word - word_ = word_.substring(0, lastCharPos) + '\u02BC'; - } - } - - return word_; -} - -/** - * iterate through the word list normalizing words and then fixing occurrences using the normalized word text - * @param {array} originalWordList - list of words to normalize - * @param {array} normalOrig - array to populate with normalized words - */ -function normalizeList(originalWordList, normalOrig) { - for (const origWord of originalWordList) { - const origWord_ = { // shallow copy - ...origWord, - word: normalize(origWord.word), - }; - normalOrig.push(origWord_); - } -} - -/** - * update the attributes for the aligned words from latest original language words - * @param {array} originalLangWordList - * @param {array} alignmentsWordList - * @return {boolean} true if verse attributes updated - */ -export function updateAlignedWordsFromOriginalWordList(originalLangWordList, alignmentsWordList) { - let changed = false; - let normalOrig = []; // an array to keep normalized original words - let normalAlign = []; // an array to keep normalized aligned words - - for (let i = 0, l = alignmentsWordList.length; i < l; i++) { - const alignedWord = alignmentsWordList[i]; - // eslint-disable-next-line eqeqeq - let foundOrig = originalLangWordList.find(item => (item.word === alignedWord.word) && (item.occurrence == alignedWord.occurrence) && (item.occurrences == alignedWord.occurrences)); //Tricky: we are allowing automatic type coercion between string and integer because occurrence could be either - - if (!foundOrig) { // fall back to normalized matching - if (!normalOrig.length) { // if not initialized - normalizeList(originalLangWordList, normalOrig); - getOccurrencesForWordList(normalOrig); - normalizeList(alignmentsWordList, normalAlign); - } - - const normalWord = normalAlign[i]; - const foundPos = normalOrig.findIndex(item => (item.word === normalWord.word) && (item.occurrence == normalWord.occurrence) && (item.occurrences == normalWord.occurrences)); //Tricky: we are allowing automatic type coercion between string and integer because occurrence could be either - - if (foundPos >= 0) { - foundOrig = originalLangWordList[foundPos]; - } - } - - if (foundOrig) { - const keys = Object.keys(foundOrig); - - for (const key of keys) { - if (ignoreOrig.includes(key)) { - continue; // skip over ignored keys - } - - if (foundOrig[key] !== alignedWord[key]) { - alignedWord[key] = foundOrig[key]; // update attribute - changed = true; - } - } - delete alignedWord.unmatched; - } - } - - return changed; -} - -/** - * remove aligned words no longer in original or target language and update word bank to enable target words not used - * @param {array} alignments * @return { extraWordFound: boolean, emptyAlignmentsFound:boolean } - * @param {array} wordBank - */ -export function updateAlignmentData(alignments, wordBank) { - const toRemove = []; - let extraWordFound = false; - - for (let j = 0, l = alignments.length; j < l; j++) { - const alignment = alignments[j]; - let sourceNgram = alignment.sourceNgram; - - for (let i = 0; i < sourceNgram.length; i++) { - const topWord = sourceNgram[i]; - - if (topWord.unmatched || !alignment.targetNgram.length) { // remove extra word or unaligned word - sourceNgram.splice(i, 1); - i--; - - if (topWord.unmatched) { - extraWordFound = true; - } - } - } - - if (!sourceNgram.length) { // if empty, remove alignment - toRemove.push(j); - } - } - - if (toRemove.length) { - for (let j = toRemove.length - 1; j >= 0; j--) { // reverse order so remaining indices not messed up by removals - const removeIdx = toRemove[j]; - alignments.splice(removeIdx, 1); - } - } - - for (let j = 0, l = alignments.length; j < l; j++) { - const alignment = alignments[j]; - let targetNgram = alignment.targetNgram; - for (let i = 0; i < targetNgram.length; i++) { - const bottomWord = targetNgram[i]; - - const foundtarget = wordBank.find(item => (item.text === bottomWord.text) && (item.occurrence == bottomWord.occurrence)); - - if (foundtarget) { - foundtarget.used = true; - } else { - targetNgram.splice(i, 1); - } - } - } - - wordBank.forEach(item => { - const used = item.used; - if (!used) { - delete item.disabled; - } - delete item.used - }) - - const emptyAlignmentsFound = !!toRemove.length; - return {extraWordFound, emptyAlignmentsFound}; -} - -/** - * remove aligned words no longer in original language - * @param {array} alignments - * @return { extraWordFound: boolean, emptyAlignmentsFound:boolean } - */ -function removeExtraWordsFromAlignments(alignments) { - const toRemove = []; - let extraWordFound = false; - - for (let j = 0, l = alignments.length; j < l; j++) { - const alignment = alignments[j]; - let topWords = alignment.topWords; - - for (let i = 0; i < topWords.length; i++) { - const topWord = topWords[i]; - - if (topWord.unmatched || !alignment.bottomWords.length) { // remove extra word or unaligned word - topWords.splice(i, 1); - i--; - - if (topWord.unmatched) { - extraWordFound = true; - } - } - } - - if (!topWords.length) { // if empty, remove alignment - toRemove.push(j); - } - } - - if (toRemove.length) { - for (let j = toRemove.length - 1; j >= 0; j--) { // reverse order so remaining indices not messed up by removals - const removeIdx = toRemove[j]; - alignments.splice(removeIdx, 1); - } - } - - const emptyAlignmentsFound = !!toRemove.length; - return {extraWordFound, emptyAlignmentsFound}; -} - -/** - * remove aligned words no longer in original language - * @param {array} alignmentsChapter - * @param {string|number} verseRef - * @return { extraWordFound: boolean, emptyAlignmentsFound:boolean } - */ -function removeExtraWordsFromChapterAlignments(alignmentsChapter, verseRef) { - const alignments = alignmentsChapter?.[verseRef]?.alignments || []; - let {extraWordFound, emptyAlignmentsFound} = removeExtraWordsFromAlignments(alignments); - return { extraWordFound, emptyAlignmentsFound }; -} - -/** - * check if verseRef is a verse span - * @param verseRef - * @return {*|boolean} - */ -function isValidVerseSpan(verseRef) { - return referenceHelpers.isVerseSpan(verseRef) && !isNaN(referenceHelpers.toInt(verseRef)); -} - -/** - * get all verses included in verse range - * @param {string} verseRef - number to look up - * @param {object} chapterData - * @return {null|{verseObjects: *[]}} - */ -function getVersesForSpan(verseRef, chapterData) { - // coerce to look like a book so we can use library call - const chapter = 1; - const bookData = { [chapter]: chapterData }; - const ref = `${chapter}:${verseRef}`; - const verses = referenceHelpers.getVerses(bookData, ref); - - if (verses?.length) { - let verseData = []; - - for (const verse_ of verses) { - if (verse_.verseData?.verseObjects) { - Array.prototype.push.apply(verseData, verse_.verseData.verseObjects); - } - } - return { verseObjects: verseData }; - } - return null; -} - -/** - * finds best match for verseRef of original language and alignments - * @param {object} originalLangChapter - * @param {object} alignmentsChapter - * @param {string|number} verseRef - */ -export function getBestMatchForVerse(originalLangChapter, alignmentsChapter, verseRef) { - let verse_ = null; - let originalLangWordList = null; - let alignmentsWordList = null; - - if (originalLangChapter?.[verseRef]?.verseObjects?.length && alignmentsChapter?.[verseRef]?.alignments?.length) { - verse_ = verseRef; // exact match is best - originalLangWordList = getOrigLangWordListForVerse(originalLangChapter, verseRef); - alignmentsWordList = getAlignedWordListForVerse(alignmentsChapter, verseRef); - } else if (isValidVerseSpan(verseRef)) { - const verseData = getVersesForSpan(verseRef, originalLangChapter); - - if (verseData) { - alignmentsWordList = getAlignedWordListForVerse(alignmentsChapter, verseRef); - - if (alignmentsWordList?.length) { - originalLangWordList = getOriginalLanguageListForVerseData(verseData?.verseObjects); - verse_ = verseRef; - } - } - } - - return { - verse: verse_, - originalLangWordList, - alignmentsWordList, - }; -} - -/** - * if flag is true, increment and return count - * @param {number} count - * @param {boolean} flag - * @return {number} new count - */ -function increment(count, flag) { - if (flag) { - count++; - } - return count; -} - -/** - * get the aligned word attributes for verse from latest original language - * @param {Object} originalLangChapter - * @param {Object} alignmentsChapter - * @param {string|number} verse - * @return {{removedExtraWords: number, emptyAlignments: number, changed: number}} - */ -export function updateAlignedWordsFromOriginalForVerse(originalLangChapter, alignmentsChapter, verse) { - let changed = 0; - let removedExtraWords = 0; - let emptyAlignments = 0; - const { - verse: verse_, - originalLangWordList, - alignmentsWordList, - } = getBestMatchForVerse(originalLangChapter, alignmentsChapter, verse); - - if (originalLangWordList?.length && alignmentsWordList?.length) { - const changed_ = updateAlignedWordsFromOriginalWordList(originalLangWordList, alignmentsWordList); - changed = increment(changed, changed_); - - if (alignmentsChapter?.[verse_]?.alignments) { - // clear word bank so it will be regenerated - alignmentsChapter[verse_].wordBank = []; - const { extraWordFound, emptyAlignmentsFound } = removeExtraWordsFromChapterAlignments(alignmentsChapter, verse_); - removedExtraWords = increment(removedExtraWords, extraWordFound); - emptyAlignments = increment(emptyAlignments, emptyAlignmentsFound); - } - } - return { - changed, - removedExtraWords, - emptyAlignments, - }; -} - -/** - * for a chapter update the aligned word attributes for verse from latest original language - * @param {Object} originalLangChapter - * @param {Object} alignmentsChapter - * @return {{emptyAlignmentsVerses: *[], changedVerses: *[], removedExtraWordsVerses: *[]}} - */ -export function updateAlignedWordAttribFromOriginalForChapter(originalLangChapter, alignmentsChapter) { - const changedVerses = []; - const removedExtraWordsVerses = []; - const emptyAlignmentsVerses = []; - const verses = Object.keys(alignmentsChapter); - - for (const verse of verses) { - const { - changed, - removedExtraWords, - emptyAlignments, - } = updateAlignedWordsFromOriginalForVerse(originalLangChapter, alignmentsChapter, verse); - - if (emptyAlignments > 0) { - emptyAlignmentsVerses.push({ verse, count: emptyAlignments }); - } - - if (removedExtraWords > 0) { - removedExtraWordsVerses.push({ verse, count: removedExtraWords }); - } - - if (changed > 0) { - changedVerses.push({ verse, count: changed }); - } - } - return { - changedVerses, - removedExtraWordsVerses, - emptyAlignmentsVerses, - }; -} - -/** - * for a book update the aligned word attributes for verse from latest original language - * @param {object} origBook - * @param {object} alignments - * @param {string} bookID - * @return {{removedExtraWordsChapters: {}, emptyAlignmentsChapters: {}, changedChapters: {}}} - */ -export function updateAlignedWordAttribFromOriginalForBook(origBook, alignments, bookID) { - const changedChapters = {}; - const removedExtraWordsChapters = {}; - const emptyAlignmentsChapters = {}; - - if (origBook) { - const chapters = Object.keys(origBook); - - for (const chapter of chapters) { - const originalLangChapter = origBook[chapter]; - const alignmentsChapter = alignments[chapter]; - - if (originalLangChapter && alignmentsChapter) { - const { - changedVerses, - removedExtraWordsVerses, - emptyAlignmentsVerses, - } = updateAlignedWordAttribFromOriginalForChapter(originalLangChapter, alignmentsChapter); - - if (changedVerses?.length) { - changedChapters[chapter] = changedVerses; - } - - if (emptyAlignmentsVerses?.length) { - emptyAlignmentsChapters[chapter] = emptyAlignmentsVerses; - } - - if (removedExtraWordsVerses?.length) { - removedExtraWordsChapters[chapter] = removedExtraWordsVerses; - } - } else { - console.log(`updateAlignedWordsFromOriginalForBook(${bookID}) - missing chapter ${chapter} data OriginalLang = ${!!originalLangChapter}, alignments = ${alignmentsChapter}`); - } - } - } - return { - changedChapters, - removedExtraWordsChapters, - emptyAlignmentsChapters, - }; -} - -/** - * check if original words in target alignment have changed, if so make an effort to update alignments based on normalized text. Otherwise remove any original words from alignments that are not found in the original verse objects. - * @param {Object[]} targetVerseObjects - * @param {Object[]} originalVerseObjects - * @return {Object[]}} migrated targetVerseObjects - */ -export function migrateTargetAlignmentsToOriginal(targetVerseObjects, originalVerseObjects) { - const originalLangWordList = getOriginalLanguageListForVerseData(originalVerseObjects); - const targetVerseText = convertVerseDataToUSFM({ verseObjects: targetVerseObjects}) - const alignments = extractAlignmentsFromTargetVerse(targetVerseText, originalVerseObjects) - // const alignments = wordaligner.unmerge(targetVerseObjects, originalVerseObjects); - const alignmentsWordList = getAlignedWordListFromAlignments(alignments.alignment); - if (originalLangWordList?.length && alignmentsWordList?.length) { - const changed_ = updateAlignedWordsFromOriginalWordList(originalLangWordList, alignmentsWordList); - // let results = updateAlignmentsToTargetVerse(initialVerseObjects, newText) - const results = parseUsfmToWordAlignerData(targetVerseText, null); - const targetWords = results.targetWords - const {extraWordFound, emptyAlignmentsFound} = updateAlignmentData(alignments.alignments, targetWords); - const bareTargetText = getUsfmForVerseContent({ verseObjects: targetVerseObjects }) - const verseUsfm = addAlignmentsToVerseUSFM(targetWords, alignments.alignments, bareTargetText) - if (verseUsfm) { - const alignedVerseObjects = usfmVerseToJson(verseUsfm) - return alignedVerseObjects - } else { - const unalignedVerseObjects = removeMilestonesAndWordMarkers(targetVerseObjects)?.verseObjects || []; - console.warn(`migrateTargetAlignmentsToOriginal() - separating alignments failed, removing alignments`) - return unalignedVerseObjects - } - } - return targetVerseObjects; -} diff --git a/src/utils/usfmHelpers.js b/src/utils/usfmHelpers.js deleted file mode 100644 index 4b41d4d..0000000 --- a/src/utils/usfmHelpers.js +++ /dev/null @@ -1,160 +0,0 @@ -/* eslint-disable no-console */ -import usfmjs from 'usfm-js'; -// helpers -import * as bibleHelpers from './bibleHelpers'; - -/** - * @description Parses the usfm file using usfm-parse library. - * @param {string} usfmData - USFM data to parse - */ -export function getParsedUSFM(usfmData) { - try { - if (usfmData) { - return usfmjs.toJSON(usfmData, { convertToInt: ['occurrence', 'occurrences'] }); - } - } catch (e) { - console.error(e); - } -} - -/** - * @description get tag item from headers array - * @param headers - * @param tag - * @return {String} content of tag if found, else null - */ -export function getHeaderTag(headers, tag) { - if (headers) { - const retVal = headers.find(header => header.tag === tag); - - if (retVal) { - return retVal.content; - } - } - return null; -} - -/** - * parses only the header information of a valid usfm file. Header information is the part before the first chapter marker. - * @param {String} usfmData - USFM data to parse - * @return {*} - */ -export function getUsfmHeaderInfo(usfmData) { - const pos = usfmData.indexOf('\\c '); - const header = (pos >= 0) ? usfmData.substr(0, pos) : usfmData; // if chapter marker is found, process only that part - return getParsedUSFM(header); -} - -/** - * parse USFM header details from usfm data - * @param {String} usfmData - USFM data to parse - * @return {*} - */ -export function parseUsfmDetails(usfmData) { - const usfmObject = getUsfmHeaderInfo(usfmData); - return getUSFMDetails(usfmObject); -} - -/** - * Most important function for creating a project from a USFM file alone. This function gets the - * book name, id, language name and direction for starting a tC project. - * @param {object} usfmObject - Object created by USFM to JSON module. Contains information - * for parsing and using in tC such as book name. - */ -export function getUSFMDetails(usfmObject) { - let details = { - book: { - id: undefined, - name: undefined, - }, - language: { - id: undefined, - name: undefined, - direction: 'ltr', - }, - target_languge: { book: { name: undefined } }, - }; - - // adding target language book name from usfm headers - const targetLangugeBookName = getHeaderTag(usfmObject.headers, 'h'); - details.target_languge.book.name = targetLangugeBookName; - - let headerIDArray = []; - const tag = 'id'; - const id = getHeaderTag(usfmObject.headers, tag); - - if (id) { - // Conditional to determine how USFM should be parsed. - let isSpaceDelimited = id.split(' ').length > 1; - let isCommaDelimited = id.split(',').length > 1; - - if (isSpaceDelimited) { - // i.e. TIT EN_ULB sw_Kiswahili_ltr Wed Jul 26 2017 22:14:55 GMT-0700 (PDT) tc. - // Could have attached commas if both comma delimited and space delimited - headerIDArray = id.split(' '); - headerIDArray.forEach((element, index) => { - headerIDArray[index] = element.replace(',', ''); - }); - details.book.id = headerIDArray[0].trim().toLowerCase(); - } else if (isCommaDelimited) { - // i.e. TIT, sw_Kiswahili_ltr, EN_ULB, Thu Jul 20 2017 16:03:48 GMT-0700 (PDT), tc. - headerIDArray = id.split(','); - details.book.id = headerIDArray[0].trim().toLowerCase(); - } else { - // i.e. EPH - details.book.id = id.toLowerCase(); - } - - let fullBookName = bibleHelpers.convertToFullBookName(details.book.id); - - if (fullBookName) { - details.book.name = fullBookName; - } else { - fullBookName = bibleHelpers.convertToFullBookName(usfmObject.book); - - if (fullBookName) { - details.book.name = fullBookName; - } else { - details.book.id = null; - } - } - - let tcField = headerIDArray[headerIDArray.length - 1] || ''; - - if (tcField.trim() === 'tc') { - details.repo = headerIDArray[1]; - - // Checking for tC field to parse with more information than standard usfm. - for (let index in headerIDArray) { - let languageCodeArray = headerIDArray[index].trim().split('_'); - - if (languageCodeArray.length === 3) { - details.language.id = languageCodeArray[0].toLowerCase(); - details.language.name = languageCodeArray[1].split('⋅').join(' '); // restore spaces - details.language.direction = languageCodeArray[2].toLowerCase(); - } - } - } - } - return details; -} -/** - * Removes any USFM markers from a string. - * @param {String} targetVerseText - verse text that may or may not contain USFM markers. - * @returns {String} printable text from USFM. - */ -export const removeUsfmMarkers = (targetVerseText) => { - const cleaned = usfmjs.removeMarker(targetVerseText); - return cleaned; -}; - -export function usfmVerseToJson(verseUSFM) { - if (verseUSFM) { - const verseObjects = usfmjs.toJSON('\\v 1 ' + verseUSFM, {chunk: true}); - - if (verseObjects?.verses?.[1]?.verseObjects) { - return verseObjects.verses[1].verseObjects; - } - } - return null; -} diff --git a/src/utils/verseObjects.js b/src/utils/verseObjects.js deleted file mode 100644 index 6ea4c15..0000000 --- a/src/utils/verseObjects.js +++ /dev/null @@ -1,65 +0,0 @@ -import {Token} from 'wordmap-lexer'; -import {VerseObjectUtils} from 'word-aligner'; - -/** - * Converts verse objects (as in from the source language verse) into {@link Token}s. - * @param verseObjects - */ -export const tokenizeVerseObjects = (verseObjects) => { - const tokens = []; - const completeTokens = []; // includes occurrences - const occurrences = {}; - let position = 0; - const words = VerseObjectUtils.getWordList(verseObjects); - let sentenceCharLength = 0; - for (const word of words) { - if (typeof occurrences[word.text] === 'undefined') { - occurrences[word.text] = 0; - } - sentenceCharLength += word.text.length; - occurrences[word.text]++; - tokens.push({ - text: word.text, - strong: (word.strong || word.strongs), - morph: word.morph, - lemma: word.lemma, - position: position, - occurrence: occurrences[word.text] - }); - position++; - } - // inject occurrences - for (const token of tokens) { - completeTokens.push(new Token({ - text: token.text, - strong: token.strong, - morph: token.morph, - lemma: token.lemma, - position: token.position, - occurrence: token.occurrence, - occurrences: occurrences[token.text], - sentenceTokenLen: tokens.length, - sentenceCharLen: sentenceCharLength - })); - } - return completeTokens; -}; - -/** - * get verse range from span. Copied from lib tc-ui-toolkit - * @param {string} verseSpan - * @return {{high: number, low: number}} - */ -export function getVerseSpanRange(verseSpan) { - let [low, high] = verseSpan.split('-'); - - if (low && high) { - low = parseInt(low, 10); - high = parseInt(high, 10); - - if ((low > 0) && (high >= low)) { - return { low, high }; - } - } - return {}; -} diff --git a/yarn.lock b/yarn.lock index b1a0916..50ce4b1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7200,6 +7200,19 @@ which@^2.0.1: dependencies: isexe "^2.0.0" +word-aligner-lib@0.9.1-beta: + version "0.9.1-beta" + resolved "https://registry.yarnpkg.com/word-aligner-lib/-/word-aligner-lib-0.9.1-beta.tgz#53f4ecc9ed5bffca63497050416768c3c81d1b98" + integrity sha512-P4YZbKo/y+cROLoQhzc/Su7VhzLoKV/49+7s6HVJYstUmlywq+rmf/EmMabu5BmP31lBz/qS+PfIoOJAEN+4zA== + dependencies: + deep-equal "^2.0.5" + file-loader "^6.2.0" + lodash.clonedeep "^4.5.0" + string-punctuation-tokenizer "2.2.0" + usfm-js "3.4.3" + word-aligner "1.0.2" + wordmap-lexer "^0.3.6" + word-aligner@1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/word-aligner/-/word-aligner-1.0.2.tgz#7b47a913421337c44b340b424b420cb5ce3ac945"