diff --git a/tools/determineDeltaBetweenVersions.ipynb b/tools/determineDeltaBetweenVersions.ipynb
index 0bf2b87..d57df65 100644
--- a/tools/determineDeltaBetweenVersions.ipynb
+++ b/tools/determineDeltaBetweenVersions.ipynb
@@ -3179,7 +3179,7 @@
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": 190,
"id": "7f5005f5-5a74-4e8e-84e7-9cae7c629b55",
"metadata": {
"tags": []
@@ -3188,10 +3188,10 @@
{
"data": {
"text/html": [
- "\n",
- "\n",
- "
\n",
"\n",
+ " \n",
+ " \n",
+ " \n",
" \n",
" \n",
" Delta Report \n",
@@ -3225,42 +3225,37 @@
" });\n",
" }\n",
" \n",
+ " \n",
+ " \n",
" \n",
- "\n",
- "\n",
"Delta Report \n",
- "Dataset 1: tonyjurg/Nestle1904LFT - 0.7
\n",
- "Dataset 2: saulocantanhede/tfgreek2 - 0.5.7
\n",
- "Expand All \n",
+ "Dataset 1: tonyjurg/Nestle1904LFT/tf v:0.7(rv0.8=#g95357e8bf298b090341cf277596be01f7f1f5ce9 offline under C:/Users/tonyj/text-fabric-data/github)
\n",
+ "Dataset 2: saulocantanhede/tfgreek2/tf v:0.5.7(r0.5.8=#1a251a4a8daacae4cd5e02294a95d806b3964000 offline under C:/Users/tonyj/text-fabric-data/github)
\n",
"Collapse All \n",
- "Expand Features \n",
- "Collapse Features \n",
- "Expand Nodetypes \n",
- "Collapse Nodetypes \n",
- " Nodenames only in Dataset 2 \n",
+ "Expand up to second level \n",
+ "Expand up to third level \n",
+ "Expand All \n",
+ " Nodenames only in Dataset 2 \n",
"subphrase \n",
"clause \n",
"phrase \n",
"group \n",
" \n",
- " \n",
- "Differences in nodenumber range for common nodenames \n",
- "Nodename verse \n",
+ "Differences in nodenumber range for common nodenames \n",
+ "Nodename verse \n",
"Dataset 1: (146078, 154020) \n",
"Dataset 2: (382714, 390657) \n",
- " \n",
- "Nodename wg \n",
+ "Nodename wg \n",
"Dataset 1: (154021, 259450) \n",
"Dataset 2: (390658, 497525) \n",
- " \n",
- "Nodename sentence \n",
+ "Nodename sentence \n",
"Dataset 1: (138067, 146077) \n",
"Dataset 2: (246833, 266535) \n",
+ " \n",
" \n",
- " \n",
- "\n",
- "Features only in Dataset 1 \n",
- "\n",
+ "Features only in Dataset 1 \n",
"booknumber \n",
"containedclause \n",
"gn \n",
@@ -3292,9 +3287,7 @@
"wordtranslit \n",
"wordunacc \n",
" \n",
- "\n",
- "Features only in Dataset 2 \n",
- "\n",
+ "Features only in Dataset 2 \n",
"appositioncontainer \n",
"articular \n",
"before \n",
@@ -3328,615 +3321,614 @@
"typ \n",
"unaccent \n",
"variant \n",
- " \n",
- " \n",
- "\n",
+ " \n",
+ "\n",
"Differences in Common Features \n",
"\n",
- "Feature: verse \n",
+ "Feature: verse \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Verse number inside chapter \n",
"Dataset 2: verse number, from ref attribute in xml \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: verse \n",
+ "Frequency List Differences \n",
+ "Nodetype: verse \n",
"\n",
"Dataset 1: 1: 260 \n",
"Dataset 2: 1: 261 \n",
" \n",
" \n",
" \n",
- "Feature: morph \n",
+ "Feature: morph \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Morphological tag (Sandborg-Petersen morphology) \n",
"Dataset 2: morphological code \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: V-PAI-3S: 2226, ADV: 2081, PRT-N: 1977, V-2AAI-3S: 1244, V-AAI-3S: 1225, V-PAP-NSM: 880, V-PAI-1S: 765, P-ASM: 746, V-PAN: 730, P-DSM: 700 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: CONJ: 16316, PREP: 10568, ADV: 3808, N-NSM: 3475, N-GSM: 2935, T-NSM: 2905, N-ASF: 2870, PRT-N: 2701, N-ASM: 2456, V-PAI-3S: 2271 \n",
" \n",
" \n",
" \n",
- "Feature: punctuation \n",
+ "Feature: punctuation \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Punctuation after word \n",
"Dataset 2: this is XML attribute punctuation \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: : 37660, ,: 3903, .: 2731, ·: 1189, ;: 589 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: : 119264, ,: 9462, .: 5717, ·: 2359, ;: 971 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 119270 \n",
"Dataset 2: : 119264 \n",
" \n",
" \n",
" \n",
- "Feature: number \n",
+ "Feature: number \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical number of the verb (e.g. singular, plural) \n",
"Dataset 2: grammatical number \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: singular: 26293, plural: 12967 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: singular: 69846, plural: 29091 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 38842 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: book \n",
+ "Feature: book \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Book name (in English language) \n",
"Dataset 2: book name (full name) \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: clause \n",
+ "Frequency List Differences \n",
+ "Nodetype: clause \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: Luke: 4880, Matthew: 4364, Acts: 4237, John: 3699, Mark: 2860, Revelation: 1803, I_Corinthians: 1487, Romans: 1401, Hebrews: 1040, II_Corinthians: 909 \n",
" \n",
- "Nodetype: group \n",
+ "Nodetype: group \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: Acts: 1288, Luke: 1232, Matthew: 1165, Revelation: 909, John: 882, Mark: 753, I_Corinthians: 431, Romans: 362, Hebrews: 325, II_Corinthians: 222 \n",
" \n",
- "Nodetype: wg \n",
+ "Nodetype: wg \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: Luke: 8945, Matthew: 8165, Acts: 7770, John: 7207, Mark: 5363, Revelation: 3895, I_Corinthians: 3160, Romans: 2799, Hebrews: 1977, II_Corinthians: 1852 \n",
" \n",
- "Nodetype: sentence \n",
+ "Nodetype: sentence \n",
"\n",
"Dataset 1: Luke: 1155, Matthew: 1133, John: 1038, Acts: 883, Mark: 727, I_Corinthians: 524, Revelation: 466, Romans: 465, II_Corinthians: 253, Hebrews: 241 \n",
"Dataset 2: Luke: 2833, Matthew: 2636, John: 2626, Acts: 2245, Mark: 1750, I_Corinthians: 1242, Revelation: 1183, Romans: 1036, II_Corinthians: 721, Hebrews: 612 \n",
" \n",
" \n",
" \n",
- "Feature: tense \n",
+ "Feature: tense \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical tense of the verb (e.g. Present, Aorist) \n",
"Dataset 2: verbal tense \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: aorist: 11503, present: 11175, future: 1592, imperfect: 1547, perfect: 1450, pluperfect: 88 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: aorist: 11803, present: 11579, imperfect: 1689, future: 1626, perfect: 1572, pluperfect: 88 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 109422 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: gloss \n",
+ "Feature: gloss \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ English gloss \n",
"Dataset 2: English gloss (BGVB) \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: word \n",
+ "Frequency List Differences \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: the: 9857, and: 6212, -: 5496, in: 2320, And: 2218, not: 2042, of the: 1551, for: 1501, that: 1498, you: 1226 \n",
"Dataset 2: the: 19783, and, also, likewise: 8978, he, she, it, himself, herself, itself; even, very; same: 5550, you: 2892, but, and: 2787, (with dat.) in: 2743, I: 2567, am, exist: 2457, say, tell: 2255, no, not: 1622 \n",
" \n",
" \n",
" \n",
- "Feature: ref \n",
+ "Feature: ref \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Value of the ref ID (taken from XML sourcedata) \n",
"Dataset 2: biblical reference with word counting \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: 1CO 10:1!1: 1, 1CO 10:1!15: 1, 1CO 10:1!17: 1, 1CO 10:1!2: 1, 1CO 10:1!21: 1, 1CO 10:1!4: 1, 1CO 10:1!5: 1, 1CO 10:10!2: 1, 1CO 10:10!6: 1, 1CO 10:10!8: 1 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: 1CO 10:1!1: 1, 1CO 10:1!10: 1, 1CO 10:1!11: 1, 1CO 10:1!12: 1, 1CO 10:1!13: 1, 1CO 10:1!14: 1, 1CO 10:1!15: 1, 1CO 10:1!16: 1, 1CO 10:1!17: 1, 1CO 10:1!18: 1 \n",
" \n",
" \n",
" \n",
- "Feature: ln \n",
+ "Feature: ln \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Lauw-Nida lexical classification (not present everywhere?) \n",
"Dataset 2: ln \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: 92.11: 2617, 33.69: 2334, 69.3: 1399, 92.1: 920, 92.27: 812, 92.7: 812, 13.1: 699, 13.4: 535, 92.29: 522, 15.81: 471 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: 92.24: 19738, 92.11: 4718, 89.92: 2903, 89.87: 2756, 33.69: 2336, 69.3: 1736, 92.1: 1732, 92.7: 1494, 12.1: 1247, 92.29: 1090 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: 92.24: 19781, : 10488 \n",
"Dataset 2: 92.24: 19738, 92.29: 1090 \n",
" \n",
" \n",
" \n",
- "Feature: degree \n",
+ "Feature: degree \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Degree (e.g. Comparitative, Superlative) \n",
"Dataset 2: grammatical degree \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: comparative: 119, superlative: 32 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: comparative: 313, superlative: 200 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 137266 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: normalized \n",
+ "Feature: normalized \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Surface word with accents normalized and trailing punctuations removed \n",
"Dataset 2: lemma normalized \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: αὐτόν: 746, μή: 717, αὐτῷ: 710, οὐκ: 660, εἶπεν: 586, ἐστιν: 556, αὐτοῖς: 491, ὑμῖν: 475, οὐ: 378, λέγει: 331 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: καί: 8576, ὁ: 2769, δέ: 2764, ἐν: 2684, τοῦ: 2497, εἰς: 1755, τό: 1664, τόν: 1562, τήν: 1523, αὐτοῦ: 1411 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: καί: 8576, δέ: 2764, τό: 1664, τόν: 1562, τήν: 1523 \n",
"Dataset 2: καί: 8576, δέ: 2764, τό: 1664, τόν: 1562, τήν: 1523 \n",
" \n",
" \n",
" \n",
- "Feature: lemma \n",
+ "Feature: lemma \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Lexeme (lemma) \n",
"Dataset 2: lexical lemma \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: αὐτός: 2839, λέγω: 2252, εἰμί: 2251, σύ: 1468, ἐγώ: 1247, οὐ: 1182, ὅς: 1111, μή: 779, ἔχω: 707, γίνομαι: 663 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: ὁ: 19783, καί: 8978, αὐτός: 5561, σύ: 2892, δέ: 2787, ἐν: 2743, ἐγώ: 2567, εἰμί: 2457, λέγω: 2255, εἰς: 1766 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: καί: 8978, αὐτός: 5561, σύ: 2892, δέ: 2787, ἐγώ: 2567, εἰμί: 2457, λέγω: 2255 \n",
"Dataset 2: καί: 8978, αὐτός: 5561, σύ: 2892, δέ: 2787, ἐγώ: 2567, εἰμί: 2457, λέγω: 2255 \n",
" \n",
" \n",
" \n",
- "Feature: type \n",
+ "Feature: type \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical type of noun or pronoun (e.g. Common, Personal) \n",
"Dataset 2: morphological type (on word), syntactical type (on sentence, group, clause, phrase or wg) \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: clause \n",
+ "Frequency List Differences \n",
+ "Nodetype: clause \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: wrapper-clause-scope: 191, group: 107, apposition-group: 20 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 93321 \n",
"Dataset 2: None \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: modifier-scope: 29645, common: 23644, personal: 11521, wrapper-scope: 11264, proper: 4639, group: 2325, demonstrative: 1722, modifier-clause-scope: 1712, relative: 1674, interrogative: 633 \n",
" \n",
- "Nodetype: wg \n",
+ "Nodetype: wg \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: modifier-scope: 29645, wrapper-clause-scope: 12166, wrapper-scope: 11264, conjuncted-wg: 8075, group: 4957, modifier-clause-scope: 1712, apposition-group: 891 \n",
" \n",
- "Nodetype: group \n",
+ "Nodetype: group \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: conjuncted-wg: 8075, apposition-group: 870 \n",
" \n",
- "Nodetype: phrase \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: modifier-scope: 10484, wrapper-scope: 9535, personal: 5885, common: 2120, relative: 1364, group: 952, modifier-clause-scope: 755, demonstrative: 744, proper: 683, interrogative: 480 \n",
" \n",
- "Nodetype: sentence \n",
+ "Nodetype: sentence \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: wrapper-clause-scope: 11975, group: 2525, apposition-group: 1 \n",
" \n",
" \n",
" \n",
- "Feature: unicode \n",
+ "Feature: unicode \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Word as it apears in the text in Unicode (incl. punctuations) \n",
"Dataset 2: word in unicode characters plus material after it \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: μὴ: 669, οὐκ: 660, αὐτῷ: 602, εἶπεν: 560, αὐτὸν: 519, αὐτοῖς: 420, ἐστιν: 383, οὐ: 378, λέγει: 318, ὑμῖν: 283 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: καὶ: 8541, ὁ: 2768, ἐν: 2683, δὲ: 2619, τοῦ: 2497, εἰς: 1755, τὸ: 1657, τὸν: 1556, τὴν: 1518, τῆς: 1300 \n",
" \n",
" \n",
" \n",
- "Feature: case \n",
+ "Feature: case \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical case (Nominative, Genitive, Dative, Accusative, Vocative) \n",
"Dataset 2: grammatical case \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: nominative: 9609, accusative: 6170, dative: 3265, genitive: 1408, vocative: 1 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: nominative: 24197, accusative: 23031, genitive: 19515, dative: 12126, vocative: 649 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 58261 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: after \n",
+ "Feature: after \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Characters (eg. punctuations) following the word \n",
"Dataset 2: material after the end of the word \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: : 37661, ,: 3892, .: 2724, ·: 1187, ;: 588, ,—: 8, ).: 4, —: 3, ,): 2, ·—: 2 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: : 119261, ,: 9439, .: 5704, ·: 2355, ;: 969, ,—: 18, —: 7, ).: 6, .]]: 4, ·—: 4 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 119270, , : 9462, . : 5717, · : 2359, ; : 971 \n",
"Dataset 2: : 119261, ,: 9439, .: 5704, ·: 2355, ;: 969, ,—: 18, —: 7, ).: 6, .]]: 4, ·—: 4 \n",
" \n",
" \n",
" \n",
- "Feature: chapter \n",
+ "Feature: chapter \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Chapter number inside book \n",
"Dataset 2: chapter number, from ref attribute in xml \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: verse \n",
+ "Frequency List Differences \n",
+ "Nodetype: verse \n",
"\n",
"Dataset 1: 4: 509 \n",
"Dataset 2: 4: 510 \n",
" \n",
- "Nodetype: sentence \n",
+ "Nodetype: sentence \n",
"\n",
"Dataset 1: 1: 519, 3: 497, 4: 496, 2: 489, 5: 481, 6: 404, 12: 399, 9: 398, 11: 390, 8: 386 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: voice \n",
+ "Feature: voice \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical voice of the verb (e.g. active,passive) \n",
"Dataset 2: verbal voice \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: active: 20154, passive: 3345, middle: 2187, middlepassive: 1669 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: active: 20742, passive: 3493, middle: 2408, middlepassive: 1714 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 109422 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: person \n",
+ "Feature: person \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical person of the verb (first, second, third) \n",
"Dataset 2: grammatical person \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: third: 12474, second: 3447, first: 2886 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: third: 12747, second: 3729, first: 2943 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 118360 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: clausetype \n",
+ "Feature: clausetype \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Clause type details (e.g. Verbless, Minor) \n",
"Dataset 2: clause type \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: sentence \n",
+ "Frequency List Differences \n",
+ "Nodetype: sentence \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: nominalized: 59 \n",
" \n",
- "Nodetype: clause \n",
+ "Nodetype: clause \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: nominalized: 5237 \n",
" \n",
- "Nodetype: wg \n",
+ "Nodetype: wg \n",
"\n",
"Dataset 1: : 102662, VerbElided: 1009, Verbless: 929, Minor: 830 \n",
"Dataset 2: nominalized: 5296 \n",
" \n",
" \n",
" \n",
- "Feature: sp \n",
+ "Feature: sp \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Part of Speech (abbreviated) \n",
"Dataset 2: part-of-speach \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: verb: 27355, pron: 8751, advb: 4384, subs: 2822, adjv: 2304, art: 257, intj: 90, conj: 85, num: 25, prep: 4 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: subs: 28455, verb: 28357, art: 19786, conj: 18227, pron: 16177, prep: 10914, adjv: 8452, advb: 6147, intj: 788, num: 476 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: noun: 28455, det: 19786, adj: 8452, adv: 6147, ptcl: 773 \n",
"Dataset 2: subs: 28455, art: 19786, adjv: 8452, advb: 6147, intj: 788 \n",
" \n",
" \n",
" \n",
- "Feature: junction \n",
+ "Feature: junction \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Junction data related to a wordgroup \n",
"Dataset 2: type of junction \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: clause \n",
+ "Frequency List Differences \n",
+ "Nodetype: clause \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: coordinate: 8186, subordinate: 7449 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: subordinate: 116, coordinate: 64 \n",
" \n",
- "Nodetype: wg \n",
+ "Nodetype: wg \n",
"\n",
"Dataset 1: : 103128, apposition: 2302 \n",
"Dataset 2: coordinate: 9367, subordinate: 8554 \n",
" \n",
- "Nodetype: phrase \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: subordinate: 57 \n",
" \n",
- "Nodetype: sentence \n",
+ "Nodetype: sentence \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: coordinate: 1117, subordinate: 989 \n",
" \n",
" \n",
" \n",
- "Feature: mood \n",
+ "Feature: mood \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Gramatical mood of the verb (passive, etc) \n",
"Dataset 2: verbal mood \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: phrase \n",
+ "Frequency List Differences \n",
+ "Nodetype: phrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: indicative: 15245, participle: 6320, infinitive: 2228, subjunctive: 1832, imperative: 1663, optative: 67 \n",
" \n",
- "Nodetype: subphrase \n",
+ "Nodetype: subphrase \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: indicative: 15617, participle: 6653, infinitive: 2285, imperative: 1877, subjunctive: 1856, optative: 69 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: : 109422 \n",
"Dataset 2: None \n",
" \n",
" \n",
" \n",
- "Feature: bookshort \n",
+ "Feature: bookshort \n",
"\n",
"Descr Difference: \n",
"\n",
"Dataset 1: ✅ Book name (abbreviated) \n",
"Dataset 2: book name (abbreviated) from ref attribute in xml \n",
" \n",
- "Frequency List Differences \n",
- "Nodetype: clause \n",
+ "Frequency List Differences \n",
+ "Nodetype: clause \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: LUK: 4880, MAT: 4364, ACT: 4237, JHN: 3699, MRK: 2860, REV: 1803, 1CO: 1487, ROM: 1401, HEB: 1040, 2CO: 909 \n",
" \n",
- "Nodetype: word \n",
+ "Nodetype: word \n",
"\n",
"Dataset 1: Luke: 19456, Acts: 18393, Matt: 18299, John: 15643, Mark: 11277, Rev: 9832, Rom: 7100, 1Cor: 6820, Heb: 4955, 2Cor: 4469 \n",
"Dataset 2: LUK: 19456, ACT: 18393, MAT: 18299, JHN: 15643, MRK: 11277, REV: 9832, ROM: 7100, 1CO: 6820, HEB: 4955, 2CO: 4469 \n",
" \n",
- "Nodetype: book \n",
+ "Nodetype: book \n",
"\n",
"Dataset 1: 1Cor: 1, 1John: 1, 1Pet: 1, 1Thess: 1, 1Tim: 1, 2Cor: 1, 2John: 1, 2Pet: 1, 2Thess: 1, 2Tim: 1 \n",
"Dataset 2: 1CO: 1, 1JN: 1, 1PE: 1, 1TH: 1, 1TI: 1, 2CO: 1, 2JN: 1, 2PE: 1, 2TH: 1, 2TI: 1 \n",
" \n",
- "Nodetype: wg \n",
+ "Nodetype: wg \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: LUK: 8945, MAT: 8165, ACT: 7770, JHN: 7207, MRK: 5363, REV: 3895, 1CO: 3160, ROM: 2799, HEB: 1977, 2CO: 1852 \n",
" \n",
- "Nodetype: group \n",
+ "Nodetype: group \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: ACT: 1288, LUK: 1232, MAT: 1165, REV: 909, JHN: 882, MRK: 753, 1CO: 431, ROM: 362, HEB: 325, 2CO: 222 \n",
" \n",
- "Nodetype: sentence \n",
+ "Nodetype: sentence \n",
"\n",
"Dataset 1: None \n",
"Dataset 2: LUK: 2833, MAT: 2636, JHN: 2626, ACT: 2245, MRK: 1750, 1CO: 1242, REV: 1183, ROM: 1036, 2CO: 721, HEB: 612 \n",
@@ -3944,7 +3936,7 @@
" \n",
" \n",
" \n",
- "Created on 2024-09-26 18:22:40 with Doc4TF tool displayDeltaBetweenVersions version 0.2.
\n",
+ "Created on 2024-09-26 21:07:30 with Doc4TF tool displayDeltaBetweenVersions version 0.2.
\n",
""
],
"text/plain": [
@@ -4024,10 +4016,10 @@
" making both features and nodetypes collapsible.\n",
" \"\"\"\n",
" html = []\n",
- " html.append(\"\")\n",
- " html.append(\"\")\n",
- " html.append(\"\")\n",
" html.append(\"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
" \n",
" \n",
" Delta Report \n",
@@ -4061,22 +4053,39 @@
" });\n",
" }\n",
" \n",
+ " \n",
+ " \n",
" \"\"\")\n",
- " html.append(\"\")\n",
- " html.append(\"\")\n",
+ " \n",
+ " # get details on the two Text-Fabric dataset\n",
+ " liveName1=f'{A1.appName} - {A1.version}'\n",
+ " if A1.provenance: \n",
+ " for parts in A1.provenance[0]:\n",
+ " if isinstance(parts, tuple):\n",
+ " key, value = parts[0], parts[1]\n",
+ " if key == 'live': \n",
+ " liveName1=value[0]\n",
+ " break\n",
+ " \n",
+ " liveName2=f'{A2.appName} - {A2.version}'\n",
+ " if A2.provenance: \n",
+ " for parts in A2.provenance[0]:\n",
+ " if isinstance(parts, tuple):\n",
+ " key, value = parts[0], parts[1]\n",
+ " if key == 'live': \n",
+ " liveName2=value[0]\n",
+ " break\n",
"\n",
" html.append(\"Delta Report \")\n",
- " html.append(f\"Dataset 1: {A1.appName} - {A1.version}
\")\n",
- " html.append(f\"Dataset 2: {A2.appName} - {A2.version}
\")\n",
+ " html.append(f\"Dataset 1: {liveName1}
\")\n",
+ " html.append(f\"Dataset 2: {liveName2}
\")\n",
" \n",
" # Add buttons to expand or collapse all details\n",
- " html.append(\"Expand All \")\n",
" html.append(\"Collapse All \")\n",
- " html.append(\"Expand Features \")\n",
- " html.append(\"Collapse Features \")\n",
- " html.append(\"Expand Nodetypes \")\n",
- " html.append(\"Collapse Nodetypes \")\n",
- "\n",
+ " html.append(\"Expand up to second level \") \n",
+ " html.append(\"Expand up to third level \")\n",
+ " html.append(\"Expand All \")\n",
+ " \n",
" # check for node name and number-range differences\n",
" \n",
" # Initialize empty dictionaries\n",
@@ -4093,57 +4102,49 @@
" # Check if either set is not empty and print if true\n",
" if nodes_in_1_not_in_2 or nodes_in_2_not_in_1: \n",
" if nodes_in_1_not_in_2:\n",
- " html.append(\" Nodenames only in Dataset 1 \")\n",
+ " html.append(\" Nodenames only in Dataset 1 \")\n",
" for node in nodes_in_1_not_in_2:\n",
" html.append(f\"{node} \")\n",
" html.append(\" \")\n",
" if nodes_in_2_not_in_1:\n",
- " html.append(\" Nodenames only in Dataset 2 \")\n",
+ " html.append(\" Nodenames only in Dataset 2 \")\n",
" for node in nodes_in_2_not_in_1:\n",
" html.append(f\"{node} \")\n",
" html.append(\" \")\n",
- " html.append(\" \")\n",
"\n",
" # Compare tuple content for node number differences\n",
" common_keys = set(nodeIntervals1.keys()) & set(nodeIntervals2.keys())\n",
" different_values = {key: {'nodeIntervals1': nodeIntervals1[key], 'nodeIntervals2': nodeIntervals2[key]} \n",
" for key in common_keys if nodeIntervals1[key] != nodeIntervals2[key]}\n",
" if different_values:\n",
- " html.append(\"Differences in nodenumber range for common nodenames \")\n",
+ " html.append(\"Differences in nodenumber range for common nodenames \")\n",
" for key, diff in different_values.items():\n",
- " html.append(f\"Nodename {key} \")\n",
+ " html.append(f\"Nodename {key} \")\n",
" html.append(f\"Dataset 1: {diff['nodeIntervals1']} \")\n",
" html.append(f\"Dataset 2: {diff['nodeIntervals2']} \")\n",
- " html.append(f\" \")\n",
+ " html.append(f\" \")\n",
" html.append(\" \")\n",
" \n",
" # check for feature differences\n",
" # Features only in dict1\n",
" if report['only_in_dict1']:\n",
- " html.append(\"\")\n",
- " html.append(\"Features only in Dataset 1 \")\n",
- " html.append(\"\")\n",
- " for feature in report['only_in_dict1']:\n",
- " html.append(f\"{feature} \")\n",
+ " html.append(\"Features only in Dataset 1 \")\n",
+ " for feature in report['only_in_dict1']: html.append(f\"{feature} \")\n",
" html.append(\" \")\n",
"\n",
" # Features only in dict2\n",
" if report['only_in_dict2']:\n",
- " html.append(\"\")\n",
- " html.append(\"Features only in Dataset 2 \")\n",
- " html.append(\"\")\n",
- " for feature in report['only_in_dict2']:\n",
- " html.append(f\"{feature} \")\n",
- " html.append(\" \")\n",
- " html.append(\" \")\n",
+ " html.append(\"Features only in Dataset 2 \")\n",
+ " for feature in report['only_in_dict2']: html.append(f\"{feature} \")\n",
+ " html.append(\" \")\n",
"\n",
" # Differences in common features\n",
" if report['differences_in_common']:\n",
- " html.append(\"\")\n",
+ " html.append(\"\")\n",
" html.append(\"Differences in Common Features \")\n",
" html.append(\"\")\n",
" for feature, diffs in report['differences_in_common'].items():\n",
- " html.append(f\"Feature: {feature} \")\n",
+ " html.append(f\"Feature: {feature} \")\n",
" html.append(\"\")\n",
" for key, change in diffs.items():\n",
" if key in ['descr', 'type', 'datatype']:\n",
@@ -4154,9 +4155,9 @@
" html.append(\" \")\n",
" elif key == 'freqlist':\n",
" freqlist = change\n",
- " html.append(\"Frequency List Differences \")\n",
+ " html.append(\"Frequency List Differences \")\n",
" for nodetype, freq_diff in freqlist.items():\n",
- " html.append(f\"Nodetype: {nodetype} \")\n",
+ " html.append(f\"Nodetype: {nodetype} \")\n",
" html.append(\"\")\n",
" dataset1_val = ', '.join([f\"{t[0]}: {t[1]}\" for t in freq_diff['Dataset1']]) if freq_diff['Dataset1'] else 'None'\n",
" dataset2_val = ', '.join([f\"{t[0]}: {t[1]}\" for t in freq_diff['Dataset2']]) if freq_diff['Dataset2'] else 'None'\n",
@@ -4182,9 +4183,7 @@
"report_html = generate_html_delta_report(delta_report)\n",
"\n",
"# Display the report in the Jupyter Notebook\n",
- "display_html_report(report_html)\n",
- "\n",
- "\n"
+ "display_html_report(report_html)"
]
},
{
@@ -4202,7 +4201,7 @@
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": 192,
"id": "ad12cf7d-4853-447a-b2f7-395ac738467a",
"metadata": {},
"outputs": [
@@ -4210,7 +4209,7 @@
"data": {
"text/html": [
"\n",
- " \n",
+ " \n",
" Download HTML File \n",
" \n",
" "
@@ -4219,7 +4218,7 @@
""
]
},
- "execution_count": 61,
+ "execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
@@ -4240,7 +4239,6 @@
" '''\n",
" return HTML(download_link)\n",
"\n",
- "\n",
"# Display the download link in the notebook\n",
"create_download_link(report_html, 'report.html')"
]
@@ -4287,14 +4285,6 @@
"source": [
"Licenced under [Creative Commons Attribution 4.0 International (CC BY 4.0)](https://github.com/tonyjurg/Doc4TF/blob/main/LICENCE.md)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7f920334-abe1-41ef-b98a-4e3f5ff04f67",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {