Implement readings auto-lookup

justinsilvestre · Feb 12, 2024 · 01ad4f7 · 01ad4f7
1 parent a6e08f3
commit 01ad4f7
Show file tree

Hide file tree

Showing 21 changed files with 1,286 additions and 138 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -12,6 +12,7 @@
   },
   "dependencies": {
     "@floating-ui/react": "^0.26.9",
+    "@silvestre/cjk-unihan": "^0.0.3-0",
     "markdown-to-jsx": "^7.4.1",
     "next": "14.1.0",
     "react": "^18",

diff --git a/src/app/prebuild.ts b/src/app/prebuild.ts
@@ -1,13 +1,16 @@
 import {
   LexiconEntry,
+  Passage,
   PassageVocab,
   VocabEntryPronunciationKey,
   parsePassage,
   parsePassageVocabList,
+  vocabFileColumns,
 } from "@/app/texts/Passage";
 import {
   getPassageFileContents,
   getPassageVocabFileContents,
+  getPassageVocabFilePath,
   getTextsIds,
 } from "@/app/texts/files";
 import * as fs from "fs";
@@ -18,17 +21,24 @@ import {
   toEnMatchKeyword,
 } from "./lexiconEntryEnKeywords";
 
+// @ts-expect-error no typings
+import unihan from "@silvestre/cjk-unihan";
+import { normalizeText } from "./texts/[textId]/punctuation";
+
 const prebuildDirectoryPath = path.join(process.cwd(), "prebuild");
 
 if (!fs.existsSync(prebuildDirectoryPath)) {
   fs.mkdirSync(prebuildDirectoryPath);
 }
 
-const lexicon = aggregateVocabulary();
-const lexiconFilePath = path.join(prebuildDirectoryPath, "lexicon.json");
-fs.writeFileSync(lexiconFilePath, JSON.stringify(lexicon, null, 2));
-console.log(`Wrote lexicon to ${lexiconFilePath}`);
-writePassageVocabularyJsons(lexicon);
+fillInMissingReadingsInTsvs().then(() => {
+  const lexicon = aggregateVocabulary();
+  const lexiconFilePath = path.join(prebuildDirectoryPath, "lexicon.json");
+  fs.writeFileSync(lexiconFilePath, JSON.stringify(lexicon, null, 2));
+  console.log(`Wrote lexicon to ${lexiconFilePath}`);
+  writePassageVocabularyJsons(lexicon);
+  console.log(`Done writing vocab jsons`);
+});
 
 function aggregateVocabulary() {
   const textsIds = getTextsIds();
@@ -41,28 +51,176 @@ function aggregateVocabulary() {
   return lexicon;
 }
 
+async function fillInMissingReadingsInTsvs() {
+  let brandtPassagesVisited = 0;
+  let registeredChars = new Set<string>();
+  for (const textId of getTextsIds()) {
+    const isBrandtPassage = textId.startsWith("brandt-");
+    if (isBrandtPassage) brandtPassagesVisited += 1;
+    const vocabFileContents = getPassageVocabFileContents(textId);
+    const vocab = parsePassageVocabList(vocabFileContents);
+    const passage = parsePassage(getPassageFileContents(textId));
+    const passageChars = getPassageChars(passage);
+
+    const newCharsInPassage = [...passageChars].filter(
+      (char) => !registeredChars.has(char)
+    );
+
+    if (textId === "brandt-ch01-3")
+      console.log({
+        passageChars: [...passageChars].join(" "),
+        registeredChars: [...registeredChars].join(" "),
+      });
+
+    const featuredChars = new Set(
+      Object.keys(vocab).concat(isBrandtPassage ? newCharsInPassage : [])
+    );
+
+    for (const char of featuredChars) {
+      if (
+        !vocab[char] ||
+        vocab[char]?.some((e) => vocabFileColumns.some((k) => !e[k.key]))
+      ) {
+        const unihanResult = await getUnihan(char);
+        vocab[char] = (
+          vocab[char] || [
+            {
+              en: null,
+              jyutping: null,
+              kr: null,
+              pinyin: null,
+              vi: null,
+            },
+          ]
+        ).map((e) => ({
+          en: e.en || null,
+          jyutping:
+            e.jyutping ||
+            unihanResult?.kCantonese
+              ?.split(/\s/)
+              .map((r, _, segments) =>
+                segments.length > 1
+                  ? `${convertToneNumbersToSuperscript(r)}?`
+                  : convertToneNumbersToSuperscript(r)
+              )
+
+              .join(", ") ||
+            null,
+          pinyin:
+            e.pinyin ||
+            getMandarinReadings(char, unihanResult)
+              .map((r, _, segments) => (segments.length > 1 ? `${r}?` : r))
+              .join(", ") ||
+            null,
+          vi:
+            e.vi ||
+            unihanResult?.kVietnamese
+              ?.split(/\s/)
+              .map((r, _, segments) => (segments.length > 1 ? `${r}?` : r))
+              .join(", ") ||
+            null,
+          kr:
+            e.kr ||
+            unihanResult?.kHangul
+              ?.split(/\s/)
+              .map((r) => r.split(":")[0])
+              .map((r, _, segments) => (segments.length > 1 ? `${r}?` : r))
+              .join(", ") ||
+            null,
+        }));
+      }
+    }
+
+    const newVocabFileContents = makeVocabTsvContent(vocab);
+
+    if (
+      Object.keys(vocab).length &&
+      newVocabFileContents !== vocabFileContents
+    ) {
+      const vocabFilePath = getPassageVocabFilePath(textId);
+      fs.writeFileSync(vocabFilePath, newVocabFileContents);
+      if (fs.existsSync(vocabFilePath))
+        console.log(`Overwrote file ${vocabFilePath}`);
+      else console.log(`Created new file ${vocabFilePath}`);
+    }
+
+    if (isBrandtPassage)
+      for (const char of passageChars) {
+        registeredChars.add(char);
+      }
+  }
+}
+
+function getPassageChars(passage: Passage) {
+  const frontmatterZi = getEmbeddedChineseSegments(
+    passage.frontmatter.description
+  ).join("");
+
+  return new Set(
+    frontmatterZi +
+      passage.lines.map((l) => normalizeText(l.chinese)).join("") +
+      Object.values(passage.notes)
+        .flatMap((n) => getEmbeddedChineseSegments(n))
+        .join("")
+  );
+}
+
+function getEmbeddedChineseSegments(text: string) {
+  return [...text.matchAll(/(?<=`)[^`\s\n]+(?=`)/g)].map((t) =>
+    normalizeText(t[0])
+  );
+}
+
 function writePassageVocabularyJsons(lexicon: PassageVocab) {
   for (const textId of getTextsIds()) {
-    const vocab = parsePassageVocabList(getPassageVocabFileContents(textId));
+    const vocabFileContents = getPassageVocabFileContents(textId);
+    const vocab = parsePassageVocabList(vocabFileContents);
     const passage = parsePassage(getPassageFileContents(textId));
 
-    const vocabFilePath = path.join(
+    const vocabJsonPath = path.join(
       prebuildDirectoryPath,
       `${textId}.vocab.json`
     );
-    const passageChars = new Set(passage.lines.map((l) => l.chinese).join(""));
+    const passageChars = getPassageChars(passage);
 
     for (const char of passageChars) {
       if (!vocab[char]) {
         vocab[char] = lexicon[char];
       }
     }
 
-    fs.writeFileSync(vocabFilePath, JSON.stringify(vocab, null, 2), "utf-8");
-    console.log(`Wrote vocab for ${textId} to ${vocabFilePath}`);
+    fs.writeFileSync(vocabJsonPath, JSON.stringify(vocab, null, 2), "utf-8");
+    console.log(`Wrote vocab for ${textId} to ${vocabJsonPath}`);
   }
 }
 
+function getMandarinReadings(
+  char: string,
+  unihanResult: Partial<Record<string, string>>
+) {
+  const kMandarin = unihanResult?.kMandarin?.split(/\s/);
+  if (kMandarin?.length) return kMandarin;
+  const kHanyuPinyin =
+    unihanResult?.kHanyuPinyin
+      ?.split(/\s/)
+      .flatMap((s) => s.split(":")[1]?.split(",") || s) || [];
+  return kHanyuPinyin;
+}
+
+function makeVocabTsvContent(
+  vocab: Partial<Record<string, LexiconEntry[]>>
+): string | NodeJS.ArrayBufferView {
+  return [
+    `Traditional\tHanyu Pinyin\tJyutping\tKorean\tVietnamese\tEnglish`,
+    ...Object.entries(vocab).flatMap(
+      ([char, ee]) =>
+        ee?.map((e) =>
+          [char, e.pinyin, e.jyutping, e.kr, e.vi, e.en].join("\t")
+        ) || []
+    ),
+  ].join("\n");
+}
+
 function mergeVocab(a: PassageVocab, b: PassageVocab): PassageVocab {
   const merged: PassageVocab = { ...a };
   for (const chinese in b) {
@@ -155,3 +313,38 @@ function mergeEntryPronunciation(
   const mergedSegments = [...new Set([...aSegments, ...bSegments])].join(",");
   return mergedSegments;
 }
+
+async function getUnihan(
+  char: string
+): Promise<Partial<Record<string, string>>> {
+  return new Promise((res, rej) => {
+    unihan.get(char, (err: any, result: any) => {
+      if (err) rej(err);
+      else {
+        for (const key in result) {
+          if (!result[key]) delete result[key];
+        }
+        res(result);
+      }
+    });
+  });
+}
+
+const SUPERSCRIPT_NUMBERS = {
+  "0": "⁰",
+  "1": "¹",
+  "2": "²",
+  "3": "³",
+  "4": "⁴",
+  "5": "⁵",
+  "6": "⁶",
+  "7": "⁷",
+  "8": "⁸",
+  "9": "⁹",
+};
+function convertToneNumbersToSuperscript(string: string) {
+  return string.replace(
+    /([0-9])$/,
+    (match, p1) => SUPERSCRIPT_NUMBERS[p1 as keyof typeof SUPERSCRIPT_NUMBERS]
+  );
+}
diff --git a/src/app/texts/Passage.ts b/src/app/texts/Passage.ts
@@ -14,12 +14,18 @@ export type LexiconEntry = Record<VocabEntryPronunciationKey, string | null>;
 
 export type PassageVocab = Partial<Record<string, LexiconEntry[]>>;
 
-export type VocabEntryPronunciationKey = "vi" | "jyutping" | "pinyin" | "en";
+export type VocabEntryPronunciationKey =
+  | "vi"
+  | "jyutping"
+  | "pinyin"
+  | "en"
+  | "kr";
 export const vocabFileColumns = [
   { heading: "Vietnamese", key: "vi" },
   { heading: "Jyutping", key: "jyutping" },
   { heading: "English", key: "en" },
   { heading: "Hanyu Pinyin", key: "pinyin" },
+  { heading: "Korean", key: "kr" },
 ] as const;
 
 export function parsePassageVocabList(vocabFileContents: string | null) {