Skip to content

Commit

Permalink
Implement readings auto-lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
justinsilvestre committed Feb 12, 2024
1 parent a6e08f3 commit 01ad4f7
Show file tree
Hide file tree
Showing 21 changed files with 1,286 additions and 138 deletions.
580 changes: 556 additions & 24 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
},
"dependencies": {
"@floating-ui/react": "^0.26.9",
"@silvestre/cjk-unihan": "^0.0.3-0",
"markdown-to-jsx": "^7.4.1",
"next": "14.1.0",
"react": "^18",
Expand Down
213 changes: 203 additions & 10 deletions src/app/prebuild.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import {
LexiconEntry,
Passage,
PassageVocab,
VocabEntryPronunciationKey,
parsePassage,
parsePassageVocabList,
vocabFileColumns,
} from "@/app/texts/Passage";
import {
getPassageFileContents,
getPassageVocabFileContents,
getPassageVocabFilePath,
getTextsIds,
} from "@/app/texts/files";
import * as fs from "fs";
Expand All @@ -18,17 +21,24 @@ import {
toEnMatchKeyword,
} from "./lexiconEntryEnKeywords";

// @ts-expect-error no typings
import unihan from "@silvestre/cjk-unihan";
import { normalizeText } from "./texts/[textId]/punctuation";

const prebuildDirectoryPath = path.join(process.cwd(), "prebuild");

if (!fs.existsSync(prebuildDirectoryPath)) {
fs.mkdirSync(prebuildDirectoryPath);
}

const lexicon = aggregateVocabulary();
const lexiconFilePath = path.join(prebuildDirectoryPath, "lexicon.json");
fs.writeFileSync(lexiconFilePath, JSON.stringify(lexicon, null, 2));
console.log(`Wrote lexicon to ${lexiconFilePath}`);
writePassageVocabularyJsons(lexicon);
fillInMissingReadingsInTsvs().then(() => {
const lexicon = aggregateVocabulary();
const lexiconFilePath = path.join(prebuildDirectoryPath, "lexicon.json");
fs.writeFileSync(lexiconFilePath, JSON.stringify(lexicon, null, 2));
console.log(`Wrote lexicon to ${lexiconFilePath}`);
writePassageVocabularyJsons(lexicon);
console.log(`Done writing vocab jsons`);
});

function aggregateVocabulary() {
const textsIds = getTextsIds();
Expand All @@ -41,28 +51,176 @@ function aggregateVocabulary() {
return lexicon;
}

async function fillInMissingReadingsInTsvs() {
let brandtPassagesVisited = 0;
let registeredChars = new Set<string>();
for (const textId of getTextsIds()) {
const isBrandtPassage = textId.startsWith("brandt-");
if (isBrandtPassage) brandtPassagesVisited += 1;
const vocabFileContents = getPassageVocabFileContents(textId);
const vocab = parsePassageVocabList(vocabFileContents);
const passage = parsePassage(getPassageFileContents(textId));
const passageChars = getPassageChars(passage);

const newCharsInPassage = [...passageChars].filter(
(char) => !registeredChars.has(char)
);

if (textId === "brandt-ch01-3")
console.log({
passageChars: [...passageChars].join(" "),
registeredChars: [...registeredChars].join(" "),
});

const featuredChars = new Set(
Object.keys(vocab).concat(isBrandtPassage ? newCharsInPassage : [])
);

for (const char of featuredChars) {
if (
!vocab[char] ||
vocab[char]?.some((e) => vocabFileColumns.some((k) => !e[k.key]))
) {
const unihanResult = await getUnihan(char);
vocab[char] = (
vocab[char] || [
{
en: null,
jyutping: null,
kr: null,
pinyin: null,
vi: null,
},
]
).map((e) => ({
en: e.en || null,
jyutping:
e.jyutping ||
unihanResult?.kCantonese
?.split(/\s/)
.map((r, _, segments) =>
segments.length > 1
? `${convertToneNumbersToSuperscript(r)}?`
: convertToneNumbersToSuperscript(r)
)

.join(", ") ||
null,
pinyin:
e.pinyin ||
getMandarinReadings(char, unihanResult)
.map((r, _, segments) => (segments.length > 1 ? `${r}?` : r))
.join(", ") ||
null,
vi:
e.vi ||
unihanResult?.kVietnamese
?.split(/\s/)
.map((r, _, segments) => (segments.length > 1 ? `${r}?` : r))
.join(", ") ||
null,
kr:
e.kr ||
unihanResult?.kHangul
?.split(/\s/)
.map((r) => r.split(":")[0])
.map((r, _, segments) => (segments.length > 1 ? `${r}?` : r))
.join(", ") ||
null,
}));
}
}

const newVocabFileContents = makeVocabTsvContent(vocab);

if (
Object.keys(vocab).length &&
newVocabFileContents !== vocabFileContents
) {
const vocabFilePath = getPassageVocabFilePath(textId);
fs.writeFileSync(vocabFilePath, newVocabFileContents);
if (fs.existsSync(vocabFilePath))
console.log(`Overwrote file ${vocabFilePath}`);
else console.log(`Created new file ${vocabFilePath}`);
}

if (isBrandtPassage)
for (const char of passageChars) {
registeredChars.add(char);
}
}
}

function getPassageChars(passage: Passage) {
const frontmatterZi = getEmbeddedChineseSegments(
passage.frontmatter.description
).join("");

return new Set(
frontmatterZi +
passage.lines.map((l) => normalizeText(l.chinese)).join("") +
Object.values(passage.notes)
.flatMap((n) => getEmbeddedChineseSegments(n))
.join("")
);
}

function getEmbeddedChineseSegments(text: string) {
return [...text.matchAll(/(?<=`)[^`\s\n]+(?=`)/g)].map((t) =>
normalizeText(t[0])
);
}

function writePassageVocabularyJsons(lexicon: PassageVocab) {
for (const textId of getTextsIds()) {
const vocab = parsePassageVocabList(getPassageVocabFileContents(textId));
const vocabFileContents = getPassageVocabFileContents(textId);
const vocab = parsePassageVocabList(vocabFileContents);
const passage = parsePassage(getPassageFileContents(textId));

const vocabFilePath = path.join(
const vocabJsonPath = path.join(
prebuildDirectoryPath,
`${textId}.vocab.json`
);
const passageChars = new Set(passage.lines.map((l) => l.chinese).join(""));
const passageChars = getPassageChars(passage);

for (const char of passageChars) {
if (!vocab[char]) {
vocab[char] = lexicon[char];
}
}

fs.writeFileSync(vocabFilePath, JSON.stringify(vocab, null, 2), "utf-8");
console.log(`Wrote vocab for ${textId} to ${vocabFilePath}`);
fs.writeFileSync(vocabJsonPath, JSON.stringify(vocab, null, 2), "utf-8");
console.log(`Wrote vocab for ${textId} to ${vocabJsonPath}`);
}
}

function getMandarinReadings(
char: string,
unihanResult: Partial<Record<string, string>>
) {
const kMandarin = unihanResult?.kMandarin?.split(/\s/);
if (kMandarin?.length) return kMandarin;
const kHanyuPinyin =
unihanResult?.kHanyuPinyin
?.split(/\s/)
.flatMap((s) => s.split(":")[1]?.split(",") || s) || [];
return kHanyuPinyin;
}

function makeVocabTsvContent(
vocab: Partial<Record<string, LexiconEntry[]>>
): string | NodeJS.ArrayBufferView {
return [
`Traditional\tHanyu Pinyin\tJyutping\tKorean\tVietnamese\tEnglish`,
...Object.entries(vocab).flatMap(
([char, ee]) =>
ee?.map((e) =>
[char, e.pinyin, e.jyutping, e.kr, e.vi, e.en].join("\t")
) || []
),
].join("\n");
}

function mergeVocab(a: PassageVocab, b: PassageVocab): PassageVocab {
const merged: PassageVocab = { ...a };
for (const chinese in b) {
Expand Down Expand Up @@ -155,3 +313,38 @@ function mergeEntryPronunciation(
const mergedSegments = [...new Set([...aSegments, ...bSegments])].join(",");
return mergedSegments;
}

async function getUnihan(
char: string
): Promise<Partial<Record<string, string>>> {
return new Promise((res, rej) => {
unihan.get(char, (err: any, result: any) => {
if (err) rej(err);
else {
for (const key in result) {
if (!result[key]) delete result[key];
}
res(result);
}
});
});
}

const SUPERSCRIPT_NUMBERS = {
"0": "⁰",
"1": "¹",
"2": "²",
"3": "³",
"4": "⁴",
"5": "⁵",
"6": "⁶",
"7": "⁷",
"8": "⁸",
"9": "⁹",
};
function convertToneNumbersToSuperscript(string: string) {
return string.replace(
/([0-9])$/,
(match, p1) => SUPERSCRIPT_NUMBERS[p1 as keyof typeof SUPERSCRIPT_NUMBERS]
);
}
8 changes: 7 additions & 1 deletion src/app/texts/Passage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ export type LexiconEntry = Record<VocabEntryPronunciationKey, string | null>;

export type PassageVocab = Partial<Record<string, LexiconEntry[]>>;

export type VocabEntryPronunciationKey = "vi" | "jyutping" | "pinyin" | "en";
export type VocabEntryPronunciationKey =
| "vi"
| "jyutping"
| "pinyin"
| "en"
| "kr";
export const vocabFileColumns = [
{ heading: "Vietnamese", key: "vi" },
{ heading: "Jyutping", key: "jyutping" },
{ heading: "English", key: "en" },
{ heading: "Hanyu Pinyin", key: "pinyin" },
{ heading: "Korean", key: "kr" },
] as const;

export function parsePassageVocabList(vocabFileContents: string | null) {
Expand Down
Loading

0 comments on commit 01ad4f7

Please sign in to comment.