diff --git a/app/lib/getCharacterDerivationsChain.test.ts b/app/lib/getCharacterDerivationsChain.test.ts index 64f734ddc..fad4d5dd7 100644 --- a/app/lib/getCharacterDerivationsChain.test.ts +++ b/app/lib/getCharacterDerivationsChain.test.ts @@ -62,4 +62,31 @@ describe("getCharacterDerivationsChain", () => { new CharacterOriginReference("𤯞", "生", CharacterOriginType.phonetic), ]); }); + + it("works for 党", async () => { + const chain = await getCharacterDerivationsChain( + "党", + new CharacterOriginReference( + "党", + "黨", + CharacterOriginType.simplification, + ), + async (id) => { + const map = new Map([ + ["党", "→黨 簡体"], + ["黨", "⿰黑尚 尚聲 3840230"], + ]); + return map.get(id) ?? null; + }, + ); + console.log(chain); + expect(chain).toEqual([ + new CharacterOriginReference( + "党", + "黨", + CharacterOriginType.simplification, + ), + new CharacterOriginReference("黨", "尚", CharacterOriginType.phonetic), + ]); + }); }); diff --git a/prisma/external/seedKanjiDbComposition.ts b/prisma/external/seedKanjiDbComposition.ts index 4bce89d37..d4d68e4f3 100644 --- a/prisma/external/seedKanjiDbComposition.ts +++ b/prisma/external/seedKanjiDbComposition.ts @@ -59,7 +59,11 @@ async function getDbInput() { const [, figureId, etymology] = line.match(/\S+\t&?([^&;\s]+);?\t(.+)/u)!; if (!figureId || !etymology) throw new Error(line); if (!dbInput[figureId]) console.warn(`no id for ${figureId} in ${line}`); - if (dbInput[figureId]) dbInput[figureId].etymology = etymology; + if (dbInput[figureId]?.etymology) { + console.warn( + `duplicate etymology for ${figureId} prioritizing first: ${dbInput[figureId].etymology}`, + ); + } else if (dbInput[figureId]) dbInput[figureId].etymology = etymology; }); const sbgyJson = readJsonSync< diff --git a/prisma/kanjisense/parseEtymologyText.ts b/prisma/kanjisense/parseEtymologyText.ts index 78f9962b1..a56f64658 100644 --- a/prisma/kanjisense/parseEtymologyText.ts +++ b/prisma/kanjisense/parseEtymologyText.ts @@ -41,7 +41,7 @@ export class CharacterOriginReference { export function parseEtymologyText(character: string, text: string) { if (text.includes(" 或字 ")) return null; - const [, parentMatch] = text.match(/^→(\S+)[^簡体]*(?#.+)?$/u) || []; + const [, parentMatch] = text.match(/^→(\S+)(\t(簡体))?/u) || []; if (parentMatch) { return new CharacterOriginReference( character, @@ -51,7 +51,7 @@ export function parseEtymologyText(character: string, text: string) { } const [, soundMarkMatch] = - text.match(/[\s】/]([^形])[省亦]?[聲声](.*)?$/u) || []; + text.match(/^\S+[\s】/]([^形])[省亦]?[聲声](.*)?$/u) || []; if (soundMarkMatch) { return new CharacterOriginReference( character, @@ -60,5 +60,21 @@ export function parseEtymologyText(character: string, text: string) { ); } + if (text === "象形") return null; + if (/^\S+\t(象形|指事|指示|會意)([\t].+)?/.test(text)) return null; + + if (/^←\S+/.test(text)) return null; + + if (/^