Skip to content

Commit

Permalink
Get first entry in kdb etym data
Browse files Browse the repository at this point in the history
  • Loading branch information
justinsilvestre committed Nov 16, 2023
1 parent dc154c3 commit effddbd
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 4 deletions.
27 changes: 27 additions & 0 deletions app/lib/getCharacterDerivationsChain.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,31 @@ describe("getCharacterDerivationsChain", () => {
new CharacterOriginReference("𤯞", "生", CharacterOriginType.phonetic),
]);
});

it("works for 党", async () => {
const chain = await getCharacterDerivationsChain(
"党",
new CharacterOriginReference(
"党",
"黨",
CharacterOriginType.simplification,
),
async (id) => {
const map = new Map<string, string>([
["党", "→黨 簡体"],
["黨", "⿰黑尚 尚聲 3840230"],
]);
return map.get(id) ?? null;
},
);
console.log(chain);
expect(chain).toEqual([
new CharacterOriginReference(
"党",
"黨",
CharacterOriginType.simplification,
),
new CharacterOriginReference("黨", "尚", CharacterOriginType.phonetic),
]);
});
});
6 changes: 5 additions & 1 deletion prisma/external/seedKanjiDbComposition.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,11 @@ async function getDbInput() {
const [, figureId, etymology] = line.match(/\S+\t&?([^&;\s]+);?\t(.+)/u)!;
if (!figureId || !etymology) throw new Error(line);
if (!dbInput[figureId]) console.warn(`no id for ${figureId} in ${line}`);
if (dbInput[figureId]) dbInput[figureId].etymology = etymology;
if (dbInput[figureId]?.etymology) {
console.warn(
`duplicate etymology for ${figureId} prioritizing first: ${dbInput[figureId].etymology}`,
);
} else if (dbInput[figureId]) dbInput[figureId].etymology = etymology;
});

const sbgyJson = readJsonSync<
Expand Down
20 changes: 18 additions & 2 deletions prisma/kanjisense/parseEtymologyText.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ export class CharacterOriginReference {
export function parseEtymologyText(character: string, text: string) {
if (text.includes(" 或字 ")) return null;

const [, parentMatch] = text.match(/^→(\S+)[^簡体]*(?<comment>#.+)?$/u) || [];
const [, parentMatch] = text.match(/^→(\S+)(\t(簡体))?/u) || [];
if (parentMatch) {
return new CharacterOriginReference(
character,
Expand All @@ -51,7 +51,7 @@ export function parseEtymologyText(character: string, text: string) {
}

const [, soundMarkMatch] =
text.match(/[\s】/]([^形])[省亦]?[聲声](.*)?$/u) || [];
text.match(/^\S+[\s】/]([^形])[省亦]?[聲声](.*)?$/u) || [];
if (soundMarkMatch) {
return new CharacterOriginReference(
character,
Expand All @@ -60,5 +60,21 @@ export function parseEtymologyText(character: string, text: string) {
);
}

if (text === "象形") return null;
if (/^\S+\t(象形|指事|指示|會意)([\t].+)?/.test(text)) return null;

if (/^←\S+/.test(text)) return null;

if (/^</.test(text)) return null;

// ⿰亻志 国字
if (/^\S+\t(\S+\t)?国字/.test(text)) return null;

//單 闕 0240050
if (/^\S+\t闕(\t|$)/.test(text)) return null;

if (/^\S(\t\d+)?/.test(text)) return null;

console.error(`Problem parsing etym text for ${character} ${text}`);
return null;
}
2 changes: 1 addition & 1 deletion prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ model KanjisenseFigure {
shuowenImage ShuowenImage? @relation(fields: [shuowenImageId], references: [id])
variantGroup KanjisenseVariantGroup? @relation(fields: [variantGroupId], references: [id])
meaning KanjisenseFigureMeaning?
reading KanjisenseFigureReading? @relation(fields: [readingId], references: [id], onDelete: NoAction, onUpdate: NoAction)
reading KanjisenseFigureReading? @relation(fields: [readingId], references: [id], onDelete: SetNull, onUpdate: SetNull)
// identical to id, but needed to allow figures to be deleted without deleting readings during updates
readingId String? @unique
allComponents KanjisenseComponent[] @relation("allComponents")
Expand Down

0 comments on commit effddbd

Please sign in to comment.