diff --git a/.gitignore b/.gitignore index d9d2e6354..22435d046 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,6 @@ node_modules /postgres-data *.log +*.log.json .DS_Store diff --git a/.vscode/settings.json b/.vscode/settings.json index 84db28ce5..ef7e15e1f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,7 @@ { "editor.formatOnSave": true, "editor.codeActionsOnSave": { - "source.fixAll": true + "source.fixAll": "explicit" }, "eslint.validate": ["javascript"] } diff --git a/app/components/AppLink.tsx b/app/components/AppLink.tsx index 38e416c53..23514e10f 100644 --- a/app/components/AppLink.tsx +++ b/app/components/AppLink.tsx @@ -4,6 +4,7 @@ import { useEffect, type ReactNode, useRef } from "react"; type LinkProps = T & { children: ReactNode; className?: string; + newWindow?: boolean; }; function AppLink({ @@ -11,12 +12,18 @@ function AppLink({ children, className = "underline hover:text-orange-600", linkRef, + newWindow, }: LinkProps<{ to: string; linkRef?: React.Ref; }>) { return ( - + {children} ); @@ -27,11 +34,13 @@ export function DictLink({ figureId, focusOnLoad, className, + newWindow, }: { children?: ReactNode; className?: string; } & Omit< LinkProps<{ + newWindow?: boolean; figureId: string; focusOnLoad?: boolean; }>, @@ -44,7 +53,12 @@ export function DictLink({ } }, [figureId, focusOnLoad]); return ( - + {children || figureId} ); diff --git a/app/components/FigureBadgeLink.tsx b/app/components/FigureBadgeLink.tsx new file mode 100644 index 000000000..e5f45a3e9 --- /dev/null +++ b/app/components/FigureBadgeLink.tsx @@ -0,0 +1,21 @@ +import { DictLink } from "~/components/AppLink"; +import { FigureBadge } from "~/components/FigureBadge"; +import { BadgeProps } from "~/features/dictionary/badgeFigure"; + +export function FigureBadgeLink({ + id: figureId, + badgeProps, + width, + newWindow, +}: { + id: string; + badgeProps: BadgeProps; + width?: number; + newWindow?: boolean; +}) { + return ( + + + + ); +} diff --git a/app/features/curate/CharactersProgress.tsx b/app/features/curate/CharactersProgress.tsx new file mode 100644 index 000000000..ea3cbc41b --- /dev/null +++ b/app/features/curate/CharactersProgress.tsx @@ -0,0 +1,432 @@ +import { KanjisenseFigure } from "@prisma/client"; +import { useState, Fragment } from "react"; +import type { PropsWithChildren, ReactNode } from "react"; + +import { CurationState } from "./getCurationState"; + +export function CharactersProgress({ + allFiguresKeysSet, + seenChars, + seenFigures, + getOnClickFigure, + remainingKanjisenseCharacters, + remainingMeaningfulComponents, + seenMeaningfulAtomicComponents, + nonAtomicCharactersSeenOnlyAsComponents, + atomicCharactersSeenOnlyAsComponents, +}: { + allFiguresKeysSet: Set; + seenChars: CurationState["seenCharacters"]; + seenFigures: CurationState["seenFigures"]; + getOnClickFigure: (char: string) => () => void; + + remainingKanjisenseCharacters: (KanjisenseFigure & { + asComponent: { + allUses: Pick[]; + } | null; + })[]; + remainingMeaningfulComponents: CurationState["remainingMeaningfulComponents"]; + atomicCharactersSeenOnlyAsComponents: Set; + seenMeaningfulFigures: CurationState["seenFigures"]; + seenMeaningfulAtomicComponents: CurationState["seenFigures"]; + nonAtomicCharactersSeenOnlyAsComponents: Set; +}) { + const [ + specialRemainingCharactersSortMethod, + setSpecialRemainingCharactersSortMethod, + ] = useState( + null, + ); + + function getListPriority(figure: KanjisenseFigure) { + if ( + figure.listsAsCharacter.includes("1") || + figure.listsAsComponent.includes("1") + ) + return 10; + if ( + figure.listsAsCharacter.includes("2") || + figure.listsAsComponent.includes("2") + ) + return 9; + if ( + figure.listsAsCharacter.includes("3") || + figure.listsAsComponent.includes("3") + ) + return 8; + if ( + figure.listsAsCharacter.includes("4") || + figure.listsAsComponent.includes("4") + ) + return 7; + if ( + figure.listsAsCharacter.includes("5") || + figure.listsAsComponent.includes("5") + ) + return 6; + if ( + figure.listsAsCharacter.includes("6") || + figure.listsAsComponent.includes("6") + ) + return 5; + if ( + figure.listsAsCharacter.includes("j") || + figure.listsAsComponent.includes("j") + ) + return 4; + if ( + figure.listsAsCharacter.includes("h") || + figure.listsAsComponent.includes("h") + ) + return 3; + if ( + figure.listsAsCharacter.includes("m") || + figure.listsAsComponent.includes("m") + ) + return 2; + return 1; + } + + const sortedRemainingKanjisenseCharacters = + specialRemainingCharactersSortMethod + ? remainingKanjisenseCharacters.sort((figureA, figureB) => { + if (!figureA || !figureB) return 0; + switch (specialRemainingCharactersSortMethod) { + case "byLists": + return getListPriority(figureB) - getListPriority(figureA); + case "byUsesAppearances": + return ( + (figureB.asComponent?.allUses.reduce( + (acc, use) => acc + use.aozoraAppearances, + 0, + ) ?? 0) - + (figureA.asComponent?.allUses.reduce( + (acc, use) => acc + use.aozoraAppearances, + 0, + ) ?? 0) + ); + case "byFewestComponents": + return ( + ((figureA.componentsTree as [string, string][] | null) + ?.length ?? 0) - + ((figureB.componentsTree as [string, string][] | null) + ?.length ?? 0) + ); + default: + return 0; + } + }) + : remainingKanjisenseCharacters; + + // const remainingMeaningfulComponents = allFiguresKeys.filter((char) => { + // const figure = getFigure(char)!; + // return ( + // figure.keyword && + // figure.isKanjisenseMeaningfulComponent() && + // !seenComponents.some(c=>c.id === char) + // ); + // }); + const remainingMeaningfulAtomicComponents = + remainingMeaningfulComponents.filter(isFigureAtomic); + + return ( +
+

+ {seenMeaningfulAtomicComponents.length}/ + {remainingMeaningfulAtomicComponents.length + + seenMeaningfulAtomicComponents.length}{" "} + atomic components seen ({remainingMeaningfulAtomicComponents.length}{" "} + remaining) +
+ {seenChars.length} characters seen ( + { + // priority + seenChars.filter((c) => c.isPriority).length + }{" "} + priority figures, {seenChars.filter((c) => !c.isPriority).length}{" "} + other):{" "} + {[...seenChars] + .sort((a, b) => { + return b.aozoraAppearances - a.aozoraAppearances; + }) + .map((entry, i) => { + const char = typeof entry === "string" ? entry : entry.id; + if (allFiguresKeysSet.has(char)) + return ( + + + {entry.isStandaloneCharacter ? null : "!"} + + ); + return ( + + *{char} + + ); + })} +

+

+ {seenFigures.length} components seen: + {[...seenFigures] + .sort((a, b) => { + return b.aozoraAppearances - a.aozoraAppearances; + }) + .map((c, i) => ( + + ))} +

+
+ {seenMeaningfulAtomicComponents.length} atomic components seen: + {seenMeaningfulAtomicComponents.map((c, i) => ( + + ))} +
+
+ {atomicCharactersSeenOnlyAsComponents.size} atomic characters + encountered only as components: + {atomicCharactersSeenOnlyAsComponents} +
+ {nonAtomicCharactersSeenOnlyAsComponents.size} other characters + encountered only as components: + {nonAtomicCharactersSeenOnlyAsComponents} +
+
+ {/* TODO: only count those non-joyo components which are used in non-joyo kanji that are NOT variants of joyo kanji */} + + {remainingMeaningfulAtomicComponents.length} remaining atomic + components: + {" "} +
+ {remainingMeaningfulAtomicComponents.map((figure) => { + return ( + + ); + })} +
+
+ + {remainingKanjisenseCharacters.length} priority characters left: + } + > +
+ {/* eslint-disable-next-line jsx-a11y/click-events-have-key-events, jsx-a11y/no-static-element-interactions */} + { + if (!specialRemainingCharactersSortMethod) + setSpecialRemainingCharactersSortMethod("byLists"); + else if (specialRemainingCharactersSortMethod === "byLists") + setSpecialRemainingCharactersSortMethod("byUsesAppearances"); + else if ( + specialRemainingCharactersSortMethod === "byUsesAppearances" + ) + setSpecialRemainingCharactersSortMethod("byFewestComponents"); + else if ( + specialRemainingCharactersSortMethod === "byFewestComponents" + ) + setSpecialRemainingCharactersSortMethod(null); + }} + > + order: {specialRemainingCharactersSortMethod || "frequency"}{" "} + + + {sortedRemainingKanjisenseCharacters.map((c, i) => { + return ( + sc.id === c.id) + ? "text-opacity-50" + : "" + } + > + + + ); + })} +
+
+ {remainingMeaningfulComponents.length} remaining components + } + > +
+ only components:{" "} + { + remainingMeaningfulComponents.filter( + (c) => !c.isStandaloneCharacter, + ).length + }{" "} + figures{" "} + {remainingMeaningfulComponents + .filter((c) => !c.isStandaloneCharacter) + .map((c) => { + const primaryVariant = c.variantGroupId ?? c.id; + return ( + sc.id === primaryVariant) || + seenFigures.some((sc) => sc.id === primaryVariant)) + ? "bg-slate-300 opacity-50" + : "" + } + > + + + ); + })} +
+ also standalone characters:{" "} + { + remainingMeaningfulComponents.filter((c) => c.isStandaloneCharacter) + .length + }{" "} + figures{" "} + {remainingMeaningfulComponents + .filter((c) => c.isStandaloneCharacter) + .map((c) => { + const primaryVariant = c.variantGroupId ?? c.id; + return ( + sc.id === primaryVariant) || + seenFigures.some((sc) => sc.id === primaryVariant)) + ? "opacity-30" + : "" + } + > + + + ); + })} +
+
+
+ ); +} + +function ColorCodedComponentWithUses({ + figure, + getOnClickFigure, +}: { + figure: CurationState["remainingMeaningfulComponents"][number]; + getOnClickFigure: (char: string) => () => void; +}) { + return ( +
+ {" "} + -{" "} + {figure.asComponent?.allUses.map((u, i) => { + return ( + + + + ); + })} +
+ ); +} + +function Collapsible({ + summary, + open = false, + children, +}: PropsWithChildren<{ summary: ReactNode; open?: boolean }>) { + const [isOpen, setIsOpen] = useState(open); + return ( + + {/* eslint-disable-next-line jsx-a11y/no-static-element-interactions, jsx-a11y/click-events-have-key-events */} +
{ + setIsOpen(!isOpen); + }} + > + {isOpen ? "-" : "+"} {summary} +
+ {isOpen ? children : null} +
+ ); +} + +function ColorCodedFigure({ + display, + lists, + onClick, +}: { + display: string; + lists: string[]; + onClick: () => void; +}) { + return ( + // eslint-disable-next-line jsx-a11y/click-events-have-key-events, jsx-a11y/no-static-element-interactions + + {display} + + ); +} + +function isFigureAtomic( + figure: Pick, +): boolean { + return Array.isArray(figure.componentsTree) + ? figure.componentsTree.length === 0 + : false; +} diff --git a/app/features/curate/CuratorCorpusText.ts b/app/features/curate/CuratorCorpusText.ts new file mode 100644 index 000000000..3eb22683c --- /dev/null +++ b/app/features/curate/CuratorCorpusText.ts @@ -0,0 +1,52 @@ +export type BaseCorpus = Record; +export type TextId = string; + +export class CuratorCorpusText { + title: string | null; + author: string | null; + source: string; + section?: string; + dynasty?: string; + urls?: string[] | null; + text: string; + uniqueChars: string; + normalizedText: string; + + constructor( + jsonProps: { + title: string | null; + author: string | null; + source: string; + section?: string; + dynasty?: string; + urls: string[]; + text: string; + uniqueChars: string; + }, + normalizedText: string, + ) { + this.title = jsonProps.title; + this.author = jsonProps.author; + this.source = jsonProps.source; + this.section = jsonProps.section; + this.dynasty = jsonProps.dynasty; + this.urls = jsonProps.urls; + this.text = jsonProps.text; + this.uniqueChars = jsonProps.uniqueChars; + this.normalizedText = normalizedText; + } + + toJSON() { + return { + title: this.title, + author: this.author, + source: this.source, + section: this.section, + dynasty: this.dynasty, + urls: this.urls, + text: this.text, + uniqueChars: this.uniqueChars, + normalizedText: this.normalizedText, + }; + } +} diff --git a/app/features/curate/getCurationState.ts b/app/features/curate/getCurationState.ts new file mode 100644 index 000000000..a6e2f5b58 --- /dev/null +++ b/app/features/curate/getCurationState.ts @@ -0,0 +1,733 @@ +import { writeFileSync } from "fs"; +import { join } from "path"; + +import { + KanjisenseFigure, + KanjiDbVariantType, + SbgyXiaoyun, +} from "@prisma/client"; + +import { prisma } from "~/db.server"; +import { + badgeFigureSelect, + isAtomicFigure, +} from "~/features/dictionary/badgeFigure"; +import { baseKanji, baseKanjiSet, joyoKanji } from "~/lib/baseKanji"; + +import { BadgeFigure } from "../dictionary/getDictionaryPageFigure.server"; +import { transcribeSbgyXiaoyun } from "../dictionary/transcribeSbgyXiaoyun"; + +export type CurationState = Awaited>; + +export async function getCurationState(courseId: string, page: number) { + const priorityCharacters = [...baseKanji]; + const priorityCharactersSet = baseKanjiSet; + const nonPriorityVariants1 = await prisma.kanjiDbVariant.findMany({ + where: { + variantType: { + in: [KanjiDbVariantType.OldStyle, KanjiDbVariantType.TwEduVariant], + }, + base: { + in: priorityCharacters, + }, + variant: { + notIn: priorityCharacters, + }, + }, + }); + const nonPriorityVariants2 = await prisma.unihan14.findMany({ + where: { + id: { + notIn: priorityCharacters, + }, + OR: [ + { + kSemanticVariant: { + hasSome: priorityCharacters, + }, + }, + { + kZVariant: { + hasSome: priorityCharacters, + }, + }, + ], + }, + }); + const nonPriorityToPriority: Record = {}; + for (const { base, variant } of nonPriorityVariants1) { + nonPriorityToPriority[variant] ||= []; + if ( + priorityCharactersSet.has(base) && + !nonPriorityToPriority[variant].includes(base) + ) + nonPriorityToPriority[variant].push(base); + } + for (const { id, kSemanticVariant, kZVariant } of nonPriorityVariants2) { + nonPriorityToPriority[id] ||= []; + if (kSemanticVariant) { + for (const variant of kSemanticVariant) { + if ( + priorityCharactersSet.has(variant) && + !nonPriorityToPriority[id].includes(variant) + ) + nonPriorityToPriority[id].push(variant); + } + } + if (kZVariant) { + for (const variant of kZVariant) { + if ( + priorityCharactersSet.has(variant) && + !nonPriorityToPriority[id].includes(variant) + ) + nonPriorityToPriority[id].push(variant); + } + } + } + const joyo = new Set(joyoKanji); + for (const variants of Object.values(nonPriorityToPriority)) { + variants.sort((a, b) => { + if (joyo.has(a) && !joyo.has(b)) return -1; + if (joyo.has(b) && !joyo.has(a)) return 1; + return 0; + }); + } + + const joyoKanjiWithVariants = await prisma.kanjisenseFigure.findMany({ + where: { + isPriority: true, + listsAsCharacter: { has: "j" }, + variantGroupId: { + not: null, + }, + }, + select: { + id: true, + variantGroup: { + select: { + id: true, + figures: { + where: { + isStandaloneCharacter: true, + }, + }, + }, + }, + }, + }); + const oldToNew: Record = {}; + for (const { id, variantGroup } of joyoKanjiWithVariants) { + const variants = variantGroup!.figures.map((f) => f.id); + for (const variant of variants) { + if (id !== variant) oldToNew[variant] = id; + } + } + + const nonJoyoLessCommonPriorityToMoreCommonPriority: Record = + {}; + const nonJoyoPriorityCharsWithVariants = + await prisma.kanjisenseFigure.findMany({ + where: { + isPriority: true, + listsAsCharacter: { isEmpty: false }, + variantGroupId: { + not: null, + }, + variantGroup: { + figures: { + none: { + listsAsCharacter: { has: "j" }, + }, + some: { + listsAsCharacter: { isEmpty: false }, + }, + }, + }, + }, + select: { + id: true, + variantGroup: { + select: { + id: true, + figures: { + where: { + isStandaloneCharacter: true, + listsAsCharacter: { isEmpty: false }, + }, + }, + }, + }, + }, + }); + + writeFileSync( + join(process.cwd(), "kanjiVariants.log.json"), + JSON.stringify({ + oldToNew, + nonPriorityToPriority, + nonJoyoLessCommonPriorityToMoreCommonPriority, + }), + ); + console.log(join(process.cwd(), "kanjiVariants.log.json")); + + for (const { variantGroup } of nonJoyoPriorityCharsWithVariants) { + const variants = variantGroup!.figures!.sort( + // by aozora appearances desc + (a, b) => b.aozoraAppearances - a.aozoraAppearances, + ); + if (variants.length > 1) { + const [mostCommonVariant, ...otherVariants] = variants; + for (const variant of otherVariants) { + nonJoyoLessCommonPriorityToMoreCommonPriority[variant.id] = + mostCommonVariant.id; + } + } + } + + await prisma.course.upsert({ + where: { + id: "kj2x", + }, + update: {}, + create: { + id: "kj2x", + }, + }); + + const allFiguresKeys = await prisma.kanjisenseFigure + .findMany({ + select: { + id: true, + }, + }) + .then((figures) => figures.map((f) => f.id)); + const course = await prisma.course.findUnique({ + where: { + id: courseId, + }, + }); + if (!course) { + throw new Error(`Course ${courseId} not found`); + } + + const keysToSeenTexts = new Map( + await prisma.baseCorpusText + .findMany({ + where: { + key: { + in: ((course?.seenTexts || []) as string[][]).flat() || [], + }, + }, + include: { + uniqueCharacters: true, + uniqueComponents: true, + }, + }) + .then((texts) => texts.map((t) => [t.key, t])), + ); + const seenTexts = ((course?.seenTexts || []) as string[][]).map((keys) => + keys.map((key) => { + const text = keysToSeenTexts.get(key)!; + if (!text) console.log(`Missing text ${key}`); + return text; + }), + ); + // excluding characters not in kanjisense + const seenCharacters = await prisma.kanjisenseFigure.findMany({ + where: { + id: { + in: [ + ...new Set( + seenTexts + .flat() + .flatMap((t) => + t.uniqueCharacters.flatMap((c) => c.figureId || []), + ), + ), + ], + }, + }, + }); + const seenFigures = await prisma.kanjisenseFigure.findMany({ + where: { + id: { + in: [ + ...new Set( + await getComponentsFromCharsAsync( + (id) => + prisma.kanjisenseFigure.findUnique({ + where: { id }, + select: { + ...badgeFigureSelect, + componentsTree: true, + }, + }), + new Set(seenCharacters.map((c) => c.id)), + ), + ), + ], + }, + }, + select: { + ...badgeFigureSelect, + isPriority: true, + componentsTree: true, + image: true, + }, + }); + + const seenCharsTangReadings = await prisma.kanjisenseFigureReading.findMany({ + where: { + id: { + in: seenCharacters.map((c) => c.id), + }, + sbgyXiaoyuns: { + some: {}, + }, + }, + select: { + id: true, + sbgyXiaoyunsMatchingExemplars: true, + sbgyXiaoyuns: { + select: { + sbgyXiaoyun: true, + }, + }, + }, + }); + const seenCharsTangReadingsMap = new Map( + seenCharsTangReadings.map((r) => [r.id, r]), + ); + const defaultTangReadings = Object.fromEntries( + seenTexts.flat().map((text) => { + return [ + text.key, + getDefaultTangReadings(text.normalizedText, (id) => { + return ( + seenCharsTangReadingsMap + .get(id) + ?.sbgyXiaoyuns.map((x) => x.sbgyXiaoyun) || [] + ); + }), + ]; + }), + ); + + const remainingKanjisenseCharacters = await prisma.kanjisenseFigure.findMany({ + where: { + id: { + notIn: seenCharacters.map((c) => c.id), + }, + isPriority: true, + // should include those without directUses + listsAsCharacter: { + isEmpty: false, + }, + }, + orderBy: { + aozoraAppearances: "desc", + }, + include: { + image: true, + asComponent: { + select: { + id: true, + allUses: { + select: { + id: true, + aozoraAppearances: true, + }, + where: { + isPriority: true, + }, + orderBy: { + aozoraAppearances: "desc", + }, + }, + }, + }, + }, + }); + + const remainingMeaningfulComponents = await prisma.kanjisenseFigure.findMany({ + where: { + id: { + notIn: seenFigures.map((c) => c.id), + }, + isPriority: true, + // should include those without directUses + listsAsComponent: { + isEmpty: false, + }, + }, + orderBy: { + aozoraAppearances: "desc", + }, + select: { + ...badgeFigureSelect, + + isPriorityComponent: true, + componentsTree: true, + + isPriority: true, + + image: true, + + asComponent: { + select: { + id: true, + allUses: { + select: { + id: true, + aozoraAppearances: true, + listsAsCharacter: true, + listsAsComponent: true, + }, + where: { + isPriority: true, + }, + orderBy: { + aozoraAppearances: "desc", + }, + }, + }, + }, + }, + }); + + const soughtCharacters = + course?.wantedCharacters || course.normalizedTextSearchQuery + ? [ + ...new Set([ + ...course.wantedCharacters, + ...course.normalizedTextSearchQuery.replaceAll("|", ""), + ]), + ] + : null; + + console.log({ + seenCharacters: seenCharacters.map((c) => c.id).join(""), + seenFigures: seenFigures.map((c) => c.id).join(" "), + remainingKanjisenseCharacters: remainingKanjisenseCharacters + .map((c) => c.id) + .join(""), + remainingMeaningfulComponents: remainingMeaningfulComponents + .map((c) => c.id) + .join(" "), + }); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // getting all characters + // in texts containing the given characters and in the given length range + // EXCEPT seen characters. + // then, group them by text, and sum these unseen characters' frequency scores to get the TEXT SCORE. + // a higher score means either + // - more unique unseen characters/components (i.e. higher raw ROI) + // - more frequent unseen characters/components (i.e. contains some especially important/easy characters) + // these texts could be further sorted by "ease" by further ordering them by the number of SEEN characters they contain. + const wantedAuthors = course?.authors.length + ? course.authors.filter((a) => !a.startsWith("-")) + : null; + const unwantedAuthors = course?.authors.length + ? course.authors.filter((a) => a.startsWith("-")).map((a) => a.slice(1)) + : null; + const wantedSources = course?.sources.length + ? course.sources.filter((s) => !s.startsWith("-")) + : null; + const unwantedSources = course?.sources.length + ? course.sources.filter((s) => s.startsWith("-")).map((s) => s.slice(1)) + : null; + + const charactersNotNeededAnymore = seenCharacters + .map((c) => c.id) + .filter((id) => !soughtCharacters?.includes(id)); + + // todo: extract querying for reuse with count below + const textGroups = await prisma.characterUsagesOnBaseCorpusText.groupBy({ + by: [ + "baseCorpusTextId", + "baseCorpusTextLength", + "baseCorpusUniqueCharactersCount", + "baseCorpusUniqueComponentsCount", + "baseCorpusTextNonPriorityCharactersCount", + ], + _count: { + baseCorpusTextId: true, + }, + + where: { + baseCorpusText: { + normalizedLength: { + gte: course?.minLength || undefined, + lte: course?.maxLength || undefined, + }, + id: { + notIn: seenTexts.flatMap((ts) => ts.map((t) => t.id)), + }, + AND: [ + { + author: + wantedAuthors?.length || unwantedAuthors?.length + ? { + in: wantedAuthors?.length ? wantedAuthors : undefined, + notIn: unwantedAuthors?.length + ? unwantedAuthors + : undefined, + } + : undefined, + }, + { + source: + wantedSources?.length || unwantedSources?.length + ? { + in: wantedSources?.length ? wantedSources : undefined, + notIn: unwantedSources?.length + ? unwantedSources + : undefined, + } + : undefined, + }, + ], + // uniqueCharacters: { + // none: { + // figure: { + // isPriority: false, + // }, + // }, + // }, + // uniqueCharacters: soughtCharacters + // ? { + // some: { + // figureId: { + // in: soughtCharacters?.length ? soughtCharacters : undefined, + // notIn: charactersNotNeededAnymore, + // }, + // }, + // } + // : undefined, + // author: + // wantedAuthors?.length || unwantedAuthors?.length + // ? { + // in: wantedAuthors?.length ? wantedAuthors : undefined, + // notIn: unwantedAuthors?.length ? unwantedAuthors : undefined, + // } + // : undefined, + // source: + // wantedSources?.length || unwantedSources?.length + // ? { + // in: wantedSources?.length ? wantedSources : undefined, + // notIn: unwantedSources?.length ? unwantedSources : undefined, + // } + // : undefined, + OR: course?.normalizedTextSearchQuery + ? course.normalizedTextSearchQuery + .split("|") + .map((q) => ({ normalizedText: { contains: q } })) + : undefined, + }, + figureId: { + notIn: charactersNotNeededAnymore, + in: course?.wantedCharacters.length + ? course.wantedCharacters.split("") + : undefined, + }, + }, + _sum: { + frequencyScore: true, + }, + orderBy: [ + { + baseCorpusTextNonPriorityCharactersCount: "asc", + }, + { + // useful to switch with baseCorpusUniqueCharactersCount, baseCorpusTextNonPriorityCharactersCount + baseCorpusUniqueComponentsCount: "asc", + }, + { + baseCorpusUniqueCharactersCount: "asc", + }, + { + _sum: { + frequencyScore: "desc", + }, + }, + { baseCorpusTextLength: "desc" }, + ], + take: 500, + skip: (page - 1) * 500, + }); + + console.log("geting text groups"); + const textGroupsCount = ( + await prisma.characterUsagesOnBaseCorpusText.groupBy({ + by: ["baseCorpusTextId"], + _count: { + baseCorpusTextId: true, + }, + + where: { + baseCorpusText: { + normalizedLength: { + gte: course?.minLength || undefined, + lte: course?.maxLength || undefined, + }, + id: { + notIn: seenTexts.flatMap((ts) => ts.map((t) => t.id)), + }, + AND: [ + { + author: + wantedAuthors?.length || unwantedAuthors?.length + ? { + in: wantedAuthors?.length ? wantedAuthors : undefined, + notIn: unwantedAuthors?.length + ? unwantedAuthors + : undefined, + } + : undefined, + }, + { + source: + wantedSources?.length || unwantedSources?.length + ? { + in: wantedSources?.length ? wantedSources : undefined, + notIn: unwantedSources?.length + ? unwantedSources + : undefined, + } + : undefined, + }, + ], + // uniqueCharacters: { + // none: { + // figure: { + // isPriority: false, + // }, + // }, + // }, + // uniqueCharacters: soughtCharacters + // ? { + // some: { + // figureId: { + // in: soughtCharacters?.length ? soughtCharacters : undefined, + // notIn: charactersNotNeededAnymore, + // }, + // }, + // } + // : undefined, + // author: + // wantedAuthors?.length || unwantedAuthors?.length + // ? { + // in: wantedAuthors?.length ? wantedAuthors : undefined, + // notIn: unwantedAuthors?.length ? unwantedAuthors : undefined, + // } + // : undefined, + // source: + // wantedSources?.length || unwantedSources?.length + // ? { + // in: wantedSources?.length ? wantedSources : undefined, + // notIn: unwantedSources?.length ? unwantedSources : undefined, + // } + // : undefined, + OR: course?.normalizedTextSearchQuery + ? course.normalizedTextSearchQuery + .split("|") + .map((q) => ({ normalizedText: { contains: q } })) + : undefined, + }, + figureId: { + notIn: charactersNotNeededAnymore, + in: course?.wantedCharacters.length + ? course.wantedCharacters.split("") + : undefined, + }, + }, + }) + ).length; + + console.log("geting texts"); + const unseenTexts = await prisma.baseCorpusText.findMany({ + where: { + id: { + in: textGroups.map((g) => g.baseCorpusTextId), + }, + // uniqueCharacters: { + // some: { + // figureId: { + // in: course?.wantedCharacters.length + // ? course.wantedCharacters.split("") + // : undefined, + // }, + // }, + // }, + }, + include: { + uniqueCharacters: true, + uniqueComponents: true, + }, + }); + + return { + course, + seenTexts, + seenCharacters, + seenFigures, + defaultTangReadings, + remainingKanjisenseCharacters, + remainingMeaningfulComponents, + allFiguresKeys, + unseenTexts, + textGroups, + textGroupsCount, + }; +} + +function getDefaultTangReadings( + normalizedShinjitai: string, + getFigureReadings: (k: string) => SbgyXiaoyun[], +): string { + return Array.from(normalizedShinjitai, (char) => { + const readings = getFigureReadings(char); + const tangReadings = readings.map((r) => transcribeSbgyXiaoyun(r)); + if (char === "不") return tangReadings[1]; + + if (!tangReadings.length) return "X"; + if (tangReadings.length === 1) return tangReadings[0]; + return tangReadings.join("/"); + }).join(" "); +} + +async function getComponentsFromCharsAsync( + getFigure: ( + key: string, + ) => Promise<(BadgeFigure & Pick) | null>, + seenChars: Set, +) { + return new Set( + await asyncFlatMap(seenChars, async (char: string) => { + const figure = await getFigure(char); + if (!figure) { + return []; + } + const components: string[] = []; + + if (figure.isPriorityComponent || isAtomicFigure(figure)) + components.push(char); + const componentsTree = (figure.componentsTree || []) as [ + string, + string, + ][]; + components.push( + ...componentsTree.map((c: [parent: string, component: string]) => c[1]), + ); + return components; + }), + ); +} + +function asyncFlatMap( + arr: Iterable, + fn: (item: T) => Promise, +): Promise { + return Promise.all(Array.from(arr, fn)).then((arrs) => arrs.flat()); +} diff --git a/app/features/curate/getGuangyunRhymeCycleHead.ts b/app/features/curate/getGuangyunRhymeCycleHead.ts new file mode 100644 index 000000000..4bc869e3c --- /dev/null +++ b/app/features/curate/getGuangyunRhymeCycleHead.ts @@ -0,0 +1,130 @@ +import { LABIAL_INITIALS } from "prisma/external/getYuntuJson"; +import { QieyunRhymeCycleHead } from "~/lib/qys/QieyunRhymeCycleHead"; +import { QysInitial } from "~/lib/qys/QysInitial"; +import { DengOrChongniu, Kaihe, Tone } from "~/lib/qys/QysSyllableProfile"; + +export function getGuangyunCycleHead( + qieyunCycleHead: QieyunRhymeCycleHead, + initial: QysInitial, + kaihe: Kaihe | null, + deng: DengOrChongniu | null, +): GuangyunCycleHead { + if (qieyunCycleHead === "眞" && kaihe === Kaihe.Closed && deng !== "A") + return "諄"; + + if ( + qieyunCycleHead === "寒" && + (kaihe === Kaihe.Closed || LABIAL_INITIALS.has(initial)) + ) + return "桓"; + + if ( + qieyunCycleHead === "歌" && + (kaihe === Kaihe.Closed || LABIAL_INITIALS.has(initial) || deng === "三") + ) + return "戈"; + + return qieyunCycleHead; +} + +export function getGuangyunFinal( + qieyunCycleHead: QieyunRhymeCycleHead, + initial: QysInitial, + kaihe: Kaihe | null, + deng: DengOrChongniu | null, + tone: Tone, +) { + const cycle = getGuangyunCycle(qieyunCycleHead, initial, kaihe, deng); + if (!cycle) + throw new Error(`無廣韻韻目: ${qieyunCycleHead}${kaihe}${deng}${tone}`); + if (tone === Tone.平) return cycle[0]; + if (tone === Tone.上) return cycle[1]; + if (tone === Tone.去) return cycle[2]; + if (tone === Tone.入) return cycle[3]!; + throw new Error(`無廣韻韻目: ${qieyunCycleHead}${kaihe}${deng}${tone}`); +} + +export function getGuangyunCycle( + qieyunCycleHead: QieyunRhymeCycleHead, + initial: QysInitial, + kaihe: Kaihe | null, + deng: DengOrChongniu | null, +) { + const guangyunCycleHead = getGuangyunCycleHead( + qieyunCycleHead, + initial, + kaihe, + deng, + ); + return prettySbgyRhymeCycles.find((c) => c.includes(guangyunCycleHead)); +} + +type GuangyunCycleHead = QieyunRhymeCycleHead | "諄" | "桓" | "戈"; + +const prettySbgyRhymeCycles: ( + | [GuangyunCycleHead, string, string] + | [GuangyunCycleHead, string, string, string] + | ["", "", GuangyunCycleHead] +)[] = [ + ["東", "董", "送", "屋"], + ["冬", "湩", "宋", "沃"], + ["鍾", "腫", "用", "燭"], + ["江", "講", "絳", "覺"], + ["支", "紙", "寘"], + ["脂", "旨", "至"], + ["之", "止", "志"], + ["微", "尾", "未"], + ["魚", "語", "御"], + ["虞", "麌", "遇"], + ["模", "姥", "暮"], + ["齊", "薺", "霽"], + ["佳", "蟹", "卦"], + ["皆", "駭", "怪"], + ["灰", "賄", "隊"], + ["咍", "海", "代"], + ["眞", "軫", "震", "質"], + ["諄", "準", "稕", "術"], + ["臻", "", "", "櫛"], + ["文", "吻", "問", "物"], + ["欣", "隱", "焮", "迄"], + ["元", "阮", "願", "月"], + ["魂", "混", "慁", "沒"], + ["痕", "很", "恨", "麧"], + ["寒", "旱", "翰", "曷"], + ["桓", "緩", "換", "末"], + ["刪", "潸", "諫", "鎋"], + ["山", "産", "襉", "黠"], + ["先", "銑", "霰", "屑"], + ["仙", "獮", "線", "薛"], + ["蕭", "篠", "嘯"], + ["宵", "小", "笑"], + ["肴", "巧", "效"], + ["豪", "晧", "号"], + ["歌", "哿", "箇"], + ["戈", "果", "過"], + ["麻", "馬", "禡"], + ["陽", "養", "漾", "藥"], + ["唐", "蕩", "宕", "鐸"], + ["庚", "梗", "映", "陌"], + ["耕", "耿", "諍", "麥"], + ["清", "靜", "勁", "昔"], + ["青", "迥", "徑", "錫"], + ["蒸", "拯", "證", "職"], + ["登", "等", "嶝", "德"], + ["尤", "有", "宥"], + ["侯", "厚", "候"], + ["幽", "黝", "幼"], + ["侵", "寑", "沁", "緝"], + ["覃", "感", "勘", "合"], + ["談", "敢", "闞", "盍"], + ["鹽", "琰", "豔", "葉"], + ["添", "忝", "㮇", "怗"], + ["咸", "豏", "陷", "洽"], + ["銜", "檻", "鑑", "狎"], + ["嚴", "儼", "釅", "業"], + ["凡", "范", "梵", "乏"], + ["", "", "祭"], + ["", "", "泰"], + ["", "", "夬"], + ["", "", "廢"], +]; diff --git a/app/features/dictionary/Finals.tsx b/app/features/dictionary/Finals.tsx index dead6b38e..0f3766c95 100644 --- a/app/features/dictionary/Finals.tsx +++ b/app/features/dictionary/Finals.tsx @@ -445,15 +445,16 @@ export function Finals({ Div. III - - (w/y/ẁ)eng - (w/y/ẁ)ek - (w/y/ẁ)ei - (w/y/ẁ)en - (w/y/ẁ)et - (y)eu - (y)em - (y)ep + ◌̇ y- ẁ- + + (w/y/ẁ)ėng + (w/y/ẁ)ėk + (w/y/ẁ)ėi + (w/y/ẁ)ėn + (w/y/ẁ)ėt + (y)ėu + (y)ėm + (y)ėp {" "} @@ -481,17 +482,16 @@ export function Finals({ Div. IV
- ◌̀ - (w)èng - (w)èk - (w)èi - (w)èn - (w)èt - èu - èm - èp + (w)eng + (w)ek + (w)ei + (w)en + (w)et + eu + em + ep diff --git a/app/features/dictionary/QysDialogContent.tsx b/app/features/dictionary/QysDialogContent.tsx index 99cb05634..8ab24494f 100644 --- a/app/features/dictionary/QysDialogContent.tsx +++ b/app/features/dictionary/QysDialogContent.tsx @@ -131,7 +131,6 @@ export const QysDialogContent = ({ const medial = /^[yŷẁw]/.test(initial) ? null : final.match(/^[yŷẁẃw]/); - console.log({ initial, medial: medial?.[0], final }); const medialHint = medial ? ( ) : null; diff --git a/app/features/dictionary/QysHints.tsx b/app/features/dictionary/QysHints.tsx index 7b08d4ffd..9ef6615fd 100644 --- a/app/features/dictionary/QysHints.tsx +++ b/app/features/dictionary/QysHints.tsx @@ -3,14 +3,17 @@ import { PropsWithChildren } from "react"; import { MiddleChineseTranscriptionLink } from "~/components/AppLink"; import { IpaLink, IpaSymbols } from "~/features/dictionary/IpaLink"; -import { Kaihe, QysSyllableProfile } from "~/lib/qys/inferOnyomi"; import { QieyunRhymeCycleHead } from "~/lib/qys/QieyunRhymeCycleHead"; import { initialGroups } from "~/lib/qys/QysInitial"; +import { Kaihe, QysSyllableProfile } from "~/lib/qys/QysSyllableProfile"; import A from "../../components/ExternalLink"; -const noMDentilabializationFinals = new Set(["尤", "東"]); -const alwaysDentilabializationFinals = new Set([ +export const noMDentilabializationFinals = new Set([ + "尤", + "東", +]); +export const alwaysDentilabializationFinals = new Set([ "元", "陽", "凡", @@ -21,7 +24,13 @@ const alwaysDentilabializationFinals = new Set([ "鍾", ]); -function hasDentilabialization({ cycleHead, initial }: QysSyllableProfile) { +function hasDentilabialization({ + cycleHead, + initial, + dengOrChongniu, +}: QysSyllableProfile) { + if (cycleHead === "東" && dengOrChongniu !== "三") return false; + if (!initialGroups.幫.has(initial)) return false; if (noMDentilabializationFinals.has(cycleHead)) { @@ -950,8 +959,7 @@ export const MedialHints = { like or . It may have been fronted to something like or {" "} in certain contexts, especially before front vowels (written here{" "} - and ) and when written with the acute accent as{" "} - . + and ) and when written before . ), Y: () => ( @@ -1395,10 +1403,15 @@ export const VowelHints = { ), E3: () => ( <> - Bare marks finals in the category Division III in this - notation. Scholars tend to reconstruct the vowel of these finals with an - onset something like /i/ or /j/, and a main vowel like{" "} - or . + For finals written with E in this notation, the dot above is + the usual mark of the category called Division III. Scholars tend + to reconstruct the vowel of these finals with an onset something like /i/ + or /j/, and a main vowel like or{" "} + . When written with the a{" "} + + leading or + + , the the dot above is omitted for the sake of brevity. ), E3Circumflex: () => ( @@ -1429,12 +1442,11 @@ export const VowelHints = { ), E4: () => ( <> - The grave accent over E in this notation - marks finals in the category Division IV. At earlier stages of - Middle Chinese, these finals were probably pronounced with a vowel - something like or . That is, - they were identical to the corresponding finals of Division III (with bare{" "} - + The bare letter E in this notation marks finals in the category{" "} + Division IV. At earlier stages of Middle Chinese, these finals were + probably pronounced with a vowel something like or{" "} + . That is, they were identical to the corresponding + finals of Division III (with bare ), but without the /i/-like glide before the vowel. Eventually, this Division IV series also was pronounced with a glide, and most (if not all) of these pairs merged. diff --git a/app/features/dictionary/SingleFigureDictionaryEntry.tsx b/app/features/dictionary/SingleFigureDictionaryEntry.tsx index 8245a5680..7e91c9b46 100644 --- a/app/features/dictionary/SingleFigureDictionaryEntry.tsx +++ b/app/features/dictionary/SingleFigureDictionaryEntry.tsx @@ -1,5 +1,5 @@ /* eslint-disable jsx-a11y/no-noninteractive-element-interactions */ -import { KanjisenseFigure, KanjisenseFigureImageType } from "@prisma/client"; +import { KanjisenseFigureImageType } from "@prisma/client"; import { clsx } from "clsx"; import { useState } from "react"; @@ -8,6 +8,7 @@ import { FigurePopoverBadge } from "~/components/FigurePopover"; import { BadgeProps, getBadgeProps, + isAtomicFigure, isPrioritySoundMark, } from "~/features/dictionary/badgeFigure"; import type { DictionaryPageFigureWithPriorityUses } from "~/features/dictionary/getDictionaryPageFigure.server"; @@ -50,7 +51,7 @@ export function SingleFigureDictionaryEntry({ const kvgImage = figure.image?.type === KanjisenseFigureImageType.Kvg ? figure.image : null; const isUnicodeCharacter = [...figure.id].length === 1; - const figureIsAtomic = isFigureAtomic(figure); + const figureIsAtomic = isAtomicFigure(figure); const glyphsJson = figure.glyphImage ? (figure.glyphImage.json as GlyphsJson) @@ -252,14 +253,6 @@ export function SingleFigureDictionaryEntry({ ); } -function isFigureAtomic( - figure: Pick, -): boolean { - return Array.isArray(figure.componentsTree) - ? figure.componentsTree.length === 0 - : false; -} - function parseRadicalNumbers(unicodeRadicalText: string) { try { // one apostrophe: chinese-simplified radical diff --git a/app/features/dictionary/getActiveSoundMarkValueText.ts b/app/features/dictionary/getActiveSoundMarkValueText.ts index 5d8f0aba3..0a467baac 100644 --- a/app/features/dictionary/getActiveSoundMarkValueText.ts +++ b/app/features/dictionary/getActiveSoundMarkValueText.ts @@ -2,6 +2,13 @@ import { SbgyXiaoyun } from "@prisma/client"; import type { OnReadingToTypeToXiaoyuns } from "~/lib/OnReadingToTypeToXiaoyuns"; import { InferredOnyomiType } from "~/lib/qys/inferOnyomi"; +import { QieyunRhymeCycleHead } from "~/lib/qys/QieyunRhymeCycleHead"; +import { + DengOrChongniu, + Kaihe, + QysSyllableProfile, + Tone, +} from "~/lib/qys/QysSyllableProfile"; import { transcribeSbgyXiaoyun } from "./transcribeSbgyXiaoyun"; @@ -11,14 +18,15 @@ export function serializeXiaoyunProfile(xiaoyun: SbgyXiaoyun) { }${xiaoyun.cycleHead}${xiaoyun.tone}`; } -export function deserializeXiaoyunProfile(profile: string) { +export function deserializeXiaoyunProfile(profile: string): QysSyllableProfile { const [initial, kaihe, dengOrChongniu, cycleHead, tone] = profile.split(""); return { - initial, - kaihe: kaihe === "x" ? null : kaihe, - dengOrChongniu: dengOrChongniu === "x" ? null : dengOrChongniu, - cycleHead, - tone, + initial: initial as QysSyllableProfile["initial"], + kaihe: kaihe === "x" ? null : (kaihe as Kaihe), + dengOrChongniu: + dengOrChongniu === "x" ? null : (dengOrChongniu as DengOrChongniu), + cycleHead: cycleHead as QieyunRhymeCycleHead, + tone: tone as Tone, }; } diff --git a/app/features/dictionary/sbgyXiaoyunToQysSyllableProfile.ts b/app/features/dictionary/sbgyXiaoyunToQysSyllableProfile.ts index 995d4410b..e899af3f3 100644 --- a/app/features/dictionary/sbgyXiaoyunToQysSyllableProfile.ts +++ b/app/features/dictionary/sbgyXiaoyunToQysSyllableProfile.ts @@ -1,8 +1,8 @@ import type { SbgyXiaoyun } from "@prisma/client"; -import { Kaihe, QysSyllableProfile, Tone } from "~/lib/qys/inferOnyomi"; import { QieyunRhymeCycleHead } from "~/lib/qys/QieyunRhymeCycleHead"; import { QysInitial } from "~/lib/qys/QysInitial"; +import { Kaihe, QysSyllableProfile, Tone } from "~/lib/qys/QysSyllableProfile"; export function sbgyXiaoyunToQysSyllableProfile( xiaoyun: SbgyXiaoyun, diff --git a/app/features/dictionary/transcribeSbgyXiaoyun.ts b/app/features/dictionary/transcribeSbgyXiaoyun.ts index a175cfe07..3c4774fa1 100644 --- a/app/features/dictionary/transcribeSbgyXiaoyun.ts +++ b/app/features/dictionary/transcribeSbgyXiaoyun.ts @@ -1,7 +1,7 @@ import type { SbgyXiaoyun } from "@prisma/client"; -import { Kaihe } from "~/lib/qys/inferOnyomi"; import { QysInitial } from "~/lib/qys/QysInitial"; +import { Kaihe } from "~/lib/qys/QysSyllableProfile"; import { QysTranscriptionProfile, transcribe, diff --git a/app/features/qysInfo/Diacritics.tsx b/app/features/qysInfo/Diacritics.tsx index 757358a6a..64626ad23 100644 --- a/app/features/qysInfo/Diacritics.tsx +++ b/app/features/qysInfo/Diacritics.tsx @@ -79,8 +79,8 @@ export function Diacritics({ only show up in syllables placed in the third row.
  • - The grave accent only shows up in syllables - placed in the fourth row. + The grave accent and the bare letter E{" "} + only show up in syllables placed in the fourth row.
  • @@ -148,21 +148,21 @@ export function Diacritics({ /a/ default ◌̣ - y- ẃ- ◌̂ + y- ◌̇ ◌̂ /e/ + ◌̇ default - ◌̀ /i/ - default + invariant @@ -176,7 +176,7 @@ export function Diacritics({ /u/ - default + invariant @@ -270,7 +270,7 @@ export function Diacritics({ - + /e/ /e/ /a/ diff --git a/app/features/qysInfo/InitialConsonants.tsx b/app/features/qysInfo/InitialConsonants.tsx index 71cf12487..3e052f6aa 100644 --- a/app/features/qysInfo/InitialConsonants.tsx +++ b/app/features/qysInfo/InitialConsonants.tsx @@ -81,7 +81,7 @@ export function InitialConsonants({ - 並 bèngˬ + 並 bengˬ @@ -129,7 +129,7 @@ export function InitialConsonants({ - 定 dèngˎ + 定 dengˎ @@ -138,7 +138,7 @@ export function InitialConsonants({ - 泥 nèiˎ + 泥 neiˎ @@ -165,7 +165,7 @@ export function InitialConsonants({ - 溪 kʻèi + 溪 kʻei @@ -250,7 +250,7 @@ export function InitialConsonants({ 幫 pang
    端 twan -
    見 kènˎ +
    見 kenˎ

    {" "} @@ -353,7 +353,7 @@ export function InitialConsonants({ - 禪 dźenˎ + 禪 dźėnˎ @@ -363,7 +363,7 @@ export function InitialConsonants({ - 穿 tśʻwen + 穿 tśʻwėn @@ -383,7 +383,7 @@ export function InitialConsonants({ - 照 tśeuˎ + 照 tśėuˎ @@ -444,7 +444,7 @@ export function InitialConsonants({ - 照 tśeuˎ + 照 tśėuˎ @@ -462,7 +462,7 @@ export function InitialConsonants({ - 穿 tśʻwen + 穿 tśʻwėn @@ -502,7 +502,7 @@ export function InitialConsonants({ - 禪 dźenˎ + 禪 dźėnˎ @@ -545,7 +545,7 @@ export function InitialConsonants({ - 曉 khèuˬ + 曉 kheuˬ {" "} @@ -606,10 +606,10 @@ export function InitialConsonants({ "tongue-head" sounds, and the other as 舌上 "tongue-up" sounds.

    - In general, before any vowel sound written with the{" "} - underdot , and before the{" "} - letters Y, I, E, and U, these letters represent the - "tongue-up" sounds, which were likely pronounced with{" "} + In general, whenever any syllable is written with a{" "} + dot above or below the main vowel, or immediately before + letters I and Y, these letters represent the "tongue-up" + sounds, which were likely pronounced with{" "} retroflex {" "} @@ -621,9 +621,8 @@ export function InitialConsonants({ have been pronounced slightly differently in different environments.

    - Before other vowels, and before the vowel {" "} - with grave accent, these letters represent the - "tongue-head" sounds, which were likely pronounced with{" "} + In all other syllables, these letters represent the "tongue-head" + sounds, which were likely pronounced with{" "} dental{" "} articulation, i.e. with the tongue closer to the teeth.

    @@ -648,7 +647,7 @@ export function InitialConsonants({ - 徹 tʻet + 徹 tʻėt @@ -669,7 +668,7 @@ export function InitialConsonants({ - 定 dèngˎ + 定 dengˎ @@ -684,7 +683,7 @@ export function InitialConsonants({ - 泥 nèiˎ + 泥 neiˎ @@ -692,9 +691,10 @@ export function InitialConsonants({

    - In general, the vowel determines whether to choose between - "tongue-head" or "tongue-up" sounds. (Linguists might describe these - sounds as being in nearly{" "} + This spelling rule is in place because, in general, the vowel + determines whether to choose between "tongue-head" or "tongue-up" + sounds. (Linguists might describe these sounds as being in{" "} + nearly{" "} complementary distribution @@ -817,7 +817,7 @@ export function InitialConsonants({ - 並 bèngˬ + 並 bengˬ @@ -839,7 +839,7 @@ export function InitialConsonants({ - 定 dèngˎ + 定 dengˎ @@ -947,7 +947,7 @@ export function InitialConsonants({ - 禪 dźenˎ + 禪 dźėnˎ @@ -1042,7 +1042,7 @@ export function InitialConsonants({ - 並 bèngˬ + 並 bengˬ as in{" "} diff --git a/app/lib/dic/componentsDictionary.yml b/app/lib/dic/componentsDictionary.yml index da9aeec77..75b05826f 100644 --- a/app/lib/dic/componentsDictionary.yml +++ b/app/lib/dic/componentsDictionary.yml @@ -395,7 +395,7 @@ CDP-89AE: # CDP-8C41,氺,楽,渋,CDP-88D2,摂,率,塁,函,𠕒,CDP-8D6F,褱,CDP- tag: nature 巳: historical: year of the Snake - mnemonic: snake + mnemonic: python tag: animals 西: historical: west @@ -486,9 +486,8 @@ CDP-8BF5: # 介,齐,粛,弗,界,芥,堺,斎,繍,費,沸,佛,拂 tag: tools 非: historical: not - mnemonic: sad - standin: 悲 - tag: feeling + mnemonic: flying wings + tag: null 疒: historical: illness mnemonic: sickbed @@ -781,7 +780,7 @@ CDP-8BD6: # 奥,向,CDP-8D7C,襖,奧 mnemonic: chopping hand tag: hands 夭: - historical: calamity + historical: youth tag: null CDP-8BB8: # 愛,受,舜,曖,授,綬,瞬 historical: (various) @@ -1315,9 +1314,9 @@ GWS-U2FF1-U2008A-U65E7: # 陥,焰,閻,陷 tag: place 它: historical: other - mnemonic: snake pole + mnemonic: garden snake reference: 蛇 - tag: tools + tag: animals 匊: historical: handful, scoop mnemonic: chrysanthemum @@ -1349,10 +1348,8 @@ GWS-U23A8A-VAR-001: # 殻,穀,穀,殼 standin: 誇 tag: actions 賁: - historical: energetic, bright - mnemonic: tomb - standin: 墳 - tag: place + mnemonic: ornament + tag: null 茲: historical: here mnemonic: magnet @@ -1600,7 +1597,7 @@ CDP-8DBF: # 廌,焉,薦 tag: personal 戊: historical: fifth Heavenly Stem - mnemonic: garrison + mnemonic: guard reference: 戌 tag: weapons 豆: @@ -1684,8 +1681,8 @@ CDP-8DBF: # 廌,焉,薦 tag: animals 曽: historical: once before - mnemonic: monk - standin: 僧 + mnemonic: steamer basket + reference: 甑 tag: personal 左: historical: left-hand @@ -1983,7 +1980,8 @@ CDP-8DBF: # 廌,焉,薦 tag: personal 韋: historical: soft leather - tag: tools + mnemonic: patrol + tag: null 色: historical: color tag: color @@ -3165,7 +3163,7 @@ GWS-U914B-G: historical: rule tag: feeling 戒: - historical: guard against + historical: ward off tag: geo 曹: historical: officer diff --git a/app/lib/qys/QysSyllableProfile.ts b/app/lib/qys/QysSyllableProfile.ts new file mode 100644 index 000000000..beafbf410 --- /dev/null +++ b/app/lib/qys/QysSyllableProfile.ts @@ -0,0 +1,23 @@ +import type { QieyunRhymeCycleHead } from "~/lib/qys/QieyunRhymeCycleHead"; +import type { QysInitial } from "~/lib/qys/QysInitial"; + +export type DengOrChongniu = "一" | "二" | "三" | "四" | "A" | "B"; +export enum Kaihe { + Open = "開", + Closed = "合", +} + +export enum Tone { + 平 = "平", + 上 = "上", + 去 = "去", + 入 = "入", +} + +export interface QysSyllableProfile { + initial: QysInitial; + dengOrChongniu: DengOrChongniu | null; + kaihe: Kaihe | null; + tone: Tone; + cycleHead: QieyunRhymeCycleHead; +} diff --git a/app/lib/qys/inferOnyomi.test.ts b/app/lib/qys/inferOnyomi.test.ts index 2631ee01c..f20cf49e1 100644 --- a/app/lib/qys/inferOnyomi.test.ts +++ b/app/lib/qys/inferOnyomi.test.ts @@ -1,11 +1,9 @@ import { InferredOnyomiType, - Kaihe, - QysSyllableProfile, - Tone, inferOnyomi, toModernKatakana, } from "./inferOnyomi"; +import { Kaihe, QysSyllableProfile, Tone } from "./QysSyllableProfile"; describe("getAttestedOnFinals", () => { it("infers onyomi from 生 in default format", () => { diff --git a/app/lib/qys/inferOnyomi.ts b/app/lib/qys/inferOnyomi.ts index 395f09079..9402cbb4c 100644 --- a/app/lib/qys/inferOnyomi.ts +++ b/app/lib/qys/inferOnyomi.ts @@ -3,29 +3,9 @@ import { attestedFinals, getCategoriesBySpecificityDescending, } from "~/lib/qys/attestedOnFinals"; -import type { QieyunRhymeCycleHead } from "~/lib/qys/QieyunRhymeCycleHead"; import type { QysInitial } from "~/lib/qys/QysInitial"; -type DengOrChongniu = "一" | "二" | "三" | "四" | "A" | "B"; -export enum Kaihe { - Open = "開", - Closed = "合", -} - -export enum Tone { - 平 = "平", - 上 = "上", - 去 = "去", - 入 = "入", -} - -export interface QysSyllableProfile { - initial: QysInitial; - dengOrChongniu: DengOrChongniu | null; - kaihe: Kaihe | null; - tone: Tone; - cycleHead: QieyunRhymeCycleHead; -} +import { QysSyllableProfile, Kaihe, Tone } from "./QysSyllableProfile"; export type ClassifiedOnyomi = | { diff --git a/app/lib/qys/transcribeXiaoyun.ts b/app/lib/qys/transcribeXiaoyun.ts index 9b514a6dc..2caeec8c7 100644 --- a/app/lib/qys/transcribeXiaoyun.ts +++ b/app/lib/qys/transcribeXiaoyun.ts @@ -6,7 +6,7 @@ import { QysInitial, } from "~/lib/qys/QysInitial"; -import { Kaihe, QysSyllableProfile } from "./inferOnyomi"; +import { Kaihe, QysSyllableProfile } from "./QysSyllableProfile"; export interface QysTranscriptionProfile { is合口: boolean; @@ -44,8 +44,9 @@ const asciiFinals = { won: "won", on: "on", an: "an", + wan: "wan", au: "au", - ẃa: "ywa", + wȧ: "wia", wa: "wa", ya: "ya", a: "a", @@ -72,6 +73,7 @@ const asciiFinals = { wạ: "rwa", ạ: "ra", wâng: "wvang", + ŷang: "vyang", wẹng: "rweng", ẹng: "reng", wạng: "rwang", @@ -80,14 +82,14 @@ const asciiFinals = { ạ̈ng: "raeng", äm: "aem", ạm: "ram", - wèi: "waei", - èi: "aei", - èm: "aem", - wèn: "waen", - èn: "aen", - èu: "aeu", - wèng: "waeng", - èng: "aeng", + wei: "wei", + ei: "ei", + em: "em", + wen: "wen", + en: "en", + eu: "eu", + weng: "weng", + eng: "eng", uï: "uie", ẁï: "ywie", wï: "wie", @@ -107,12 +109,12 @@ const asciiFinals = { u: "u", yu: "yu", ẁei: "ywei", - wei: "wei", + wėi: "wiei", yei: "yei", - ei: "ei", + ėi: "iei", âi: "vai", yeu: "yeu", - eu: "eu", + ėu: "ieu", ū: "uu", iū: "iuu", ông: "vong", @@ -129,29 +131,30 @@ const asciiFinals = { wên: "wven", ên: "ven", ẁen: "ywen", - wen: "wen", + wėn: "wien", yen: "yen", - en: "en", - âng: "wvang", + ėn: "ien", + âng: "vang", yang: "yang", ẁeng: "yweng", - weng: "weng", + wėng: "wieng", yeng: "yeng", - eng: "eng", + ėng: "ieng", wĭng: "wcing", ŷŏng: "vycong", yŏng: "ycong", yim: "yim", im: "im", yem: "yem", - em: "em", + ėm: "iem", êm: "vem", âm: "vam", }; const rhymes: Record< QieyunRhymeCycleHead, - string | ((syllable: QysTranscriptionProfile) => string) + | keyof typeof asciiFinals + | ((syllable: QysTranscriptionProfile) => keyof typeof asciiFinals) > = { 東: (s) => { if (s.tone聲 === "入" && s.contrastiveRow等 === "三") { @@ -175,10 +178,10 @@ const rhymes: Record< 咍: "ai", 魂: "won", 痕: "on", - 寒: "an", + 寒: (s) => (s.is合口 ? "wan" : "an"), 豪: "au", 歌: (s) => { - if (s.is合口) return s.contrastiveRow等 === "三" ? "ẃa" : "wa"; + if (s.is合口) return s.contrastiveRow等 === "三" ? "wȧ" : "wa"; return s.contrastiveRow等 === "三" ? "ya" : "a"; }, 唐: (s) => (s.is合口 ? "wang" : "ang"), @@ -208,11 +211,11 @@ const rhymes: Record< 銜: "ạm", // 四等韻 - 齊: (s) => (s.is合口 ? "wèi" : "èi"), - 先: (s) => (s.is合口 ? "wèn" : "èn"), - 蕭: "èu", - 青: (s) => (s.is合口 ? "wèng" : "èng"), - 添: "èm", + 齊: (s) => (s.is合口 ? "wei" : "ei"), + 先: (s) => (s.is合口 ? "wen" : "en"), + 蕭: "eu", + 青: (s) => (s.is合口 ? "weng" : "eng"), + 添: "em", // 三等陰聲韻 支: (s) => { @@ -285,14 +288,14 @@ const rhymes: Record< initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "ẁei" - : "wei"; + : "wėi"; return s.canonical母 === "以" || (s.is重紐A類 && (initialGroups["幫"].has(s.canonical母) || initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "yei" - : "ei"; + : "ėi"; }, 廢: "âi", 宵: (s) => @@ -302,7 +305,7 @@ const rhymes: Record< initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "yeu" - : "eu", + : "ėu", 尤: (s) => (initialGroups["幫"].has(s.canonical母) ? "ū" : "iū"), 幽: "iu", @@ -352,17 +355,22 @@ const rhymes: Record< initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "ẁen" - : "wen"; + : "wėn"; return s.canonical母 === "以" || (s.is重紐A類 && (initialGroups["幫"].has(s.canonical母) || initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "yen" - : "en"; + : "ėn"; }, 陽: (s) => { - if (s.is合口) return "wâng"; + if (s.is合口) + return s.canonical母 === "影" || + s.canonical母 === "以" || + s.canonical母 === "云" + ? "wâng" + : "ŷang"; return initialGroups["幫"].has(s.canonical母) || initialGroups["莊"].has(s.canonical母) ? "âng" @@ -376,14 +384,14 @@ const rhymes: Record< initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "ẁeng" - : "weng"; + : "wėng"; return s.canonical母 === "以" || (s.is重紐A類 && (initialGroups["幫"].has(s.canonical母) || initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "yeng" - : "eng"; + : "ėng"; }, 蒸: (s) => { if (s.is合口) return s.canonical母 === "云" ? "wĭng" : "ŷŏng"; @@ -404,7 +412,7 @@ const rhymes: Record< initialGroups["見"].has(s.canonical母) || initialGroups["影"].has(s.canonical母))) ? "yem" - : "em", + : "ėm", 嚴: "êm", 凡: "âm", }; diff --git a/app/root.tsx b/app/root.tsx index 630ce7feb..ae2c6fd87 100644 --- a/app/root.tsx +++ b/app/root.tsx @@ -68,7 +68,7 @@ export function ErrorBoundary() { Error {error.status}: {error.statusText} ) : ( - "Something went wrong" + "Something went wrong on the server. Please try again later." ) } /> diff --git a/app/routes/browse.sound-components.tsx b/app/routes/browse.sound-components.tsx index 32568c3f1..945d522ba 100644 --- a/app/routes/browse.sound-components.tsx +++ b/app/routes/browse.sound-components.tsx @@ -17,7 +17,6 @@ import { } from "~/components/AppLink"; import DictionaryLayout from "~/components/DictionaryLayout"; import A from "~/components/ExternalLink"; -import { FigureBadge } from "~/components/FigureBadge"; import { prisma } from "~/db.server"; import CollapsibleInfoSection from "~/features/browse/CollapsibleInfoSection"; import { abbreviateTranscriptions } from "~/features/dictionary/abbreviateTranscriptions"; @@ -31,6 +30,8 @@ import { transcribeSerializedXiaoyunProfile, } from "~/features/dictionary/getActiveSoundMarkValueText"; +import { FigureBadgeLink } from "../components/FigureBadgeLink"; + type LoaderData = Awaited>; const groupsThresholds = [10, 8, 7, 5, 4, 3, 2, 1]; @@ -552,20 +553,6 @@ export default function FigureDetailsPage() { ); } -function FigureBadgeLink({ - id: figureId, - badgeProps, -}: { - id: string; - badgeProps: BadgeProps; -}) { - return ( - - - - ); -} - export function ErrorBoundary() { const error = useRouteError(); diff --git a/app/routes/curate.$courseId.tsx b/app/routes/curate.$courseId.tsx new file mode 100644 index 000000000..cd03adb43 --- /dev/null +++ b/app/routes/curate.$courseId.tsx @@ -0,0 +1,1279 @@ +import type { BaseCorpusText, KanjisenseFigure } from "@prisma/client"; +import { useActionData, useLoaderData, useSubmit } from "@remix-run/react"; +import { + ActionFunctionArgs, + LoaderFunctionArgs, + json, +} from "@remix-run/server-runtime"; +import { + Fragment, + ReactNode, + useEffect, + useMemo, + useRef, + useState, +} from "react"; + +import { FigureBadgeLink } from "~/components/FigureBadgeLink"; +import { prisma } from "~/db.server"; +import { CharactersProgress } from "~/features/curate/CharactersProgress"; +import { + BadgePropsFigure, + getBadgeProps, + isAtomicFigure, +} from "~/features/dictionary/badgeFigure"; + +import { + CurationState, + getCurationState, +} from "../features/curate/getCurationState"; + +export const action = async ({ request }: ActionFunctionArgs) => { + const formData = await request.formData(); + + const seenCharactersParam = formData.get("sc") as string; + const seenTextsParam = formData.get("st") as string; + const wantedCharacters = formData.get("wc") as string; + const normalizedTextSearchQuery = formData.get("ntsq") as string; + const authors = (formData.get("a") as string)?.split(",")?.filter((a) => a); + const sources = + (formData.get("s") as string)?.split(",")?.filter((a) => a) || []; + const lengthRangeParam = formData.get("l") as string; + const courseId = (formData.get("courseId") as string) || "kj"; + + const seenTexts = JSON.parse(seenTextsParam) as string[][]; + + const invalidParams = []; + if (typeof seenCharactersParam !== "string") invalidParams.push("seenChars"); + const seenTextsIds = seenTexts.flatMap((keys) => + keys.map((k) => hashString(k)), + ); + if (seenTextsIds.some(Number.isNaN)) invalidParams.push("seenTexts"); + const lengthRange = lengthRangeParam?.split("-").map(Number) as [ + number, + number, + ]; + if (!lengthRange?.every((n) => !Number.isNaN(n))) + invalidParams.push("length"); + if (invalidParams.length) + throw new Error("Invalid params: " + invalidParams.join(", ")); + + const updatedCourse = await prisma.course.update({ + where: { + id: courseId, + }, + data: { + seenTexts, + wantedCharacters: wantedCharacters || "", + minLength: lengthRange[0], + maxLength: lengthRange[1], + authors: authors || [], + sources: sources || [], + normalizedTextSearchQuery: normalizedTextSearchQuery || "", + }, + }); + console.log(`Updated course ${courseId}`); + console.log({ + updated: { + wantedCharacters: updatedCourse.wantedCharacters, + minLength: updatedCourse.minLength, + maxLength: updatedCourse.maxLength, + }, + }); + + return json({ + courseId, + }); +}; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const courseId = params.courseId!; + + const queryStringParams = new URL(request.url).searchParams; + const page = queryStringParams.get("p") + ? parseInt(queryStringParams.get("p")!) + : 1; + + const { + course, + seenTexts, + seenCharacters, + seenFigures, + defaultTangReadings, + remainingKanjisenseCharacters, + remainingMeaningfulComponents, + allFiguresKeys, + unseenTexts, + textGroups, + textGroupsCount, + } = await getCurationState(courseId, page); + + return json({ + course, + page, + seenTexts, + seenCharacters, + seenFigures, + defaultTangReadings, + remainingKanjisenseCharacters, + remainingMeaningfulComponents, + + allFiguresKeys, + + unseenTexts, + textGroups, + count: textGroupsCount, + + priorityFiguresIds: await prisma.kanjisenseFigure + .findMany({ + select: { + id: true, + }, + where: { + isPriority: true, + }, + }) + .then((figures) => figures.map((f) => f.id)), + }); +}; + +export default function CuratePage() { + const actionData = useActionData(); + const loaderData = useLoaderData(); + + const { + course, + defaultTangReadings, + remainingKanjisenseCharacters, + remainingMeaningfulComponents, + allFiguresKeys, + seenTexts, + seenCharacters, + seenFigures, + unseenTexts, + priorityFiguresIds, + } = loaderData; + const allFiguresKeysSet = useMemo( + () => new Set(allFiguresKeys), + [allFiguresKeys], + ); + + const seenFiguresMap = useMemo( + () => new Map(seenFigures.map((c) => [c.id, c])), + [seenFigures], + ); + const remainingComponentsMap = useMemo( + () => new Map(remainingMeaningfulComponents.map((c) => [c.id, c])), + [remainingMeaningfulComponents], + ); + const remainingKanjisenseCharactersMap = useMemo( + () => new Map(remainingKanjisenseCharacters.map((c) => [c.id, c])), + [remainingKanjisenseCharacters], + ); + + const { + seenTextsFlat, + seenTextsState, + addToSeenTexts, + removeFromSeenTexts, + moveWithinSeenTexts, + runningSeenCharacters, + componentsToFirstSighting, + handleSubmit, + setFilterState, + filterState, + + setTextGroupDescription, + insertTextGroup, + removeTextGroup, + moveTextGroup, + } = useSeenTextsState(course, seenTexts, seenCharacters, unseenTexts); + + const [mouseovered, setMouseovered] = useState>(new Set()); + function getOnMouseoverText(id: number) { + return () => { + console.log("mouseover", id); + setMouseovered((s) => new Set(s).add(id)); + }; + } + + const wantedCharactersSet = useMemo( + () => new Set(filterState.wantedCharacters), + [filterState.wantedCharacters], + ); + const seenCharactersSet = useMemo( + () => new Set(seenCharacters.map((c) => c.id)), + [seenCharacters], + ); + const priorityCharactersSet = useMemo( + () => new Set(priorityFiguresIds), + [priorityFiguresIds], + ); + + const seenMeaningfulFigures = seenFigures.filter((char) => { + const figure = char; + return figure.isPriority; + }); + const seenMeaningfulAtomicComponents = + seenMeaningfulFigures.filter(isFigureAtomic); + const nonAtomicCharactersSeenOnlyAsComponents = new Set( + remainingKanjisenseCharacters.flatMap((c) => { + return seenMeaningfulFigures + .filter((sc) => sc.id === c.id && !isFigureAtomic(sc)) + .map((sc) => sc.id); + }), + ); + const atomicCharactersSeenOnlyAsComponents = new Set( + remainingKanjisenseCharacters.flatMap((c) => { + return seenMeaningfulAtomicComponents + .filter((sc) => sc.id === c.id) + .map((sc) => sc.id); + }), + ); + + return ( +

    +
    + {actionData?.courseId ? ( +

    Updated course {actionData.courseId}

    + ) : null} +

    {seenTextsFlat.length} seen texts

    +
    + { + seenTextsState.reduce( + (acc, group, groupIndex) => { + acc.nodes.push( +
    +

    + group {groupIndex + 1} + + {!group.texts.length ? ( + + ) : null} + + +
    + {group.texts.length} texts +

    +
    + +
    +
    + {group.texts.map((textKey, seenTextIndex) => { + acc.runningTotalTexts++; + + const seenText = + seenTextsFlat.find((t) => t.key === textKey) || + loaderData?.unseenTexts.find( + (t) => t.key === textKey, + ); + if (!seenText) return null; + + const tg = loaderData?.textGroups.find( + (tg) => tg.baseCorpusTextId === seenText.id, + ); + // const unseenCharactersRegex = new RegExp( + // `([^${[...runningSeenCharacters[textKey]].join("")}])`, + // "g", + // ); + + return ( +
    + {tg ? ( +

    + unique chars:{" "} + {tg.baseCorpusUniqueCharactersCount};{" "} + components: {tg.baseCorpusUniqueComponentsCount} + ; total length: {tg.baseCorpusTextLength} score:{" "} + {tg._sum?.frequencyScore?.toLocaleString( + "en-US", + )} +

    + ) : null} +

    + #{acc.runningTotalTexts} {seenText.author} -{" "} + {seenText.title} ({seenText.source}) +

    + {seenText.text} +
    + + + +
    +
    + {/*

    $&', + ), + }} + /> */} + +

    + { + const seen = seenFiguresMap.get(id); + if (!seen) return null; + if ( + componentsToFirstSighting.get(id) + ?.textKey !== seenText.key + ) + return null; + return seen || null; + }} + newAtomicCharactersSeenOnlyAsComponents={ + // get characters from uniqueCharacters which are NOT in runningSeenCharacters + // and which are NOT in seenFigures + new Set( + seenText.uniqueCharacters.flatMap( + ({ character }) => { + if ( + !runningSeenCharacters[textKey].has( + character, + ) + ) { + const figureFirstSighting = + componentsToFirstSighting.get( + character, + ); + const firstSightingWasBeforeThisText = + figureFirstSighting && + (figureFirstSighting.textGroupIndex < + groupIndex || + (figureFirstSighting.textGroupIndex === + groupIndex && + figureFirstSighting.textIndex < + seenTextIndex)); + if ( + firstSightingWasBeforeThisText + ) { + const figure = + seenFiguresMap.get(character) || + null; + return figure && + isAtomicFigure(figure) + ? [character] + : []; + } + } + return []; + }, + ), + ) + } + newNonAtomicCharactersSeenOnlyAsComponents={ + new Set( + seenText.uniqueCharacters.flatMap( + ({ character }) => { + if ( + !runningSeenCharacters[textKey].has( + character, + ) + ) { + const figureFirstSighting = + componentsToFirstSighting.get( + character, + ); + const firstSightingWasBeforeThisText = + figureFirstSighting && + (figureFirstSighting.textGroupIndex < + groupIndex || + (figureFirstSighting.textGroupIndex === + groupIndex && + figureFirstSighting.textIndex < + seenTextIndex)); + + if ( + firstSightingWasBeforeThisText + ) { + const figure = + seenFiguresMap.get(character) || + null; + return figure && + !isAtomicFigure(figure) + ? [character] + : []; + } + } + return []; + }, + ), + ) + } + /> +
    +
    +
    + ); + })} +
    +
    +
    , + ); + return acc; + }, + { + nodes: [] as ReactNode[], + runningTotalTexts: 0, + }, + ).nodes + } +
    +
    +
    +
    + + setFilterState((s) => ({ + ...s, + wantedCharacters: [...new Set(e.target.value)].join(""), + })) + } + /> +
    + + { + const number = parseInt(e.target.value); + if ( + !Number.isNaN(number) && + number <= filterState.lengthRange[1] && + number >= 0 + ) { + setFilterState((s) => ({ + ...s, + lengthRange: [number, s.lengthRange[1]], + })); + } + }} + /> + { + const number = parseInt(e.target.value); + if ( + !Number.isNaN(number) && + number >= filterState.lengthRange[0] + ) { + setFilterState((s) => ({ + ...s, + lengthRange: [s.lengthRange[0], number], + })); + } + }} + /> +
    + { + setFilterState((s) => ({ + ...s, + authors: e.target.value.split(","), + })); + }} + /> +
    + { + setFilterState((s) => ({ + ...s, + sources: e.target.value.split(","), + })); + }} + /> +
    + { + setFilterState((s) => ({ + ...s, + normalizedTextSearchQuery: e.target.value, + })); + }} + /> +
    + +
    + + {loaderData ? ( + () => { + console.log("clicked figure"); + }} + /> + ) : null} +
    +
    +
    {loaderData?.count} texts total
    + {loaderData.count > loaderData.unseenTexts.length ? ( +
    + page {loaderData.page} of {Math.ceil(loaderData.count / 500)}:{" "} + {Array.from( + Array(Math.ceil(loaderData.count / 500)).keys(), + (i) => ( + + + {i + 1} + {" "} + + ), + )} +
    + ) : null} +
    + {loaderData?.unseenTexts.length} +
    + {loaderData?.textGroups.map((tg, tgi) => { + const ti = loaderData.unseenTexts.findIndex( + (t) => t.id === tg.baseCorpusTextId, + )!; + const unseenText = loaderData.unseenTexts[ti]; + return ( + // eslint-disable-next-line jsx-a11y/mouse-events-have-key-events +
    +

    + #{tgi + 1} unique chars ={" "} + + {tg.baseCorpusUniqueCharactersCount} + {" "} + components ={" "} + + {tg.baseCorpusUniqueComponentsCount} + {" "} +
    + non-priority chars ={" "} + + {tg.baseCorpusTextNonPriorityCharactersCount} + {" "} +
    + length = {tg.baseCorpusTextLength}; score ={" "} + {tg._sum?.frequencyScore?.toLocaleString("en-US")} +

    +

    + {unseenText.author} - {unseenText.title} ({unseenText.source}) +
    +

    +

    {unseenText.text}

    +
    + + +
    + {mouseovered.has(unseenText.id) ? ( +
    + +
    + + remainingComponentsMap.get(id) || null + } + newAtomicCharactersSeenOnlyAsComponents={ + new Set( + unseenText.uniqueCharacters.flatMap( + ({ character }) => + atomicCharactersSeenOnlyAsComponents.has( + character, + ) + ? [character] + : [], + ), + ) + } + newNonAtomicCharactersSeenOnlyAsComponents={ + new Set( + unseenText.uniqueCharacters.flatMap( + ({ character }) => + nonAtomicCharactersSeenOnlyAsComponents.has( + character, + ) && + remainingKanjisenseCharactersMap.get(character) + ? [character] + : [], + ), + ) + } + /> +
    +
    + ) : ( +

    {unseenText.normalizedText}

    + )} +
    + ); + })} +
    +
    +
    + ); +} + +function ColoredCharactersByInterest({ + normalizedText, + // textUniqueCharacters, + wantedCharacters, + seenCharacters, + priorityFiguresIds, + defaultTangReadings, +}: { + // textUniqueCharacters: { figureId: string | null }[]; + normalizedText: string; + wantedCharacters: Set; + seenCharacters: Set; + priorityFiguresIds: Set; + defaultTangReadings?: string; +}) { + const tangReadingsArray = defaultTangReadings?.split(" "); + let factor: number | null = null; + if (normalizedText.length % 5 === 0) factor = 5; + else if (normalizedText.length % 7 === 0) factor = 7; + + return ( +
    + {Array.from(normalizedText, (figureId, i) => { + let className = ""; + if (wantedCharacters.has(figureId) && seenCharacters.has(figureId)) + className = "bg-[#bbffff] "; + else if (wantedCharacters.has(figureId)) className = "bg-[#00ffff] "; + else if (seenCharacters.has(figureId)) className = "text-gray-300 "; + else if (!priorityFiguresIds.has(figureId)) + className = "text-amber-700/60 "; + else className = "text-black "; + let punctuation = ""; + if (factor && (i + 1) % (factor * 2) === 0) { + punctuation = "。"; + if (factor && (i + 1) % (factor * 4) === 0) punctuation += "\n"; + } else if (factor && (i + 1) % factor === 0) { + punctuation = ","; + } + if (tangReadingsArray) { + return ( +
    +
    + {tangReadingsArray[i]?.replaceAll("/", "\n")}{" "} +
    + {figureId} + {punctuation} +
    + ); + } + + if (className || punctuation) { + return ( + + {figureId} + {punctuation} + + ); + } + + return figureId; + })} +
    + ); +} +function TextUniqueComponents({ + text, + getFigure, + newAtomicCharactersSeenOnlyAsComponents, + newNonAtomicCharactersSeenOnlyAsComponents, +}: { + text: ( + | CurationState["unseenTexts"] + | CurationState["seenTexts"][number] + )[number]; + getFigure: (figureId: string) => BadgePropsFigure | null; + newAtomicCharactersSeenOnlyAsComponents: Set; + newNonAtomicCharactersSeenOnlyAsComponents: Set; +}) { + const newAtomic: React.ReactNode[] = []; + const newNonAtomic: React.ReactNode[] = []; + + text.uniqueComponents.forEach((c) => { + const figure = getFigure(c.figureId); + const badgeProps = figure && getBadgeProps(figure); + const newNode = !badgeProps ? null : ( +
    + +
    + ); + + if (figure && newNode) { + const isAtomic = isAtomicFigure(figure); + if (isAtomic) { + newAtomic.push(newNode); + } else { + newNonAtomic.push(newNode); + } + } + }); + + return ( + <> + {newAtomic.length ? ( + <> + {newAtomic.length} atomic: {newAtomic} + + ) : ( + <>no new atomic figures + )} +
    + {newAtomicCharactersSeenOnlyAsComponents.size ? ( + <> +
    + {newAtomicCharactersSeenOnlyAsComponents.size} atomic newly seen as + character: {Array.from(newAtomicCharactersSeenOnlyAsComponents)} + + ) : null} +
    + {newNonAtomic.length ? ( + <> + {newNonAtomic.length} compound: {newNonAtomic} + + ) : ( + <>no new non-atomic figures + )} + {newNonAtomicCharactersSeenOnlyAsComponents.size ? ( + <> +
    + {newNonAtomicCharactersSeenOnlyAsComponents.size} compound newly seen + as character: {Array.from(newNonAtomicCharactersSeenOnlyAsComponents)} + + ) : null} + + ); +} + +function hashString(string: string) { + let hash = 0, + i, + chr; + if (string.length === 0) return hash; + for (i = 0; i < string.length; i++) { + chr = string.charCodeAt(i); + hash = (hash << 5) - hash + chr; + hash |= 0; + } + return hash; +} + +function CopyYmlButton({ + text, + defaultTangReadings, +}: { + text: BaseCorpusText & { + uniqueCharacters: { figureId: string | null }[]; + }; + defaultTangReadings?: string; +}) { + const { author, title, source } = text; + return ( + + ); +} + +type SeenTextsState = { + texts: TextId[]; + description: string; +}[]; + +type TextId = string; + +const useSeenTextsState = ( + course: CurationState["course"], + seenTexts: CurationState["seenTexts"], + seenCharacters: CurationState["seenCharacters"], + unseenTexts: CurationState["unseenTexts"], +) => { + const seenTextsFlat = useMemo(() => seenTexts.flat() || [], [seenTexts]); + + const [seenTextsState, setSeenTextsState] = useState(() => + ((course?.seenTexts || []) as string[][]).map((texts) => ({ + texts, + description: "", + })), + ); + + const storageKey = useRef(null); + useEffect(() => { + const newStorageKey = `seenTextsGroupsDescriptions-${course.id}`; + if (storageKey.current !== newStorageKey) { + storageKey.current = newStorageKey; + const descriptionsFromLocalStorage: string[] = localStorage.getItem( + newStorageKey, + ) + ? JSON.parse(localStorage.getItem(newStorageKey)!) + : []; + setSeenTextsState( + seenTextsState.map((group, i) => ({ + ...group, + description: descriptionsFromLocalStorage[i] || "", + })), + ); + } else { + localStorage.setItem( + newStorageKey, + JSON.stringify(seenTextsState.map((g) => g.description)), + ); + } + }, [course.id, seenTextsState, storageKey]); + + useEffect(() => { + Object.assign( + window as unknown as { + summary: { + groupNumber: number; + description: string; + texts: string[]; + }[]; + getPrettySummary: () => string; + }, + { + summary: seenTextsState.map((group, i) => ({ + groupNumber: i + 1, + description: group.description, + texts: group.texts, + })), + getPrettySummary: () => + seenTexts + .flatMap((group, groupIndex) => { + const description = seenTextsState[groupIndex]?.description || ""; + return [ + `# ${groupIndex + 1}. ${description}`, + "", + ...group.map((text, textIndex) => { + return [ + `## ${textIndex + 1}. (${text.author} - ${text.title} (${ + text.source + }))`, + "", + text.normalizedText, + "", + text.text, + "", + "", + ].join("\n"); + }), + ]; + }) + .join("\n"), + }, + ); + }); + + const addToSeenTexts = ({ + textKey, + defaultGroupNumber, + }: { + textKey: string; + defaultGroupNumber?: number; + }) => { + const existingGroupIndex = seenTextsState.findIndex((group) => + group.texts.includes(textKey), + ); + if (existingGroupIndex !== -1) { + alert(`Text already in group ${existingGroupIndex + 1}`); + return; + } + const groupNumberInput = prompt( + "Which group to add text to?", + String( + defaultGroupNumber ?? + 1 + (seenTextsState.length ? seenTextsState.length - 1 : 0), + ), + ); + const groupNumber = groupNumberInput ? parseInt(groupNumberInput) : NaN; + if (!Number.isNaN(groupNumber)) { + const groupIndex = groupNumber - 1; + const newSeenTexts = [ + ...seenTextsState.slice(0, groupIndex), + seenTextsState[groupIndex] + ? { + ...seenTextsState[groupIndex], + texts: [...seenTextsState[groupIndex].texts, textKey], + } + : { texts: [textKey], description: "" }, + ...seenTextsState.slice(groupIndex + 1), + ]; + setSeenTextsState(newSeenTexts); + } + }; + + const removeFromSeenTexts = (textKey: string) => { + const newSeenTexts = seenTextsState.map((group) => ({ + ...group, + texts: group.texts.filter((t) => t !== textKey), + })); + setSeenTextsState(newSeenTexts); + }; + const moveWithinSeenTexts = ( + textKey: string, + defaultGroupNumber?: number, + ) => { + const groupNumberInput = prompt( + "Which group to move text to?", + String( + defaultGroupNumber ?? + 1 + (seenTextsState.length ? seenTextsState.length - 1 : 0), + ), + ); + const newGroupNumber = groupNumberInput + ? Math.min(parseInt(groupNumberInput), seenTextsState.length) + : NaN; + if (!Number.isNaN(newGroupNumber)) { + const newGroupIndex = newGroupNumber - 1; + const newSeenTexts = seenTextsState.map((group) => ({ + ...group, + texts: group.texts.filter((t) => t !== textKey), + })); + newSeenTexts[newGroupIndex] = { + ...newSeenTexts[newGroupIndex], + texts: [...(newSeenTexts[newGroupIndex]?.texts || []), textKey], + }; + setSeenTextsState(newSeenTexts); + } + }; + + const runningSeenCharacters = useMemo>>(() => { + let runningTotal = new Set(); + return Object.fromEntries( + seenTextsState.flatMap((g) => { + return g.texts.map((t) => { + const text = + seenTextsFlat.find((t2) => t2.key === t) || + unseenTexts.find((t2) => t2.key === t); + + const oldSeenChars = new Set(runningTotal); + if (!text) return [t, oldSeenChars]; + runningTotal = new Set([ + ...runningTotal, + ...text.uniqueCharacters.flatMap((c) => c.figureId || []), + ]); + return [t, oldSeenChars]; + }); + }), + ); + }, [seenTextsState, seenTextsFlat, unseenTexts]); + + const componentsToFirstSighting = useMemo< + Map< + string, + { + textGroupIndex: number; + textIndex: number; + textKey: string; + } + > + >(() => { + const seenSoFar = new Set(); + const map = new Map< + string, + { textGroupIndex: number; textIndex: number; textKey: string } + >(); + + seenTextsState.forEach(({ texts }, textGroupIndex) => { + texts.forEach((textKey, textIndex) => { + const text = seenTextsFlat.find((t) => t.key === textKey); + if (!text) return; + text.uniqueCharacters.forEach((c) => { + if (!c.figureId) return; + if (seenSoFar.has(c.figureId)) return; + seenSoFar.add(c.figureId); + map.set(c.figureId, { textGroupIndex, textIndex, textKey }); + }); + text.uniqueComponents.forEach((c) => { + if (seenSoFar.has(c.figureId)) return; + seenSoFar.add(c.figureId); + map.set(c.figureId, { textGroupIndex, textIndex, textKey }); + }); + }); + }); + return map; + }, [seenTextsState, seenTextsFlat]); + + const [filterState, setFilterState] = useState({ + authors: course.authors || [], + sources: course.sources || [], + wantedCharacters: course.wantedCharacters || "", + normalizedTextSearchQuery: course.normalizedTextSearchQuery || "", + lengthRange: [course.minLength ?? 0, course.maxLength ?? 1000], + }); + + const submit = useSubmit(); + + function handleSubmit() { + const formData = new FormData(); + formData.append( + "sc", + seenCharacters.length ? seenCharacters.map((c) => c.id).join("") : "", + ); + formData.append("wc", filterState.wantedCharacters); + + formData.append("a", filterState.authors.filter((a) => a.trim()).join(",")); + formData.append("s", filterState.sources.filter((s) => s.trim()).join(",")); + + formData.append("ntsq", filterState.normalizedTextSearchQuery); + + formData.append("l", filterState.lengthRange.join("-")); + formData.append("courseId", course.id); + + formData.append( + "st", + // JSON.stringify(seenTextsState.map(({ texts }) => texts)) + JSON.stringify(seenTextsState.map(({ texts }) => texts) || []), + ); + + submit(formData, { method: "post" }); + } + + const setTextGroupDescription = (groupIndex: number, description: string) => { + const newSeenTextsState = [...seenTextsState]; + newSeenTextsState[groupIndex] = { + ...newSeenTextsState[groupIndex], + description, + }; + setSeenTextsState(newSeenTextsState); + }; + + const insertTextGroup = (newGroupIndex: number) => { + const newSeenTextsState = [...seenTextsState]; + newSeenTextsState.splice(newGroupIndex, 0, { + texts: [], + description: "", + }); + setSeenTextsState(newSeenTextsState); + }; + const removeTextGroup = (groupIndex: number) => () => { + const newSeenTextsState = [...seenTextsState]; + newSeenTextsState.splice(groupIndex, 1); + setSeenTextsState(newSeenTextsState); + }; + const moveTextGroup = (groupIndex: number) => { + const newGroupNumberString = prompt( + "Which position to move text group to?", + String(groupIndex + 1), + ); + const newGroupNumber = newGroupNumberString + ? parseInt(newGroupNumberString) + : null; + + if (newGroupNumber != null && !Number.isNaN(newGroupNumber)) { + const newGroupIndex = newGroupNumber - 1; + const group = seenTextsState[groupIndex]; + const newSeenTextsState = [...seenTextsState]; + newSeenTextsState.splice(groupIndex, 1); + newSeenTextsState.splice(newGroupIndex, 0, group); + + setSeenTextsState(newSeenTextsState); + console.log(newSeenTextsState); + } else if (newGroupNumberString != null) { + alert("Invalid group number"); + } + }; + + return { + seenTextsFlat, + seenTextsState, + addToSeenTexts, + removeFromSeenTexts, + moveWithinSeenTexts, + runningSeenCharacters, + componentsToFirstSighting, + handleSubmit, + setFilterState, + filterState, + setTextGroupDescription, + insertTextGroup, + removeTextGroup, + moveTextGroup, + }; +}; + +function TextGroupDescriptionInput({ + groupIndex, + setTextGroupDescription, + description, +}: { + groupIndex: number; + setTextGroupDescription: (groupIndex: number, description: string) => void; + description: string; +}) { + const [value, setValue] = useState(description); + const ref = useRef(null); + useEffect(() => { + setValue(description); + }, [description]); + return ( +
    + { + setValue(e.target.value); + }} + /> + +
    + ); +} + +function isFigureAtomic( + figure: Pick, +): boolean { + return Array.isArray(figure.componentsTree) + ? figure.componentsTree.length === 0 + : false; +} diff --git a/app/routes/dict.$figureId.tsx b/app/routes/dict.$figureId.tsx index 6590cc689..e95a799a3 100644 --- a/app/routes/dict.$figureId.tsx +++ b/app/routes/dict.$figureId.tsx @@ -93,16 +93,21 @@ export default function FigureDetailsPage() { ); } - const { searchedFigure: figure } = loaderData; - const variants = figure.variantGroup?.variants + const { searchedFigure } = loaderData; + const variants = searchedFigure.variantGroup?.variants .flatMap((vid) => { - return figure.variantGroup?.figures.find((f) => f.id === vid) || []; + return ( + searchedFigure.variantGroup?.figures.find((f) => f.id === vid) || [] + ); }) .map((f) => getBadgeProps(f)); return (
    - +
    ); diff --git a/app/routes/dict.middle-chinese.tsx b/app/routes/dict.middle-chinese.tsx index 3895e7355..4491bd523 100644 --- a/app/routes/dict.middle-chinese.tsx +++ b/app/routes/dict.middle-chinese.tsx @@ -477,7 +477,7 @@ export default function MiddleChinese() {
    - +
    diff --git a/prisma/external/seedKanjiDbVariants.ts b/prisma/external/seedKanjiDbVariants.ts index d735c067d..bdb251410 100644 --- a/prisma/external/seedKanjiDbVariants.ts +++ b/prisma/external/seedKanjiDbVariants.ts @@ -171,12 +171,15 @@ async function getkanjiDbOldStyleDbInput( deregisterOldAndNewVariants(dbInput, "紋"); //given traditional form doesnt seem valid deregisterOldAndNewVariants(dbInput, "棚"); - // 簾 and 廉 seem to be clearly distinguished in modern Japanese + + // 簾 and 廉 seem to be clearly distinguished in modern Japanese; + // must have actually been mistake for 廉 deregisterOldAndNewVariants(dbInput, "簾"); // we don't want 欲 to be considered a component // if its only usage as a component is in a variant of itself // and that variant isn't even a base character (from our lists of important kanji). deregisterOldAndNewVariants(dbInput, "欲"); + // "擔" as old variant for "栃" is probably a mistake deregisterOldAndNewVariants(dbInput, "栃"); diff --git a/prisma/kanjisense/findGuangyunEntriesByShinjitai.ts b/prisma/kanjisense/findGuangyunEntriesByShinjitai.ts index 70873c2fa..ddbff0e8d 100644 --- a/prisma/kanjisense/findGuangyunEntriesByShinjitai.ts +++ b/prisma/kanjisense/findGuangyunEntriesByShinjitai.ts @@ -24,6 +24,7 @@ const entrySourceOverrides: Partial> = { "GWS-U6EA5-VAR-003": ["溥"], "GWS-U5C03-VAR-001": ["尃"], 円: ["圎"], // was taking 元 in addition to 圎 + 万: ["萬"], // was taking 万 in addition to 萬 }; export async function findGuangyunEntriesByShinjitai( diff --git a/prisma/kanjisense/seedBaseCorpus.ts b/prisma/kanjisense/seedBaseCorpus.ts new file mode 100644 index 000000000..ecf4118d7 --- /dev/null +++ b/prisma/kanjisense/seedBaseCorpus.ts @@ -0,0 +1,283 @@ +import { createReadStream } from "fs"; +import readline from "readline"; + +import { type PrismaClient } from "@prisma/client"; + +import { + BaseCorpus, + CuratorCorpusText, +} from "~/features/curate/CuratorCorpusText"; + +import { executeAndLogTime } from "./executeAndLogTime"; + +const COURSE = "kj"; + +export async function seedCorpus(prisma: PrismaClient, corpusTextPath: string) { + const startTime = Date.now(); + + console.log("Seeding corpus"); + + const fileStream = createReadStream(corpusTextPath, "utf8"); + const rl = readline.createInterface({ + input: fileStream, + crlfDelay: Infinity, + }); + const corpusJson: BaseCorpus = {}; + for await (const line of rl) { + const poem = JSON.parse(line) as CuratorCorpusText; + corpusJson[poem.normalizedText] = poem; + } + await fileStream.close(); + + await executeAndLogTime("Deleting all baseCorpusTexts", () => + prisma.baseCorpusText.deleteMany({ + where: { + course: COURSE, + }, + }), + ); + console.log("Creating baseCorpusTexts..."); + + const priorityFiguresKeys = await prisma.kanjisenseFigure + .findMany({ + select: { id: true }, + where: { isPriority: true }, + }) + .then((figures) => new Set(figures.map(({ id }) => id))); + const priorityComponentsKeys = await prisma.kanjisenseFigure + .findMany({ + select: { id: true }, + where: { isPriorityComponent: true, isPriority: true }, + }) + .then((figures) => new Set(figures.map(({ id }) => id))); + const allFiguresWithTrees = await prisma.kanjisenseFigure + .findMany({ + select: { id: true, componentsTree: true }, + }) + .then((figures) => { + const figuresMap = new Map< + string, + [parent: string, component: string][] | null + >(); + for (const { id, componentsTree } of figures) { + figuresMap.set( + id, + (componentsTree as unknown as [ + parent: string, + component: string, + ][]) ?? null, + ); + } + return figuresMap; + }); + const allFiguresFrequencyScores = await prisma.kanjisenseFigure + .findMany({ + select: { id: true, aozoraAppearances: true }, + }) + .then((figures) => { + const figuresMap = new Map(); + for (const { id, aozoraAppearances } of figures) { + figuresMap.set(id, aozoraAppearances); + } + return figuresMap; + }); + + const figuresToUniquePriorityComponents = new Map(); + for (const [id] of allFiguresWithTrees) { + const components = getAllUniquePriorityComponents( + (id) => { + return allFiguresWithTrees.get(id) ?? null; + }, + priorityFiguresKeys, + priorityComponentsKeys, + id, + ); + figuresToUniquePriorityComponents.set(id, components); + } + + const totalTexts = Object.keys(corpusJson).length; + let seeded = 0; + await inBatchesOf(2000, Object.entries(corpusJson), async (batch) => { + const lengthCache = new Array(batch.length); + const nonPriorityCharactersCountCache = new Array(batch.length); + const hashCache = new Array(batch.length); + const uniquePriorityComponentsCache = new Array(batch.length); + const hashAndCache = (text: string, i: number) => { + if (hashCache[i]) return hashCache[i]; + const hash = hashString(text); + hashCache[i] = hash; + return hash; + }; + const getAllUniqueComponentsAndCache = (uniqueChars: string, i: number) => { + if (uniquePriorityComponentsCache[i]) + return uniquePriorityComponentsCache[i]; + const uniqueComponents = new Set(); + for (const char of uniqueChars) { + if (priorityFiguresKeys.has(char)) { + const components = figuresToUniquePriorityComponents.get(char) ?? []; + for (const component of components) { + uniqueComponents.add(component); + } + } + } + uniquePriorityComponentsCache[i] = [...uniqueComponents]; + + return uniquePriorityComponentsCache[i]; + }; + + const priorityFigures = await prisma.kanjisenseFigure + .findMany({ + where: { + isPriority: true, + }, + }) + .then((figures) => new Set(figures.map(({ id }) => id))); + + const batchData = batch.map(([key, value], i) => { + lengthCache[i] = value.normalizedText.length; + let nonPriorityCharactersCount = 0; + for (const char of value.normalizedText) { + if (!priorityFigures.has(char)) nonPriorityCharactersCount++; + } + nonPriorityCharactersCountCache[i] = nonPriorityCharactersCount; + return { + id: hashAndCache(key, i), + key, + course: COURSE, + title: value.title, + author: value.author, + source: value.source, + section: value.section, + dynasty: value.dynasty, + urls: value.urls ?? [], + text: value.text, + + normalizedText: value.normalizedText, + normalizedLength: value.normalizedText.length, + nonPriorityCharactersCount, + }; + }); + const result = await prisma.baseCorpusText.createMany({ + data: batchData, + }); + + seeded += result.count; + console.log(`Seeded ${seeded} baseCorpusTexts of ${totalTexts}`); + + console.log("Seeding character relations..."); + + const characterUsagesData = batch.flatMap(([, { uniqueChars }], i) => { + const uniqueComponents = getAllUniqueComponentsAndCache(uniqueChars, i); + + return Array.from(uniqueChars, (character) => { + return { + character, + baseCorpusTextId: hashCache[i], + figureId: allFiguresWithTrees.has(character) ? character : null, + frequencyScore: allFiguresFrequencyScores.get(character) ?? 0, + baseCorpusTextLength: lengthCache[i], + baseCorpusUniqueCharactersCount: uniqueChars.length, + baseCorpusUniqueComponentsCount: uniqueComponents.length, + baseCorpusTextNonPriorityCharactersCount: + nonPriorityCharactersCountCache[i], + }; + }); + }); + await prisma.characterUsagesOnBaseCorpusText.createMany({ + data: characterUsagesData, + }); + + console.log("Seeding component relations..."); + + const createdComponentUsages = + await prisma.componentUsagesOnBaseCorpusText.createMany({ + data: batch.reduce( + (all, [, { uniqueChars }], i) => { + const uniqueComponents = uniquePriorityComponentsCache[i]; + for (const figureId of uniqueComponents) { + all.push({ + figureId, + baseCorpusTextId: hashCache[i], + frequencyScore: allFiguresFrequencyScores.get(figureId) ?? 0, + baseCorpusTextLength: lengthCache[i], + baseCorpusUniqueCharactersCount: uniqueChars.length, + baseCorpusUniqueComponentsCount: uniqueComponents.length, + }); + } + return all; + }, + [] as { + figureId: string; + baseCorpusTextId: number; + frequencyScore: number; + baseCorpusTextLength: number; + baseCorpusUniqueCharactersCount: number; + baseCorpusUniqueComponentsCount: number; + }[], + ), + }); + console.log(createdComponentUsages.count, "component usages created"); + + const endTime = Date.now(); + const ms = endTime - startTime; + const seconds = ms / 1000; + console.log( + `Seeded character and component relations for batch in ${seconds.toFixed( + 3, + )} seconds.`, + ); + }); + + console.log(`Seeded ${seeded} baseCorpusTexts.}`); + const endTime = Date.now(); + const ms = endTime - startTime; + const seconds = ms / 1000; + console.log(`Finished in ${seconds.toFixed(3)} seconds.`); +} + +async function inBatchesOf( + count: number, + array: T[], + action: (batch: T[]) => Promise, +) { + for (let i = 0; i < array.length; i += count) { + const batch = array.slice(i, i + count); + await action(batch); + } +} + +function hashString(string: string) { + let hash = 0, + i, + chr; + if (string.length === 0) return hash; + for (i = 0; i < string.length; i++) { + chr = string.charCodeAt(i); + hash = (hash << 5) - hash + chr; + hash |= 0; + } + return hash; +} + +function getAllUniquePriorityComponents( + getComponentsTree: ( + key: string, + ) => [parent: string, component: string][] | null, + priorityFiguresKeys: Set, + priorityComponentsKeys: Set, + character: string, +) { + if (!priorityFiguresKeys.has(character)) return []; + const components = new Set(); + + const componentsTreeJson = getComponentsTree(character); + if (!componentsTreeJson?.length) { + components.add(character); + } else { + for (const [, component] of componentsTreeJson) { + if (priorityComponentsKeys.has(component)) components.add(component); + } + } + + return [...components]; +} diff --git a/prisma/schema.prisma b/prisma/schema.prisma index a29f804de..fc02151b4 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -1,5 +1,6 @@ generator client { - provider = "prisma-client-js" + provider = "prisma-client-js" + previewFeatures = ["fullTextSearch"] } datasource db { @@ -158,6 +159,9 @@ model KanjisenseFigure { allComponents KanjisenseComponent[] @relation("allComponents") searchProperties SearchPropertiesOnFigure[] + + baseCorpusUsesAsCharacter CharacterUsagesOnBaseCorpusText[] + baseCorpusUsesAsComponent ComponentUsagesOnBaseCorpusText[] } model KanjisenseComponent { @@ -326,3 +330,68 @@ enum FigureSearchPropertyType { TRANSLATION_ENGLISH MNEMONIC_ENGLISH } + +model BaseCorpusText { + id Int @id + + course String + + key String + title String? + author String? + source String + section String? + dynasty String? + urls String[] + text String + normalizedText String + normalizedLength Int + + nonPriorityCharactersCount Int @default(0) + + uniqueCharacters CharacterUsagesOnBaseCorpusText[] + uniqueComponents ComponentUsagesOnBaseCorpusText[] +} + +model CharacterUsagesOnBaseCorpusText { + character String + figureId String? + figure KanjisenseFigure? @relation(fields: [figureId], references: [id], onDelete: Cascade, onUpdate: Cascade) + baseCorpusTextId Int + baseCorpusText BaseCorpusText @relation(fields: [baseCorpusTextId], references: [id], onDelete: Cascade, onUpdate: Cascade) + + frequencyScore Int + baseCorpusTextLength Int + baseCorpusUniqueCharactersCount Int + baseCorpusUniqueComponentsCount Int + baseCorpusTextNonPriorityCharactersCount Int @default(0) + + @@id([baseCorpusTextId, character]) +} + +model ComponentUsagesOnBaseCorpusText { + figureId String + figure KanjisenseFigure @relation(fields: [figureId], references: [id], onDelete: Cascade, onUpdate: Cascade) + baseCorpusTextId Int + baseCorpusText BaseCorpusText @relation(fields: [baseCorpusTextId], references: [id], onDelete: Cascade, onUpdate: Cascade) + + frequencyScore Int + baseCorpusTextLength Int + baseCorpusUniqueCharactersCount Int + baseCorpusUniqueComponentsCount Int + + @@id([baseCorpusTextId, figureId]) +} + +model Course { + id String @id + + // only for curator + authors String[] @default([]) + sources String[] @default([]) + seenTexts Json? + normalizedTextSearchQuery String @default("") + wantedCharacters String @default("") + minLength Int? + maxLength Int? +}