From d46e45e92e1a08b13c9bd064afcae13d5214615c Mon Sep 17 00:00:00 2001 From: justinsilvestre Date: Sat, 9 Mar 2024 17:52:12 +0100 Subject: [PATCH] Add versioning fields, make migration including curator tables --- app/features/curate/getCurationState.ts | 6 +- app/isComponentFirstClass.ptest.ts | 18 ++-- app/routes/_index.tsx | 8 ++ app/routes/curate.$courseId.tsx | 36 ++++--- prisma/kanjisense/seedBaseCorpus.ts | 8 +- .../seedKanjisenseActiveSoundMarks.ts | 7 +- prisma/kanjisense/seedKanjisenseFigures.ts | 86 ++++++++-------- .../migration.sql | 99 +++++++++++++++++++ prisma/schema.prisma | 43 +++++--- prisma/seedScript.ts | 13 +++ 10 files changed, 238 insertions(+), 86 deletions(-) create mode 100644 prisma/migrations/20240309115908_add_versioning_fields_and_curator_tables/migration.sql diff --git a/app/features/curate/getCurationState.ts b/app/features/curate/getCurationState.ts index a6e2f5b5..ab4823bc 100644 --- a/app/features/curate/getCurationState.ts +++ b/app/features/curate/getCurationState.ts @@ -241,7 +241,7 @@ export async function getCurationState(courseId: string, page: number) { seenTexts .flat() .flatMap((t) => - t.uniqueCharacters.flatMap((c) => c.figureId || []), + t.uniqueCharacters.flatMap((c) => c.character || []), ), ), ], @@ -525,7 +525,7 @@ export async function getCurationState(courseId: string, page: number) { .map((q) => ({ normalizedText: { contains: q } })) : undefined, }, - figureId: { + character: { notIn: charactersNotNeededAnymore, in: course?.wantedCharacters.length ? course.wantedCharacters.split("") @@ -635,7 +635,7 @@ export async function getCurationState(courseId: string, page: number) { .map((q) => ({ normalizedText: { contains: q } })) : undefined, }, - figureId: { + character: { notIn: charactersNotNeededAnymore, in: course?.wantedCharacters.length ? course.wantedCharacters.split("") diff --git a/app/isComponentFirstClass.ptest.ts b/app/isComponentFirstClass.ptest.ts index 10bfe11e..836c755e 100644 --- a/app/isComponentFirstClass.ptest.ts +++ b/app/isComponentFirstClass.ptest.ts @@ -51,13 +51,13 @@ describe("isComponentFirstClass", () => { ); it("works with CDP-8CAB (left of 歸)", async () => { - const priorityFiguresIds = await prisma.kanjisenseFigure + const priorityFiguresKeys = await prisma.kanjisenseFigure .findMany({ where: { isPriority: true, }, }) - .then((fs) => fs.map((f) => f.id)); + .then((fs) => fs.map((f) => f.key!)); const parent = "歸"; const component = "CDP-8CAB"; const componentsToDirectUsesPrimaryVariants = new Map>([ @@ -65,7 +65,7 @@ describe("isComponentFirstClass", () => { ]); const figuresToVariantGroups = await getFiguresToVariantGroups(prisma); const result = isComponentFirstClass( - new Set(priorityFiguresIds), + new Set(priorityFiguresKeys), parent, component, componentsToDirectUsesPrimaryVariants, @@ -76,7 +76,7 @@ describe("isComponentFirstClass", () => { }); it("works with 𠚍", async () => { - const priorityFiguresIds = await prisma.kanjisenseFigure + const priorityFiguresKeys = await prisma.kanjisenseFigure .findMany({ where: { isPriority: true, @@ -91,7 +91,7 @@ describe("isComponentFirstClass", () => { ]); const figuresToVariantGroups = await getFiguresToVariantGroups(prisma); const result = isComponentFirstClass( - new Set(priorityFiguresIds), + new Set(priorityFiguresKeys), parent, component, componentsToDirectUsesPrimaryVariants, @@ -102,7 +102,7 @@ describe("isComponentFirstClass", () => { }); it("works with 旡", async () => { - const priorityFiguresIds = await prisma.kanjisenseFigure + const priorityFiguresKeys = await prisma.kanjisenseFigure .findMany({ where: { isPriority: true, @@ -117,7 +117,7 @@ describe("isComponentFirstClass", () => { ]); const figuresToVariantGroups = await getFiguresToVariantGroups(prisma); const result = isComponentFirstClass( - new Set(priorityFiguresIds), + new Set(priorityFiguresKeys), parent, component, componentsToDirectUsesPrimaryVariants, @@ -128,7 +128,7 @@ describe("isComponentFirstClass", () => { }); it("works with 卂", async () => { - const priorityFiguresIds = await prisma.kanjisenseFigure + const priorityFiguresKeys = await prisma.kanjisenseFigure .findMany({ where: { isPriority: true, @@ -143,7 +143,7 @@ describe("isComponentFirstClass", () => { ]); const figuresToVariantGroups = await getFiguresToVariantGroups(prisma); const result = isComponentFirstClass( - new Set(priorityFiguresIds), + new Set(priorityFiguresKeys), parent, component, componentsToDirectUsesPrimaryVariants, diff --git a/app/routes/_index.tsx b/app/routes/_index.tsx index 68f58b83..7d884554 100644 --- a/app/routes/_index.tsx +++ b/app/routes/_index.tsx @@ -67,6 +67,8 @@ const nichi: BadgeProps = { id: "日", image: { id: "日", + key: "日", + version: 0, type: "Kvg", content: { n: [ @@ -96,6 +98,8 @@ const getsu: BadgeProps = { id: "月", image: { id: "月", + key: "月", + version: 0, type: "Kvg", content: { n: [ @@ -125,6 +129,8 @@ const akarui: BadgeProps = { id: "明", image: { id: "明", + key: "明", + version: 1, type: "Kvg", content: { n: [ @@ -162,6 +168,8 @@ const mei: BadgeProps = { id: "盟", image: { id: "盟", + key: "盟", + version: 1, type: "Kvg", content: { n: [ diff --git a/app/routes/curate.$courseId.tsx b/app/routes/curate.$courseId.tsx index cd03adb4..1974aa74 100644 --- a/app/routes/curate.$courseId.tsx +++ b/app/routes/curate.$courseId.tsx @@ -1,3 +1,5 @@ +import { writeFileSync } from "fs"; + import type { BaseCorpusText, KanjisenseFigure } from "@prisma/client"; import { useActionData, useLoaderData, useSubmit } from "@remix-run/react"; import { @@ -94,6 +96,16 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { ? parseInt(queryStringParams.get("p")!) : 1; + writeFileSync( + __dirname + "/curatorCoursesArchive.json", + JSON.stringify( + (await prisma.course.findMany()).map((c) => ({ + id: c.id, + seenTexts: c.seenTexts, + })), + ), + ); + console.log(__dirname + "/curatorCoursesArchive.json"); const { course, seenTexts, @@ -794,13 +806,13 @@ function TextUniqueComponents({ const newNonAtomic: React.ReactNode[] = []; text.uniqueComponents.forEach((c) => { - const figure = getFigure(c.figureId); + const figure = getFigure(c.figureKey); const badgeProps = figure && getBadgeProps(figure); const newNode = !badgeProps ? null : ( -
+
c.figureId || []), + ...text.uniqueCharacters.flatMap((c) => c.character || []), ]); return [t, oldSeenChars]; }); @@ -1113,15 +1125,15 @@ const useSeenTextsState = ( const text = seenTextsFlat.find((t) => t.key === textKey); if (!text) return; text.uniqueCharacters.forEach((c) => { - if (!c.figureId) return; - if (seenSoFar.has(c.figureId)) return; - seenSoFar.add(c.figureId); - map.set(c.figureId, { textGroupIndex, textIndex, textKey }); + if (!c.character) return; + if (seenSoFar.has(c.character)) return; + seenSoFar.add(c.character); + map.set(c.character, { textGroupIndex, textIndex, textKey }); }); text.uniqueComponents.forEach((c) => { - if (seenSoFar.has(c.figureId)) return; - seenSoFar.add(c.figureId); - map.set(c.figureId, { textGroupIndex, textIndex, textKey }); + if (seenSoFar.has(c.figureKey)) return; + seenSoFar.add(c.figureKey); + map.set(c.figureKey, { textGroupIndex, textIndex, textKey }); }); }); }); diff --git a/prisma/kanjisense/seedBaseCorpus.ts b/prisma/kanjisense/seedBaseCorpus.ts index ecf4118d..7dfb3e35 100644 --- a/prisma/kanjisense/seedBaseCorpus.ts +++ b/prisma/kanjisense/seedBaseCorpus.ts @@ -194,11 +194,11 @@ export async function seedCorpus(prisma: PrismaClient, corpusTextPath: string) { data: batch.reduce( (all, [, { uniqueChars }], i) => { const uniqueComponents = uniquePriorityComponentsCache[i]; - for (const figureId of uniqueComponents) { + for (const figureKey of uniqueComponents) { all.push({ - figureId, + figureKey, baseCorpusTextId: hashCache[i], - frequencyScore: allFiguresFrequencyScores.get(figureId) ?? 0, + frequencyScore: allFiguresFrequencyScores.get(figureKey) ?? 0, baseCorpusTextLength: lengthCache[i], baseCorpusUniqueCharactersCount: uniqueChars.length, baseCorpusUniqueComponentsCount: uniqueComponents.length, @@ -207,7 +207,7 @@ export async function seedCorpus(prisma: PrismaClient, corpusTextPath: string) { return all; }, [] as { - figureId: string; + figureKey: string; baseCorpusTextId: number; frequencyScore: number; baseCorpusTextLength: number; diff --git a/prisma/kanjisense/seedKanjisenseActiveSoundMarks.ts b/prisma/kanjisense/seedKanjisenseActiveSoundMarks.ts index 471d3c27..236f825c 100644 --- a/prisma/kanjisense/seedKanjisenseActiveSoundMarks.ts +++ b/prisma/kanjisense/seedKanjisenseActiveSoundMarks.ts @@ -60,7 +60,12 @@ export async function registerActiveSoundMarks( function getPrimaryVariantId(id: string) { return allVariantsToVariantGroupHead[id] || id; } - for (const [id, tree] of componentsTrees.entries()) { + let visitedCount = 0; + for (const [id, tree] of componentsTrees) { + visitedCount++; + if (visitedCount % 500 === 0 || visitedCount === componentsTrees.size) { + console.log(`|| processed ${visitedCount} / ${componentsTrees.size}`); + } const derivation = (await prisma.kanjiDbCharacterDerivation.findUnique({ where: { diff --git a/prisma/kanjisense/seedKanjisenseFigures.ts b/prisma/kanjisense/seedKanjisenseFigures.ts index 1e8004c2..d5724521 100644 --- a/prisma/kanjisense/seedKanjisenseFigures.ts +++ b/prisma/kanjisense/seedKanjisenseFigures.ts @@ -192,8 +192,9 @@ export async function seedKanjisenseFigures( ); }); - console.log("cleaning slate before creating figures"); - await prisma.kanjisenseFigure.deleteMany({}); + await executeAndLogTime("cleaning slate before creating figures", () => + prisma.kanjisenseFigure.deleteMany({}), + ); await executeAndLogTime("seeding figures", async () => { await inBatchesOf({ @@ -426,44 +427,49 @@ async function connectComponentsTreesEntries( } >, ) { - for (const [id, componentsTree] of componentsTreesInput) { - const figureUsesAsComponent = getComponentUses(id); - try { - const combinedAozoraAppearances = - (allAozoraCharacterFrequencies[id]?.appearances ?? 0) + - (figureUsesAsComponent - ? setReduce( - figureUsesAsComponent, - (acc, parentId) => - acc + - (allAozoraCharacterFrequencies[parentId]?.appearances ?? 0), - 0, - ) - : 0); - - await prisma.kanjisenseFigure.update({ - where: { id }, - data: { - aozoraAppearances: combinedAozoraAppearances, - componentsTree: componentsTree.map((c) => c.toJSON()), - asComponent: figureUsesAsComponent?.size - ? { - create: { - allUses: { - connect: Array.from(figureUsesAsComponent, (parentId) => ({ - id: parentId, - })), + return await Promise.all( + Array.from(componentsTreesInput, async ([id, componentsTree]) => { + const figureUsesAsComponent = getComponentUses(id); + try { + const combinedAozoraAppearances = + (allAozoraCharacterFrequencies[id]?.appearances ?? 0) + + (figureUsesAsComponent + ? setReduce( + figureUsesAsComponent, + (acc, parentId) => + acc + + (allAozoraCharacterFrequencies[parentId]?.appearances ?? 0), + 0, + ) + : 0); + + await prisma.kanjisenseFigure.update({ + where: { id }, + data: { + aozoraAppearances: combinedAozoraAppearances, + componentsTree: componentsTree.map((c) => c.toJSON()), + asComponent: figureUsesAsComponent?.size + ? { + create: { + allUses: { + connect: Array.from( + figureUsesAsComponent, + (parentId) => ({ + id: parentId, + }), + ), + }, }, - }, - } - : undefined, - }, - }); - } catch (e) { - console.log({ id, componentsTree, figureUsesAsComponent }); - throw e; - } - } + } + : undefined, + }, + }); + } catch (e) { + console.log({ id, componentsTree, figureUsesAsComponent }); + throw e; + } + }), + ); } async function prepareFiguresForMeaningAssignments( @@ -681,10 +687,8 @@ async function getAllComponentsTrees( visitedFigures++; if (visitedFigures % 1000 === 0 || visitedFigures === figuresKeys.length) { console.log(`|| ${visitedFigures} / ${figuresKeys.length} processed`); - console.dir(componentsTree); } }); - console.log("WOP!"); return { componentsTreesInput, diff --git a/prisma/migrations/20240309115908_add_versioning_fields_and_curator_tables/migration.sql b/prisma/migrations/20240309115908_add_versioning_fields_and_curator_tables/migration.sql new file mode 100644 index 00000000..1d0596c7 --- /dev/null +++ b/prisma/migrations/20240309115908_add_versioning_fields_and_curator_tables/migration.sql @@ -0,0 +1,99 @@ +-- AlterTable +ALTER TABLE "GlyphImage" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseComponent" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseFigure" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseFigureImage" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseFigureMeaning" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseFigureReading" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseFigureRelation" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "KanjisenseVariantGroup" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- AlterTable +ALTER TABLE "ShuowenImage" ADD COLUMN "key" TEXT, +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 0; + +-- CreateTable +CREATE TABLE "BaseCorpusText" ( + "id" INTEGER NOT NULL, + "course" TEXT NOT NULL, + "key" TEXT NOT NULL, + "title" TEXT, + "author" TEXT, + "source" TEXT NOT NULL, + "section" TEXT, + "dynasty" TEXT, + "urls" TEXT[], + "text" TEXT NOT NULL, + "normalizedText" TEXT NOT NULL, + "normalizedLength" INTEGER NOT NULL, + "nonPriorityCharactersCount" INTEGER NOT NULL DEFAULT 0, + + CONSTRAINT "BaseCorpusText_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "CharacterUsagesOnBaseCorpusText" ( + "character" TEXT NOT NULL, + "baseCorpusTextId" INTEGER NOT NULL, + "frequencyScore" INTEGER NOT NULL, + "baseCorpusTextLength" INTEGER NOT NULL, + "baseCorpusUniqueCharactersCount" INTEGER NOT NULL, + "baseCorpusUniqueComponentsCount" INTEGER NOT NULL, + "baseCorpusTextNonPriorityCharactersCount" INTEGER NOT NULL DEFAULT 0, + + CONSTRAINT "CharacterUsagesOnBaseCorpusText_pkey" PRIMARY KEY ("baseCorpusTextId","character") +); + +-- CreateTable +CREATE TABLE "ComponentUsagesOnBaseCorpusText" ( + "figureKey" TEXT NOT NULL, + "baseCorpusTextId" INTEGER NOT NULL, + "frequencyScore" INTEGER NOT NULL, + "baseCorpusTextLength" INTEGER NOT NULL, + "baseCorpusUniqueCharactersCount" INTEGER NOT NULL, + "baseCorpusUniqueComponentsCount" INTEGER NOT NULL, + + CONSTRAINT "ComponentUsagesOnBaseCorpusText_pkey" PRIMARY KEY ("baseCorpusTextId","figureKey") +); + +-- CreateTable +CREATE TABLE "Course" ( + "id" TEXT NOT NULL, + "authors" TEXT[] DEFAULT ARRAY[]::TEXT[], + "sources" TEXT[] DEFAULT ARRAY[]::TEXT[], + "seenTexts" JSONB, + "normalizedTextSearchQuery" TEXT NOT NULL DEFAULT '', + "wantedCharacters" TEXT NOT NULL DEFAULT '', + "minLength" INTEGER, + "maxLength" INTEGER, + + CONSTRAINT "Course_pkey" PRIMARY KEY ("id") +); + +-- AddForeignKey +ALTER TABLE "CharacterUsagesOnBaseCorpusText" ADD CONSTRAINT "CharacterUsagesOnBaseCorpusText_baseCorpusTextId_fkey" FOREIGN KEY ("baseCorpusTextId") REFERENCES "BaseCorpusText"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "ComponentUsagesOnBaseCorpusText" ADD CONSTRAINT "ComponentUsagesOnBaseCorpusText_baseCorpusTextId_fkey" FOREIGN KEY ("baseCorpusTextId") REFERENCES "BaseCorpusText"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index fc02151b..24c08a5d 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -102,6 +102,8 @@ model ScriptinAozoraFrequency { model KanjisenseVariantGroup { id String @id + key String? + version Int @default(0) variants String[] // built after variants are set figures KanjisenseFigure[] @@ -110,6 +112,8 @@ model KanjisenseVariantGroup { model KanjisenseFigureRelation { id String @id + key String? + version Int @default(0) variantGroupId String? directUses String[] listsAsComponent String[] @@ -127,6 +131,8 @@ model KanjiDbCharacterDerivation { model KanjisenseFigure { id String @id + key String? + version Int @default(0) keyword String isPriority Boolean isStandaloneCharacter Boolean @default(false) @@ -159,13 +165,12 @@ model KanjisenseFigure { allComponents KanjisenseComponent[] @relation("allComponents") searchProperties SearchPropertiesOnFigure[] - - baseCorpusUsesAsCharacter CharacterUsagesOnBaseCorpusText[] - baseCorpusUsesAsComponent ComponentUsagesOnBaseCorpusText[] } model KanjisenseComponent { id String @id + key String? + version Int @default(0) componentFigure KanjisenseFigure @relation("asComponent", fields: [id], references: [id], onDelete: Cascade, onUpdate: Cascade) soundMarkUses KanjisenseFigure[] @relation("activeSoundMark") allUses KanjisenseFigure[] @relation("allComponents") @@ -184,6 +189,8 @@ model KanjisenseComponentUse { model KanjisenseFigureReading { id String @id + key String? + version Int @default(0) sbgyXiaoyunsMatchingExemplars Json? inferredOnReadingCandidates Json kanjidicEntryId String? @unique @@ -207,15 +214,18 @@ model KanjisenseFigureReadingToSbgyXiaoyun { model KanjisenseFigureMeaning { id String @id + key String? + version Int @default(0) unihanDefinition String? kanjidicEnglish String[] figure KanjisenseFigure @relation(fields: [id], references: [id], onDelete: Cascade, onUpdate: Cascade) } model KanjisenseFigureImage { - id String @id - - type KanjisenseFigureImageType + id String @id + key String? + version Int @default(0) + type KanjisenseFigureImageType content Json figure KanjisenseFigure @relation(fields: [id], references: [id], onDelete: Cascade, onUpdate: Cascade) @@ -228,14 +238,18 @@ enum KanjisenseFigureImageType { model ShuowenImage { id String @id + key String? + version Int @default(0) paths String[] figures KanjisenseFigure[] } model GlyphImage { - id String @id - json Json - figure KanjisenseFigure @relation(fields: [id], references: [id], onDelete: Cascade, onUpdate: Cascade) + id String @id + key String? + version Int @default(0) + json Json + figure KanjisenseFigure @relation(fields: [id], references: [id], onDelete: Cascade, onUpdate: Cascade) } model User { @@ -355,10 +369,8 @@ model BaseCorpusText { model CharacterUsagesOnBaseCorpusText { character String - figureId String? - figure KanjisenseFigure? @relation(fields: [figureId], references: [id], onDelete: Cascade, onUpdate: Cascade) baseCorpusTextId Int - baseCorpusText BaseCorpusText @relation(fields: [baseCorpusTextId], references: [id], onDelete: Cascade, onUpdate: Cascade) + baseCorpusText BaseCorpusText @relation(fields: [baseCorpusTextId], references: [id], onDelete: Cascade, onUpdate: Cascade) frequencyScore Int baseCorpusTextLength Int @@ -370,17 +382,16 @@ model CharacterUsagesOnBaseCorpusText { } model ComponentUsagesOnBaseCorpusText { - figureId String - figure KanjisenseFigure @relation(fields: [figureId], references: [id], onDelete: Cascade, onUpdate: Cascade) + figureKey String baseCorpusTextId Int - baseCorpusText BaseCorpusText @relation(fields: [baseCorpusTextId], references: [id], onDelete: Cascade, onUpdate: Cascade) + baseCorpusText BaseCorpusText @relation(fields: [baseCorpusTextId], references: [id], onDelete: Cascade, onUpdate: Cascade) frequencyScore Int baseCorpusTextLength Int baseCorpusUniqueCharactersCount Int baseCorpusUniqueComponentsCount Int - @@id([baseCorpusTextId, figureId]) + @@id([baseCorpusTextId, figureKey]) } model Course { diff --git a/prisma/seedScript.ts b/prisma/seedScript.ts index b8b2cea5..c01a61a3 100644 --- a/prisma/seedScript.ts +++ b/prisma/seedScript.ts @@ -97,6 +97,19 @@ export async function seed(prisma: PrismaClient) { seedFigureSearchProperties(prisma, 100, false), ); + console.log("updating keys"); + for (const { id } of await prisma.kanjisenseFigure.findMany({ + select: { id: true }, + })) { + await prisma.kanjisenseFigure.update({ + where: { id }, + data: { + key: id, + }, + }); + } + console.log("keys updated"); + console.log( "disk usage after:", await prisma.$queryRaw`SELECT datname as db_name, pg_size_pretty(pg_database_size(datname)) as db_usage FROM pg_database`,