Skip to content

Commit

Permalink
More batching
Browse files Browse the repository at this point in the history
  • Loading branch information
justinsilvestre committed Dec 10, 2023
1 parent 2caf100 commit 260b000
Show file tree
Hide file tree
Showing 6 changed files with 285 additions and 246 deletions.
57 changes: 31 additions & 26 deletions prisma/external/seedKanjiDbComposition.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { PrismaClient } from "@prisma/client";

import { inBatchesOf } from "prisma/kanjisense/inBatchesOf";
import { files, readJsonSync } from "~/lib/files.server";
import { forEachLine } from "~/lib/forEachLine.server";

Expand All @@ -19,15 +20,15 @@ export async function seedKanjiDbComposition(
const dbInput = await getDbInput();

await prisma.kanjiDbComposition.deleteMany({});
await prisma.kanjiDbComposition.createMany({
data: Object.values(dbInput).map(
({ id, ids, etymology, sbgySyllables }) => ({
id,
ids: ids || null,
etymology: etymology || null,
sbgySyllables: sbgySyllables || [],
}),
),
await inBatchesOf({
count: 500,
collection: dbInput,
getBatchItem: (entry) => entry[1],
action: async (batch) => {
await prisma.kanjiDbComposition.createMany({
data: batch,
});
},
});

await registerSeeded(prisma, "KanjiDbComposition");
Expand All @@ -36,34 +37,39 @@ export async function seedKanjiDbComposition(
console.log(`KanjiDbComposition seeded. 🌱`);
}

class KanjiDbComposition {
constructor(
public id: string,
public ids: string | null,
public etymology: string | null = null,
public sbgySyllables: number[] = [],
) {}
}

async function getDbInput() {
const dbInput: Record<
string,
{ id: string; ids?: string; etymology?: string; sbgySyllables?: number[] }
> = {};
const dbInput = new Map<string, KanjiDbComposition>();

await forEachLine(files.kanjiDbIdsCdpTxt, async (line) => {
if (!line || /^#|^;;/.test(line)) return;

const [, figureId, ids] = line.match(/\S+\t&?([^&;\s]+);?\t(.+)/u)!;
if (!figureId || !ids) throw new Error(line);
dbInput[figureId] = {
id: figureId,
ids,
};

dbInput.set(figureId, new KanjiDbComposition(figureId, ids, null, []));
});

await forEachLine(files.kanjiDbAnalysisTxt, async (line) => {
if (!line || /^#|^;;/.test(line)) return;

const [, figureId, etymology] = line.match(/\S+\t&?([^&;\s]+);?\t(.+)/u)!;
if (!figureId || !etymology) throw new Error(line);
if (!dbInput[figureId]) console.warn(`no id for ${figureId} in ${line}`);
if (dbInput[figureId]?.etymology) {
const entry = dbInput.get(figureId);
if (!entry) console.warn(`no id for ${figureId} in ${line}`);
if (entry?.etymology) {
console.warn(
`duplicate etymology for ${figureId} prioritizing first: ${dbInput[figureId].etymology}`,
`duplicate etymology for ${figureId} prioritizing first: ${entry.etymology}`,
);
} else if (dbInput[figureId]) dbInput[figureId].etymology = etymology;
} else if (entry) entry.etymology = etymology;
});

const sbgyJson = readJsonSync<
Expand All @@ -80,12 +86,11 @@ async function getDbInput() {
for (const character of characters.split(",")) {
if (!character)
console.warn(`no character for ${syllableNumber} ${fanqie}`);
dbInput[character] ||= {
id: character,
};
const entry =
dbInput.get(character) || new KanjiDbComposition(character, null);
dbInput.set(character, entry);

dbInput[character].sbgySyllables ||= [];
dbInput[character].sbgySyllables!.push(syllableNumber);
entry.sbgySyllables.push(syllableNumber);
}
}
return dbInput;
Expand Down
27 changes: 16 additions & 11 deletions prisma/external/seedUnihan14.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,17 +72,22 @@ export async function seedUnihan14(prisma: PrismaClient, force = false) {
registerVariant(dbInput, "倶", "kZVariant", "俱");
registerVariant(dbInput, "俱", "kZVariant", "倶");

await inBatchesOf(10000, dbInput, async (batch) => {
await prisma.unihan14.createMany({
data: Array.from(batch, ([id, fields]) => ({
id,
kSemanticVariant: fields.kSemanticVariant || [],
kSimplifiedVariant: fields.kSimplifiedVariant || [],
kSpecializedSemanticVariant: fields.kSpecializedSemanticVariant || [],
kTraditionalVariant: fields.kTraditionalVariant || [],
kZVariant: fields.kZVariant || [],
})),
});
await inBatchesOf({
count: 10000,
collection: dbInput,
getBatchItem: ([id, fields]) => ({
id,
kSemanticVariant: fields.kSemanticVariant || [],
kSimplifiedVariant: fields.kSimplifiedVariant || [],
kSpecializedSemanticVariant: fields.kSpecializedSemanticVariant || [],
kTraditionalVariant: fields.kTraditionalVariant || [],
kZVariant: fields.kZVariant || [],
}),
action: async (batch) => {
await prisma.unihan14.createMany({
data: batch,
});
},
});

await registerSeeded(prisma, "Unihan14");
Expand Down
20 changes: 12 additions & 8 deletions prisma/external/seedUnihan15.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ export async function seedUnihan15(prisma: PrismaClient, force = false) {
});
});

await inBatchesOf(500, dbInput, async (batch) => {
const data = Array.from(batch, ([id, fields]) => ({
await inBatchesOf({
count: 500,
collection: dbInput,
getBatchItem: ([id, fields]) => ({
id,
kDefinition: fields.kDefinition || null,
kCantonese: fields.kCantonese?.split(" ") || [],
Expand All @@ -81,12 +83,14 @@ export async function seedUnihan15(prisma: PrismaClient, force = false) {
kXHC1983: fields.kXHC1983?.split(" ") || [],
kRSAdobe_Japan1_6: fields.kRSAdobe_Japan1_6?.split(" ") || [],
kRSUnicode: fields.kRSUnicode?.split(" ") || [],
}));

const x = await prisma.unihan15.createMany({
data,
});
console.log(`${x.count} created.`);
}),
action: async (batch) => {
const created = await prisma.unihan15.createMany({
data: batch,
});

console.log(` ${created.count} created.`);
},
});

await executeAndLogTime("connecting readings", async () => {
Expand Down
28 changes: 21 additions & 7 deletions prisma/kanjisense/inBatchesOf.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,32 @@
export async function inBatchesOf<T, U>(
count: number,
collection: (Iterable<T> & { size: number }) | T[],
action: (batch: T[]) => Promise<U>,
) {
export type BatchCollection<T> =
| (Iterable<T> & {
size: number;
})
| T[];

interface BatchOptions<T, V, U> {
count: number;
collection: BatchCollection<T>;
getBatchItem?: (item: T) => U;
action: (batch: U[]) => Promise<V>;
}

export async function inBatchesOf<T, V, U = T>({
count,
collection,
getBatchItem = (x: T): U => x as unknown as U,
action,
}: BatchOptions<T, V, U>) {
const collectionSize =
"length" in collection ? collection.length : collection.size;
const totalStartTime = Date.now() / 1000;
const totalBatches = Math.ceil(collectionSize / count);

let itemIndex = 0;
let batchIndex = 0;
let batch: T[] = [];
let batch: U[] = [];
for (const item of collection) {
batch.push(item);
batch.push(getBatchItem(item));
itemIndex++;

if (batch.length === count || itemIndex === collectionSize) {
Expand Down
Loading

0 comments on commit 260b000

Please sign in to comment.