From f711cefca3ca87f81eee825dfce4e8c4fb3bb2f8 Mon Sep 17 00:00:00 2001 From: Arvin Xu Date: Tue, 19 Nov 2024 21:52:44 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20perf:=20fix=20slow=20delet?= =?UTF-8?q?e=20file=20sql=20(#4738)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/database/server/models/file.ts | 37 ++++++++++++++++++++++++++++++ src/server/routers/lambda/file.ts | 5 ---- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/database/server/models/file.ts b/src/database/server/models/file.ts index f26b7e23071c..189065490174 100644 --- a/src/database/server/models/file.ts +++ b/src/database/server/models/file.ts @@ -1,5 +1,6 @@ import { asc, count, eq, ilike, inArray, notExists, or, sum } from 'drizzle-orm'; import { and, desc, like } from 'drizzle-orm/expressions'; +import type { PgTransaction } from 'drizzle-orm/pg-core'; import { serverDBEnv } from '@/config/db'; import { serverDB } from '@/database/server/core/db'; @@ -9,6 +10,9 @@ import { FileItem, NewFile, NewGlobalFile, + chunks, + embeddings, + fileChunks, files, globalFiles, knowledgeBaseFiles, @@ -68,6 +72,10 @@ export class FileModel { const fileHash = file.fileHash!; return await serverDB.transaction(async (trx) => { + // 1. 删除相关的 chunks + await this.deleteFileChunks(trx as any, [id]); + + // 2. 删除文件记录 await trx.delete(files).where(and(eq(files.id, id), eq(files.userId, this.userId))); const result = await trx @@ -107,6 +115,9 @@ export class FileModel { const hashList = fileList.map((file) => file.fileHash!); return await serverDB.transaction(async (trx) => { + // 1. 删除相关的 chunks + await this.deleteFileChunks(trx as any, ids); + // delete the files await trx.delete(files).where(and(inArray(files.id, ids), eq(files.userId, this.userId))); @@ -289,4 +300,30 @@ export class FileModel { ), }); } + + // 抽象出通用的删除 chunks 方法 + private async deleteFileChunks(trx: PgTransaction, fileIds: string[]) { + const BATCH_SIZE = 1000; // 每批处理的数量 + + // 1. 获取所有关联的 chunk IDs + const relatedChunks = await trx + .select({ chunkId: fileChunks.chunkId }) + .from(fileChunks) + .where(inArray(fileChunks.fileId, fileIds)); + + const chunkIds = relatedChunks.map((c) => c.chunkId).filter(Boolean) as string[]; + + if (chunkIds.length === 0) return; + + // 2. 分批处理删除 + for (let i = 0; i < chunkIds.length; i += BATCH_SIZE) { + const batchChunkIds = chunkIds.slice(i, i + BATCH_SIZE); + + await trx.delete(embeddings).where(inArray(embeddings.chunkId, batchChunkIds)); + + await trx.delete(chunks).where(inArray(chunks.id, batchChunkIds)); + } + + return chunkIds; + } } diff --git a/src/server/routers/lambda/file.ts b/src/server/routers/lambda/file.ts index 10b84ef09a57..8b73cc715ca7 100644 --- a/src/server/routers/lambda/file.ts +++ b/src/server/routers/lambda/file.ts @@ -154,8 +154,6 @@ export const fileRouter = router({ removeFile: fileProcedure.input(z.object({ id: z.string() })).mutation(async ({ input, ctx }) => { const file = await ctx.fileModel.delete(input.id); - // delete the orphan chunks - await ctx.chunkModel.deleteOrphanChunks(); if (!file) return; // delele the file from remove from S3 if it is not used by other files @@ -187,9 +185,6 @@ export const fileRouter = router({ .mutation(async ({ input, ctx }) => { const needToRemoveFileList = await ctx.fileModel.deleteMany(input.ids); - // delete the orphan chunks - await ctx.chunkModel.deleteOrphanChunks(); - if (!needToRemoveFileList || needToRemoveFileList.length === 0) return; // remove from S3