Skip to content

Commit

Permalink
Refactor document processing methods to improve readability and maint…
Browse files Browse the repository at this point in the history
…ainability.
  • Loading branch information
anpigon committed Nov 17, 2024
1 parent c67fd9e commit 61a127e
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 122 deletions.
2 changes: 1 addition & 1 deletion manifest.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"id": "smart-seeker",
"name": "Smart Seeker",
"version": "0.0.12",
"version": "0.0.13",
"minAppVersion": "0.15.0",
"description": "Demonstrates some of the capabilities of the Obsidian API.",
"author": "Obsidian",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "smart-seeker",
"version": "0.0.12",
"version": "0.0.13",
"description": "An Obsidian plugin that enables fast and intelligent note search using RAG (Retrieval Augmented Generation) with Pinecone vector database and OpenAI",
"main": "main.js",
"scripts": {
Expand Down
126 changes: 38 additions & 88 deletions src/helpers/document/DocumentProcessor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,6 @@ import { getFileNameSafe } from "../utils/fileUtils";
import getEmbeddingModel from "../utils/getEmbeddingModel";
import { createContentHash, createHash } from "../utils/hash";

interface ProcessingResult {
totalDocuments: number; // 입력된 전체 문서 수
processedDocuments: number; // 처리된 문서 수
skippedDocuments: number; // 건너뛴 문서 수
processedChunks: number; // 처리된 청크 수
}

interface DocumentChunk {
ids: string[];
chunks: Document[];
Expand Down Expand Up @@ -70,81 +63,11 @@ export default class DocumentProcessor {
return pinecone.Index(settings.selectedIndex);
}

// 기존 파인콘DB에 있는 문서는 필터링한다.
async filterDocuments(documents: Document[]): Promise<Document[]> {
if (!documents?.length) return [];

try {
const documentIds = this.generateDocumentIds(documents);
const existingHashes = await this.fetchExistingHashes(documentIds);

return documents.filter((doc) => !existingHashes.has(doc.metadata.hash));
} catch (error) {
this.logger.error("Error filtering documents:", error);
}

return [];
}

private async fetchExistingHashes(
documentIds: string[],
): Promise<Set<string>> {
const { records } = await this.pineconeIndex.fetch(documentIds);
return new Set(
Object.values(records).map(
(record) => (record.metadata as { hash: string }).hash,
),
);
}

private generateDocumentIds(documents: Document[]): string[] {
return documents.map((doc) => `${doc.metadata.id}-0`);
}

async processSingleDocument(document: Document) {
const { ids, chunks } = await this.createChunks([document]);
this.logger.debug("chunks", chunks);
return await this.saveToVectorStore(chunks, ids);
}

async processSingleFile(file: TFile) {
const document = await this.createDocument(file);
const { ids, chunks } = await this.createChunks([document]);
return await this.saveToVectorStore(chunks, ids);
}

async processDocuments(documents: Document[]): Promise<ProcessingResult> {
try {
const totalDocuments = documents.length;
const filteredDocs = await this.filterDocuments(documents);
this.logger.debug("Filtered documents count:", filteredDocs.length);

if (!filteredDocs.length) {
return {
totalDocuments: totalDocuments,
processedDocuments: 0,
skippedDocuments: totalDocuments,
processedChunks: 0,
};
}

const { ids, chunks } = await this.createChunks(filteredDocs);
console.log("chunks", chunks);
await this.saveToVectorStore(chunks, ids);

return {
totalDocuments: totalDocuments,
processedDocuments: filteredDocs.length,
skippedDocuments: totalDocuments - filteredDocs.length,
processedChunks: chunks.length,
};
} catch (error) {
this.logger.error("Error processing documents:", error);
throw error;
}
}

async createDocumentsFromFiles(
private async createDocumentsFromFiles(
files: TFile[],
): Promise<Document<NoteMetadata>[]> {
const documents: Document<NoteMetadata>[] = [];
Expand All @@ -155,13 +78,7 @@ export default class DocumentProcessor {
return documents;
}

async processMultiFiles(files: TFile[]) {
const documents = await this.createDocumentsFromFiles(files);
const { ids, chunks } = await this.createChunks(documents);
await this.saveToVectorStore(chunks, ids);
}

async createChunks(documents: Document[]): Promise<DocumentChunk> {
private async createChunks(documents: Document[]): Promise<DocumentChunk> {
const result: DocumentChunk = { ids: [], chunks: [] };

for (const document of documents) {
Expand All @@ -181,7 +98,7 @@ export default class DocumentProcessor {
return result;
}

async saveToVectorStore(
private async saveToVectorStore(
chunks: Document[],
ids: string[],
): Promise<string[]> {
Expand All @@ -193,7 +110,7 @@ export default class DocumentProcessor {
return await vectorStore.addDocuments(chunks, { ids });
}

private async filterDocumentsByQuery(documents: Document[]) {
async filterDocumentsByQuery(documents: Document[]) {
const filterPromises = documents.map(async (doc) => {
try {
const queryResult = await this.pineconeIndex.query({
Expand Down Expand Up @@ -223,7 +140,7 @@ export default class DocumentProcessor {
return results.filter((doc): doc is Document => doc !== null);
}

public async createDocument(file: TFile) {
private async createDocument(file: TFile) {
const content = await this.plugin.app.vault.cachedRead(file);
const hash = await createContentHash(content);
const id = await createHash(file.path);
Expand Down Expand Up @@ -254,4 +171,37 @@ export default class DocumentProcessor {
console.log("--→ document", document);
return document;
}

// 기존 파인콘DB에 있는 문서는 필터링한다.
public async filterDocuments(documents: Document[]): Promise<Document[]> {
if (!documents?.length) return [];

try {
const documentIds = this.generateDocumentIds(documents);
const { records } = await this.pineconeIndex.fetch(documentIds);
const existingHashes = new Set(
Object.values(records).map(
(record) => (record.metadata as { hash: string }).hash,
),
);

return documents.filter((doc) => !existingHashes.has(doc.metadata.hash));
} catch (error) {
this.logger.error("Error filtering documents:", error);
}

return [];
}

public async processSingleFile(file: TFile) {
const document = await this.createDocument(file);
const { ids, chunks } = await this.createChunks([document]);
return await this.saveToVectorStore(chunks, ids);
}

public async processMultiFiles(files: TFile[]): Promise<string[]> {
const documents = await this.createDocumentsFromFiles(files);
const { ids, chunks } = await this.createChunks(documents);
return await this.saveToVectorStore(chunks, ids);
}
}
39 changes: 8 additions & 31 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,7 @@ export default class SmartSeekerPlugin extends Plugin {
return;
}

if (Object.keys(this.taskQueue).length === 0) {
this.logger.debug("📭 처리할 taskQueue가 없습니다.");
return;
}
if (Object.keys(this.taskQueue).length === 0) return;

this.isProcessing = true;

Expand All @@ -345,41 +342,21 @@ export default class SmartSeekerPlugin extends Plugin {
}

const files = Object.values(this.taskQueue);
const documents =
await this.documentProcessor.createDocumentsFromFiles(files);
const filteredDocs =
await this.documentProcessor.filterDocuments(documents);
const totalCount = documents.length;
const filterdCount = filteredDocs.length;
if (filteredDocs.length === 0) {
new Notice(
this.createResultMessage(totalCount, filterdCount, totalCount),
5000,
);
}
await this.documentProcessor.processMultiFiles(files);
const totalCount = files.length;

const { ids, chunks } =
await this.documentProcessor.createChunks(filteredDocs);
await this.documentProcessor.saveToVectorStore(chunks, ids);
this.logger.debug(`${totalCount} notes successfully saved to PineconeDB`);

this.logger.debug(
`${filterdCount} notes successfully saved to PineconeDB`,
);

new Notice(
this.createResultMessage(totalCount, filterdCount, totalCount),
5000,
);
new Notice(`📊 총 ${totalCount}개 노트 처리`, 5000);

// 처리된 노트 제거
for (const file of files) {
delete this.taskQueue[file.path];
}
} catch (error) {
const errorMessage =
error instanceof Error ? error.message : "Unknown error";
this.logger.error(`Failed to process notes: ${errorMessage}`);
new Notice(`Failed to save notes: ${errorMessage}`);
this.logger.error(
`Failed to process notes: ${error?.message || error.toString()}`,
);
} finally {
this.isProcessing = false;
}
Expand Down
3 changes: 2 additions & 1 deletion versions.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
"0.0.9": "0.15.0",
"0.0.10": "0.15.0",
"0.0.11": "0.15.0",
"0.0.12": "0.15.0"
"0.0.12": "0.15.0",
"0.0.13": "0.15.0"
}

0 comments on commit 61a127e

Please sign in to comment.