diff --git a/packages/llm.gblib/services/ChatServices.ts b/packages/llm.gblib/services/ChatServices.ts index a6a38612..b642e242 100644 --- a/packages/llm.gblib/services/ChatServices.ts +++ b/packages/llm.gblib/services/ChatServices.ts @@ -180,16 +180,24 @@ export class ChatServices { if (sanitizedQuestion === '' || !vectorStore) { return ''; } - - let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments); + let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments * 10); const uniqueDocuments = {}; + const MAX_DOCUMENTS = numDocuments; for (const document of documents) { + if (!GBUtil.isContentPage(document.pageContent)) { + continue; + } + if (!uniqueDocuments[document.metadata.source]) { uniqueDocuments[document.metadata.source] = document; } - } + // Stop once we have max unique documents + if (Object.keys(uniqueDocuments).length >= MAX_DOCUMENTS) { + break; + } + } let output = ''; for (const filePaths of Object.keys(uniqueDocuments)) { @@ -197,10 +205,6 @@ export class ChatServices { const metadata = doc.metadata; const filename = path.basename(metadata.source); - if (!GBUtil.isContentPage(doc.pageContent)){ - continue; - } - let page = 0; if (metadata.source.endsWith('.pdf')) { page = await ChatServices.findPageForText(metadata.source, doc.pageContent); diff --git a/src/util.ts b/src/util.ts index ebd23a13..ee040262 100644 --- a/src/util.ts +++ b/src/util.ts @@ -365,15 +365,7 @@ export class GBUtil { // Common patterns that indicate non-content pages const nonContentPatterns = [ /^index$/i, - /^contents$/i, /^table of contents$/i, - /^appendix/i, - /^glossary$/i, - /^bibliography$/i, - /^references$/i, - /^acknowledgments?$/i, - /^copyright/i, - /^about the author/i ]; // Check if page is mostly dots, numbers or blank