From 23923721eafca3abacbf3a541bcb9f11ae00a896 Mon Sep 17 00:00:00 2001 From: lopagela Date: Sat, 4 Nov 2023 18:35:50 +0100 Subject: [PATCH] Endpoint to delete documents ingested A file that is ingested will be transformed into several documents (that are organized into nodes). This endpoint is deleting documents (bits of a file). These bits can be retrieved thanks to the endpoint to list all the documents. --- private_gpt/__main__.py | 9 ++++---- .../node_store/node_store_component.py | 6 +++++ private_gpt/server/ingest/ingest_router.py | 18 +++++++++++++++ private_gpt/server/ingest/ingest_service.py | 23 +++++++++++++++++-- 4 files changed, 50 insertions(+), 6 deletions(-) diff --git a/private_gpt/__main__.py b/private_gpt/__main__.py index 6bf2f156e..9e897f4be 100644 --- a/private_gpt/__main__.py +++ b/private_gpt/__main__.py @@ -5,7 +5,8 @@ from private_gpt.main import app from private_gpt.settings.settings import settings -# Set log_config=None to do not use the uvicorn logging configuration, and -# use ours instead. For reference, see below: -# https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108 -uvicorn.run(app, host="0.0.0.0", port=settings.server.port, log_config=None) +if __name__ == "__main__": + # Set log_config=None to do not use the uvicorn logging configuration, and + # use ours instead. For reference, see below: + # https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108 + uvicorn.run(app, host="0.0.0.0", port=settings.server.port, log_config=None) diff --git a/private_gpt/components/node_store/node_store_component.py b/private_gpt/components/node_store/node_store_component.py index c20f98c5e..c039bf502 100644 --- a/private_gpt/components/node_store/node_store_component.py +++ b/private_gpt/components/node_store/node_store_component.py @@ -1,3 +1,5 @@ +import logging + from injector import inject, singleton from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore from llama_index.storage.index_store import SimpleIndexStore @@ -5,6 +7,8 @@ from private_gpt.paths import local_data_path +logger = logging.getLogger(__name__) + @singleton class NodeStoreComponent: @@ -18,6 +22,7 @@ def __init__(self) -> None: persist_dir=str(local_data_path) ) except FileNotFoundError: + logger.debug("Local index store not found, creating a new one") self.index_store = SimpleIndexStore() try: @@ -25,4 +30,5 @@ def __init__(self) -> None: persist_dir=str(local_data_path) ) except FileNotFoundError: + logger.debug("Local document store not found, creating a new one") self.doc_store = SimpleDocumentStore() diff --git a/private_gpt/server/ingest/ingest_router.py b/private_gpt/server/ingest/ingest_router.py index dd49b5a8a..4841fa82a 100644 --- a/private_gpt/server/ingest/ingest_router.py +++ b/private_gpt/server/ingest/ingest_router.py @@ -47,3 +47,21 @@ def list_ingested() -> IngestResponse: service = root_injector.get(IngestService) ingested_documents = service.list_ingested() return IngestResponse(object="list", model="private-gpt", data=ingested_documents) + + +@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"]) +def delete_ingested(doc_id: str) -> None: + """Delete the specified ingested Document. + + The `doc_id` can be obtained from the `GET /ingest/list` endpoint + The document will be effectively deleted from the document store (i.e. + from the directory specified in your configuration) + """ + service = root_injector.get(IngestService) + try: + service.delete(doc_id) + except ValueError as err: + raise HTTPException( + 404, f"Document={doc_id} not found in the datastore" + ) from err + return diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index 6a34e6fbb..b805cd8e6 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -1,3 +1,4 @@ +import logging import tempfile from pathlib import Path from typing import TYPE_CHECKING, Any, AnyStr @@ -25,6 +26,8 @@ if TYPE_CHECKING: from llama_index.readers.base import BaseReader +logger = logging.getLogger(__name__) + class IngestedDoc(BaseModel): object: str = Field(enum=["ingest.document"]) @@ -70,6 +73,7 @@ def __init__( ) def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: + logger.info("Ingesting file_name=%s", file_name) extension = Path(file_name).suffix reader_cls = DEFAULT_FILE_READER_CLS.get(extension) documents: list[Document] @@ -100,7 +104,9 @@ def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: else: path_to_tmp.write_text(str(file_data)) documents = reader.load_data(path_to_tmp) - + logger.info( + "Transformed file=%s into count=%s documents", file_data, len(documents) + ) for document in documents: document.metadata["file_name"] = file_name return self._save_docs(documents) @@ -153,7 +159,20 @@ def list_ingested(self) -> list[IngestedDoc]: doc_metadata=doc_metadata, ) ) - return ingested_docs except ValueError: + logger.warning("Got an exception when getting list of docs", exc_info=True) pass + logger.debug("Found count=%s ingested documents", len(ingested_docs)) return ingested_docs + + def delete(self, doc_id: str) -> None: + """Delete an ingested document. + + :raises ValueError: if the document does not exist + """ + logger.info("Deleting the ingested document=%s in the doc store", doc_id) + self.storage_context.docstore.delete_ref_doc(doc_id) + # FIXME the documents are only deleted, and not in the vector store + # or index store + # self.storage_context.vector_store.delete(doc_id) + # self.storage_context.index_store.delete_index_struct(doc_id)