Skip to content

Commit

Permalink
Endpoint to delete documents ingested
Browse files Browse the repository at this point in the history
A file that is ingested will be transformed into several documents (that
are organized into nodes).
This endpoint is deleting documents (bits of a file). These bits can be
retrieved thanks to the endpoint to list all the documents.
  • Loading branch information
lopagela committed Nov 4, 2023
1 parent f29df84 commit 2392372
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 6 deletions.
9 changes: 5 additions & 4 deletions private_gpt/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from private_gpt.main import app
from private_gpt.settings.settings import settings

# Set log_config=None to do not use the uvicorn logging configuration, and
# use ours instead. For reference, see below:
# https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108
uvicorn.run(app, host="0.0.0.0", port=settings.server.port, log_config=None)
if __name__ == "__main__":
# Set log_config=None to do not use the uvicorn logging configuration, and
# use ours instead. For reference, see below:
# https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108
uvicorn.run(app, host="0.0.0.0", port=settings.server.port, log_config=None)
6 changes: 6 additions & 0 deletions private_gpt/components/node_store/node_store_component.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import logging

from injector import inject, singleton
from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.storage.index_store.types import BaseIndexStore

from private_gpt.paths import local_data_path

logger = logging.getLogger(__name__)


@singleton
class NodeStoreComponent:
Expand All @@ -18,11 +22,13 @@ def __init__(self) -> None:
persist_dir=str(local_data_path)
)
except FileNotFoundError:
logger.debug("Local index store not found, creating a new one")
self.index_store = SimpleIndexStore()

try:
self.doc_store = SimpleDocumentStore.from_persist_dir(
persist_dir=str(local_data_path)
)
except FileNotFoundError:
logger.debug("Local document store not found, creating a new one")
self.doc_store = SimpleDocumentStore()
18 changes: 18 additions & 0 deletions private_gpt/server/ingest/ingest_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,21 @@ def list_ingested() -> IngestResponse:
service = root_injector.get(IngestService)
ingested_documents = service.list_ingested()
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)


@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"])
def delete_ingested(doc_id: str) -> None:
"""Delete the specified ingested Document.
The `doc_id` can be obtained from the `GET /ingest/list` endpoint
The document will be effectively deleted from the document store (i.e.
from the directory specified in your configuration)
"""
service = root_injector.get(IngestService)
try:
service.delete(doc_id)
except ValueError as err:
raise HTTPException(
404, f"Document={doc_id} not found in the datastore"
) from err
return
23 changes: 21 additions & 2 deletions private_gpt/server/ingest/ingest_service.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Any, AnyStr
Expand Down Expand Up @@ -25,6 +26,8 @@
if TYPE_CHECKING:
from llama_index.readers.base import BaseReader

logger = logging.getLogger(__name__)


class IngestedDoc(BaseModel):
object: str = Field(enum=["ingest.document"])
Expand Down Expand Up @@ -70,6 +73,7 @@ def __init__(
)

def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
extension = Path(file_name).suffix
reader_cls = DEFAULT_FILE_READER_CLS.get(extension)
documents: list[Document]
Expand Down Expand Up @@ -100,7 +104,9 @@ def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
else:
path_to_tmp.write_text(str(file_data))
documents = reader.load_data(path_to_tmp)

logger.info(
"Transformed file=%s into count=%s documents", file_data, len(documents)
)
for document in documents:
document.metadata["file_name"] = file_name
return self._save_docs(documents)
Expand Down Expand Up @@ -153,7 +159,20 @@ def list_ingested(self) -> list[IngestedDoc]:
doc_metadata=doc_metadata,
)
)
return ingested_docs
except ValueError:
logger.warning("Got an exception when getting list of docs", exc_info=True)
pass
logger.debug("Found count=%s ingested documents", len(ingested_docs))
return ingested_docs

def delete(self, doc_id: str) -> None:
"""Delete an ingested document.
:raises ValueError: if the document does not exist
"""
logger.info("Deleting the ingested document=%s in the doc store", doc_id)
self.storage_context.docstore.delete_ref_doc(doc_id)
# FIXME the documents are only deleted, and not in the vector store
# or index store
# self.storage_context.vector_store.delete(doc_id)
# self.storage_context.index_store.delete_index_struct(doc_id)

0 comments on commit 2392372

Please sign in to comment.