From 07b6bb893f47385219f1583eba0367c2f518fd85 Mon Sep 17 00:00:00 2001 From: Hadrien Bertrand Date: Wed, 7 Feb 2024 16:53:11 -0500 Subject: [PATCH] pinecone update (#159) * pinecone update * new black --- buster/documents_manager/service.py | 13 +++++-------- buster/parsers/parser.py | 3 +-- buster/retriever/service.py | 5 ++--- requirements.txt | 2 +- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/buster/documents_manager/service.py b/buster/documents_manager/service.py index 6d36230..9683a19 100644 --- a/buster/documents_manager/service.py +++ b/buster/documents_manager/service.py @@ -17,7 +17,6 @@ class DocumentsService(DocumentsManager): def __init__( self, pinecone_api_key: str, - pinecone_env: str, pinecone_index: str, pinecone_namespace: str, mongo_uri: str, @@ -37,9 +36,9 @@ def __init__( """ super().__init__(**kwargs) - pinecone.init(api_key=pinecone_api_key, environment=pinecone_env) + pc = pinecone.Pinecone(api_key=pinecone_api_key) - self.index = pinecone.Index(pinecone_index) + self.index = pc.Index(pinecone_index) self.namespace = pinecone_namespace self.mongo_db_name = mongo_db_name @@ -98,11 +97,9 @@ def _add_documents(self, df: pd.DataFrame): to_upsert.append(vector) - # Current (November 2023) Pinecone upload rules: - # - Max 1000 vectors per batch - # - Max 2 MB per batch - # Sparse vectors are heavier, so we reduce the batch size when using them. - MAX_PINECONE_BATCH_SIZE = 100 if use_sparse_vector else 1000 + # Current (February 2024) Pinecone upload rules: + # - Max 100 vectors per batch + MAX_PINECONE_BATCH_SIZE = 100 for i in range(0, len(to_upsert), MAX_PINECONE_BATCH_SIZE): self.index.upsert(vectors=to_upsert[i : i + MAX_PINECONE_BATCH_SIZE], namespace=self.namespace) diff --git a/buster/parsers/parser.py b/buster/parsers/parser.py index fe3b596..6ceb626 100644 --- a/buster/parsers/parser.py +++ b/buster/parsers/parser.py @@ -94,8 +94,7 @@ def relative_path(self) -> str: return self._relative_path @abstractmethod - def find_sections(self) -> Iterator[Section]: - ... + def find_sections(self) -> Iterator[Section]: ... def parse(self) -> list[Section]: """Parse the documents into sections, respecting the lenght constraints.""" diff --git a/buster/retriever/service.py b/buster/retriever/service.py index b068689..5155cb0 100644 --- a/buster/retriever/service.py +++ b/buster/retriever/service.py @@ -18,7 +18,6 @@ class ServiceRetriever(Retriever): def __init__( self, pinecone_api_key: str, - pinecone_env: str, pinecone_index: str, pinecone_namespace: str, mongo_uri: str, @@ -43,9 +42,9 @@ def __init__( """ super().__init__(**kwargs) - pinecone.init(api_key=pinecone_api_key, environment=pinecone_env) + pc = pinecone.Pinecone(api_key=pinecone_api_key) - self.index = pinecone.Index(pinecone_index) + self.index = pc.Index(pinecone_index) self.namespace = pinecone_namespace self.client = MongoClient(mongo_uri, server_api=ServerApi("1")) diff --git a/requirements.txt b/requirements.txt index 40f53dc..95d387b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ matplotlib numpy>=1.25 openai>=1.0 pandas>=2.1.3 -pinecone-client +pinecone-client>=3.0.2 pinecone-text>=0.6.0 pymongo pytest