-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #153 from TogetherCrew/feat/add-qdrant
feat: Adding Qdrant vectorstore db support!
- Loading branch information
Showing
17 changed files
with
169 additions
and
233 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,70 @@ | ||
from dags.hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db | ||
from dags.hivemind_etl_helpers.src.utils.redis import RedisSingleton | ||
from llama_index.core import MockEmbedding | ||
import logging | ||
|
||
from hivemind_etl_helpers.src.utils.credentials import load_redis_credentials | ||
from hivemind_etl_helpers.src.utils.mongo import get_mongo_uri | ||
from hivemind_etl_helpers.src.utils.redis import RedisSingleton | ||
from llama_index.core import Document, MockEmbedding | ||
from llama_index.core.ingestion import ( | ||
DocstoreStrategy, | ||
IngestionCache, | ||
IngestionPipeline, | ||
) | ||
from llama_index.core.node_parser import SemanticSplitterNodeParser | ||
from llama_index.storage.docstore.postgres import PostgresDocumentStore | ||
from llama_index.storage.docstore.mongodb import MongoDocumentStore | ||
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache | ||
from llama_index.vector_stores.postgres import PGVectorStore | ||
from tc_hivemind_backend.db.credentials import load_postgres_credentials | ||
from tc_hivemind_backend.db.qdrant import QdrantSingleton | ||
from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams | ||
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding | ||
from tc_hivemind_backend.qdrant_vector_access import QDrantVectorAccess | ||
|
||
|
||
class CustomIngestionPipeline: | ||
def __init__(self, community_id: str, table_name: str, testing: bool = False): | ||
self.postgres_credentials = load_postgres_credentials() | ||
self.table_name = table_name | ||
self.dbname = f"community_{community_id}" | ||
def __init__(self, community_id: str, collection_name: str, testing: bool = False): | ||
self.community_id = community_id | ||
self.qdrant_client = QdrantSingleton.get_instance().client | ||
|
||
_, self.embedding_dim = load_model_hyperparams() | ||
self.pg_creds = load_postgres_credentials() | ||
self.redis_cred = load_redis_credentials() | ||
self.collection_name = community_id | ||
self.platform_name = collection_name | ||
|
||
self.embed_model = ( | ||
CohereEmbedding() if not testing else MockEmbedding(embed_dim=1024) | ||
CohereEmbedding() | ||
if not testing | ||
else MockEmbedding(embed_dim=self.embedding_dim) | ||
) | ||
self.redis_client = RedisSingleton.get_instance().get_client() | ||
|
||
def run_pipeline(self, docs): | ||
_, embedding_dim = load_model_hyperparams() | ||
setup_db(community_id=self.community_id) | ||
def run_pipeline(self, docs: list[Document]): | ||
# qdrant is just collection based and doesn't have any database | ||
qdrant_collection_name = f"{self.collection_name}_{self.platform_name}" | ||
vector_access = QDrantVectorAccess(collection_name=qdrant_collection_name) | ||
vector_store = vector_access.setup_qdrant_vector_store() | ||
|
||
pipeline = IngestionPipeline( | ||
transformations=[ | ||
SemanticSplitterNodeParser(embed_model=self.embed_model), | ||
self.embed_model, | ||
], | ||
docstore=PostgresDocumentStore.from_params( | ||
host=self.postgres_credentials["host"], | ||
port=self.postgres_credentials["port"], | ||
user=self.postgres_credentials["user"], | ||
password=self.postgres_credentials["password"], | ||
database=self.dbname, | ||
table_name=self.table_name + "_docstore", | ||
), | ||
vector_store=PGVectorStore.from_params( | ||
host=self.postgres_credentials["host"], | ||
port=self.postgres_credentials["port"], | ||
user=self.postgres_credentials["user"], | ||
password=self.postgres_credentials["password"], | ||
database=self.dbname, | ||
table_name=self.table_name, | ||
embed_dim=embedding_dim, | ||
docstore=MongoDocumentStore.from_uri( | ||
uri=get_mongo_uri(), | ||
db_name=f"docstore_{self.collection_name}", | ||
namespace=self.platform_name, | ||
), | ||
vector_store=vector_store, | ||
cache=IngestionCache( | ||
cache=RedisCache.from_redis_client(self.redis_client), | ||
collection=self.dbname + f"_{self.table_name}" + "_ingestion_cache", | ||
collection=f"{self.collection_name}_{self.platform_name}_ingestion_cache", | ||
docstore_strategy=DocstoreStrategy.UPSERTS, | ||
), | ||
docstore_strategy=DocstoreStrategy.UPSERTS, | ||
) | ||
|
||
nodes = pipeline.run(documents=docs, show_progress=True) | ||
|
||
return nodes | ||
try: | ||
nodes = pipeline.run(documents=docs, show_progress=True) | ||
return nodes | ||
except Exception as e: | ||
logging.error( | ||
f"An error occurred while running the pipeline: {e}", exc_info=True | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.