From 0005225d25a6b3a0d75da99a225eea165bdf704a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 23 May 2024 16:28:22 +0330 Subject: [PATCH 1/3] feat: Add logs to ingestion pipeline! --- dags/hivemind_etl_helpers/ingestion_pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dags/hivemind_etl_helpers/ingestion_pipeline.py b/dags/hivemind_etl_helpers/ingestion_pipeline.py index a49ffbcd..7e6cd046 100644 --- a/dags/hivemind_etl_helpers/ingestion_pipeline.py +++ b/dags/hivemind_etl_helpers/ingestion_pipeline.py @@ -39,6 +39,7 @@ def __init__(self, community_id: str, collection_name: str, testing: bool = Fals def run_pipeline(self, docs: list[Document]): # qdrant is just collection based and doesn't have any database + logging.info(f"{len(docs)} docuemnts was extracted and now loading into QDrant DB!") qdrant_collection_name = f"{self.collection_name}_{self.platform_name}" vector_access = QDrantVectorAccess(collection_name=qdrant_collection_name) vector_store = vector_access.setup_qdrant_vector_store() @@ -61,6 +62,7 @@ def run_pipeline(self, docs: list[Document]): ), docstore_strategy=DocstoreStrategy.UPSERTS, ) + logging.info("Pipeline created, now inserting documents into pipeline!") try: nodes = pipeline.run(documents=docs, show_progress=True) return nodes From 6414fdd9afdf71c8cbbd14f1966f5476df337c3a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 23 May 2024 16:33:24 +0330 Subject: [PATCH 2/3] feat: printing qdrant creds! HAVE TO BE REMOVED! --- dags/hivemind_etl_helpers/debugs.py | 8 ++++++++ dags/hivemind_mediawiki_etl.py | 3 +++ 2 files changed, 11 insertions(+) create mode 100644 dags/hivemind_etl_helpers/debugs.py diff --git a/dags/hivemind_etl_helpers/debugs.py b/dags/hivemind_etl_helpers/debugs.py new file mode 100644 index 00000000..1b91d59f --- /dev/null +++ b/dags/hivemind_etl_helpers/debugs.py @@ -0,0 +1,8 @@ +import logging + +from tc_hivemind_backend.db.credentials import load_qdrant_credentials + + +def pring_qdrant_creds(): + qdrant_creds = load_qdrant_credentials() + logging.info(f"qdrant_creds: {qdrant_creds}") \ No newline at end of file diff --git a/dags/hivemind_mediawiki_etl.py b/dags/hivemind_mediawiki_etl.py index 339ec73e..2dec6019 100644 --- a/dags/hivemind_mediawiki_etl.py +++ b/dags/hivemind_mediawiki_etl.py @@ -5,6 +5,7 @@ from airflow import DAG from airflow.decorators import task +from hivemind_etl_helpers.debugs import pring_qdrant_creds from hivemind_etl_helpers.mediawiki_etl import process_mediawiki_etl from hivemind_etl_helpers.src.utils.modules import ModulesMediaWiki @@ -19,6 +20,8 @@ def get_mediawiki_communities() -> list[dict[str, str | list[str]]]: """ Getting all communities having mediawiki from database """ + # TODO: REMOVE!!! + pring_qdrant_creds() communities = ModulesMediaWiki().get_learning_platforms() return communities From 9a285e596d853decbd224254811fd93e04983430 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 23 May 2024 16:43:37 +0330 Subject: [PATCH 3/3] fix: lint issues! --- dags/hivemind_etl_helpers/debugs.py | 2 +- dags/hivemind_etl_helpers/ingestion_pipeline.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/debugs.py b/dags/hivemind_etl_helpers/debugs.py index 1b91d59f..799bd95c 100644 --- a/dags/hivemind_etl_helpers/debugs.py +++ b/dags/hivemind_etl_helpers/debugs.py @@ -5,4 +5,4 @@ def pring_qdrant_creds(): qdrant_creds = load_qdrant_credentials() - logging.info(f"qdrant_creds: {qdrant_creds}") \ No newline at end of file + logging.info(f"qdrant_creds: {qdrant_creds}") diff --git a/dags/hivemind_etl_helpers/ingestion_pipeline.py b/dags/hivemind_etl_helpers/ingestion_pipeline.py index 7e6cd046..2d8080b7 100644 --- a/dags/hivemind_etl_helpers/ingestion_pipeline.py +++ b/dags/hivemind_etl_helpers/ingestion_pipeline.py @@ -39,7 +39,9 @@ def __init__(self, community_id: str, collection_name: str, testing: bool = Fals def run_pipeline(self, docs: list[Document]): # qdrant is just collection based and doesn't have any database - logging.info(f"{len(docs)} docuemnts was extracted and now loading into QDrant DB!") + logging.info( + f"{len(docs)} docuemnts was extracted and now loading into QDrant DB!" + ) qdrant_collection_name = f"{self.collection_name}_{self.platform_name}" vector_access = QDrantVectorAccess(collection_name=qdrant_collection_name) vector_store = vector_access.setup_qdrant_vector_store()