From a40acc9a4500c817211754e6d9764b2be6d323da Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 7 Mar 2024 17:53:59 +0330
Subject: [PATCH 01/10] feat: llama-index code migration! - We needed to update
 codes based on the latest update of llama-index library. - We also, added
 skip to previously gdrive etl codes as because it's task was paused and yet
 to be completed.

---
 .../discord_mongo_summary_etl.py              | 12 ++++---
 .../discord_mongo_vector_store_etl.py         | 22 ++++---------
 .../discourse_summary_etl.py                  | 18 +++++++---
 .../discourse_vectorstore_etl.py              |  9 +++--
 dags/hivemind_etl_helpers/github_etl.py       |  2 +-
 .../discord_raw_message_to_document.py        |  2 +-
 .../src/db/discord/discord_summary.py         | 33 +++++++++++--------
 .../db/discord/summary/prepare_summaries.py   | 15 +++++----
 .../src/db/discord/summary/summary_utils.py   |  2 +-
 .../utils/transform_discord_raw_messges.py    |  2 +-
 .../src/db/discourse/raw_post_to_documents.py |  2 +-
 .../db/discourse/summary/prepare_summary.py   | 14 ++++----
 .../src/db/discourse/summary/summary_utils.py |  2 +-
 .../utils/transform_raw_to_documents.py       |  2 +-
 .../src/db/gdrive/retrieve_documents.py       |  2 +-
 .../src/db/github/load/load_raw_data.py       | 12 ++++---
 .../src/db/github/load/prepare_deletion.py    |  2 +-
 .../src/db/github/transform/comments.py       |  2 +-
 .../src/db/github/transform/commits.py        |  2 +-
 .../src/db/github/transform/issues.py         |  2 +-
 .../src/db/github/transform/pull_requests.py  |  2 +-
 .../src/document_node_parser.py               | 11 ++++---
 .../src/utils/check_documents.py              |  2 +-
 .../src/utils/sort_summary_docs.py            |  2 +-
 .../src/utils/summary_base.py                 | 33 +++++++------------
 .../test_discord_prepare_summary.py           | 29 +++++-----------
 .../test_discord_prepare_thread_summaries.py  | 18 ++++------
 .../integration/test_discord_summary_base.py  | 22 +++++--------
 ...scord_summary_prepare_channel_summaries.py | 19 ++++-------
 ...discord_summary_prepare_daily_summaries.py | 19 ++++-------
 ...scord_summary_transform_daily_summaries.py |  2 +-
 .../integration/test_discord_summary_utils.py |  2 +-
 .../test_gdrive_convert_doc_to_id_date.py     |  4 ++-
 .../test_gdrive_db_fetch_files_modified_at.py |  4 ++-
 .../integration/test_gdrive_delete_records.py |  4 ++-
 .../test_gdrive_documents_to_delete.py        |  4 ++-
 .../test_gdrive_documents_to_insert.py        |  4 ++-
 .../test_pg_vector_access_with_discord.py     |  2 +-
 ...test_discourse_summary_prepare_category.py | 17 +++-------
 .../test_discourse_summary_prepare_daily.py   | 17 +++-------
 ...scourse_summary_prepare_daily_documents.py | 15 +++------
 .../test_discourse_summary_prepare_topic.py   | 15 +++------
 ...drive_files_modified_at_process_results.py |  2 ++
 .../unit/test_github_etl_prepare_deletion.py  |  2 +-
 .../unit/test_github_transform_comments.py    |  2 +-
 .../unit/test_github_transform_commits.py     |  2 +-
 .../unit/test_github_transform_issues.py      |  2 +-
 .../tests/unit/test_github_transform_prs.py   |  2 +-
 .../tests/unit/test_sort_summary_docs.py      |  2 +-
 requirements.txt                              |  3 +-
 50 files changed, 196 insertions(+), 227 deletions(-)

diff --git a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
index 9fd72f1f..375e560b 100644
--- a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
+++ b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
@@ -10,7 +10,9 @@
 )
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
 from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily
-from llama_index.response_synthesizers import get_response_synthesizer
+from llama_index.core.response_synthesizers import get_response_synthesizer
+from llama_index.llms.openai import OpenAI
+from llama_index.core import Settings
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
@@ -93,16 +95,16 @@ def process_discord_summaries(community_id: str, verbose: bool = False) -> None:
     node_parser = configure_node_parser(chunk_size=chunk_size)
     pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
 
-    embed_model = CohereEmbedding()
+    Settings.node_parser = node_parser
+    Settings.embed_model = CohereEmbedding()
+    Settings.chunk_size = chunk_size
+    Settings.llm = OpenAI(model="gpt-3.5-turbo")
 
     pg_vector.save_documents_in_batches(
         community_id=community_id,
         documents=docs_daily_sorted,
         batch_size=100,
-        node_parser=node_parser,
         max_request_per_minute=None,
-        embed_model=embed_model,
-        embed_dim=embedding_dim,
         request_per_minute=10000,
         deletion_query=deletion_query,
     )
diff --git a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
index 60d3d02d..584c3963 100644
--- a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
+++ b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 from datetime import timedelta
 
@@ -13,6 +12,8 @@
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
 
 
 def process_discord_guild_mongo(community_id: str) -> None:
@@ -52,7 +53,10 @@ def process_discord_guild_mongo(community_id: str) -> None:
     node_parser = configure_node_parser(chunk_size=chunk_size)
     pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
 
-    embed_model = CohereEmbedding()
+    Settings.node_parser = node_parser
+    Settings.embed_model = CohereEmbedding()
+    Settings.chunk_size = chunk_size
+    Settings.llm = OpenAI(model="gpt-3.5-turbo")
 
     pg_vector.save_documents_in_batches(
         community_id=community_id,
@@ -60,19 +64,5 @@ def process_discord_guild_mongo(community_id: str) -> None:
         batch_size=100,
         node_parser=node_parser,
         max_request_per_minute=None,
-        embed_model=embed_model,
-        embed_dim=embedding_dim,
         request_per_minute=10000,
-        # max_request_per_day=REQUEST_PER_DAY,
     )
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "community_id", type=str, help="the Community that the guild is related to"
-    )
-    args = parser.parse_args()
-
-    process_discord_guild_mongo(community_id=args.community_id)
diff --git a/dags/hivemind_etl_helpers/discourse_summary_etl.py b/dags/hivemind_etl_helpers/discourse_summary_etl.py
index b620aba5..5344b146 100644
--- a/dags/hivemind_etl_helpers/discourse_summary_etl.py
+++ b/dags/hivemind_etl_helpers/discourse_summary_etl.py
@@ -10,8 +10,9 @@
 from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
 from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily
-from llama_index import Document
-from llama_index.response_synthesizers import get_response_synthesizer
+from llama_index.core import Document, Settings
+from llama_index.core.response_synthesizers import get_response_synthesizer
+from llama_index.llms.openai import OpenAI
 from neo4j._data import Record
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
@@ -145,13 +146,16 @@ def process_forum(
             f"{log_prefix} Saving discourse summaries (extracting the embedding and saving)"
         )
 
+        Settings.node_parser = node_parser
+        Settings.embed_model = CohereEmbedding()
+        Settings.chunk_size = chunk_size
+        Settings.llm = OpenAI(model="gpt-3.5-turbo")
+
         pg_vector.save_documents_in_batches(
             community_id=community_id,
             documents=sorted_daily_docs,
             batch_size=100,
-            node_parser=node_parser,
             max_request_per_minute=None,
-            embed_model=embed_model,
             embed_dim=embedding_dim,
             request_per_minute=10000,
             deletion_query=deletion_query,
@@ -162,7 +166,11 @@ def process_forum(
 
 def get_summary_documents(
     forum_id: str, raw_data_grouped: list[Record], forum_endpoint: str
-) -> tuple[list[Document], list[Document], list[Document],]:
+) -> tuple[
+    list[Document],
+    list[Document],
+    list[Document],
+]:
     """
     prepare the summary documents for discourse based on given raw data
 
diff --git a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
index e1be6d8b..0fca0ec2 100644
--- a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
+++ b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
@@ -11,6 +11,8 @@
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
 
 
 def process_discourse_vectorstore(
@@ -127,15 +129,16 @@ def process_forum(
             WHERE (metadata_->>'postId')::float IN {deletion_ids};
         """
 
-    embed_model = CohereEmbedding()
+    Settings.node_parser = node_parser
+    Settings.embed_model = CohereEmbedding()
+    Settings.chunk_size = chunk_size
+    Settings.llm = OpenAI(model="gpt-3.5-turbo")
 
     pg_vector.save_documents_in_batches(
         community_id=community_id,
         documents=documents,
         batch_size=100,
-        node_parser=node_parser,
         max_request_per_minute=None,
-        embed_model=embed_model,
         embed_dim=embedding_dim,
         doc_file_ids_to_delete=deletion_query,
     )
diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py
index fd5844e5..ddae97f4 100644
--- a/dags/hivemind_etl_helpers/github_etl.py
+++ b/dags/hivemind_etl_helpers/github_etl.py
@@ -21,7 +21,7 @@
     transform_issues,
     transform_prs,
 )
-from llama_index import Document
+from llama_index.core import Document
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 
 
diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
index dde1e129..524d126a 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
@@ -4,7 +4,7 @@
 from hivemind_etl_helpers.src.db.discord.utils.transform_discord_raw_messges import (
     transform_discord_raw_messages,
 )
-from llama_index import Document
+from llama_index.core import Document
 
 
 def discord_raw_to_docuemnts(
diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
index 3461fd77..6901fffd 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
@@ -10,44 +10,51 @@
 from hivemind_etl_helpers.src.db.discord.summary.summary_utils import (
     transform_daily_summary_to_document,
 )
-from llama_index import Document, ServiceContext
-from llama_index.llms import LLM
-from llama_index.response_synthesizers.base import BaseSynthesizer
+from llama_index.core import Document, Settings
+from llama_index.core.llms import LLM
+from llama_index.core.response_synthesizers.base import BaseSynthesizer
 
 
 class DiscordSummary(PrepareSummaries):
     def __init__(
         self,
-        service_context: ServiceContext | None = None,
         response_synthesizer: BaseSynthesizer | None = None,
-        llm: LLM | None = None,
         verbose: bool = False,
+        **kwargs,
     ) -> None:
         """
         initialize the summary preparation class
 
         Parameters
         -----------
-        service_context : llama_index.ServiceContext | None
-            the service context for llama_index to work
-            if nothing passed will be to `llm=gpt-3.5-turbo` and `chunk_size = 512`
         set_response_synthesizer : BaseSynthesizer | None
             whether to set a response_synthesizer to refine the summaries or not
             if nothing passed would be set to `None`
-        llm : LLM | None
-            the llm to use
-            if nothing passed, it would use chatgpt with `gpt-3.5-turbo` model
         verbose : bool
             whether to show the progress of summarizing or not
+        **kwargs :
+            llm : LLM
+                the llm to use
+                if nothing passed, it would use the default `llama_index.core.Setting.llm`
+
+        Note: `chunk_size` is read from `llama_index.core.Setting.llm`.
         """
-        super().__init__(service_context, response_synthesizer, llm, verbose)
+        llm = kwargs.get("llm", Settings.llm)
+
+        super().__init__(
+            llm=llm, response_synthesizer=response_synthesizer, verbose=verbose
+        )
 
     def prepare_summaries(
         self,
         guild_id: str,
         summarization_prefix: str,
         from_date: datetime | None = None,
-    ) -> tuple[list[Document], list[Document], list[Document],]:
+    ) -> tuple[
+        list[Document],
+        list[Document],
+        list[Document],
+    ]:
         """
         prepare per thread summaries of discord messages.
         Note: This will always process the data until 1 day ago.
diff --git a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py
index 6bd4f3fc..b6701d31 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py
@@ -8,20 +8,23 @@
     transform_discord_raw_messages,
 )
 from hivemind_etl_helpers.src.utils.summary_base import SummaryBase
-from llama_index import Document, ServiceContext
-from llama_index.llms import LLM
-from llama_index.response_synthesizers.base import BaseSynthesizer
+from llama_index.core import Document, Settings
+from llama_index.core.llms import LLM
+from llama_index.core.response_synthesizers.base import BaseSynthesizer
 
 
 class PrepareSummaries(SummaryBase):
     def __init__(
         self,
-        service_context: ServiceContext | None = None,
         response_synthesizer: BaseSynthesizer | None = None,
-        llm: LLM | None = None,
         verbose: bool = False,
+        **kwargs,
     ) -> None:
-        super().__init__(service_context, response_synthesizer, llm, verbose)
+        llm = kwargs.get("llm", Settings.llm)
+
+        super().__init__(
+            llm=llm, response_synthesizer=response_synthesizer, verbose=verbose
+        )
         # initialization
         self.prefix: str = ""
 
diff --git a/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py b/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py
index 561b8af5..a6f7b6ff 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py
@@ -1,4 +1,4 @@
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_thread_summary_to_document(
diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
index 5d5db78d..f6194145 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -20,7 +20,7 @@
     prepare_raction_ids,
 )
 from hivemind_etl_helpers.src.db.globals import DATE_FORMAT
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_discord_raw_messages(
diff --git a/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py
index 3f812996..565d8f1f 100644
--- a/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py
+++ b/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py
@@ -4,7 +4,7 @@
 from hivemind_etl_helpers.src.db.discourse.utils.transform_raw_to_documents import (
     transform_raw_to_documents,
 )
-from llama_index import Document
+from llama_index.core import Document
 
 
 def fetch_discourse_documents(
diff --git a/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py b/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py
index 4fe0aee2..6aec9bff 100644
--- a/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py
+++ b/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py
@@ -8,9 +8,8 @@
     transform_summary_to_document,
 )
 from hivemind_etl_helpers.src.utils.summary_base import SummaryBase
-from llama_index import Document, ServiceContext
-from llama_index.llms import LLM
-from llama_index.response_synthesizers.base import BaseSynthesizer
+from llama_index.core import Document, Settings
+from llama_index.core.response_synthesizers.base import BaseSynthesizer
 
 
 class DiscourseSummary(SummaryBase):
@@ -18,12 +17,15 @@ def __init__(
         self,
         forum_id: str,
         forum_endpoint: str,
-        service_context: ServiceContext | None = None,
         response_synthesizer: BaseSynthesizer | None = None,
-        llm: LLM | None = None,
         verbose: bool = False,
+        **kwargs,
     ) -> None:
-        super().__init__(service_context, response_synthesizer, llm, verbose)
+        llm = kwargs.get("llm", Settings.llm)
+
+        super().__init__(
+            llm=llm, response_synthesizer=response_synthesizer, verbose=verbose
+        )
         self.prefix = f"FORUM_ID: {forum_id} "
         self.forum_endpoint = forum_endpoint
 
diff --git a/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py b/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py
index 10746213..30de1364 100644
--- a/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py
+++ b/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py
@@ -1,4 +1,4 @@
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_summary_to_document(
diff --git a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py
index dfce2b4d..b9e4042a 100644
--- a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py
+++ b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py
@@ -1,4 +1,4 @@
-from llama_index import Document
+from llama_index.core import Document
 from neo4j import Record
 
 
diff --git a/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py b/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py
index 048964d9..d08bfa3e 100644
--- a/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py
+++ b/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py
@@ -1,4 +1,4 @@
-from llama_index import Document, download_loader
+from llama_index.core import Document, download_loader
 
 
 def retrieve_folder_documents(folder_id: str) -> list[Document]:
diff --git a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py
index 17db0a7f..ea6756a2 100644
--- a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py
+++ b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py
@@ -1,8 +1,10 @@
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
-from llama_index import Document
+from llama_index.core import Document
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
 
 
 def load_documents_into_pg_database(
@@ -37,16 +39,18 @@ def load_documents_into_pg_database(
 
     pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
 
-    embed_model = CohereEmbedding()
     node_parser = configure_node_parser(chunk_size=chunk_size)
 
+    Settings.node_parser = node_parser
+    Settings.embed_model = CohereEmbedding()
+    Settings.chunk_size = chunk_size
+    Settings.llm = OpenAI(model="gpt-3.5-turbo")
+
     pg_vector.save_documents_in_batches(
         community_id=community_id,
         documents=documents,
         batch_size=100,
-        node_parser=node_parser,
         max_request_per_minute=None,
-        embed_model=embed_model,
         embed_dim=embedding_dim,
         request_per_minute=10000,
         deletion_query=kwargs.get("deletion_query", ""),
diff --git a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py
index f0cd1af2..971abd70 100644
--- a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py
+++ b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py
@@ -1,5 +1,5 @@
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
-from llama_index import Document
+from llama_index.core import Document
 
 
 class PrepareDeletion:
diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/comments.py b/dags/hivemind_etl_helpers/src/db/github/transform/comments.py
index 243f6cf8..b4559b6b 100644
--- a/dags/hivemind_etl_helpers/src/db/github/transform/comments.py
+++ b/dags/hivemind_etl_helpers/src/db/github/transform/comments.py
@@ -1,5 +1,5 @@
 from hivemind_etl_helpers.src.db.github.schema import GitHubComment
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_comments(data: list[GitHubComment]) -> list[Document]:
diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py
index bef52627..4efbd880 100644
--- a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py
+++ b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py
@@ -1,5 +1,5 @@
 from hivemind_etl_helpers.src.db.github.schema import GitHubCommit
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_commits(data: list[GitHubCommit]) -> list[Document]:
diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/issues.py b/dags/hivemind_etl_helpers/src/db/github/transform/issues.py
index 38dc1789..8c7045ab 100644
--- a/dags/hivemind_etl_helpers/src/db/github/transform/issues.py
+++ b/dags/hivemind_etl_helpers/src/db/github/transform/issues.py
@@ -1,5 +1,5 @@
 from hivemind_etl_helpers.src.db.github.schema import GitHubIssue
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_issues(data: list[GitHubIssue]) -> tuple[list[Document], list[Document]]:
diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py b/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py
index 4f35552e..73b56e7b 100644
--- a/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py
+++ b/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py
@@ -1,5 +1,5 @@
 from hivemind_etl_helpers.src.db.github.schema import GitHubPullRequest
-from llama_index import Document
+from llama_index.core import Document
 
 
 def transform_prs(data: list[GitHubPullRequest]) -> list[Document]:
diff --git a/dags/hivemind_etl_helpers/src/document_node_parser.py b/dags/hivemind_etl_helpers/src/document_node_parser.py
index 29bc11a9..3d6bace1 100644
--- a/dags/hivemind_etl_helpers/src/document_node_parser.py
+++ b/dags/hivemind_etl_helpers/src/document_node_parser.py
@@ -1,9 +1,12 @@
 from typing import Callable
 
-from llama_index.node_parser import SimpleNodeParser
-from llama_index.node_parser.text.sentence import CHUNKING_REGEX, DEFAULT_PARAGRAPH_SEP
-from llama_index.text_splitter import SentenceSplitter
-from llama_index.utils import get_tokenizer
+from llama_index.core.node_parser import SimpleNodeParser
+from llama_index.core.node_parser.text.sentence import (
+    CHUNKING_REGEX,
+    DEFAULT_PARAGRAPH_SEP,
+)
+from llama_index.core.text_splitter import SentenceSplitter
+from llama_index.core.utils import get_tokenizer
 
 
 def configure_node_parser(
diff --git a/dags/hivemind_etl_helpers/src/utils/check_documents.py b/dags/hivemind_etl_helpers/src/utils/check_documents.py
index 903b59dc..11159d75 100644
--- a/dags/hivemind_etl_helpers/src/utils/check_documents.py
+++ b/dags/hivemind_etl_helpers/src/utils/check_documents.py
@@ -3,7 +3,7 @@
 
 from dateutil import parser
 from hivemind_etl_helpers.src.db.gdrive.db_utils import fetch_files_date_field
-from llama_index import Document
+from llama_index.core import Document
 
 
 def check_documents(
diff --git a/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py b/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py
index dbbd2059..762ed79e 100644
--- a/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py
+++ b/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 
-from llama_index import Document
+from llama_index.core import Document
 
 
 def sort_summaries_daily(
diff --git a/dags/hivemind_etl_helpers/src/utils/summary_base.py b/dags/hivemind_etl_helpers/src/utils/summary_base.py
index fa969007..ad85d634 100644
--- a/dags/hivemind_etl_helpers/src/utils/summary_base.py
+++ b/dags/hivemind_etl_helpers/src/utils/summary_base.py
@@ -1,14 +1,13 @@
-from llama_index import Document, ServiceContext, SummaryIndex
-from llama_index.llms import LLM, OpenAI
-from llama_index.response_synthesizers.base import BaseSynthesizer
+from llama_index.core import Document, SummaryIndex
+from llama_index.core.llms import LLM
+from llama_index.core.response_synthesizers.base import BaseSynthesizer
 
 
 class SummaryBase:
     def __init__(
         self,
-        service_context: ServiceContext | None = None,
+        llm: LLM,
         response_synthesizer: BaseSynthesizer | None = None,
-        llm: LLM | None = None,
         verbose: bool = False,
     ) -> None:
         """
@@ -16,29 +15,19 @@ def __init__(
 
         Parameters
         -----------
-        service_context : llama_index.ServiceContext | None
-            the service context for llama_index to work
-            if nothing passed will be to `llm=gpt-3.5-turbo` and `chunk_size = 512`
         set_response_synthesizer : bool | None
             whether to set a response_synthesizer to refine the summaries or not
             if nothing passed would be set to None
-        llm : LLM | None
-            the llm to use
-            if nothing passed, it would use chatgpt with `gpt-3.5-turbo` model
         verbose : bool
             whether to show the progress of summarizing or not
-        testing : bool
-            testing mode would use a MockLLM
-        """
-        if llm is None:
-            llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
-
-        if service_context is None:
-            service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
+        llm : LLM
+            the llm to use
+            if nothing passed, it would use the default `llama_index.core.Setting.llm`
 
-        self.service_context = service_context
-        self.response_synthesizer = response_synthesizer
+        Note: `chunk_size` is read from `llama_index.core.Setting.llm`.
+        """
         self.llm = llm
+        self.response_synthesizer = response_synthesizer
         self.verbose = verbose
 
     def _get_summary(
@@ -50,7 +39,6 @@ def _get_summary(
         summary_index = SummaryIndex.from_documents(
             documents=messages_document,
             response_synthesizer=self.response_synthesizer,
-            service_context=self.service_context,
             show_progress=self.verbose,
         )
         summary_response = self.retrieve_summary(summary_index, summarization_query)
@@ -74,6 +62,7 @@ def retrieve_summary(
         query_engine = doc_summary_index.as_query_engine(
             response_mode="tree_summarize",
             response_synthesizer=self.response_synthesizer,
+            llm=self.llm,
         )
         response = query_engine.query(query)
         return response.response
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py
index 24088490..cb642354 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py
@@ -4,18 +4,15 @@
 from bson import ObjectId
 from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary
 from hivemind_etl_helpers.src.utils.mongo import MongoSingleton
-from llama_index import Document, MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestDiscordGroupedDataPreparation(TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(),
-            chunk_size=512,
-            embed_model=MockEmbedding(embed_dim=1024),
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 512
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def setup_db(
         self,
@@ -101,9 +98,7 @@ def test_empty_data_prepare_without_per_date(self):
         client[guild_id].drop_collection("rawinfos")
         self.setUp()
 
-        discord_summary = DiscordSummary(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        discord_summary = DiscordSummary()
         (
             thread_summary_docs,
             channel_summary_docs,
@@ -127,9 +122,7 @@ def test_empty_data_prepare_with_from_date(self):
         from_date = datetime(2023, 8, 1)
         self.setUp()
 
-        discord_summary = DiscordSummary(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        discord_summary = DiscordSummary()
         (
             thread_summary_docs,
             channel_summary_docs,
@@ -221,9 +214,7 @@ def test_some_data_prepare_with_from_date(self):
         client[guild_id]["rawinfos"].insert_many(raw_data)
         self.setUp()
 
-        discord_summary = DiscordSummary(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        discord_summary = DiscordSummary()
         (
             thread_summary_docs,
             channel_summary_docs,
@@ -329,9 +320,7 @@ def test_some_data_prepare_after_from_date(self):
         client[guild_id]["rawinfos"].insert_many(raw_data)
         self.setUp()
 
-        discord_summary = DiscordSummary(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        discord_summary = DiscordSummary()
         (
             thread_summary_docs,
             channel_summary_docs,
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py
index b1f5eb71..3ec1eb5a 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py
@@ -5,24 +5,22 @@
     PrepareSummaries,
 )
 from hivemind_etl_helpers.src.utils.mongo import MongoSingleton
-from llama_index import MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestPrepareSummaries(unittest.TestCase):
     def setUp(self):
         self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_thread_summaries_empty_data(self):
         self.setUp()
         guild_id = "1234"
 
-        prepare_summaries = PrepareSummaries(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        prepare_summaries = PrepareSummaries()
         summaries = prepare_summaries.prepare_thread_summaries(
             guild_id=guild_id,
             raw_data_grouped={},
@@ -111,9 +109,7 @@ def test_prepare_thread_summaries_some_data(self):
             }
         }
 
-        prepare_summaries = PrepareSummaries(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        prepare_summaries = PrepareSummaries()
         summaries = prepare_summaries.prepare_thread_summaries(
             guild_id=guild_id,
             raw_data_grouped=sample_grouped_data,
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py
index 8a6f840e..5cfdf091 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py
@@ -2,23 +2,22 @@
 from unittest.mock import Mock
 
 from hivemind_etl_helpers.src.utils.summary_base import SummaryBase
-from llama_index import Document, MockEmbedding, ServiceContext, SummaryIndex
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings, SummaryIndex
+from llama_index.core.llms import MockLLM
 
 
 class TestSummaryBase(unittest.TestCase):
     def setUp(self):
-        # Set up a sample ServiceContext for testing
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        # Set up a sample Settings for testing
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 512
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_summary_base_default_values(self):
         # Test the default values of the SummaryBase class
         # We need to set the service_context as we need the MockEmbedding model
         self.setUp()
-        summary_base = SummaryBase(llm=MockLLM(), service_context=self.service_context)
-        self.assertIsNotNone(summary_base.service_context)
+        summary_base = SummaryBase(llm=MockLLM())
         self.assertIsNone(summary_base.response_synthesizer)
         self.assertIsNotNone(summary_base.llm)
         self.assertFalse(summary_base.verbose)
@@ -28,19 +27,17 @@ def test_summary_base_custom_values(self):
         llm_mock = MockLLM()
         response_synthesizer_mock = Mock()
         summary_base = SummaryBase(
-            service_context=self.service_context,
             response_synthesizer=response_synthesizer_mock,
             llm=llm_mock,
             verbose=True,
         )
-        self.assertEqual(summary_base.service_context, self.service_context)
         self.assertEqual(summary_base.response_synthesizer, response_synthesizer_mock)
         self.assertEqual(summary_base.llm, llm_mock)
         self.assertTrue(summary_base.verbose)
 
     def test_get_summary(self):
         # Test the _get_summary method
-        summary_base = SummaryBase(service_context=self.service_context)
+        summary_base = SummaryBase(llm=Settings.llm)
         messages_document = [Document(text="Document 1"), Document(text="Document 2")]
 
         result = summary_base._get_summary(
@@ -51,10 +48,9 @@ def test_get_summary(self):
 
     def test_retrieve_summary(self):
         # Test the retrieve_summary method
-        summary_base = SummaryBase(service_context=self.service_context, llm=MockLLM())
+        summary_base = SummaryBase(llm=MockLLM())
         doc_summary_index = SummaryIndex.from_documents(
             documents=[Document(text="Document 1"), Document(text="Document 2")],
-            service_context=self.service_context,
         )
         query = "Summarize this"
         result = summary_base.retrieve_summary(doc_summary_index, query)
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py
index 678c86ce..04ef700d 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py
@@ -3,23 +3,20 @@
 from hivemind_etl_helpers.src.db.discord.summary.prepare_summaries import (
     PrepareSummaries,
 )
-from llama_index import Document, MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestPrepareSummaries(unittest.TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_channel_summaries_empty_data(self):
         self.setUp()
 
-        prepare_summaries = PrepareSummaries(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        prepare_summaries = PrepareSummaries()
         summaries, thread_docs = prepare_summaries.prepare_channel_summaries(
             thread_summaries={},
             summarization_query="Please give a summary of the data you have.",
@@ -44,9 +41,7 @@ def test_prepare_channel_summaries_some_data(self):
             }
         }
 
-        prepare_summaries = PrepareSummaries(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        prepare_summaries = PrepareSummaries()
         summaries, thread_docs = prepare_summaries.prepare_channel_summaries(
             thread_summaries=sample_thread_summary,
             summarization_query="Please give a summary of the data you have.",
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py
index 391d8ec3..020d6d0c 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py
@@ -3,23 +3,20 @@
 from hivemind_etl_helpers.src.db.discord.summary.prepare_summaries import (
     PrepareSummaries,
 )
-from llama_index import Document, MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestPrepareSummaries(unittest.TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_daily_summaries_empty_data(self):
         self.setUp()
 
-        prepare_summaries = PrepareSummaries(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        prepare_summaries = PrepareSummaries()
         summaries, channel_docs = prepare_summaries.prepare_daily_summaries(
             channel_summaries={},
             summarization_query="Please give a summary of the data you have.",
@@ -39,9 +36,7 @@ def test_prepare_daily_summaries_some_data(self):
             }
         }
 
-        prepare_summaries = PrepareSummaries(
-            service_context=self.service_context, llm=self.mock_llm
-        )
+        prepare_summaries = PrepareSummaries()
         summaries, channel_docs = prepare_summaries.prepare_daily_summaries(
             channel_summaries=sample_channel_summary,
             summarization_query="Please give a summary of the data you have.",
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py
index 34c639dd..56f1fab1 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py
@@ -3,7 +3,7 @@
 from hivemind_etl_helpers.src.db.discord.summary.summary_utils import (
     transform_daily_summary_to_document,
 )
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestTransformDailySummaryToDocument(unittest.TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py
index e891b044..45171eaf 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py
@@ -4,7 +4,7 @@
     transform_channel_summary_to_document,
     transform_thread_summary_to_document,
 )
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestTransformSummaryToDocument(unittest.TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py
index b9cd69ff..8dd3b2ab 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py
@@ -1,10 +1,12 @@
 from datetime import datetime, timezone
 from unittest import TestCase
+import pytest
 
 from hivemind_etl_helpers.src.utils.check_documents import process_doc_to_id_date
-from llama_index import Document
+from llama_index.core import Document
 
 
+@pytest.mark.skip(reason="GDrive ETL is not finished!")
 class TestDocumentGdriveProcessing(TestCase):
     def test_empty_documents(self):
         result = process_doc_to_id_date(
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py
index 077a6e8b..4da7fcc2 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py
@@ -1,13 +1,15 @@
 from datetime import datetime
 from unittest import TestCase
+import pytest
 
 import psycopg2
 from hivemind_etl_helpers.src.db.gdrive.db_utils import fetch_files_date_field, setup_db
-from llama_index import Document
+from llama_index.core import Document
 from tc_hivemind_backend.db.credentials import load_postgres_credentials
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
 
 
+@pytest.mark.skip(reason="GDrive ETL is not finished!")
 class TestFetchGdriveFileIds(TestCase):
     def setUpDB(self, community: str):
         db_name = f"community_{community}"
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py
index 20830e2a..38e39840 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py
@@ -1,15 +1,17 @@
 from datetime import datetime
 from unittest import TestCase
+import pytest
 
 import psycopg2
 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db
 from hivemind_etl_helpers.src.db.gdrive.delete_records import delete_records
-from llama_index import Document
+from llama_index.core import Document
 from tc_hivemind_backend.db.credentials import load_postgres_credentials
 from tc_hivemind_backend.db.pg_db_utils import convert_tuple_str
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
 
 
+@pytest.mark.skip(reason="GDrive ETL is not finished!")
 class TestDeleteRecords(TestCase):
     def setUpDB(self, community_id: str):
         db_name = f"community_{community_id}"
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py
index f30ede2a..86780db3 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py
@@ -1,14 +1,16 @@
 from datetime import datetime
 from unittest import TestCase
+import pytest
 
 import psycopg2
 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
-from llama_index import Document
+from llama_index.core import Document
 from tc_hivemind_backend.db.credentials import load_postgres_credentials
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
 
 
+@pytest.mark.skip(reason="GDrive ETL is not finished!")
 class TestGdriveDocumentsToDelete(TestCase):
     def setUpDB(self, community_id: str):
         db_name = f"community_{community_id}"
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py
index 18fe5819..56749d33 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py
@@ -1,14 +1,16 @@
 from datetime import datetime
 from unittest import TestCase
+import pytest
 
 import psycopg2
 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
-from llama_index import Document
+from llama_index.core import Document
 from tc_hivemind_backend.db.credentials import load_postgres_credentials
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
 
 
+@pytest.mark.skip(reason="GDrive ETL is not finished!")
 class TestGdriveDocumentsToInsert(TestCase):
     def setUpDB(self, community_id: str):
         db_name = f"community_{community_id}"
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
index 9034bccb..f13a35d6 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
@@ -7,7 +7,7 @@
     discord_raw_to_docuemnts,
 )
 from hivemind_etl_helpers.src.utils.mongo import MongoSingleton
-from llama_index.indices.vector_store import VectorStoreIndex
+from llama_index.core.indices.vector_store import VectorStoreIndex
 from tc_hivemind_backend.db.credentials import load_postgres_credentials
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py
index 0770cf98..7c92f26a 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py
@@ -3,16 +3,15 @@
 from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import (
     DiscourseSummary,
 )
-from llama_index import Document, MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestDiscoursePrepareDailySummaries(unittest.TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_daily_summaries_empty_data(self):
         self.setUp()
@@ -20,8 +19,6 @@ def test_prepare_daily_summaries_empty_data(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
@@ -40,8 +37,6 @@ def test_prepare_daily_summaries_some_data(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
@@ -80,8 +75,6 @@ def test_prepare_daily_summaries_some_data_check_documents(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py
index 13f2e0ed..b9480536 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py
@@ -3,16 +3,15 @@
 from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import (
     DiscourseSummary,
 )
-from llama_index import Document, MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestDiscoursePrepareCategorySummaries(unittest.TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_category_summaries_empty_data(self):
         self.setUp()
@@ -20,8 +19,6 @@ def test_prepare_category_summaries_empty_data(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
@@ -40,8 +37,6 @@ def test_prepare_category_summaries_some_data(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
@@ -105,8 +100,6 @@ def test_prepare_category_summaries_some_data_check_documents(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py
index 0b0510d9..600263ee 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py
@@ -3,16 +3,15 @@
 from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import (
     DiscourseSummary,
 )
-from llama_index import Document, MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import Document, MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestDiscoursePrepareDailySummaryDocuments(unittest.TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_documents_empty_input(self):
         self.setUp()
@@ -20,8 +19,6 @@ def test_prepare_documents_empty_input(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
@@ -36,8 +33,6 @@ def test_prepare_documents_some_inputs(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py
index 59a62655..7641d391 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py
@@ -4,16 +4,15 @@
 from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import (
     DiscourseSummary,
 )
-from llama_index import MockEmbedding, ServiceContext
-from llama_index.llms import MockLLM
+from llama_index.core import MockEmbedding, Settings
+from llama_index.core.llms import MockLLM
 
 
 class TestDiscoursePrepareTopicSummaries(unittest.TestCase):
     def setUp(self):
-        self.mock_llm = MockLLM()
-        self.service_context = ServiceContext.from_defaults(
-            llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024)
-        )
+        Settings.llm = MockLLM()
+        Settings.chunk_size = 256
+        Settings.embed_model = MockEmbedding(embed_dim=1024)
 
     def test_prepare_topic_summaries_empty_data(self):
         self.setUp()
@@ -21,8 +20,6 @@ def test_prepare_topic_summaries_empty_data(self):
         forum_endpoint = "sample_endpoint"
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
@@ -67,8 +64,6 @@ def test_prepare_topic_summaries_some_data(self):
             raw_data_grouped.append(data)
 
         prepare_summaries = DiscourseSummary(
-            service_context=self.service_context,
-            llm=self.mock_llm,
             forum_id=forum_id,
             forum_endpoint=forum_endpoint,
         )
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py
index 3a5246c4..0a67c36d 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py
@@ -1,9 +1,11 @@
 from datetime import datetime
 from unittest import TestCase
+import pytest
 
 from hivemind_etl_helpers.src.db.gdrive.db_utils import postprocess_results
 
 
+@pytest.mark.skip(reason="GDrive ETL is not finished!")
 class TestProcessGdriveFileRetreivalResultsProcess(TestCase):
     def test_empty_list(self):
         self.assertEqual(postprocess_results([]), {})
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py b/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py
index ba78a24c..25235a54 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py
@@ -3,7 +3,7 @@
 from unittest.mock import patch
 
 from hivemind_etl_helpers.src.db.github.load import PrepareDeletion
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestPrepareDeletion(unittest.TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py
index 47f6a6bf..dc0edc61 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py
@@ -3,7 +3,7 @@
 
 from hivemind_etl_helpers.src.db.github.schema import GitHubComment
 from hivemind_etl_helpers.src.db.github.transform.comments import transform_comments
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestGithubTransformcomComments(TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py
index 73f2bbcd..ac23d49c 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py
@@ -3,7 +3,7 @@
 
 from hivemind_etl_helpers.src.db.github.schema import GitHubCommit
 from hivemind_etl_helpers.src.db.github.transform.commits import transform_commits
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestGithubTransformCommits(TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py
index a833141e..d73e4a12 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py
@@ -3,7 +3,7 @@
 
 from hivemind_etl_helpers.src.db.github.schema import GitHubIssue
 from hivemind_etl_helpers.src.db.github.transform.issues import transform_issues
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestGithubTransformIssues(TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py
index bc6cdd0a..07d9e1e6 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py
@@ -3,7 +3,7 @@
 
 from hivemind_etl_helpers.src.db.github.schema import GitHubPullRequest
 from hivemind_etl_helpers.src.db.github.transform.pull_requests import transform_prs
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestGithubTransformPRs(TestCase):
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py b/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py
index 2cc500f2..a9a5f304 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py
@@ -1,7 +1,7 @@
 import unittest
 
 from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily
-from llama_index import Document
+from llama_index.core import Document
 
 
 class TestSortSummariesDaily(unittest.TestCase):
diff --git a/requirements.txt b/requirements.txt
index f5b8d815..a067b2b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 numpy
-llama-index==0.9.48
 pymongo
 python-dotenv
 pgvector
@@ -17,5 +16,5 @@ coverage>=7.3.3, <8.0.0
 pytest>=7.4.3, <8.0.0
 python-dotenv>=1.0.0, <2.0.0
 urlextract==1.8.0
-tc-hivemind-backend==1.1.3
+tc-hivemind-backend==1.1.4
 traceloop-sdk==0.9.4

From 39d350d2957955f0f105096b6acc609bb9f470a8 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 7 Mar 2024 18:10:45 +0330
Subject: [PATCH 02/10] fix: linter issues based on superlinter rules!

---
 dags/hivemind_etl_helpers/discord_mongo_summary_etl.py    | 2 +-
 .../discord_mongo_vector_store_etl.py                     | 4 ++--
 dags/hivemind_etl_helpers/discourse_summary_etl.py        | 8 +-------
 dags/hivemind_etl_helpers/discourse_vectorstore_etl.py    | 4 ++--
 .../src/db/discord/discord_summary.py                     | 7 +------
 .../src/db/discord/summary/prepare_summaries.py           | 1 -
 .../src/db/github/load/load_raw_data.py                   | 5 ++---
 .../integration/test_gdrive_convert_doc_to_id_date.py     | 2 +-
 .../integration/test_gdrive_db_fetch_files_modified_at.py | 2 +-
 .../tests/integration/test_gdrive_delete_records.py       | 2 +-
 .../tests/integration/test_gdrive_documents_to_delete.py  | 2 +-
 .../tests/integration/test_gdrive_documents_to_insert.py  | 2 +-
 .../unit/test_gdrive_files_modified_at_process_results.py | 2 +-
 13 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
index 375e560b..e7d449ae 100644
--- a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
+++ b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
@@ -10,9 +10,9 @@
 )
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
 from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily
+from llama_index.core import Settings
 from llama_index.core.response_synthesizers import get_response_synthesizer
 from llama_index.llms.openai import OpenAI
-from llama_index.core import Settings
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
diff --git a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
index 584c3963..1d7d9b1f 100644
--- a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
+++ b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
@@ -8,12 +8,12 @@
     find_guild_id_by_community_id,
 )
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
-from llama_index.core import Settings
-from llama_index.llms.openai import OpenAI
 
 
 def process_discord_guild_mongo(community_id: str) -> None:
diff --git a/dags/hivemind_etl_helpers/discourse_summary_etl.py b/dags/hivemind_etl_helpers/discourse_summary_etl.py
index 5344b146..25b83c25 100644
--- a/dags/hivemind_etl_helpers/discourse_summary_etl.py
+++ b/dags/hivemind_etl_helpers/discourse_summary_etl.py
@@ -134,8 +134,6 @@ def process_forum(
         node_parser = configure_node_parser(chunk_size=chunk_size)
         pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
 
-        embed_model = CohereEmbedding()
-
         sorted_daily_docs = sort_summaries_daily(
             level1_docs=topic_summary_documents,
             level2_docs=category_summary_documenets,
@@ -166,11 +164,7 @@ def process_forum(
 
 def get_summary_documents(
     forum_id: str, raw_data_grouped: list[Record], forum_endpoint: str
-) -> tuple[
-    list[Document],
-    list[Document],
-    list[Document],
-]:
+) -> tuple[list[Document], list[Document], list[Document],]:
     """
     prepare the summary documents for discourse based on given raw data
 
diff --git a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
index 0fca0ec2..2af29ad5 100644
--- a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
+++ b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
@@ -7,12 +7,12 @@
 from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
 from tc_hivemind_backend.db.pg_db_utils import setup_db
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
-from llama_index.core import Settings
-from llama_index.llms.openai import OpenAI
 
 
 def process_discourse_vectorstore(
diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
index 6901fffd..f6cad40e 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
@@ -11,7 +11,6 @@
     transform_daily_summary_to_document,
 )
 from llama_index.core import Document, Settings
-from llama_index.core.llms import LLM
 from llama_index.core.response_synthesizers.base import BaseSynthesizer
 
 
@@ -50,11 +49,7 @@ def prepare_summaries(
         guild_id: str,
         summarization_prefix: str,
         from_date: datetime | None = None,
-    ) -> tuple[
-        list[Document],
-        list[Document],
-        list[Document],
-    ]:
+    ) -> tuple[list[Document], list[Document], list[Document],]:
         """
         prepare per thread summaries of discord messages.
         Note: This will always process the data until 1 day ago.
diff --git a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py
index b6701d31..8853fefb 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py
@@ -9,7 +9,6 @@
 )
 from hivemind_etl_helpers.src.utils.summary_base import SummaryBase
 from llama_index.core import Document, Settings
-from llama_index.core.llms import LLM
 from llama_index.core.response_synthesizers.base import BaseSynthesizer
 
 
diff --git a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py
index ea6756a2..1034e1a4 100644
--- a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py
+++ b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py
@@ -1,10 +1,9 @@
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
-from llama_index.core import Document
+from llama_index.core import Document, Settings
+from llama_index.llms.openai import OpenAI
 from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
 from tc_hivemind_backend.pg_vector_access import PGVectorAccess
-from llama_index.core import Settings
-from llama_index.llms.openai import OpenAI
 
 
 def load_documents_into_pg_database(
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py
index 8dd3b2ab..12690210 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py
@@ -1,7 +1,7 @@
 from datetime import datetime, timezone
 from unittest import TestCase
-import pytest
 
+import pytest
 from hivemind_etl_helpers.src.utils.check_documents import process_doc_to_id_date
 from llama_index.core import Document
 
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py
index 4da7fcc2..7fa0f127 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py
@@ -1,8 +1,8 @@
 from datetime import datetime
 from unittest import TestCase
-import pytest
 
 import psycopg2
+import pytest
 from hivemind_etl_helpers.src.db.gdrive.db_utils import fetch_files_date_field, setup_db
 from llama_index.core import Document
 from tc_hivemind_backend.db.credentials import load_postgres_credentials
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py
index 38e39840..25c0d092 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py
@@ -1,8 +1,8 @@
 from datetime import datetime
 from unittest import TestCase
-import pytest
 
 import psycopg2
+import pytest
 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db
 from hivemind_etl_helpers.src.db.gdrive.delete_records import delete_records
 from llama_index.core import Document
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py
index 86780db3..8ce64235 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py
@@ -1,8 +1,8 @@
 from datetime import datetime
 from unittest import TestCase
-import pytest
 
 import psycopg2
+import pytest
 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
 from llama_index.core import Document
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py
index 56749d33..9d930969 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py
@@ -1,8 +1,8 @@
 from datetime import datetime
 from unittest import TestCase
-import pytest
 
 import psycopg2
+import pytest
 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
 from llama_index.core import Document
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py
index 0a67c36d..797ecf48 100644
--- a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py
+++ b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 from unittest import TestCase
-import pytest
 
+import pytest
 from hivemind_etl_helpers.src.db.gdrive.db_utils import postprocess_results
 
 

From 64f7d5c55e393daff6d1e16de406ac7ccef6680a Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 7 Mar 2024 18:20:45 +0330
Subject: [PATCH 03/10] fix: wrong docstring!

---
 dags/hivemind_etl_helpers/src/db/discord/discord_summary.py | 2 +-
 dags/hivemind_etl_helpers/src/utils/summary_base.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
index f6cad40e..56f765da 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
@@ -36,7 +36,7 @@ def __init__(
                 the llm to use
                 if nothing passed, it would use the default `llama_index.core.Setting.llm`
 
-        Note: `chunk_size` is read from `llama_index.core.Setting.llm`.
+        Note: `chunk_size` is read from `llama_index.core.Setting.chunk_size`.
         """
         llm = kwargs.get("llm", Settings.llm)
 
diff --git a/dags/hivemind_etl_helpers/src/utils/summary_base.py b/dags/hivemind_etl_helpers/src/utils/summary_base.py
index ad85d634..2a3e3d02 100644
--- a/dags/hivemind_etl_helpers/src/utils/summary_base.py
+++ b/dags/hivemind_etl_helpers/src/utils/summary_base.py
@@ -24,7 +24,7 @@ def __init__(
             the llm to use
             if nothing passed, it would use the default `llama_index.core.Setting.llm`
 
-        Note: `chunk_size` is read from `llama_index.core.Setting.llm`.
+        Note: `chunk_size` is read from `llama_index.core.Setting.chunk_size`.
         """
         self.llm = llm
         self.response_synthesizer = response_synthesizer

From 1a0a3a06f4cc2920319d8150896f265f3f069575 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 12 Mar 2024 13:40:38 +0330
Subject: [PATCH 04/10] feat: increase backend lib depdendency version! The
 backend lib now contains the llama-index-legacy which is for using its
 PGVectorStore and airflow was not supporting the new vectorstore version
 because of sqlalchemy!

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a067b2b7..b76573f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,5 +16,5 @@ coverage>=7.3.3, <8.0.0
 pytest>=7.4.3, <8.0.0
 python-dotenv>=1.0.0, <2.0.0
 urlextract==1.8.0
-tc-hivemind-backend==1.1.4
+tc-hivemind-backend==1.1.5
 traceloop-sdk==0.9.4

From cfeefd8e5d7dd878049b495bf6e5aa13b17df45c Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 12 Mar 2024 14:19:24 +0330
Subject: [PATCH 05/10] feat: raise error if tests failing!

---
 docker-entrypoint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index e95edd19..5db146c3 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -2,7 +2,7 @@
 echo "chang dir to dags"
 cd dags || exit
 
-python3 -m coverage run --omit=tests/* -m pytest .
+python3 -m coverage run --omit=tests/* -m pytest . && echo "Tests Passed" || exit 1
 
 cp .coverage ../.coverage
 cd .. || exit

From 41e889615c28eb52168b5ea7b399ec06c30b5ad5 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 12 Mar 2024 18:15:30 +0330
Subject: [PATCH 06/10] feat: more separation between dags! Following best
 practices to make each DAG file more efficient. That was because airflow was
 raising an error and couldn't bring the previous dags up.

---
 dags/hivemind_discord_etl.py   |  69 ++++++++++++++++
 dags/hivemind_discourse_etl.py |  58 +++++++++++++
 dags/hivemind_etl.py           | 146 ---------------------------------
 dags/hivemind_github_etl.py    |  34 ++++++++
 4 files changed, 161 insertions(+), 146 deletions(-)
 create mode 100644 dags/hivemind_discord_etl.py
 create mode 100644 dags/hivemind_discourse_etl.py
 delete mode 100644 dags/hivemind_etl.py
 create mode 100644 dags/hivemind_github_etl.py

diff --git a/dags/hivemind_discord_etl.py b/dags/hivemind_discord_etl.py
new file mode 100644
index 00000000..c4a22209
--- /dev/null
+++ b/dags/hivemind_discord_etl.py
@@ -0,0 +1,69 @@
+import logging
+from datetime import datetime
+
+from airflow import DAG
+from airflow.decorators import task
+
+from dotenv import load_dotenv
+from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries
+from hivemind_etl_helpers.discord_mongo_vector_store_etl import (
+    process_discord_guild_mongo,
+)
+from hivemind_etl_helpers.src.utils.mongo_discord_communities import (
+    get_all_discord_communities,
+)
+
+
+with DAG(
+    dag_id="discord_vector_store_update",
+    start_date=datetime(2024, 1, 1),
+    schedule_interval="0 2 * * *",
+    catchup=False,
+) as dag:
+
+    @task
+    def get_discord_communities() -> list[str]:
+        """
+        Getting all communities having discord from database
+        """
+        communities = get_all_discord_communities()
+        return communities
+
+    @task
+    def start_discord_vectorstore(community_id: str):
+        load_dotenv()
+        logging.info(f"Working on community, {community_id}")
+        process_discord_guild_mongo(community_id=community_id)
+        logging.info(f"Community {community_id} Job finished!")
+
+    communities = get_discord_communities()
+    # `start_discord_vectorstore` will be called multiple times
+    # with the length of the list
+    start_discord_vectorstore.expand(community_id=communities)
+
+
+with DAG(
+    dag_id="discord_summary_vector_store",
+    start_date=datetime(2024, 1, 1),
+    schedule_interval="0 2 * * *",
+) as dag:
+
+    @task
+    def get_mongo_discord_communities() -> list[str]:
+        """
+        Getting all communities having discord from database
+        this function is the same with `get_discord_communities`
+        we just changed the name for the pylint
+        """
+        communities = get_all_discord_communities()
+        return communities
+
+    @task
+    def start_discord_summary_vectorstore(community_id: str):
+        load_dotenv()
+        logging.info(f"Working on community, {community_id}")
+        process_discord_summaries(community_id=community_id, verbose=False)
+        logging.info(f"Community {community_id} Job finished!")
+
+    communities = get_mongo_discord_communities()
+    start_discord_summary_vectorstore.expand(community_id=communities)
diff --git a/dags/hivemind_discourse_etl.py b/dags/hivemind_discourse_etl.py
new file mode 100644
index 00000000..d169b287
--- /dev/null
+++ b/dags/hivemind_discourse_etl.py
@@ -0,0 +1,58 @@
+import logging
+from datetime import datetime
+
+from airflow import DAG
+from airflow.decorators import task
+
+from hivemind_etl_helpers.discourse_summary_etl import process_discourse_summary
+from hivemind_etl_helpers.discourse_vectorstore_etl import process_discourse_vectorstore
+from hivemind_etl_helpers.src.utils.get_communities_data import (
+    get_discourse_communities,
+)
+
+with DAG(
+    dag_id="discourse_vector_store",
+    start_date=datetime(2024, 3, 1),
+    schedule_interval="0 2 * * *",
+) as dag:
+
+    @task
+    def process_discourse_community(community_information: dict[str, str | datetime]):
+        community_id = community_information["community_id"]
+        forum_endpoint = community_information["endpoint"]
+        from_date = community_information["from_date"]
+
+        logging.info(f"Starting Discourse ETL | community_id: {community_id}")
+        process_discourse_vectorstore(
+            community_id=community_id,
+            forum_endpoint=forum_endpoint,
+            from_starting_date=from_date,
+        )
+
+    communities_info = get_discourse_communities()
+    process_discourse_community.expand(community_information=communities_info)
+
+
+with DAG(
+    dag_id="discourse_summary_vector_store",
+    start_date=datetime(2024, 2, 21),
+    schedule_interval="0 2 * * *",
+) as dag:
+
+    @task
+    def process_discourse_community_summary(
+        community_information: dict[str, str | datetime]
+    ):
+        community_id = community_information["community_id"]
+        forum_endpoint = community_information["endpoint"]
+        from_date = community_information["from_date"]
+
+        logging.info(f"Starting Discourse ETL | community_id: {community_id}")
+        process_discourse_summary(
+            community_id=community_id,
+            forum_endpoint=forum_endpoint,
+            from_starting_date=from_date,
+        )
+
+    communities_info = get_discourse_communities()
+    process_discourse_community_summary.expand(community_information=communities_info)
diff --git a/dags/hivemind_etl.py b/dags/hivemind_etl.py
deleted file mode 100644
index d288e0e3..00000000
--- a/dags/hivemind_etl.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from __future__ import annotations
-
-import logging
-from datetime import datetime
-
-from airflow import DAG
-from airflow.decorators import task
-from dotenv import load_dotenv
-from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries
-from hivemind_etl_helpers.discord_mongo_vector_store_etl import (
-    process_discord_guild_mongo,
-)
-from hivemind_etl_helpers.discourse_summary_etl import process_discourse_summary
-from hivemind_etl_helpers.discourse_vectorstore_etl import process_discourse_vectorstore
-from hivemind_etl_helpers.github_etl import process_github_vectorstore
-from hivemind_etl_helpers.src.utils.get_communities_data import (
-    get_discourse_communities,
-    get_github_communities_data,
-)
-from hivemind_etl_helpers.src.utils.mongo_discord_communities import (
-    get_all_discord_communities,
-)
-
-with DAG(
-    dag_id="discord_vector_store_update",
-    start_date=datetime(2024, 1, 1),
-    schedule_interval="0 2 * * *",
-    catchup=False,
-) as dag:
-
-    @task
-    def get_discord_communities() -> list[str]:
-        """
-        Getting all communities having discord from database
-        """
-        communities = get_all_discord_communities()
-        return communities
-
-    @task
-    def start_discord_vectorstore(community_id: str):
-        load_dotenv()
-        logging.info(f"Working on community, {community_id}")
-        process_discord_guild_mongo(community_id=community_id)
-        logging.info(f"Community {community_id} Job finished!")
-
-    communities = get_discord_communities()
-    # `start_discord_vectorstore` will be called multiple times
-    # with the length of the list
-    start_discord_vectorstore.expand(community_id=communities)
-
-with DAG(
-    dag_id="discord_summary_vector_store",
-    start_date=datetime(2024, 1, 1),
-    schedule_interval="0 2 * * *",
-) as dag:
-
-    @task
-    def get_mongo_discord_communities() -> list[str]:
-        """
-        Getting all communities having discord from database
-        this function is the same with `get_discord_communities`
-        we just changed the name for the pylint
-        """
-        communities = get_all_discord_communities()
-        return communities
-
-    @task
-    def start_discord_summary_vectorstore(community_id: str):
-        load_dotenv()
-        logging.info(f"Working on community, {community_id}")
-        process_discord_summaries(community_id=community_id, verbose=False)
-        logging.info(f"Community {community_id} Job finished!")
-
-    communities = get_mongo_discord_communities()
-    start_discord_summary_vectorstore.expand(community_id=communities)
-
-
-with DAG(
-    dag_id="github_vector_store",
-    start_date=datetime(2024, 2, 21),
-    schedule_interval="0 2 * * *",
-) as dag:
-
-    @task
-    def process_github_community(community_information: dict[str, str | datetime]):
-        community_id = community_information["community_id"]
-        organization_id = community_information["organization_id"]
-        from_date = community_information["from_date"]
-
-        logging.info(f"Starting Github ETL | community_id: {community_id}")
-        process_github_vectorstore(
-            community_id=community_id,
-            github_org_id=organization_id,
-            from_starting_date=from_date,
-        )
-
-    communities_info = get_github_communities_data()
-    process_github_community.expand(community_information=communities_info)
-
-
-with DAG(
-    dag_id="discourse_vector_store",
-    start_date=datetime(2024, 3, 1),
-    schedule_interval="0 2 * * *",
-) as dag:
-
-    @task
-    def process_discourse_community(community_information: dict[str, str | datetime]):
-        community_id = community_information["community_id"]
-        forum_endpoint = community_information["endpoint"]
-        from_date = community_information["from_date"]
-
-        logging.info(f"Starting Discourse ETL | community_id: {community_id}")
-        process_discourse_vectorstore(
-            community_id=community_id,
-            forum_endpoint=forum_endpoint,
-            from_starting_date=from_date,
-        )
-
-    communities_info = get_discourse_communities()
-    process_discourse_community.expand(community_information=communities_info)
-
-
-with DAG(
-    dag_id="discourse_summary_vector_store",
-    start_date=datetime(2024, 2, 21),
-    schedule_interval="0 2 * * *",
-) as dag:
-
-    @task
-    def process_discourse_community_summary(
-        community_information: dict[str, str | datetime]
-    ):
-        community_id = community_information["community_id"]
-        forum_endpoint = community_information["endpoint"]
-        from_date = community_information["from_date"]
-
-        logging.info(f"Starting Discourse ETL | community_id: {community_id}")
-        process_discourse_summary(
-            community_id=community_id,
-            forum_endpoint=forum_endpoint,
-            from_starting_date=from_date,
-        )
-
-    communities_info = get_discourse_communities()
-    process_discourse_community_summary.expand(community_information=communities_info)
diff --git a/dags/hivemind_github_etl.py b/dags/hivemind_github_etl.py
new file mode 100644
index 00000000..e381a5ab
--- /dev/null
+++ b/dags/hivemind_github_etl.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import logging
+from datetime import datetime
+
+from airflow import DAG
+from airflow.decorators import task
+
+from hivemind_etl_helpers.github_etl import process_github_vectorstore
+from hivemind_etl_helpers.src.utils.get_communities_data import (
+    get_github_communities_data,
+)
+
+with DAG(
+    dag_id="github_vector_store",
+    start_date=datetime(2024, 2, 21),
+    schedule_interval="0 2 * * *",
+) as dag:
+
+    @task
+    def process_github_community(community_information: dict[str, str | datetime]):
+        community_id = community_information["community_id"]
+        organization_id = community_information["organization_id"]
+        from_date = community_information["from_date"]
+
+        logging.info(f"Starting Github ETL | community_id: {community_id}")
+        process_github_vectorstore(
+            community_id=community_id,
+            github_org_id=organization_id,
+            from_starting_date=from_date,
+        )
+
+    communities_info = get_github_communities_data()
+    process_github_community.expand(community_information=communities_info)

From 22f87a87993f65f39861a49c70e5a6eb8bbb8eb1 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Wed, 13 Mar 2024 08:16:25 +0330
Subject: [PATCH 07/10] fix: fixed minor issues! - One of the github commit
 data relations was changed. - When no data was available deletion was not
 producing empty string, which is fixed now.

---
 .../src/db/github/extract/commit.py                   |  2 +-
 .../src/db/github/load/prepare_deletion.py            | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py
index 0641b341..e29c34bc 100644
--- a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py
+++ b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py
@@ -27,7 +27,7 @@ def fetch_raw_commits(
     """
     neo4j_connection = Neo4jConnection()
     neo4j_driver = neo4j_connection.connect_neo4j()
-    query = """MATCH (co:Commit)<-[:COMMITED]-(user:GitHubUser)
+    query = """MATCH (co:Commit)<-[:COMMITTED]-(user:GitHubUser)
         WHERE
         co.repository_id IN $repoIds
     """
diff --git a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py
index 971abd70..43583377 100644
--- a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py
+++ b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py
@@ -54,10 +54,11 @@ def prepare(
         )
 
         documents_to_save = docs_to_save + pr_docsuments_to_save
-
-        deletion_query = self._create_deletion_query(
-            docs_file_ids_to_delete + pr_docs_file_ids_to_delete
-        )
+        doc_ids_to_delete = docs_file_ids_to_delete + pr_docs_file_ids_to_delete
+        if len(doc_ids_to_delete):
+            deletion_query = self._create_deletion_query(doc_ids_to_delete)
+        else:
+            deletion_query = ""
 
         return documents_to_save, deletion_query
 
@@ -125,6 +126,8 @@ def _create_deletion_query(
         self,
         doc_ids_to_delete: list[str],
     ) -> str:
+        if len(doc_ids_to_delete) == 0:
+            return ""
         if len(doc_ids_to_delete) == 1:
             deletion_ids = f"({doc_ids_to_delete[0]})"
         else:

From d52661eea5037302f8a7c2b5e07113d07a71ae97 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Wed, 13 Mar 2024 08:19:31 +0330
Subject: [PATCH 08/10] fix: isort linter issues!

---
 dags/hivemind_discord_etl.py   | 2 --
 dags/hivemind_discourse_etl.py | 1 -
 dags/hivemind_github_etl.py    | 1 -
 3 files changed, 4 deletions(-)

diff --git a/dags/hivemind_discord_etl.py b/dags/hivemind_discord_etl.py
index c4a22209..615f3ae2 100644
--- a/dags/hivemind_discord_etl.py
+++ b/dags/hivemind_discord_etl.py
@@ -3,7 +3,6 @@
 
 from airflow import DAG
 from airflow.decorators import task
-
 from dotenv import load_dotenv
 from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries
 from hivemind_etl_helpers.discord_mongo_vector_store_etl import (
@@ -13,7 +12,6 @@
     get_all_discord_communities,
 )
 
-
 with DAG(
     dag_id="discord_vector_store_update",
     start_date=datetime(2024, 1, 1),
diff --git a/dags/hivemind_discourse_etl.py b/dags/hivemind_discourse_etl.py
index d169b287..25c834ce 100644
--- a/dags/hivemind_discourse_etl.py
+++ b/dags/hivemind_discourse_etl.py
@@ -3,7 +3,6 @@
 
 from airflow import DAG
 from airflow.decorators import task
-
 from hivemind_etl_helpers.discourse_summary_etl import process_discourse_summary
 from hivemind_etl_helpers.discourse_vectorstore_etl import process_discourse_vectorstore
 from hivemind_etl_helpers.src.utils.get_communities_data import (
diff --git a/dags/hivemind_github_etl.py b/dags/hivemind_github_etl.py
index e381a5ab..15de6758 100644
--- a/dags/hivemind_github_etl.py
+++ b/dags/hivemind_github_etl.py
@@ -5,7 +5,6 @@
 
 from airflow import DAG
 from airflow.decorators import task
-
 from hivemind_etl_helpers.github_etl import process_github_vectorstore
 from hivemind_etl_helpers.src.utils.get_communities_data import (
     get_github_communities_data,

From 9e1884b71314207388b7eaabe520d91ae7a41ce9 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Wed, 13 Mar 2024 16:50:03 +0330
Subject: [PATCH 09/10] fix: test cases based on previous updates! - The
 summary type was added in other commits. - The COMMITED relationships typo
 was fixed on staging database.

---
 .../test_discord_summary_transform_daily_summaries.py  |  6 +++---
 .../tests/integration/test_github_etl_fetch_commits.py | 10 +++++-----
 .../integration/test_github_etl_fetch_raw_commits.py   |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py
index 56f1fab1..3ea81ace 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py
@@ -25,15 +25,15 @@ def test_transform_daily_summary_to_document(self):
         expected_documents = [
             Document(
                 text="Summary 1",
-                metadata={"date": "2023-01-01", "channel": None, "thread": None},
+                metadata={"date": "2023-01-01", "type": "day"},
             ),
             Document(
                 text="Summary 2",
-                metadata={"date": "2023-01-02", "channel": None, "thread": None},
+                metadata={"date": "2023-01-02", "type": "day"},
             ),
             Document(
                 text="Summary 3",
-                metadata={"date": "2023-01-03", "channel": None, "thread": None},
+                metadata={"date": "2023-01-03", "type": "day"},
             ),
         ]
 
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py
index 0a589638..7656279b 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py
@@ -29,7 +29,7 @@ def test_get_single_commit_single_repo_no_from_date(self):
             session.execute_write(
                 lambda tx: tx.run(
                     """
-                    CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"})
+                    CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"})
                         SET
                             co.`commit.author.name` = "Author#1",
                             co.`commit.message` = "Issue #1 is resolved!",
@@ -68,7 +68,7 @@ def test_get_single_commit_single_repo_with_from_date(self):
             session.execute_write(
                 lambda tx: tx.run(
                     """
-                    CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"})
+                    CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"})
                         SET
                             co.`commit.author.name` = "Author#1",
                             co.`commit.message` = "Issue #1 is resolved!",
@@ -108,7 +108,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self):
             session.execute_write(
                 lambda tx: tx.run(
                     """
-                    CREATE (co:Commit)<-[:COMMITED]-(:GitHubUser {login: "author #1"})
+                    CREATE (co:Commit)<-[:COMMITTED]-(:GitHubUser {login: "author #1"})
                         SET
                             co.`commit.author.name` = "Author#1",
                             co.`commit.message` = "Issue #1 is resolved!",
@@ -120,7 +120,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self):
                             co.`commit.author.date` = "2024-01-01T10:23:50Z",
                             co.`commit.verification.reason` = "invalid"
 
-                    CREATE (co2:Commit)<-[:COMMITED]-(:GitHubUser {login: "author #2"})
+                    CREATE (co2:Commit)<-[:COMMITTED]-(:GitHubUser {login: "author #2"})
                         SET
                             co2.`commit.author.name` = "Author#2",
                             co2.`commit.message` = "Issue #2 is resolved!",
@@ -132,7 +132,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self):
                             co2.`commit.author.date` = "2023-01-01T10:23:50Z",
                             co2.`commit.verification.reason` = "invalid"
 
-                    CREATE (co3:Commit)<-[:COMMITED]-(:GitHubUser {login: "author #3"})
+                    CREATE (co3:Commit)<-[:COMMITTED]-(:GitHubUser {login: "author #3"})
                         SET
                             co3.`commit.author.name` = "Author#3",
                             co3.`commit.message` = "Issue #3 is resolved!",
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py
index c6bb05cd..62edd6d1 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py
@@ -29,7 +29,7 @@ def test_get_single_commit_single_repo_no_from_date(self):
             session.execute_write(
                 lambda tx: tx.run(
                     """
-                    CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"})
+                    CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"})
                         SET
                             co.`commit.author.name` = "Author#1",
                             co.`commit.message` = "Issue #1 is resolved!",
@@ -70,7 +70,7 @@ def test_get_single_commit_single_repo_with_from_date(self):
             session.execute_write(
                 lambda tx: tx.run(
                     """
-                    CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"})
+                    CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"})
                         SET
                             co.`commit.author.name` = "Author#1",
                             co.`commit.message` = "Issue #1 is resolved!",

From 43cdc7b9da6a6b02044e3c9affd9e87fb2437f0c Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Wed, 13 Mar 2024 17:14:14 +0330
Subject: [PATCH 10/10] fix: converted list to set to not to compare indices!

---
 .../tests/integration/test_github_etl_get_org_repos.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py
index 88e17f4b..41dab6e0 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py
@@ -51,4 +51,4 @@ def test_fetch_multiple_repo(self):
                 )
             )
         repo_ids = get_github_organization_repos(github_organization_id=org_id)
-        self.assertEqual(repo_ids, [100, 101, 102])
+        self.assertEqual(set(repo_ids), set([100, 101, 102]))