From a40acc9a4500c817211754e6d9764b2be6d323da Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 7 Mar 2024 17:53:59 +0330 Subject: [PATCH 01/10] feat: llama-index code migration! - We needed to update codes based on the latest update of llama-index library. - We also, added skip to previously gdrive etl codes as because it's task was paused and yet to be completed. --- .../discord_mongo_summary_etl.py | 12 ++++--- .../discord_mongo_vector_store_etl.py | 22 ++++--------- .../discourse_summary_etl.py | 18 +++++++--- .../discourse_vectorstore_etl.py | 9 +++-- dags/hivemind_etl_helpers/github_etl.py | 2 +- .../discord_raw_message_to_document.py | 2 +- .../src/db/discord/discord_summary.py | 33 +++++++++++-------- .../db/discord/summary/prepare_summaries.py | 15 +++++---- .../src/db/discord/summary/summary_utils.py | 2 +- .../utils/transform_discord_raw_messges.py | 2 +- .../src/db/discourse/raw_post_to_documents.py | 2 +- .../db/discourse/summary/prepare_summary.py | 14 ++++---- .../src/db/discourse/summary/summary_utils.py | 2 +- .../utils/transform_raw_to_documents.py | 2 +- .../src/db/gdrive/retrieve_documents.py | 2 +- .../src/db/github/load/load_raw_data.py | 12 ++++--- .../src/db/github/load/prepare_deletion.py | 2 +- .../src/db/github/transform/comments.py | 2 +- .../src/db/github/transform/commits.py | 2 +- .../src/db/github/transform/issues.py | 2 +- .../src/db/github/transform/pull_requests.py | 2 +- .../src/document_node_parser.py | 11 ++++--- .../src/utils/check_documents.py | 2 +- .../src/utils/sort_summary_docs.py | 2 +- .../src/utils/summary_base.py | 33 +++++++------------ .../test_discord_prepare_summary.py | 29 +++++----------- .../test_discord_prepare_thread_summaries.py | 18 ++++------ .../integration/test_discord_summary_base.py | 22 +++++-------- ...scord_summary_prepare_channel_summaries.py | 19 ++++------- ...discord_summary_prepare_daily_summaries.py | 19 ++++------- ...scord_summary_transform_daily_summaries.py | 2 +- .../integration/test_discord_summary_utils.py | 2 +- .../test_gdrive_convert_doc_to_id_date.py | 4 ++- .../test_gdrive_db_fetch_files_modified_at.py | 4 ++- .../integration/test_gdrive_delete_records.py | 4 ++- .../test_gdrive_documents_to_delete.py | 4 ++- .../test_gdrive_documents_to_insert.py | 4 ++- .../test_pg_vector_access_with_discord.py | 2 +- ...test_discourse_summary_prepare_category.py | 17 +++------- .../test_discourse_summary_prepare_daily.py | 17 +++------- ...scourse_summary_prepare_daily_documents.py | 15 +++------ .../test_discourse_summary_prepare_topic.py | 15 +++------ ...drive_files_modified_at_process_results.py | 2 ++ .../unit/test_github_etl_prepare_deletion.py | 2 +- .../unit/test_github_transform_comments.py | 2 +- .../unit/test_github_transform_commits.py | 2 +- .../unit/test_github_transform_issues.py | 2 +- .../tests/unit/test_github_transform_prs.py | 2 +- .../tests/unit/test_sort_summary_docs.py | 2 +- requirements.txt | 3 +- 50 files changed, 196 insertions(+), 227 deletions(-) diff --git a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py index 9fd72f1f..375e560b 100644 --- a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py +++ b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py @@ -10,7 +10,9 @@ ) from hivemind_etl_helpers.src.document_node_parser import configure_node_parser from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily -from llama_index.response_synthesizers import get_response_synthesizer +from llama_index.core.response_synthesizers import get_response_synthesizer +from llama_index.llms.openai import OpenAI +from llama_index.core import Settings from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding @@ -93,16 +95,16 @@ def process_discord_summaries(community_id: str, verbose: bool = False) -> None: node_parser = configure_node_parser(chunk_size=chunk_size) pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) - embed_model = CohereEmbedding() + Settings.node_parser = node_parser + Settings.embed_model = CohereEmbedding() + Settings.chunk_size = chunk_size + Settings.llm = OpenAI(model="gpt-3.5-turbo") pg_vector.save_documents_in_batches( community_id=community_id, documents=docs_daily_sorted, batch_size=100, - node_parser=node_parser, max_request_per_minute=None, - embed_model=embed_model, - embed_dim=embedding_dim, request_per_minute=10000, deletion_query=deletion_query, ) diff --git a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py index 60d3d02d..584c3963 100644 --- a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py +++ b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py @@ -1,4 +1,3 @@ -import argparse import logging from datetime import timedelta @@ -13,6 +12,8 @@ from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess +from llama_index.core import Settings +from llama_index.llms.openai import OpenAI def process_discord_guild_mongo(community_id: str) -> None: @@ -52,7 +53,10 @@ def process_discord_guild_mongo(community_id: str) -> None: node_parser = configure_node_parser(chunk_size=chunk_size) pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) - embed_model = CohereEmbedding() + Settings.node_parser = node_parser + Settings.embed_model = CohereEmbedding() + Settings.chunk_size = chunk_size + Settings.llm = OpenAI(model="gpt-3.5-turbo") pg_vector.save_documents_in_batches( community_id=community_id, @@ -60,19 +64,5 @@ def process_discord_guild_mongo(community_id: str) -> None: batch_size=100, node_parser=node_parser, max_request_per_minute=None, - embed_model=embed_model, - embed_dim=embedding_dim, request_per_minute=10000, - # max_request_per_day=REQUEST_PER_DAY, ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser() - parser.add_argument( - "community_id", type=str, help="the Community that the guild is related to" - ) - args = parser.parse_args() - - process_discord_guild_mongo(community_id=args.community_id) diff --git a/dags/hivemind_etl_helpers/discourse_summary_etl.py b/dags/hivemind_etl_helpers/discourse_summary_etl.py index b620aba5..5344b146 100644 --- a/dags/hivemind_etl_helpers/discourse_summary_etl.py +++ b/dags/hivemind_etl_helpers/discourse_summary_etl.py @@ -10,8 +10,9 @@ from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid from hivemind_etl_helpers.src.document_node_parser import configure_node_parser from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily -from llama_index import Document -from llama_index.response_synthesizers import get_response_synthesizer +from llama_index.core import Document, Settings +from llama_index.core.response_synthesizers import get_response_synthesizer +from llama_index.llms.openai import OpenAI from neo4j._data import Record from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams @@ -145,13 +146,16 @@ def process_forum( f"{log_prefix} Saving discourse summaries (extracting the embedding and saving)" ) + Settings.node_parser = node_parser + Settings.embed_model = CohereEmbedding() + Settings.chunk_size = chunk_size + Settings.llm = OpenAI(model="gpt-3.5-turbo") + pg_vector.save_documents_in_batches( community_id=community_id, documents=sorted_daily_docs, batch_size=100, - node_parser=node_parser, max_request_per_minute=None, - embed_model=embed_model, embed_dim=embedding_dim, request_per_minute=10000, deletion_query=deletion_query, @@ -162,7 +166,11 @@ def process_forum( def get_summary_documents( forum_id: str, raw_data_grouped: list[Record], forum_endpoint: str -) -> tuple[list[Document], list[Document], list[Document],]: +) -> tuple[ + list[Document], + list[Document], + list[Document], +]: """ prepare the summary documents for discourse based on given raw data diff --git a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py index e1be6d8b..0fca0ec2 100644 --- a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py +++ b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py @@ -11,6 +11,8 @@ from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess +from llama_index.core import Settings +from llama_index.llms.openai import OpenAI def process_discourse_vectorstore( @@ -127,15 +129,16 @@ def process_forum( WHERE (metadata_->>'postId')::float IN {deletion_ids}; """ - embed_model = CohereEmbedding() + Settings.node_parser = node_parser + Settings.embed_model = CohereEmbedding() + Settings.chunk_size = chunk_size + Settings.llm = OpenAI(model="gpt-3.5-turbo") pg_vector.save_documents_in_batches( community_id=community_id, documents=documents, batch_size=100, - node_parser=node_parser, max_request_per_minute=None, - embed_model=embed_model, embed_dim=embedding_dim, doc_file_ids_to_delete=deletion_query, ) diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index fd5844e5..ddae97f4 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -21,7 +21,7 @@ transform_issues, transform_prs, ) -from llama_index import Document +from llama_index.core import Document from tc_hivemind_backend.db.pg_db_utils import setup_db diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py index dde1e129..524d126a 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py @@ -4,7 +4,7 @@ from hivemind_etl_helpers.src.db.discord.utils.transform_discord_raw_messges import ( transform_discord_raw_messages, ) -from llama_index import Document +from llama_index.core import Document def discord_raw_to_docuemnts( diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py index 3461fd77..6901fffd 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py @@ -10,44 +10,51 @@ from hivemind_etl_helpers.src.db.discord.summary.summary_utils import ( transform_daily_summary_to_document, ) -from llama_index import Document, ServiceContext -from llama_index.llms import LLM -from llama_index.response_synthesizers.base import BaseSynthesizer +from llama_index.core import Document, Settings +from llama_index.core.llms import LLM +from llama_index.core.response_synthesizers.base import BaseSynthesizer class DiscordSummary(PrepareSummaries): def __init__( self, - service_context: ServiceContext | None = None, response_synthesizer: BaseSynthesizer | None = None, - llm: LLM | None = None, verbose: bool = False, + **kwargs, ) -> None: """ initialize the summary preparation class Parameters ----------- - service_context : llama_index.ServiceContext | None - the service context for llama_index to work - if nothing passed will be to `llm=gpt-3.5-turbo` and `chunk_size = 512` set_response_synthesizer : BaseSynthesizer | None whether to set a response_synthesizer to refine the summaries or not if nothing passed would be set to `None` - llm : LLM | None - the llm to use - if nothing passed, it would use chatgpt with `gpt-3.5-turbo` model verbose : bool whether to show the progress of summarizing or not + **kwargs : + llm : LLM + the llm to use + if nothing passed, it would use the default `llama_index.core.Setting.llm` + + Note: `chunk_size` is read from `llama_index.core.Setting.llm`. """ - super().__init__(service_context, response_synthesizer, llm, verbose) + llm = kwargs.get("llm", Settings.llm) + + super().__init__( + llm=llm, response_synthesizer=response_synthesizer, verbose=verbose + ) def prepare_summaries( self, guild_id: str, summarization_prefix: str, from_date: datetime | None = None, - ) -> tuple[list[Document], list[Document], list[Document],]: + ) -> tuple[ + list[Document], + list[Document], + list[Document], + ]: """ prepare per thread summaries of discord messages. Note: This will always process the data until 1 day ago. diff --git a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py index 6bd4f3fc..b6701d31 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py +++ b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py @@ -8,20 +8,23 @@ transform_discord_raw_messages, ) from hivemind_etl_helpers.src.utils.summary_base import SummaryBase -from llama_index import Document, ServiceContext -from llama_index.llms import LLM -from llama_index.response_synthesizers.base import BaseSynthesizer +from llama_index.core import Document, Settings +from llama_index.core.llms import LLM +from llama_index.core.response_synthesizers.base import BaseSynthesizer class PrepareSummaries(SummaryBase): def __init__( self, - service_context: ServiceContext | None = None, response_synthesizer: BaseSynthesizer | None = None, - llm: LLM | None = None, verbose: bool = False, + **kwargs, ) -> None: - super().__init__(service_context, response_synthesizer, llm, verbose) + llm = kwargs.get("llm", Settings.llm) + + super().__init__( + llm=llm, response_synthesizer=response_synthesizer, verbose=verbose + ) # initialization self.prefix: str = "" diff --git a/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py b/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py index 561b8af5..a6f7b6ff 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py +++ b/dags/hivemind_etl_helpers/src/db/discord/summary/summary_utils.py @@ -1,4 +1,4 @@ -from llama_index import Document +from llama_index.core import Document def transform_thread_summary_to_document( diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index 5d5db78d..f6194145 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -20,7 +20,7 @@ prepare_raction_ids, ) from hivemind_etl_helpers.src.db.globals import DATE_FORMAT -from llama_index import Document +from llama_index.core import Document def transform_discord_raw_messages( diff --git a/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py index 3f812996..565d8f1f 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/raw_post_to_documents.py @@ -4,7 +4,7 @@ from hivemind_etl_helpers.src.db.discourse.utils.transform_raw_to_documents import ( transform_raw_to_documents, ) -from llama_index import Document +from llama_index.core import Document def fetch_discourse_documents( diff --git a/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py b/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py index 4fe0aee2..6aec9bff 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/summary/prepare_summary.py @@ -8,9 +8,8 @@ transform_summary_to_document, ) from hivemind_etl_helpers.src.utils.summary_base import SummaryBase -from llama_index import Document, ServiceContext -from llama_index.llms import LLM -from llama_index.response_synthesizers.base import BaseSynthesizer +from llama_index.core import Document, Settings +from llama_index.core.response_synthesizers.base import BaseSynthesizer class DiscourseSummary(SummaryBase): @@ -18,12 +17,15 @@ def __init__( self, forum_id: str, forum_endpoint: str, - service_context: ServiceContext | None = None, response_synthesizer: BaseSynthesizer | None = None, - llm: LLM | None = None, verbose: bool = False, + **kwargs, ) -> None: - super().__init__(service_context, response_synthesizer, llm, verbose) + llm = kwargs.get("llm", Settings.llm) + + super().__init__( + llm=llm, response_synthesizer=response_synthesizer, verbose=verbose + ) self.prefix = f"FORUM_ID: {forum_id} " self.forum_endpoint = forum_endpoint diff --git a/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py b/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py index 10746213..30de1364 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/summary/summary_utils.py @@ -1,4 +1,4 @@ -from llama_index import Document +from llama_index.core import Document def transform_summary_to_document( diff --git a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py index dfce2b4d..b9e4042a 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py @@ -1,4 +1,4 @@ -from llama_index import Document +from llama_index.core import Document from neo4j import Record diff --git a/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py b/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py index 048964d9..d08bfa3e 100644 --- a/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py +++ b/dags/hivemind_etl_helpers/src/db/gdrive/retrieve_documents.py @@ -1,4 +1,4 @@ -from llama_index import Document, download_loader +from llama_index.core import Document, download_loader def retrieve_folder_documents(folder_id: str) -> list[Document]: diff --git a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py index 17db0a7f..ea6756a2 100644 --- a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py +++ b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py @@ -1,8 +1,10 @@ from hivemind_etl_helpers.src.document_node_parser import configure_node_parser -from llama_index import Document +from llama_index.core import Document from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess +from llama_index.core import Settings +from llama_index.llms.openai import OpenAI def load_documents_into_pg_database( @@ -37,16 +39,18 @@ def load_documents_into_pg_database( pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) - embed_model = CohereEmbedding() node_parser = configure_node_parser(chunk_size=chunk_size) + Settings.node_parser = node_parser + Settings.embed_model = CohereEmbedding() + Settings.chunk_size = chunk_size + Settings.llm = OpenAI(model="gpt-3.5-turbo") + pg_vector.save_documents_in_batches( community_id=community_id, documents=documents, batch_size=100, - node_parser=node_parser, max_request_per_minute=None, - embed_model=embed_model, embed_dim=embedding_dim, request_per_minute=10000, deletion_query=kwargs.get("deletion_query", ""), diff --git a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py index f0cd1af2..971abd70 100644 --- a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py +++ b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py @@ -1,5 +1,5 @@ from hivemind_etl_helpers.src.utils.check_documents import check_documents -from llama_index import Document +from llama_index.core import Document class PrepareDeletion: diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/comments.py b/dags/hivemind_etl_helpers/src/db/github/transform/comments.py index 243f6cf8..b4559b6b 100644 --- a/dags/hivemind_etl_helpers/src/db/github/transform/comments.py +++ b/dags/hivemind_etl_helpers/src/db/github/transform/comments.py @@ -1,5 +1,5 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubComment -from llama_index import Document +from llama_index.core import Document def transform_comments(data: list[GitHubComment]) -> list[Document]: diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py index bef52627..4efbd880 100644 --- a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py +++ b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py @@ -1,5 +1,5 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubCommit -from llama_index import Document +from llama_index.core import Document def transform_commits(data: list[GitHubCommit]) -> list[Document]: diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/issues.py b/dags/hivemind_etl_helpers/src/db/github/transform/issues.py index 38dc1789..8c7045ab 100644 --- a/dags/hivemind_etl_helpers/src/db/github/transform/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/transform/issues.py @@ -1,5 +1,5 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubIssue -from llama_index import Document +from llama_index.core import Document def transform_issues(data: list[GitHubIssue]) -> tuple[list[Document], list[Document]]: diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py b/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py index 4f35552e..73b56e7b 100644 --- a/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py +++ b/dags/hivemind_etl_helpers/src/db/github/transform/pull_requests.py @@ -1,5 +1,5 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubPullRequest -from llama_index import Document +from llama_index.core import Document def transform_prs(data: list[GitHubPullRequest]) -> list[Document]: diff --git a/dags/hivemind_etl_helpers/src/document_node_parser.py b/dags/hivemind_etl_helpers/src/document_node_parser.py index 29bc11a9..3d6bace1 100644 --- a/dags/hivemind_etl_helpers/src/document_node_parser.py +++ b/dags/hivemind_etl_helpers/src/document_node_parser.py @@ -1,9 +1,12 @@ from typing import Callable -from llama_index.node_parser import SimpleNodeParser -from llama_index.node_parser.text.sentence import CHUNKING_REGEX, DEFAULT_PARAGRAPH_SEP -from llama_index.text_splitter import SentenceSplitter -from llama_index.utils import get_tokenizer +from llama_index.core.node_parser import SimpleNodeParser +from llama_index.core.node_parser.text.sentence import ( + CHUNKING_REGEX, + DEFAULT_PARAGRAPH_SEP, +) +from llama_index.core.text_splitter import SentenceSplitter +from llama_index.core.utils import get_tokenizer def configure_node_parser( diff --git a/dags/hivemind_etl_helpers/src/utils/check_documents.py b/dags/hivemind_etl_helpers/src/utils/check_documents.py index 903b59dc..11159d75 100644 --- a/dags/hivemind_etl_helpers/src/utils/check_documents.py +++ b/dags/hivemind_etl_helpers/src/utils/check_documents.py @@ -3,7 +3,7 @@ from dateutil import parser from hivemind_etl_helpers.src.db.gdrive.db_utils import fetch_files_date_field -from llama_index import Document +from llama_index.core import Document def check_documents( diff --git a/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py b/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py index dbbd2059..762ed79e 100644 --- a/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py +++ b/dags/hivemind_etl_helpers/src/utils/sort_summary_docs.py @@ -1,6 +1,6 @@ from datetime import datetime -from llama_index import Document +from llama_index.core import Document def sort_summaries_daily( diff --git a/dags/hivemind_etl_helpers/src/utils/summary_base.py b/dags/hivemind_etl_helpers/src/utils/summary_base.py index fa969007..ad85d634 100644 --- a/dags/hivemind_etl_helpers/src/utils/summary_base.py +++ b/dags/hivemind_etl_helpers/src/utils/summary_base.py @@ -1,14 +1,13 @@ -from llama_index import Document, ServiceContext, SummaryIndex -from llama_index.llms import LLM, OpenAI -from llama_index.response_synthesizers.base import BaseSynthesizer +from llama_index.core import Document, SummaryIndex +from llama_index.core.llms import LLM +from llama_index.core.response_synthesizers.base import BaseSynthesizer class SummaryBase: def __init__( self, - service_context: ServiceContext | None = None, + llm: LLM, response_synthesizer: BaseSynthesizer | None = None, - llm: LLM | None = None, verbose: bool = False, ) -> None: """ @@ -16,29 +15,19 @@ def __init__( Parameters ----------- - service_context : llama_index.ServiceContext | None - the service context for llama_index to work - if nothing passed will be to `llm=gpt-3.5-turbo` and `chunk_size = 512` set_response_synthesizer : bool | None whether to set a response_synthesizer to refine the summaries or not if nothing passed would be set to None - llm : LLM | None - the llm to use - if nothing passed, it would use chatgpt with `gpt-3.5-turbo` model verbose : bool whether to show the progress of summarizing or not - testing : bool - testing mode would use a MockLLM - """ - if llm is None: - llm = OpenAI(temperature=0, model="gpt-3.5-turbo") - - if service_context is None: - service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) + llm : LLM + the llm to use + if nothing passed, it would use the default `llama_index.core.Setting.llm` - self.service_context = service_context - self.response_synthesizer = response_synthesizer + Note: `chunk_size` is read from `llama_index.core.Setting.llm`. + """ self.llm = llm + self.response_synthesizer = response_synthesizer self.verbose = verbose def _get_summary( @@ -50,7 +39,6 @@ def _get_summary( summary_index = SummaryIndex.from_documents( documents=messages_document, response_synthesizer=self.response_synthesizer, - service_context=self.service_context, show_progress=self.verbose, ) summary_response = self.retrieve_summary(summary_index, summarization_query) @@ -74,6 +62,7 @@ def retrieve_summary( query_engine = doc_summary_index.as_query_engine( response_mode="tree_summarize", response_synthesizer=self.response_synthesizer, + llm=self.llm, ) response = query_engine.query(query) return response.response diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py index 24088490..cb642354 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py @@ -4,18 +4,15 @@ from bson import ObjectId from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary from hivemind_etl_helpers.src.utils.mongo import MongoSingleton -from llama_index import Document, MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestDiscordGroupedDataPreparation(TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), - chunk_size=512, - embed_model=MockEmbedding(embed_dim=1024), - ) + Settings.llm = MockLLM() + Settings.chunk_size = 512 + Settings.embed_model = MockEmbedding(embed_dim=1024) def setup_db( self, @@ -101,9 +98,7 @@ def test_empty_data_prepare_without_per_date(self): client[guild_id].drop_collection("rawinfos") self.setUp() - discord_summary = DiscordSummary( - service_context=self.service_context, llm=self.mock_llm - ) + discord_summary = DiscordSummary() ( thread_summary_docs, channel_summary_docs, @@ -127,9 +122,7 @@ def test_empty_data_prepare_with_from_date(self): from_date = datetime(2023, 8, 1) self.setUp() - discord_summary = DiscordSummary( - service_context=self.service_context, llm=self.mock_llm - ) + discord_summary = DiscordSummary() ( thread_summary_docs, channel_summary_docs, @@ -221,9 +214,7 @@ def test_some_data_prepare_with_from_date(self): client[guild_id]["rawinfos"].insert_many(raw_data) self.setUp() - discord_summary = DiscordSummary( - service_context=self.service_context, llm=self.mock_llm - ) + discord_summary = DiscordSummary() ( thread_summary_docs, channel_summary_docs, @@ -329,9 +320,7 @@ def test_some_data_prepare_after_from_date(self): client[guild_id]["rawinfos"].insert_many(raw_data) self.setUp() - discord_summary = DiscordSummary( - service_context=self.service_context, llm=self.mock_llm - ) + discord_summary = DiscordSummary() ( thread_summary_docs, channel_summary_docs, diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py index b1f5eb71..3ec1eb5a 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py @@ -5,24 +5,22 @@ PrepareSummaries, ) from hivemind_etl_helpers.src.utils.mongo import MongoSingleton -from llama_index import MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestPrepareSummaries(unittest.TestCase): def setUp(self): self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_thread_summaries_empty_data(self): self.setUp() guild_id = "1234" - prepare_summaries = PrepareSummaries( - service_context=self.service_context, llm=self.mock_llm - ) + prepare_summaries = PrepareSummaries() summaries = prepare_summaries.prepare_thread_summaries( guild_id=guild_id, raw_data_grouped={}, @@ -111,9 +109,7 @@ def test_prepare_thread_summaries_some_data(self): } } - prepare_summaries = PrepareSummaries( - service_context=self.service_context, llm=self.mock_llm - ) + prepare_summaries = PrepareSummaries() summaries = prepare_summaries.prepare_thread_summaries( guild_id=guild_id, raw_data_grouped=sample_grouped_data, diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py index 8a6f840e..5cfdf091 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_base.py @@ -2,23 +2,22 @@ from unittest.mock import Mock from hivemind_etl_helpers.src.utils.summary_base import SummaryBase -from llama_index import Document, MockEmbedding, ServiceContext, SummaryIndex -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings, SummaryIndex +from llama_index.core.llms import MockLLM class TestSummaryBase(unittest.TestCase): def setUp(self): - # Set up a sample ServiceContext for testing - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + # Set up a sample Settings for testing + Settings.llm = MockLLM() + Settings.chunk_size = 512 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_summary_base_default_values(self): # Test the default values of the SummaryBase class # We need to set the service_context as we need the MockEmbedding model self.setUp() - summary_base = SummaryBase(llm=MockLLM(), service_context=self.service_context) - self.assertIsNotNone(summary_base.service_context) + summary_base = SummaryBase(llm=MockLLM()) self.assertIsNone(summary_base.response_synthesizer) self.assertIsNotNone(summary_base.llm) self.assertFalse(summary_base.verbose) @@ -28,19 +27,17 @@ def test_summary_base_custom_values(self): llm_mock = MockLLM() response_synthesizer_mock = Mock() summary_base = SummaryBase( - service_context=self.service_context, response_synthesizer=response_synthesizer_mock, llm=llm_mock, verbose=True, ) - self.assertEqual(summary_base.service_context, self.service_context) self.assertEqual(summary_base.response_synthesizer, response_synthesizer_mock) self.assertEqual(summary_base.llm, llm_mock) self.assertTrue(summary_base.verbose) def test_get_summary(self): # Test the _get_summary method - summary_base = SummaryBase(service_context=self.service_context) + summary_base = SummaryBase(llm=Settings.llm) messages_document = [Document(text="Document 1"), Document(text="Document 2")] result = summary_base._get_summary( @@ -51,10 +48,9 @@ def test_get_summary(self): def test_retrieve_summary(self): # Test the retrieve_summary method - summary_base = SummaryBase(service_context=self.service_context, llm=MockLLM()) + summary_base = SummaryBase(llm=MockLLM()) doc_summary_index = SummaryIndex.from_documents( documents=[Document(text="Document 1"), Document(text="Document 2")], - service_context=self.service_context, ) query = "Summarize this" result = summary_base.retrieve_summary(doc_summary_index, query) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py index 678c86ce..04ef700d 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_channel_summaries.py @@ -3,23 +3,20 @@ from hivemind_etl_helpers.src.db.discord.summary.prepare_summaries import ( PrepareSummaries, ) -from llama_index import Document, MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestPrepareSummaries(unittest.TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_channel_summaries_empty_data(self): self.setUp() - prepare_summaries = PrepareSummaries( - service_context=self.service_context, llm=self.mock_llm - ) + prepare_summaries = PrepareSummaries() summaries, thread_docs = prepare_summaries.prepare_channel_summaries( thread_summaries={}, summarization_query="Please give a summary of the data you have.", @@ -44,9 +41,7 @@ def test_prepare_channel_summaries_some_data(self): } } - prepare_summaries = PrepareSummaries( - service_context=self.service_context, llm=self.mock_llm - ) + prepare_summaries = PrepareSummaries() summaries, thread_docs = prepare_summaries.prepare_channel_summaries( thread_summaries=sample_thread_summary, summarization_query="Please give a summary of the data you have.", diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py index 391d8ec3..020d6d0c 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_prepare_daily_summaries.py @@ -3,23 +3,20 @@ from hivemind_etl_helpers.src.db.discord.summary.prepare_summaries import ( PrepareSummaries, ) -from llama_index import Document, MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestPrepareSummaries(unittest.TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_daily_summaries_empty_data(self): self.setUp() - prepare_summaries = PrepareSummaries( - service_context=self.service_context, llm=self.mock_llm - ) + prepare_summaries = PrepareSummaries() summaries, channel_docs = prepare_summaries.prepare_daily_summaries( channel_summaries={}, summarization_query="Please give a summary of the data you have.", @@ -39,9 +36,7 @@ def test_prepare_daily_summaries_some_data(self): } } - prepare_summaries = PrepareSummaries( - service_context=self.service_context, llm=self.mock_llm - ) + prepare_summaries = PrepareSummaries() summaries, channel_docs = prepare_summaries.prepare_daily_summaries( channel_summaries=sample_channel_summary, summarization_query="Please give a summary of the data you have.", diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py index 34c639dd..56f1fab1 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py @@ -3,7 +3,7 @@ from hivemind_etl_helpers.src.db.discord.summary.summary_utils import ( transform_daily_summary_to_document, ) -from llama_index import Document +from llama_index.core import Document class TestTransformDailySummaryToDocument(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py index e891b044..45171eaf 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_utils.py @@ -4,7 +4,7 @@ transform_channel_summary_to_document, transform_thread_summary_to_document, ) -from llama_index import Document +from llama_index.core import Document class TestTransformSummaryToDocument(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py index b9cd69ff..8dd3b2ab 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py @@ -1,10 +1,12 @@ from datetime import datetime, timezone from unittest import TestCase +import pytest from hivemind_etl_helpers.src.utils.check_documents import process_doc_to_id_date -from llama_index import Document +from llama_index.core import Document +@pytest.mark.skip(reason="GDrive ETL is not finished!") class TestDocumentGdriveProcessing(TestCase): def test_empty_documents(self): result = process_doc_to_id_date( diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py index 077a6e8b..4da7fcc2 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py @@ -1,13 +1,15 @@ from datetime import datetime from unittest import TestCase +import pytest import psycopg2 from hivemind_etl_helpers.src.db.gdrive.db_utils import fetch_files_date_field, setup_db -from llama_index import Document +from llama_index.core import Document from tc_hivemind_backend.db.credentials import load_postgres_credentials from tc_hivemind_backend.pg_vector_access import PGVectorAccess +@pytest.mark.skip(reason="GDrive ETL is not finished!") class TestFetchGdriveFileIds(TestCase): def setUpDB(self, community: str): db_name = f"community_{community}" diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py index 20830e2a..38e39840 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py @@ -1,15 +1,17 @@ from datetime import datetime from unittest import TestCase +import pytest import psycopg2 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db from hivemind_etl_helpers.src.db.gdrive.delete_records import delete_records -from llama_index import Document +from llama_index.core import Document from tc_hivemind_backend.db.credentials import load_postgres_credentials from tc_hivemind_backend.db.pg_db_utils import convert_tuple_str from tc_hivemind_backend.pg_vector_access import PGVectorAccess +@pytest.mark.skip(reason="GDrive ETL is not finished!") class TestDeleteRecords(TestCase): def setUpDB(self, community_id: str): db_name = f"community_{community_id}" diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py index f30ede2a..86780db3 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py @@ -1,14 +1,16 @@ from datetime import datetime from unittest import TestCase +import pytest import psycopg2 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db from hivemind_etl_helpers.src.utils.check_documents import check_documents -from llama_index import Document +from llama_index.core import Document from tc_hivemind_backend.db.credentials import load_postgres_credentials from tc_hivemind_backend.pg_vector_access import PGVectorAccess +@pytest.mark.skip(reason="GDrive ETL is not finished!") class TestGdriveDocumentsToDelete(TestCase): def setUpDB(self, community_id: str): db_name = f"community_{community_id}" diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py index 18fe5819..56749d33 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py @@ -1,14 +1,16 @@ from datetime import datetime from unittest import TestCase +import pytest import psycopg2 from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db from hivemind_etl_helpers.src.utils.check_documents import check_documents -from llama_index import Document +from llama_index.core import Document from tc_hivemind_backend.db.credentials import load_postgres_credentials from tc_hivemind_backend.pg_vector_access import PGVectorAccess +@pytest.mark.skip(reason="GDrive ETL is not finished!") class TestGdriveDocumentsToInsert(TestCase): def setUpDB(self, community_id: str): db_name = f"community_{community_id}" diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index 9034bccb..f13a35d6 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -7,7 +7,7 @@ discord_raw_to_docuemnts, ) from hivemind_etl_helpers.src.utils.mongo import MongoSingleton -from llama_index.indices.vector_store import VectorStoreIndex +from llama_index.core.indices.vector_store import VectorStoreIndex from tc_hivemind_backend.db.credentials import load_postgres_credentials from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.pg_vector_access import PGVectorAccess diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py index 0770cf98..7c92f26a 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_category.py @@ -3,16 +3,15 @@ from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import ( DiscourseSummary, ) -from llama_index import Document, MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestDiscoursePrepareDailySummaries(unittest.TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_daily_summaries_empty_data(self): self.setUp() @@ -20,8 +19,6 @@ def test_prepare_daily_summaries_empty_data(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) @@ -40,8 +37,6 @@ def test_prepare_daily_summaries_some_data(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) @@ -80,8 +75,6 @@ def test_prepare_daily_summaries_some_data_check_documents(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py index 13f2e0ed..b9480536 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily.py @@ -3,16 +3,15 @@ from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import ( DiscourseSummary, ) -from llama_index import Document, MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestDiscoursePrepareCategorySummaries(unittest.TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_category_summaries_empty_data(self): self.setUp() @@ -20,8 +19,6 @@ def test_prepare_category_summaries_empty_data(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) @@ -40,8 +37,6 @@ def test_prepare_category_summaries_some_data(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) @@ -105,8 +100,6 @@ def test_prepare_category_summaries_some_data_check_documents(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py index 0b0510d9..600263ee 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_daily_documents.py @@ -3,16 +3,15 @@ from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import ( DiscourseSummary, ) -from llama_index import Document, MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import Document, MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestDiscoursePrepareDailySummaryDocuments(unittest.TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_documents_empty_input(self): self.setUp() @@ -20,8 +19,6 @@ def test_prepare_documents_empty_input(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) @@ -36,8 +33,6 @@ def test_prepare_documents_some_inputs(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py index 59a62655..7641d391 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_discourse_summary_prepare_topic.py @@ -4,16 +4,15 @@ from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import ( DiscourseSummary, ) -from llama_index import MockEmbedding, ServiceContext -from llama_index.llms import MockLLM +from llama_index.core import MockEmbedding, Settings +from llama_index.core.llms import MockLLM class TestDiscoursePrepareTopicSummaries(unittest.TestCase): def setUp(self): - self.mock_llm = MockLLM() - self.service_context = ServiceContext.from_defaults( - llm=MockLLM(), chunk_size=256, embed_model=MockEmbedding(embed_dim=1024) - ) + Settings.llm = MockLLM() + Settings.chunk_size = 256 + Settings.embed_model = MockEmbedding(embed_dim=1024) def test_prepare_topic_summaries_empty_data(self): self.setUp() @@ -21,8 +20,6 @@ def test_prepare_topic_summaries_empty_data(self): forum_endpoint = "sample_endpoint" prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) @@ -67,8 +64,6 @@ def test_prepare_topic_summaries_some_data(self): raw_data_grouped.append(data) prepare_summaries = DiscourseSummary( - service_context=self.service_context, - llm=self.mock_llm, forum_id=forum_id, forum_endpoint=forum_endpoint, ) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py index 3a5246c4..0a67c36d 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py @@ -1,9 +1,11 @@ from datetime import datetime from unittest import TestCase +import pytest from hivemind_etl_helpers.src.db.gdrive.db_utils import postprocess_results +@pytest.mark.skip(reason="GDrive ETL is not finished!") class TestProcessGdriveFileRetreivalResultsProcess(TestCase): def test_empty_list(self): self.assertEqual(postprocess_results([]), {}) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py b/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py index ba78a24c..25235a54 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_github_etl_prepare_deletion.py @@ -3,7 +3,7 @@ from unittest.mock import patch from hivemind_etl_helpers.src.db.github.load import PrepareDeletion -from llama_index import Document +from llama_index.core import Document class TestPrepareDeletion(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py index 47f6a6bf..dc0edc61 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_comments.py @@ -3,7 +3,7 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubComment from hivemind_etl_helpers.src.db.github.transform.comments import transform_comments -from llama_index import Document +from llama_index.core import Document class TestGithubTransformcomComments(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py index 73f2bbcd..ac23d49c 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py @@ -3,7 +3,7 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubCommit from hivemind_etl_helpers.src.db.github.transform.commits import transform_commits -from llama_index import Document +from llama_index.core import Document class TestGithubTransformCommits(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py index a833141e..d73e4a12 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_issues.py @@ -3,7 +3,7 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubIssue from hivemind_etl_helpers.src.db.github.transform.issues import transform_issues -from llama_index import Document +from llama_index.core import Document class TestGithubTransformIssues(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py index bc6cdd0a..07d9e1e6 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_prs.py @@ -3,7 +3,7 @@ from hivemind_etl_helpers.src.db.github.schema import GitHubPullRequest from hivemind_etl_helpers.src.db.github.transform.pull_requests import transform_prs -from llama_index import Document +from llama_index.core import Document class TestGithubTransformPRs(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py b/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py index 2cc500f2..a9a5f304 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_sort_summary_docs.py @@ -1,7 +1,7 @@ import unittest from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily -from llama_index import Document +from llama_index.core import Document class TestSortSummariesDaily(unittest.TestCase): diff --git a/requirements.txt b/requirements.txt index f5b8d815..a067b2b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ numpy -llama-index==0.9.48 pymongo python-dotenv pgvector @@ -17,5 +16,5 @@ coverage>=7.3.3, <8.0.0 pytest>=7.4.3, <8.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.1.3 +tc-hivemind-backend==1.1.4 traceloop-sdk==0.9.4 From 39d350d2957955f0f105096b6acc609bb9f470a8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 7 Mar 2024 18:10:45 +0330 Subject: [PATCH 02/10] fix: linter issues based on superlinter rules! --- dags/hivemind_etl_helpers/discord_mongo_summary_etl.py | 2 +- .../discord_mongo_vector_store_etl.py | 4 ++-- dags/hivemind_etl_helpers/discourse_summary_etl.py | 8 +------- dags/hivemind_etl_helpers/discourse_vectorstore_etl.py | 4 ++-- .../src/db/discord/discord_summary.py | 7 +------ .../src/db/discord/summary/prepare_summaries.py | 1 - .../src/db/github/load/load_raw_data.py | 5 ++--- .../integration/test_gdrive_convert_doc_to_id_date.py | 2 +- .../integration/test_gdrive_db_fetch_files_modified_at.py | 2 +- .../tests/integration/test_gdrive_delete_records.py | 2 +- .../tests/integration/test_gdrive_documents_to_delete.py | 2 +- .../tests/integration/test_gdrive_documents_to_insert.py | 2 +- .../unit/test_gdrive_files_modified_at_process_results.py | 2 +- 13 files changed, 15 insertions(+), 28 deletions(-) diff --git a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py index 375e560b..e7d449ae 100644 --- a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py +++ b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py @@ -10,9 +10,9 @@ ) from hivemind_etl_helpers.src.document_node_parser import configure_node_parser from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily +from llama_index.core import Settings from llama_index.core.response_synthesizers import get_response_synthesizer from llama_index.llms.openai import OpenAI -from llama_index.core import Settings from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding diff --git a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py index 584c3963..1d7d9b1f 100644 --- a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py +++ b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py @@ -8,12 +8,12 @@ find_guild_id_by_community_id, ) from hivemind_etl_helpers.src.document_node_parser import configure_node_parser +from llama_index.core import Settings +from llama_index.llms.openai import OpenAI from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess -from llama_index.core import Settings -from llama_index.llms.openai import OpenAI def process_discord_guild_mongo(community_id: str) -> None: diff --git a/dags/hivemind_etl_helpers/discourse_summary_etl.py b/dags/hivemind_etl_helpers/discourse_summary_etl.py index 5344b146..25b83c25 100644 --- a/dags/hivemind_etl_helpers/discourse_summary_etl.py +++ b/dags/hivemind_etl_helpers/discourse_summary_etl.py @@ -134,8 +134,6 @@ def process_forum( node_parser = configure_node_parser(chunk_size=chunk_size) pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) - embed_model = CohereEmbedding() - sorted_daily_docs = sort_summaries_daily( level1_docs=topic_summary_documents, level2_docs=category_summary_documenets, @@ -166,11 +164,7 @@ def process_forum( def get_summary_documents( forum_id: str, raw_data_grouped: list[Record], forum_endpoint: str -) -> tuple[ - list[Document], - list[Document], - list[Document], -]: +) -> tuple[list[Document], list[Document], list[Document],]: """ prepare the summary documents for discourse based on given raw data diff --git a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py index 0fca0ec2..2af29ad5 100644 --- a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py +++ b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py @@ -7,12 +7,12 @@ from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid from hivemind_etl_helpers.src.document_node_parser import configure_node_parser from hivemind_etl_helpers.src.utils.check_documents import check_documents +from llama_index.core import Settings +from llama_index.llms.openai import OpenAI from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess -from llama_index.core import Settings -from llama_index.llms.openai import OpenAI def process_discourse_vectorstore( diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py index 6901fffd..f6cad40e 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py @@ -11,7 +11,6 @@ transform_daily_summary_to_document, ) from llama_index.core import Document, Settings -from llama_index.core.llms import LLM from llama_index.core.response_synthesizers.base import BaseSynthesizer @@ -50,11 +49,7 @@ def prepare_summaries( guild_id: str, summarization_prefix: str, from_date: datetime | None = None, - ) -> tuple[ - list[Document], - list[Document], - list[Document], - ]: + ) -> tuple[list[Document], list[Document], list[Document],]: """ prepare per thread summaries of discord messages. Note: This will always process the data until 1 day ago. diff --git a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py index b6701d31..8853fefb 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py +++ b/dags/hivemind_etl_helpers/src/db/discord/summary/prepare_summaries.py @@ -9,7 +9,6 @@ ) from hivemind_etl_helpers.src.utils.summary_base import SummaryBase from llama_index.core import Document, Settings -from llama_index.core.llms import LLM from llama_index.core.response_synthesizers.base import BaseSynthesizer diff --git a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py index ea6756a2..1034e1a4 100644 --- a/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py +++ b/dags/hivemind_etl_helpers/src/db/github/load/load_raw_data.py @@ -1,10 +1,9 @@ from hivemind_etl_helpers.src.document_node_parser import configure_node_parser -from llama_index.core import Document +from llama_index.core import Document, Settings +from llama_index.llms.openai import OpenAI from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess -from llama_index.core import Settings -from llama_index.llms.openai import OpenAI def load_documents_into_pg_database( diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py index 8dd3b2ab..12690210 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_convert_doc_to_id_date.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone from unittest import TestCase -import pytest +import pytest from hivemind_etl_helpers.src.utils.check_documents import process_doc_to_id_date from llama_index.core import Document diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py index 4da7fcc2..7fa0f127 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_db_fetch_files_modified_at.py @@ -1,8 +1,8 @@ from datetime import datetime from unittest import TestCase -import pytest import psycopg2 +import pytest from hivemind_etl_helpers.src.db.gdrive.db_utils import fetch_files_date_field, setup_db from llama_index.core import Document from tc_hivemind_backend.db.credentials import load_postgres_credentials diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py index 38e39840..25c0d092 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_delete_records.py @@ -1,8 +1,8 @@ from datetime import datetime from unittest import TestCase -import pytest import psycopg2 +import pytest from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db from hivemind_etl_helpers.src.db.gdrive.delete_records import delete_records from llama_index.core import Document diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py index 86780db3..8ce64235 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_delete.py @@ -1,8 +1,8 @@ from datetime import datetime from unittest import TestCase -import pytest import psycopg2 +import pytest from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db from hivemind_etl_helpers.src.utils.check_documents import check_documents from llama_index.core import Document diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py index 56749d33..9d930969 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_documents_to_insert.py @@ -1,8 +1,8 @@ from datetime import datetime from unittest import TestCase -import pytest import psycopg2 +import pytest from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db from hivemind_etl_helpers.src.utils.check_documents import check_documents from llama_index.core import Document diff --git a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py index 0a67c36d..797ecf48 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_gdrive_files_modified_at_process_results.py @@ -1,7 +1,7 @@ from datetime import datetime from unittest import TestCase -import pytest +import pytest from hivemind_etl_helpers.src.db.gdrive.db_utils import postprocess_results From 64f7d5c55e393daff6d1e16de406ac7ccef6680a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 7 Mar 2024 18:20:45 +0330 Subject: [PATCH 03/10] fix: wrong docstring! --- dags/hivemind_etl_helpers/src/db/discord/discord_summary.py | 2 +- dags/hivemind_etl_helpers/src/utils/summary_base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py index f6cad40e..56f765da 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py @@ -36,7 +36,7 @@ def __init__( the llm to use if nothing passed, it would use the default `llama_index.core.Setting.llm` - Note: `chunk_size` is read from `llama_index.core.Setting.llm`. + Note: `chunk_size` is read from `llama_index.core.Setting.chunk_size`. """ llm = kwargs.get("llm", Settings.llm) diff --git a/dags/hivemind_etl_helpers/src/utils/summary_base.py b/dags/hivemind_etl_helpers/src/utils/summary_base.py index ad85d634..2a3e3d02 100644 --- a/dags/hivemind_etl_helpers/src/utils/summary_base.py +++ b/dags/hivemind_etl_helpers/src/utils/summary_base.py @@ -24,7 +24,7 @@ def __init__( the llm to use if nothing passed, it would use the default `llama_index.core.Setting.llm` - Note: `chunk_size` is read from `llama_index.core.Setting.llm`. + Note: `chunk_size` is read from `llama_index.core.Setting.chunk_size`. """ self.llm = llm self.response_synthesizer = response_synthesizer From 1a0a3a06f4cc2920319d8150896f265f3f069575 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 12 Mar 2024 13:40:38 +0330 Subject: [PATCH 04/10] feat: increase backend lib depdendency version! The backend lib now contains the llama-index-legacy which is for using its PGVectorStore and airflow was not supporting the new vectorstore version because of sqlalchemy! --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a067b2b7..b76573f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,5 +16,5 @@ coverage>=7.3.3, <8.0.0 pytest>=7.4.3, <8.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.1.4 +tc-hivemind-backend==1.1.5 traceloop-sdk==0.9.4 From cfeefd8e5d7dd878049b495bf6e5aa13b17df45c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 12 Mar 2024 14:19:24 +0330 Subject: [PATCH 05/10] feat: raise error if tests failing! --- docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index e95edd19..5db146c3 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -2,7 +2,7 @@ echo "chang dir to dags" cd dags || exit -python3 -m coverage run --omit=tests/* -m pytest . +python3 -m coverage run --omit=tests/* -m pytest . && echo "Tests Passed" || exit 1 cp .coverage ../.coverage cd .. || exit From 41e889615c28eb52168b5ea7b399ec06c30b5ad5 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 12 Mar 2024 18:15:30 +0330 Subject: [PATCH 06/10] feat: more separation between dags! Following best practices to make each DAG file more efficient. That was because airflow was raising an error and couldn't bring the previous dags up. --- dags/hivemind_discord_etl.py | 69 ++++++++++++++++ dags/hivemind_discourse_etl.py | 58 +++++++++++++ dags/hivemind_etl.py | 146 --------------------------------- dags/hivemind_github_etl.py | 34 ++++++++ 4 files changed, 161 insertions(+), 146 deletions(-) create mode 100644 dags/hivemind_discord_etl.py create mode 100644 dags/hivemind_discourse_etl.py delete mode 100644 dags/hivemind_etl.py create mode 100644 dags/hivemind_github_etl.py diff --git a/dags/hivemind_discord_etl.py b/dags/hivemind_discord_etl.py new file mode 100644 index 00000000..c4a22209 --- /dev/null +++ b/dags/hivemind_discord_etl.py @@ -0,0 +1,69 @@ +import logging +from datetime import datetime + +from airflow import DAG +from airflow.decorators import task + +from dotenv import load_dotenv +from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries +from hivemind_etl_helpers.discord_mongo_vector_store_etl import ( + process_discord_guild_mongo, +) +from hivemind_etl_helpers.src.utils.mongo_discord_communities import ( + get_all_discord_communities, +) + + +with DAG( + dag_id="discord_vector_store_update", + start_date=datetime(2024, 1, 1), + schedule_interval="0 2 * * *", + catchup=False, +) as dag: + + @task + def get_discord_communities() -> list[str]: + """ + Getting all communities having discord from database + """ + communities = get_all_discord_communities() + return communities + + @task + def start_discord_vectorstore(community_id: str): + load_dotenv() + logging.info(f"Working on community, {community_id}") + process_discord_guild_mongo(community_id=community_id) + logging.info(f"Community {community_id} Job finished!") + + communities = get_discord_communities() + # `start_discord_vectorstore` will be called multiple times + # with the length of the list + start_discord_vectorstore.expand(community_id=communities) + + +with DAG( + dag_id="discord_summary_vector_store", + start_date=datetime(2024, 1, 1), + schedule_interval="0 2 * * *", +) as dag: + + @task + def get_mongo_discord_communities() -> list[str]: + """ + Getting all communities having discord from database + this function is the same with `get_discord_communities` + we just changed the name for the pylint + """ + communities = get_all_discord_communities() + return communities + + @task + def start_discord_summary_vectorstore(community_id: str): + load_dotenv() + logging.info(f"Working on community, {community_id}") + process_discord_summaries(community_id=community_id, verbose=False) + logging.info(f"Community {community_id} Job finished!") + + communities = get_mongo_discord_communities() + start_discord_summary_vectorstore.expand(community_id=communities) diff --git a/dags/hivemind_discourse_etl.py b/dags/hivemind_discourse_etl.py new file mode 100644 index 00000000..d169b287 --- /dev/null +++ b/dags/hivemind_discourse_etl.py @@ -0,0 +1,58 @@ +import logging +from datetime import datetime + +from airflow import DAG +from airflow.decorators import task + +from hivemind_etl_helpers.discourse_summary_etl import process_discourse_summary +from hivemind_etl_helpers.discourse_vectorstore_etl import process_discourse_vectorstore +from hivemind_etl_helpers.src.utils.get_communities_data import ( + get_discourse_communities, +) + +with DAG( + dag_id="discourse_vector_store", + start_date=datetime(2024, 3, 1), + schedule_interval="0 2 * * *", +) as dag: + + @task + def process_discourse_community(community_information: dict[str, str | datetime]): + community_id = community_information["community_id"] + forum_endpoint = community_information["endpoint"] + from_date = community_information["from_date"] + + logging.info(f"Starting Discourse ETL | community_id: {community_id}") + process_discourse_vectorstore( + community_id=community_id, + forum_endpoint=forum_endpoint, + from_starting_date=from_date, + ) + + communities_info = get_discourse_communities() + process_discourse_community.expand(community_information=communities_info) + + +with DAG( + dag_id="discourse_summary_vector_store", + start_date=datetime(2024, 2, 21), + schedule_interval="0 2 * * *", +) as dag: + + @task + def process_discourse_community_summary( + community_information: dict[str, str | datetime] + ): + community_id = community_information["community_id"] + forum_endpoint = community_information["endpoint"] + from_date = community_information["from_date"] + + logging.info(f"Starting Discourse ETL | community_id: {community_id}") + process_discourse_summary( + community_id=community_id, + forum_endpoint=forum_endpoint, + from_starting_date=from_date, + ) + + communities_info = get_discourse_communities() + process_discourse_community_summary.expand(community_information=communities_info) diff --git a/dags/hivemind_etl.py b/dags/hivemind_etl.py deleted file mode 100644 index d288e0e3..00000000 --- a/dags/hivemind_etl.py +++ /dev/null @@ -1,146 +0,0 @@ -from __future__ import annotations - -import logging -from datetime import datetime - -from airflow import DAG -from airflow.decorators import task -from dotenv import load_dotenv -from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries -from hivemind_etl_helpers.discord_mongo_vector_store_etl import ( - process_discord_guild_mongo, -) -from hivemind_etl_helpers.discourse_summary_etl import process_discourse_summary -from hivemind_etl_helpers.discourse_vectorstore_etl import process_discourse_vectorstore -from hivemind_etl_helpers.github_etl import process_github_vectorstore -from hivemind_etl_helpers.src.utils.get_communities_data import ( - get_discourse_communities, - get_github_communities_data, -) -from hivemind_etl_helpers.src.utils.mongo_discord_communities import ( - get_all_discord_communities, -) - -with DAG( - dag_id="discord_vector_store_update", - start_date=datetime(2024, 1, 1), - schedule_interval="0 2 * * *", - catchup=False, -) as dag: - - @task - def get_discord_communities() -> list[str]: - """ - Getting all communities having discord from database - """ - communities = get_all_discord_communities() - return communities - - @task - def start_discord_vectorstore(community_id: str): - load_dotenv() - logging.info(f"Working on community, {community_id}") - process_discord_guild_mongo(community_id=community_id) - logging.info(f"Community {community_id} Job finished!") - - communities = get_discord_communities() - # `start_discord_vectorstore` will be called multiple times - # with the length of the list - start_discord_vectorstore.expand(community_id=communities) - -with DAG( - dag_id="discord_summary_vector_store", - start_date=datetime(2024, 1, 1), - schedule_interval="0 2 * * *", -) as dag: - - @task - def get_mongo_discord_communities() -> list[str]: - """ - Getting all communities having discord from database - this function is the same with `get_discord_communities` - we just changed the name for the pylint - """ - communities = get_all_discord_communities() - return communities - - @task - def start_discord_summary_vectorstore(community_id: str): - load_dotenv() - logging.info(f"Working on community, {community_id}") - process_discord_summaries(community_id=community_id, verbose=False) - logging.info(f"Community {community_id} Job finished!") - - communities = get_mongo_discord_communities() - start_discord_summary_vectorstore.expand(community_id=communities) - - -with DAG( - dag_id="github_vector_store", - start_date=datetime(2024, 2, 21), - schedule_interval="0 2 * * *", -) as dag: - - @task - def process_github_community(community_information: dict[str, str | datetime]): - community_id = community_information["community_id"] - organization_id = community_information["organization_id"] - from_date = community_information["from_date"] - - logging.info(f"Starting Github ETL | community_id: {community_id}") - process_github_vectorstore( - community_id=community_id, - github_org_id=organization_id, - from_starting_date=from_date, - ) - - communities_info = get_github_communities_data() - process_github_community.expand(community_information=communities_info) - - -with DAG( - dag_id="discourse_vector_store", - start_date=datetime(2024, 3, 1), - schedule_interval="0 2 * * *", -) as dag: - - @task - def process_discourse_community(community_information: dict[str, str | datetime]): - community_id = community_information["community_id"] - forum_endpoint = community_information["endpoint"] - from_date = community_information["from_date"] - - logging.info(f"Starting Discourse ETL | community_id: {community_id}") - process_discourse_vectorstore( - community_id=community_id, - forum_endpoint=forum_endpoint, - from_starting_date=from_date, - ) - - communities_info = get_discourse_communities() - process_discourse_community.expand(community_information=communities_info) - - -with DAG( - dag_id="discourse_summary_vector_store", - start_date=datetime(2024, 2, 21), - schedule_interval="0 2 * * *", -) as dag: - - @task - def process_discourse_community_summary( - community_information: dict[str, str | datetime] - ): - community_id = community_information["community_id"] - forum_endpoint = community_information["endpoint"] - from_date = community_information["from_date"] - - logging.info(f"Starting Discourse ETL | community_id: {community_id}") - process_discourse_summary( - community_id=community_id, - forum_endpoint=forum_endpoint, - from_starting_date=from_date, - ) - - communities_info = get_discourse_communities() - process_discourse_community_summary.expand(community_information=communities_info) diff --git a/dags/hivemind_github_etl.py b/dags/hivemind_github_etl.py new file mode 100644 index 00000000..e381a5ab --- /dev/null +++ b/dags/hivemind_github_etl.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import logging +from datetime import datetime + +from airflow import DAG +from airflow.decorators import task + +from hivemind_etl_helpers.github_etl import process_github_vectorstore +from hivemind_etl_helpers.src.utils.get_communities_data import ( + get_github_communities_data, +) + +with DAG( + dag_id="github_vector_store", + start_date=datetime(2024, 2, 21), + schedule_interval="0 2 * * *", +) as dag: + + @task + def process_github_community(community_information: dict[str, str | datetime]): + community_id = community_information["community_id"] + organization_id = community_information["organization_id"] + from_date = community_information["from_date"] + + logging.info(f"Starting Github ETL | community_id: {community_id}") + process_github_vectorstore( + community_id=community_id, + github_org_id=organization_id, + from_starting_date=from_date, + ) + + communities_info = get_github_communities_data() + process_github_community.expand(community_information=communities_info) From 22f87a87993f65f39861a49c70e5a6eb8bbb8eb1 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 13 Mar 2024 08:16:25 +0330 Subject: [PATCH 07/10] fix: fixed minor issues! - One of the github commit data relations was changed. - When no data was available deletion was not producing empty string, which is fixed now. --- .../src/db/github/extract/commit.py | 2 +- .../src/db/github/load/prepare_deletion.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py index 0641b341..e29c34bc 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py @@ -27,7 +27,7 @@ def fetch_raw_commits( """ neo4j_connection = Neo4jConnection() neo4j_driver = neo4j_connection.connect_neo4j() - query = """MATCH (co:Commit)<-[:COMMITED]-(user:GitHubUser) + query = """MATCH (co:Commit)<-[:COMMITTED]-(user:GitHubUser) WHERE co.repository_id IN $repoIds """ diff --git a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py index 971abd70..43583377 100644 --- a/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py +++ b/dags/hivemind_etl_helpers/src/db/github/load/prepare_deletion.py @@ -54,10 +54,11 @@ def prepare( ) documents_to_save = docs_to_save + pr_docsuments_to_save - - deletion_query = self._create_deletion_query( - docs_file_ids_to_delete + pr_docs_file_ids_to_delete - ) + doc_ids_to_delete = docs_file_ids_to_delete + pr_docs_file_ids_to_delete + if len(doc_ids_to_delete): + deletion_query = self._create_deletion_query(doc_ids_to_delete) + else: + deletion_query = "" return documents_to_save, deletion_query @@ -125,6 +126,8 @@ def _create_deletion_query( self, doc_ids_to_delete: list[str], ) -> str: + if len(doc_ids_to_delete) == 0: + return "" if len(doc_ids_to_delete) == 1: deletion_ids = f"({doc_ids_to_delete[0]})" else: From d52661eea5037302f8a7c2b5e07113d07a71ae97 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 13 Mar 2024 08:19:31 +0330 Subject: [PATCH 08/10] fix: isort linter issues! --- dags/hivemind_discord_etl.py | 2 -- dags/hivemind_discourse_etl.py | 1 - dags/hivemind_github_etl.py | 1 - 3 files changed, 4 deletions(-) diff --git a/dags/hivemind_discord_etl.py b/dags/hivemind_discord_etl.py index c4a22209..615f3ae2 100644 --- a/dags/hivemind_discord_etl.py +++ b/dags/hivemind_discord_etl.py @@ -3,7 +3,6 @@ from airflow import DAG from airflow.decorators import task - from dotenv import load_dotenv from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries from hivemind_etl_helpers.discord_mongo_vector_store_etl import ( @@ -13,7 +12,6 @@ get_all_discord_communities, ) - with DAG( dag_id="discord_vector_store_update", start_date=datetime(2024, 1, 1), diff --git a/dags/hivemind_discourse_etl.py b/dags/hivemind_discourse_etl.py index d169b287..25c834ce 100644 --- a/dags/hivemind_discourse_etl.py +++ b/dags/hivemind_discourse_etl.py @@ -3,7 +3,6 @@ from airflow import DAG from airflow.decorators import task - from hivemind_etl_helpers.discourse_summary_etl import process_discourse_summary from hivemind_etl_helpers.discourse_vectorstore_etl import process_discourse_vectorstore from hivemind_etl_helpers.src.utils.get_communities_data import ( diff --git a/dags/hivemind_github_etl.py b/dags/hivemind_github_etl.py index e381a5ab..15de6758 100644 --- a/dags/hivemind_github_etl.py +++ b/dags/hivemind_github_etl.py @@ -5,7 +5,6 @@ from airflow import DAG from airflow.decorators import task - from hivemind_etl_helpers.github_etl import process_github_vectorstore from hivemind_etl_helpers.src.utils.get_communities_data import ( get_github_communities_data, From 9e1884b71314207388b7eaabe520d91ae7a41ce9 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 13 Mar 2024 16:50:03 +0330 Subject: [PATCH 09/10] fix: test cases based on previous updates! - The summary type was added in other commits. - The COMMITED relationships typo was fixed on staging database. --- .../test_discord_summary_transform_daily_summaries.py | 6 +++--- .../tests/integration/test_github_etl_fetch_commits.py | 10 +++++----- .../integration/test_github_etl_fetch_raw_commits.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py index 56f1fab1..3ea81ace 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_summary_transform_daily_summaries.py @@ -25,15 +25,15 @@ def test_transform_daily_summary_to_document(self): expected_documents = [ Document( text="Summary 1", - metadata={"date": "2023-01-01", "channel": None, "thread": None}, + metadata={"date": "2023-01-01", "type": "day"}, ), Document( text="Summary 2", - metadata={"date": "2023-01-02", "channel": None, "thread": None}, + metadata={"date": "2023-01-02", "type": "day"}, ), Document( text="Summary 3", - metadata={"date": "2023-01-03", "channel": None, "thread": None}, + metadata={"date": "2023-01-03", "type": "day"}, ), ] diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py index 0a589638..7656279b 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py @@ -29,7 +29,7 @@ def test_get_single_commit_single_repo_no_from_date(self): session.execute_write( lambda tx: tx.run( """ - CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"}) + CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"}) SET co.`commit.author.name` = "Author#1", co.`commit.message` = "Issue #1 is resolved!", @@ -68,7 +68,7 @@ def test_get_single_commit_single_repo_with_from_date(self): session.execute_write( lambda tx: tx.run( """ - CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"}) + CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"}) SET co.`commit.author.name` = "Author#1", co.`commit.message` = "Issue #1 is resolved!", @@ -108,7 +108,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self): session.execute_write( lambda tx: tx.run( """ - CREATE (co:Commit)<-[:COMMITED]-(:GitHubUser {login: "author #1"}) + CREATE (co:Commit)<-[:COMMITTED]-(:GitHubUser {login: "author #1"}) SET co.`commit.author.name` = "Author#1", co.`commit.message` = "Issue #1 is resolved!", @@ -120,7 +120,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self): co.`commit.author.date` = "2024-01-01T10:23:50Z", co.`commit.verification.reason` = "invalid" - CREATE (co2:Commit)<-[:COMMITED]-(:GitHubUser {login: "author #2"}) + CREATE (co2:Commit)<-[:COMMITTED]-(:GitHubUser {login: "author #2"}) SET co2.`commit.author.name` = "Author#2", co2.`commit.message` = "Issue #2 is resolved!", @@ -132,7 +132,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self): co2.`commit.author.date` = "2023-01-01T10:23:50Z", co2.`commit.verification.reason` = "invalid" - CREATE (co3:Commit)<-[:COMMITED]-(:GitHubUser {login: "author #3"}) + CREATE (co3:Commit)<-[:COMMITTED]-(:GitHubUser {login: "author #3"}) SET co3.`commit.author.name` = "Author#3", co3.`commit.message` = "Issue #3 is resolved!", diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py index c6bb05cd..62edd6d1 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py @@ -29,7 +29,7 @@ def test_get_single_commit_single_repo_no_from_date(self): session.execute_write( lambda tx: tx.run( """ - CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"}) + CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"}) SET co.`commit.author.name` = "Author#1", co.`commit.message` = "Issue #1 is resolved!", @@ -70,7 +70,7 @@ def test_get_single_commit_single_repo_with_from_date(self): session.execute_write( lambda tx: tx.run( """ - CREATE (co:Commit)<-[:COMMITED]-(user:GitHubUser {login: "author #1"}) + CREATE (co:Commit)<-[:COMMITTED]-(user:GitHubUser {login: "author #1"}) SET co.`commit.author.name` = "Author#1", co.`commit.message` = "Issue #1 is resolved!", From 43cdc7b9da6a6b02044e3c9affd9e87fb2437f0c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 13 Mar 2024 17:14:14 +0330 Subject: [PATCH 10/10] fix: converted list to set to not to compare indices! --- .../tests/integration/test_github_etl_get_org_repos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py index 88e17f4b..41dab6e0 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_get_org_repos.py @@ -51,4 +51,4 @@ def test_fetch_multiple_repo(self): ) ) repo_ids = get_github_organization_repos(github_organization_id=org_id) - self.assertEqual(repo_ids, [100, 101, 102]) + self.assertEqual(set(repo_ids), set([100, 101, 102]))