From 4797380f4c78087cb409b929eeb6ce4ecfb73198 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 10:05:59 +0330 Subject: [PATCH 1/7] feat: removed website ETL codes! those will be moved to temporal service. --- .../src/utils/modules/website.py | 63 ------------- dags/hivemind_etl_helpers/website_etl.py | 94 ------------------- dags/hivemind_website_ingestion.py | 57 ----------- 3 files changed, 214 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/src/utils/modules/website.py delete mode 100644 dags/hivemind_etl_helpers/website_etl.py delete mode 100644 dags/hivemind_website_ingestion.py diff --git a/dags/hivemind_etl_helpers/src/utils/modules/website.py b/dags/hivemind_etl_helpers/src/utils/modules/website.py deleted file mode 100644 index d0901935..00000000 --- a/dags/hivemind_etl_helpers/src/utils/modules/website.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging - -from .modules_base import ModulesBase - - -class ModulesWebsite(ModulesBase): - def __init__(self) -> None: - self.platform_name = "website" - super().__init__() - - def get_learning_platforms( - self, - ) -> list[dict[str, str | list[str]]]: - """ - Get all the website communities with their page titles. - - Returns - --------- - community_orgs : list[dict[str, str | list[str]]] = [] - a list of website data information - - example data output: - ``` - [{ - "community_id": "6579c364f1120850414e0dc5", - "platform_id": "6579c364f1120850414e0dc6", - "urls": ["link1", "link2"], - }] - ``` - """ - modules = self.query(platform=self.platform_name, projection={"name": 0}) - communities_data: list[dict[str, str | list[str]]] = [] - - for module in modules: - community = module["community"] - - # each platform of the community - for platform in module["options"]["platforms"]: - if platform["name"] != self.platform_name: - continue - - platform_id = platform["platform"] - - try: - website_links = self.get_platform_metadata( - platform_id=platform_id, - metadata_name="resources", - ) - - communities_data.append( - { - "community_id": str(community), - "platform_id": str(platform_id), - "urls": website_links, - } - ) - except Exception as exp: - logging.error( - "Exception while fetching website modules " - f"for platform: {platform_id} | exception: {exp}" - ) - - return communities_data diff --git a/dags/hivemind_etl_helpers/website_etl.py b/dags/hivemind_etl_helpers/website_etl.py deleted file mode 100644 index e66958fd..00000000 --- a/dags/hivemind_etl_helpers/website_etl.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import Any - -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline -from hivemind_etl_helpers.src.db.website.crawlee_client import CrawleeClient -from llama_index.core import Document - - -class WebsiteETL: - def __init__( - self, - community_id: str, - ) -> None: - """ - Parameters - ----------- - community_id : str - the community to save its data - """ - self.community_id = community_id - collection_name = "website" - - # preparing the data extractor and ingestion pipelines - self.crawlee_client = CrawleeClient() - self.ingestion_pipeline = CustomIngestionPipeline( - self.community_id, collection_name=collection_name - ) - - async def extract( - self, - urls: list[str], - ) -> list[dict[str, Any]]: - """ - Extract given urls - - Parameters - ----------- - urls : list[str] - a list of urls - - Returns - --------- - extracted_data : list[dict[str, Any]] - The crawled data from urls - """ - if not urls: - raise ValueError("No URLs provided for crawling") - extracted_data = await self.crawlee_client.crawl(urls) - - if not extracted_data: - raise ValueError(f"No data extracted from URLs: {urls}") - - return extracted_data - - def transform(self, raw_data: list[dict[str, Any]]) -> list[Document]: - """ - transform raw data to llama-index documents - - Parameters - ------------ - raw_data : list[dict[str, Any]] - crawled data - - Returns - --------- - documents : list[llama_index.Document] - list of llama-index documents - """ - documents: list[Document] = [] - - for data in raw_data: - doc_id = data["url"] - doc = Document( - doc_id=doc_id, - text=data["inner_text"], - metadata={ - "title": data["title"], - "url": data["url"], - }, - ) - documents.append(doc) - - return documents - - def load(self, documents: list[Document]) -> None: - """ - load the documents into the vector db - - Parameters - ------------- - documents: list[llama_index.Document] - the llama-index documents to be ingested - """ - # loading data into db - self.ingestion_pipeline.run_pipeline(docs=documents) diff --git a/dags/hivemind_website_ingestion.py b/dags/hivemind_website_ingestion.py deleted file mode 100644 index 52e26fcd..00000000 --- a/dags/hivemind_website_ingestion.py +++ /dev/null @@ -1,57 +0,0 @@ -import asyncio -import logging -from datetime import datetime - -from airflow import DAG -from airflow.decorators import task -from dotenv import load_dotenv -from hivemind_etl_helpers.src.utils.modules import ModulesWebsite -from hivemind_etl_helpers.website_etl import WebsiteETL - -with DAG( - dag_id="website_ingestion_embedding", - start_date=datetime(2024, 1, 1), - schedule_interval="0 3 * * *", - catchup=False, - max_active_runs=1, -) as dag: - - @task - def get_website_communities() -> list[dict[str, str | datetime | list]]: - """ - Retrieve all communities with associated website URLs from the database. - - Returns: - list[dict]: List of community information containing: - - community_id (str) - - platform_id (str) - - urls (list) - """ - communities = ModulesWebsite().get_learning_platforms() - return communities - - @task - def start_website_embedding(community_info: dict[str, str | datetime | list]): - load_dotenv() - community_id = community_info["community_id"] - platform_id = community_info["platform_id"] - urls = community_info["urls"] - - logging.info( - f"Processing community_id: {community_id} | platform_id: {platform_id}" - ) - website_etl = WebsiteETL(community_id=community_id) - - # Extract - raw_data = asyncio.run(website_etl.extract(urls=urls)) - # transform - documents = website_etl.transform(raw_data=raw_data) - # load into db - website_etl.load(documents=documents) - - logging.info( - f"Community {community_id} Job finished | platform_id: {platform_id}" - ) - - communities_info = get_website_communities() - start_website_embedding.expand(community_info=communities_info) From fca520ec8d32cae93ec5592b9db3c4f693d9fcae Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 10:18:39 +0330 Subject: [PATCH 2/7] feat: Using shared codes and removing duplicate codes! --- .../analyzer_helper/common/fetch_platforms.py | 2 +- .../common/load_transformed_data.py | 2 +- .../common/load_transformed_members.py | 2 +- .../discord/discord_extract_raw_infos.py | 2 +- .../discord/discord_extract_raw_members.py | 2 +- .../discord/discord_load_transformed_data.py | 2 +- .../discord_load_transformed_members.py | 2 +- .../discord/fetch_discord_platforms.py | 2 +- .../discord/utils/is_user_bot.py | 2 +- .../discourse/extract_raw_data.py | 2 +- .../discourse/extract_raw_members.py | 2 +- .../telegram/extract_raw_data.py | 2 +- .../telegram/extract_raw_members.py | 2 +- .../test_telegram_extract_raw_data.py | 2 +- .../test_discord_extract_raw_info.py | 2 +- .../test_discord_extract_raw_members.py | 2 +- .../integration/test_discord_is_user_bot.py | 2 +- .../test_discord_load_transformed_data.py | 2 +- .../test_discord_load_transformed_members.py | 2 +- .../test_discord_transform_raw_data.py | 2 +- .../test_discourse_extract_raw_data.py | 2 +- ...est_integration_fetch_discord_platforms.py | 2 +- .../unit/test_unit_fetch_discord_platforms.py | 2 +- dags/hivemind_etl_helpers/github_etl.py | 2 +- .../ingestion_pipeline.py | 186 ------------------ dags/hivemind_etl_helpers/mediawiki_etl.py | 2 +- dags/hivemind_etl_helpers/notion_etl.py | 2 +- .../src/db/discord/fetch_raw_messages.py | 2 +- .../src/db/discord/find_guild_id.py | 2 +- .../src/db/discord/utils/id_transform.py | 2 +- .../src/db/telegram/utils/module.py | 2 +- .../src/db/telegram/utils/platform.py | 2 +- .../src/utils/credentials.py | 63 ------ .../src/utils/modules/discord.py | 2 +- .../src/utils/modules/discourse.py | 2 +- .../src/utils/modules/gdrive.py | 2 +- .../src/utils/modules/github.py | 2 +- .../src/utils/modules/mediawiki.py | 2 +- .../src/utils/modules/modules_base.py | 132 ------------- .../src/utils/modules/notion.py | 2 +- dags/hivemind_etl_helpers/src/utils/mongo.py | 44 ----- dags/hivemind_etl_helpers/src/utils/redis.py | 39 ---- .../test_discord_convert_role_id_to_name.py | 2 +- .../test_discord_convert_user_id_to_name.py | 2 +- .../test_discord_fetch_modules_channels.py | 2 +- .../test_discord_fetch_raw_messages.py | 2 +- ...test_discord_fetch_raw_messages_grouped.py | 2 +- .../integration/test_discord_find_guild_id.py | 2 +- ...test_discord_merge_user_ids_fetch_names.py | 2 +- .../test_discord_prepare_document_from_db.py | 2 +- .../test_discord_prepare_grouped_data.py | 2 +- .../integration/test_discord_prepare_llama.py | 2 +- .../test_discord_prepare_summary.py | 2 +- .../test_discord_prepare_thread_summaries.py | 2 +- .../test_gdrive_get_communities_org.py | 2 +- .../test_get_all_discord_communities.py | 2 +- .../test_get_discourse_community_data.py | 2 +- .../test_github_get_communities_org.py | 2 +- .../test_ingestion_pipeline_etl.py | 2 +- .../tests/integration/test_load_envs.py | 50 ----- .../integration/test_mediawiki_modules.py | 2 +- .../test_modules_base_query_token.py | 2 +- .../tests/integration/test_notion_modules.py | 2 +- .../test_pg_vector_access_with_discord.py | 2 +- .../integration/test_telegram_comminity.py | 2 +- dags/hivemind_google_drive_etl.py | 2 +- dags/hivemind_telegram_etl.py | 2 +- dags/violation_detection_helpers/extract.py | 2 +- dags/violation_detection_helpers/load.py | 2 +- dags/violation_detection_helpers/modules.py | 2 +- .../tests/integration/test_extract_data.py | 2 +- .../tests/integration/test_load_data.py | 2 +- .../integration/test_retrieve_modeules.py | 2 +- requirements.txt | 4 +- 74 files changed, 68 insertions(+), 584 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/ingestion_pipeline.py delete mode 100644 dags/hivemind_etl_helpers/src/utils/credentials.py delete mode 100644 dags/hivemind_etl_helpers/src/utils/modules/modules_base.py delete mode 100644 dags/hivemind_etl_helpers/src/utils/mongo.py delete mode 100644 dags/hivemind_etl_helpers/src/utils/redis.py delete mode 100644 dags/hivemind_etl_helpers/tests/integration/test_load_envs.py diff --git a/dags/analyzer_helper/common/fetch_platforms.py b/dags/analyzer_helper/common/fetch_platforms.py index 89705daa..c1164d8c 100644 --- a/dags/analyzer_helper/common/fetch_platforms.py +++ b/dags/analyzer_helper/common/fetch_platforms.py @@ -1,5 +1,5 @@ from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class FetchPlatforms: diff --git a/dags/analyzer_helper/common/load_transformed_data.py b/dags/analyzer_helper/common/load_transformed_data.py index dc75fa95..5141007e 100644 --- a/dags/analyzer_helper/common/load_transformed_data.py +++ b/dags/analyzer_helper/common/load_transformed_data.py @@ -3,7 +3,7 @@ from analyzer_helper.common.base.load_transformed_data_base import ( LoadTransformedDataBase, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class LoadTransformedData(LoadTransformedDataBase): diff --git a/dags/analyzer_helper/common/load_transformed_members.py b/dags/analyzer_helper/common/load_transformed_members.py index eac64e19..7935f872 100644 --- a/dags/analyzer_helper/common/load_transformed_members.py +++ b/dags/analyzer_helper/common/load_transformed_members.py @@ -3,7 +3,7 @@ from analyzer_helper.common.base.load_transformed_members_base import ( LoadTransformedMembersBase, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class LoadTransformedMembers(LoadTransformedMembersBase): diff --git a/dags/analyzer_helper/discord/discord_extract_raw_infos.py b/dags/analyzer_helper/discord/discord_extract_raw_infos.py index 661ee3bf..c36a583b 100644 --- a/dags/analyzer_helper/discord/discord_extract_raw_infos.py +++ b/dags/analyzer_helper/discord/discord_extract_raw_infos.py @@ -2,7 +2,7 @@ from datetime import datetime, timezone from analyzer_helper.discord.extract_raw_info_base import ExtractRawInfosBase -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class DiscordExtractRawInfos(ExtractRawInfosBase): diff --git a/dags/analyzer_helper/discord/discord_extract_raw_members.py b/dags/analyzer_helper/discord/discord_extract_raw_members.py index 86cc5dff..b4f32ff3 100644 --- a/dags/analyzer_helper/discord/discord_extract_raw_members.py +++ b/dags/analyzer_helper/discord/discord_extract_raw_members.py @@ -1,7 +1,7 @@ import logging from analyzer_helper.discord.extract_raw_member_base import ExtractRawMembersBase -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class DiscordExtractRawMembers(ExtractRawMembersBase): diff --git a/dags/analyzer_helper/discord/discord_load_transformed_data.py b/dags/analyzer_helper/discord/discord_load_transformed_data.py index be3dd93d..5000e163 100644 --- a/dags/analyzer_helper/discord/discord_load_transformed_data.py +++ b/dags/analyzer_helper/discord/discord_load_transformed_data.py @@ -1,7 +1,7 @@ import logging from analyzer_helper.discord.load_transformed_data_base import LoadTransformedDataBase -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class DiscordLoadTransformedData(LoadTransformedDataBase): diff --git a/dags/analyzer_helper/discord/discord_load_transformed_members.py b/dags/analyzer_helper/discord/discord_load_transformed_members.py index d9c55e1d..52486e79 100644 --- a/dags/analyzer_helper/discord/discord_load_transformed_members.py +++ b/dags/analyzer_helper/discord/discord_load_transformed_members.py @@ -3,7 +3,7 @@ from analyzer_helper.discord.load_transformed_members_base import ( LoadTransformedMembersBase, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class DiscordLoadTransformedMembers(LoadTransformedMembersBase): diff --git a/dags/analyzer_helper/discord/fetch_discord_platforms.py b/dags/analyzer_helper/discord/fetch_discord_platforms.py index faebf4cd..58574513 100644 --- a/dags/analyzer_helper/discord/fetch_discord_platforms.py +++ b/dags/analyzer_helper/discord/fetch_discord_platforms.py @@ -1,5 +1,5 @@ from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class FetchDiscordPlatforms: diff --git a/dags/analyzer_helper/discord/utils/is_user_bot.py b/dags/analyzer_helper/discord/utils/is_user_bot.py index b5568302..41bd81df 100644 --- a/dags/analyzer_helper/discord/utils/is_user_bot.py +++ b/dags/analyzer_helper/discord/utils/is_user_bot.py @@ -1,4 +1,4 @@ -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class UserBotChecker: diff --git a/dags/analyzer_helper/discourse/extract_raw_data.py b/dags/analyzer_helper/discourse/extract_raw_data.py index a3ba78d4..54ec39df 100644 --- a/dags/analyzer_helper/discourse/extract_raw_data.py +++ b/dags/analyzer_helper/discourse/extract_raw_data.py @@ -5,7 +5,7 @@ DateTimeFormatConverter, ) from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class ExtractRawInfo: diff --git a/dags/analyzer_helper/discourse/extract_raw_members.py b/dags/analyzer_helper/discourse/extract_raw_members.py index a152a92e..62e16a80 100644 --- a/dags/analyzer_helper/discourse/extract_raw_members.py +++ b/dags/analyzer_helper/discourse/extract_raw_members.py @@ -4,7 +4,7 @@ DateTimeFormatConverter, ) from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class ExtractRawMembers: diff --git a/dags/analyzer_helper/telegram/extract_raw_data.py b/dags/analyzer_helper/telegram/extract_raw_data.py index 1c6a43bd..5303f9b4 100644 --- a/dags/analyzer_helper/telegram/extract_raw_data.py +++ b/dags/analyzer_helper/telegram/extract_raw_data.py @@ -5,7 +5,7 @@ DateTimeFormatConverter, ) from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class ExtractRawInfo: diff --git a/dags/analyzer_helper/telegram/extract_raw_members.py b/dags/analyzer_helper/telegram/extract_raw_members.py index 0ec2f382..fc624afe 100644 --- a/dags/analyzer_helper/telegram/extract_raw_members.py +++ b/dags/analyzer_helper/telegram/extract_raw_members.py @@ -2,7 +2,7 @@ DateTimeFormatConverter, ) from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from pymongo import DESCENDING diff --git a/dags/analyzer_helper/telegram/tests/integration/test_telegram_extract_raw_data.py b/dags/analyzer_helper/telegram/tests/integration/test_telegram_extract_raw_data.py index 7e07e000..fea1b512 100644 --- a/dags/analyzer_helper/telegram/tests/integration/test_telegram_extract_raw_data.py +++ b/dags/analyzer_helper/telegram/tests/integration/test_telegram_extract_raw_data.py @@ -3,7 +3,7 @@ from analyzer_helper.telegram.extract_raw_data import ExtractRawInfo from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestExtractRawInfo(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discord_extract_raw_info.py b/dags/analyzer_helper/tests/integration/test_discord_extract_raw_info.py index 9f395e65..1b42c554 100644 --- a/dags/analyzer_helper/tests/integration/test_discord_extract_raw_info.py +++ b/dags/analyzer_helper/tests/integration/test_discord_extract_raw_info.py @@ -3,7 +3,7 @@ from analyzer_helper.discord.discord_extract_raw_infos import DiscordExtractRawInfos from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordExtractRawInfos(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discord_extract_raw_members.py b/dags/analyzer_helper/tests/integration/test_discord_extract_raw_members.py index c7689c48..16a0217c 100644 --- a/dags/analyzer_helper/tests/integration/test_discord_extract_raw_members.py +++ b/dags/analyzer_helper/tests/integration/test_discord_extract_raw_members.py @@ -2,7 +2,7 @@ from datetime import datetime from analyzer_helper.discord.discord_extract_raw_members import DiscordExtractRawMembers -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordExtractRawMembers(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discord_is_user_bot.py b/dags/analyzer_helper/tests/integration/test_discord_is_user_bot.py index f35a9ed2..ce551eb0 100644 --- a/dags/analyzer_helper/tests/integration/test_discord_is_user_bot.py +++ b/dags/analyzer_helper/tests/integration/test_discord_is_user_bot.py @@ -3,7 +3,7 @@ from analyzer_helper.discord.utils.is_user_bot import UserBotChecker from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestUserBotChecker(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discord_load_transformed_data.py b/dags/analyzer_helper/tests/integration/test_discord_load_transformed_data.py index 17d70937..0515584f 100644 --- a/dags/analyzer_helper/tests/integration/test_discord_load_transformed_data.py +++ b/dags/analyzer_helper/tests/integration/test_discord_load_transformed_data.py @@ -4,7 +4,7 @@ from analyzer_helper.discord.discord_load_transformed_data import ( DiscordLoadTransformedData, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordLoadTransformedData(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discord_load_transformed_members.py b/dags/analyzer_helper/tests/integration/test_discord_load_transformed_members.py index 77e0f49d..4de76450 100644 --- a/dags/analyzer_helper/tests/integration/test_discord_load_transformed_members.py +++ b/dags/analyzer_helper/tests/integration/test_discord_load_transformed_members.py @@ -4,7 +4,7 @@ from analyzer_helper.discord.discord_load_transformed_members import ( DiscordLoadTransformedMembers, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordLoadTransformedMembers(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discord_transform_raw_data.py b/dags/analyzer_helper/tests/integration/test_discord_transform_raw_data.py index dc4a3849..0011d5c6 100644 --- a/dags/analyzer_helper/tests/integration/test_discord_transform_raw_data.py +++ b/dags/analyzer_helper/tests/integration/test_discord_transform_raw_data.py @@ -4,7 +4,7 @@ from analyzer_helper.discord.discord_transform_raw_data import DiscordTransformRawData from analyzer_helper.discord.utils.is_user_bot import UserBotChecker from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordTransformRawData(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py index 1c191f92..12f1978e 100644 --- a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py +++ b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py @@ -3,7 +3,7 @@ from analyzer_helper.discourse.extract_raw_data import ExtractRawInfo from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestExtractRawInfo(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/integration/test_integration_fetch_discord_platforms.py b/dags/analyzer_helper/tests/integration/test_integration_fetch_discord_platforms.py index 997e2b47..0183804d 100644 --- a/dags/analyzer_helper/tests/integration/test_integration_fetch_discord_platforms.py +++ b/dags/analyzer_helper/tests/integration/test_integration_fetch_discord_platforms.py @@ -3,7 +3,7 @@ from analyzer_helper.discord.fetch_discord_platforms import FetchDiscordPlatforms from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestFetchDiscordPlatforms(unittest.TestCase): diff --git a/dags/analyzer_helper/tests/unit/test_unit_fetch_discord_platforms.py b/dags/analyzer_helper/tests/unit/test_unit_fetch_discord_platforms.py index 4a0e2e46..c34a27cb 100644 --- a/dags/analyzer_helper/tests/unit/test_unit_fetch_discord_platforms.py +++ b/dags/analyzer_helper/tests/unit/test_unit_fetch_discord_platforms.py @@ -4,7 +4,7 @@ from analyzer_helper.discord.fetch_discord_platforms import FetchDiscordPlatforms from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestFetchDiscordPlatformsUnit(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index 648e462a..e747e2aa 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -2,7 +2,7 @@ from datetime import datetime from dotenv import load_dotenv -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.github.extract import ( GithubExtraction, fetch_issues, diff --git a/dags/hivemind_etl_helpers/ingestion_pipeline.py b/dags/hivemind_etl_helpers/ingestion_pipeline.py deleted file mode 100644 index 5de2b933..00000000 --- a/dags/hivemind_etl_helpers/ingestion_pipeline.py +++ /dev/null @@ -1,186 +0,0 @@ -import logging -from datetime import datetime - -from dateutil.parser import parse -from hivemind_etl_helpers.src.utils.credentials import load_redis_credentials -from hivemind_etl_helpers.src.utils.mongo import get_mongo_uri -from hivemind_etl_helpers.src.utils.redis import RedisSingleton -from llama_index.core import Document, MockEmbedding -from llama_index.core.ingestion import ( - DocstoreStrategy, - IngestionCache, - IngestionPipeline, -) -from llama_index.core.node_parser import SemanticSplitterNodeParser -from llama_index.core.schema import BaseNode -from llama_index.storage.docstore.mongodb import MongoDocumentStore -from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache -from qdrant_client.conversions import common_types as qdrant_types -from qdrant_client.http import models -from tc_hivemind_backend.db.credentials import load_postgres_credentials -from tc_hivemind_backend.db.qdrant import QdrantSingleton -from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams -from tc_hivemind_backend.embeddings.cohere import CohereEmbedding -from tc_hivemind_backend.qdrant_vector_access import QDrantVectorAccess - - -class CustomIngestionPipeline: - def __init__(self, community_id: str, collection_name: str, testing: bool = False): - self.community_id = community_id - self.qdrant_client = QdrantSingleton.get_instance().client - - _, self.embedding_dim = load_model_hyperparams() - self.pg_creds = load_postgres_credentials() - self.redis_cred = load_redis_credentials() - self.collection_name = f"{community_id}_{collection_name}" - self.platform_name = collection_name - - self.embed_model = ( - CohereEmbedding() - if not testing - else MockEmbedding(embed_dim=self.embedding_dim) - ) - self.redis_client = RedisSingleton.get_instance().get_client() - - def run_pipeline(self, docs: list[Document]) -> list[BaseNode]: - """ - vectorize and ingest data into a qdrant collection - - Note: This will handle duplicate documents by doing an upsert operation. - - Parameters - ------------ - docs : list[llama_index.Document] - list of llama-index documents - - Returns - --------- - nodes : list[BaseNode] - The set of transformed and loaded Nodes/Documents - (transformation is chunking and embedding of data) - """ - # qdrant is just collection based and doesn't have any database - logging.info( - f"{len(docs)} docuemnts was extracted and now loading into QDrant DB!" - ) - vector_access = QDrantVectorAccess(collection_name=self.collection_name) - vector_store = vector_access.setup_qdrant_vector_store() - - pipeline = IngestionPipeline( - transformations=[ - SemanticSplitterNodeParser(embed_model=self.embed_model), - self.embed_model, - ], - docstore=MongoDocumentStore.from_uri( - uri=get_mongo_uri(), - db_name=f"docstore_{self.community_id}", - namespace=self.platform_name, - ), - vector_store=vector_store, - cache=IngestionCache( - cache=RedisCache.from_redis_client(self.redis_client), - collection=f"{self.collection_name}_ingestion_cache", - docstore_strategy=DocstoreStrategy.UPSERTS, - ), - docstore_strategy=DocstoreStrategy.UPSERTS, - ) - logging.info("Pipeline created, now inserting documents into pipeline!") - - nodes = pipeline.run(documents=docs, show_progress=True) - return nodes - - def _create_payload_index( - self, - field_name: str, - field_schema: qdrant_types.PayloadSchemaType, - ) -> qdrant_types.UpdateResult: - """ - Creates an index on a field under the payload of points in qdrant db - - Note: this could be used for payload fields that we want to scroll for after - - Parameters - ------------ - field_name : str - the field name under points' payload to create the index for - field_schema : qdrant_client.conversions.common_types.PayloadSchemaType - the schema type of the field - - Returns - ----------- - operation_result : qdrant_client.conversions.common_types.UpdateResult - the payload index creation type - """ - operation_result = self.qdrant_client.create_payload_index( - collection_name=self.collection_name, - field_name=field_name, - field_schema=field_schema, - ) - - return operation_result - - def get_latest_document_date( - self, - field_name: str, - field_schema: qdrant_types.PayloadSchemaType = models.PayloadSchemaType.FLOAT, - ) -> datetime | None: - """ - get the latest date for the most recent available document - - NOTE: the given `field_name` under the points' schema MUST CONTAIN A VALUE HAVING DATE FORMAT (or a string format). If not, errors might raise in result of this function - - Parameters - ------------ - field_name : str - the datetime field name in qdrant points' payload - field_schema : qdrant_client.conversions.common_types.PayloadSchemaType - the date field schema - for default we're assuming it is a float timestamp - but it also could be DATETIME - - Returns - --------- - latest_date : datetime.datetime | None - the datetime for the document containing the latest date - if no document or any errors raised, we would return `None` - """ - latest_date: datetime | None = None - try: - result = self._create_payload_index( - field_name=field_name, - field_schema=field_schema, - ) - if result.status.name == "COMPLETED": - latest_document = self.qdrant_client.scroll( - collection_name=self.collection_name, - limit=1, - with_payload=True, - order_by=models.OrderBy( - key=field_name, - direction=models.Direction.DESC, - ), - ) - - if not latest_document[0]: - logging.info("No documents found in the collection.") - latest_date = None - else: - date_field = latest_document[0][0].payload[field_name] - - # if it was float timestamp - if field_schema == models.PayloadSchemaType.FLOAT: - latest_date = datetime.fromtimestamp(date_field) - - # it should be datetime in any other case - else: - latest_date = parse(date_field) - - else: - raise ValueError( - f"Index not created successfully! index creation result: {result}" - ) - except Exception as exp: - logging.error(f"Error: {exp} while loading latest point!") - latest_date = None - - return latest_date diff --git a/dags/hivemind_etl_helpers/mediawiki_etl.py b/dags/hivemind_etl_helpers/mediawiki_etl.py index 7bbec510..f5b9d1b7 100644 --- a/dags/hivemind_etl_helpers/mediawiki_etl.py +++ b/dags/hivemind_etl_helpers/mediawiki_etl.py @@ -1,6 +1,6 @@ import logging -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.mediawiki.extractor import MediaWikiExtractor diff --git a/dags/hivemind_etl_helpers/notion_etl.py b/dags/hivemind_etl_helpers/notion_etl.py index 1a859aaf..9a69419b 100644 --- a/dags/hivemind_etl_helpers/notion_etl.py +++ b/dags/hivemind_etl_helpers/notion_etl.py @@ -1,6 +1,6 @@ import logging -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py index 7e02b12d..dfb540bd 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py @@ -1,6 +1,6 @@ from datetime import datetime -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton def fetch_raw_messages( diff --git a/dags/hivemind_etl_helpers/src/db/discord/find_guild_id.py b/dags/hivemind_etl_helpers/src/db/discord/find_guild_id.py index 6df3a30c..af4f88be 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/find_guild_id.py +++ b/dags/hivemind_etl_helpers/src/db/discord/find_guild_id.py @@ -1,5 +1,5 @@ from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton def find_guild_id_by_platform_id(platform_id: str) -> str: diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/id_transform.py b/dags/hivemind_etl_helpers/src/db/discord/utils/id_transform.py index 919395e1..271f17a0 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/id_transform.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/id_transform.py @@ -1,5 +1,5 @@ from hivemind_etl_helpers.src.db.discord.utils.sort_based_id import sort_based_on_id -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton def convert_user_id( diff --git a/dags/hivemind_etl_helpers/src/db/telegram/utils/module.py b/dags/hivemind_etl_helpers/src/db/telegram/utils/module.py index 2df34c7d..bef96550 100644 --- a/dags/hivemind_etl_helpers/src/db/telegram/utils/module.py +++ b/dags/hivemind_etl_helpers/src/db/telegram/utils/module.py @@ -2,7 +2,7 @@ from datetime import datetime, timezone from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TelegramModules: diff --git a/dags/hivemind_etl_helpers/src/db/telegram/utils/platform.py b/dags/hivemind_etl_helpers/src/db/telegram/utils/platform.py index a9b21aaf..42f21d7b 100644 --- a/dags/hivemind_etl_helpers/src/db/telegram/utils/platform.py +++ b/dags/hivemind_etl_helpers/src/db/telegram/utils/platform.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TelegramPlatform: diff --git a/dags/hivemind_etl_helpers/src/utils/credentials.py b/dags/hivemind_etl_helpers/src/utils/credentials.py deleted file mode 100644 index 5458f0a7..00000000 --- a/dags/hivemind_etl_helpers/src/utils/credentials.py +++ /dev/null @@ -1,63 +0,0 @@ -import os - -from dotenv import load_dotenv - - -def load_mongo_credentials() -> dict[str, str]: - """ - load mongo db credentials from .env - - Returns: - --------- - mongo_creds : dict[str, Any] - mongodb credentials - a dictionary representive of - `user`: str - `password` : str - `host` : str - `port` : int - """ - load_dotenv() - - mongo_creds = {} - - mongo_creds["user"] = os.getenv("MONGODB_USER", "root") - mongo_creds["password"] = os.getenv("MONGODB_PASS", "pass") - mongo_creds["host"] = os.getenv("MONGODB_HOST", "mongo") - mongo_creds["port"] = os.getenv("MONGODB_PORT", 27017) - - return mongo_creds - - -def load_redis_credentials() -> dict[str, str]: - """ - load redis db credentials from .env - - Returns: - --------- - redis_creds : dict[str, Any] - redis credentials - a dictionary representive of - `password` : str - `host` : str - `port` : int - """ - load_dotenv() - - host = os.getenv("REDIS_HOST") - port = os.getenv("REDIS_PORT") - password = os.getenv("REDIS_PASSWORD") - - if host is None: - raise ValueError("`REDIS_HOST` is not set in env credentials!") - if port is None: - raise ValueError("`REDIS_PORT` is not set in env credentials!") - if password is None: - raise ValueError("`REDIS_PASSWORD` is not set in env credentials!") - - redis_creds: dict[str, str] = { - "host": host, - "port": port, - "password": password, - } - return redis_creds diff --git a/dags/hivemind_etl_helpers/src/utils/modules/discord.py b/dags/hivemind_etl_helpers/src/utils/modules/discord.py index 080ad6d6..983714ce 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/discord.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/discord.py @@ -1,6 +1,6 @@ from datetime import datetime -from .modules_base import ModulesBase +from tc_hivemind_backend.db.modules_base import ModulesBase class ModulesDiscord(ModulesBase): diff --git a/dags/hivemind_etl_helpers/src/utils/modules/discourse.py b/dags/hivemind_etl_helpers/src/utils/modules/discourse.py index fbff1c2e..2b94c5c3 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/discourse.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/discourse.py @@ -1,6 +1,6 @@ from datetime import datetime -from .modules_base import ModulesBase +from tc_hivemind_backend.db.modules_base import ModulesBase class ModulesDiscourse(ModulesBase): diff --git a/dags/hivemind_etl_helpers/src/utils/modules/gdrive.py b/dags/hivemind_etl_helpers/src/utils/modules/gdrive.py index 685637c5..21fc7988 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/gdrive.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/gdrive.py @@ -1,7 +1,7 @@ import logging from datetime import datetime -from .modules_base import ModulesBase +from tc_hivemind_backend.db.modules_base import ModulesBase class ModulesGDrive(ModulesBase): diff --git a/dags/hivemind_etl_helpers/src/utils/modules/github.py b/dags/hivemind_etl_helpers/src/utils/modules/github.py index f90ca8e6..e7b41d2c 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/github.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/github.py @@ -1,7 +1,7 @@ import logging from datetime import datetime -from .modules_base import ModulesBase +from tc_hivemind_backend.db.modules_base import ModulesBase class ModulesGitHub(ModulesBase): diff --git a/dags/hivemind_etl_helpers/src/utils/modules/mediawiki.py b/dags/hivemind_etl_helpers/src/utils/modules/mediawiki.py index db7e3773..9353c6b4 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/mediawiki.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/mediawiki.py @@ -1,6 +1,6 @@ import logging -from .modules_base import ModulesBase +from tc_hivemind_backend.db.modules_base import ModulesBase class ModulesMediaWiki(ModulesBase): diff --git a/dags/hivemind_etl_helpers/src/utils/modules/modules_base.py b/dags/hivemind_etl_helpers/src/utils/modules/modules_base.py deleted file mode 100644 index 586bc1ac..00000000 --- a/dags/hivemind_etl_helpers/src/utils/modules/modules_base.py +++ /dev/null @@ -1,132 +0,0 @@ -from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton - - -class ModulesBase: - def __init__(self) -> None: - pass - - def query(self, platform: str, **kwargs) -> list[dict]: - """ - query the modules database for to get platforms' metadata - - Parameters - ----------- - platform : str - the platform to choose - it can be `github`, `discourse`, `discord` or etc - **kwargs : dict - projection : dict[str, int] - feature projection on query - - Returns - --------- - modules_docs : list[dict] - all the module documents that have the `platform` within them - """ - client = MongoSingleton.get_instance().client - projection = kwargs.get("projection", {}) - - cursor = client["Core"]["modules"].find( - { - "options.platforms.name": platform, - "name": "hivemind", - }, - projection, - ) - modules_docs = list(cursor) - return modules_docs - - def get_platform_community_ids(self, platform_name: str) -> list[str]: - """ - get all community ids that a platform has - - Parameters - ------------ - platform_name : str - the platform having community id and available for hivemind module - - Returns - -------- - community_ids : list[str] - id of communities that has discord platform and hivemind module enabled - - """ - modules = self.query(platform=platform_name, projection={"community"}) - community_ids = list(map(lambda x: str(x["community"]), modules)) - - return community_ids - - def get_token(self, platform_id: ObjectId, token_type: str) -> str: - """ - get a specific type of token for a platform - This method is called when we needed a token for modules to extract its data - - Parameters - ------------ - platform_id : ObjectId - the platform id that we want their token - token_type : str - the type of token. i.e. `google_refresh` - - Returns - -------- - token : str - the token that was required for module's ETL process - """ - client = MongoSingleton.get_instance().client - - user_id = self.get_platform_metadata(platform_id, "userId") - user_id = ObjectId(user_id) - token_doc = client["Core"]["tokens"].find_one( - { - "user": user_id, - "type": token_type, - }, - { - "token": 1, - }, - sort=[("createdAt", -1)], - ) - if token_doc is None: - raise ValueError( - f"No Token for the given user {user_id} " - "in tokens collection of the Core database!" - ) - token = token_doc["token"] - return token - - def get_platform_metadata( - self, platform_id: ObjectId, metadata_name: str - ) -> str | dict | list: - """ - get the userid that belongs to a platform - - Parameters - ----------- - platform_id : bson.ObjectId - the platform id we need their owner user id - metadata_name : str - a specific field of metadata that we want - - Returns - --------- - metadata_value : Any - the values that the metadata belongs to - """ - client = MongoSingleton.get_instance().get_client() - - platform = client["Core"]["platforms"].find_one( - { - "_id": platform_id, - "disconnectedAt": None, - }, - { - f"metadata.{metadata_name}": 1, - }, - ) - if platform is None: - raise ValueError(f"No platform available given platform id: {platform_id}") - - metadata_field = platform["metadata"][metadata_name] - return metadata_field diff --git a/dags/hivemind_etl_helpers/src/utils/modules/notion.py b/dags/hivemind_etl_helpers/src/utils/modules/notion.py index 778a5ad5..81238112 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/notion.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/notion.py @@ -1,6 +1,6 @@ import logging -from .modules_base import ModulesBase +from tc_hivemind_backend.db.modules_base import ModulesBase class ModulesNotion(ModulesBase): diff --git a/dags/hivemind_etl_helpers/src/utils/mongo.py b/dags/hivemind_etl_helpers/src/utils/mongo.py deleted file mode 100644 index 46e0e30f..00000000 --- a/dags/hivemind_etl_helpers/src/utils/mongo.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging - -from pymongo import MongoClient - -from .credentials import load_mongo_credentials - - -class MongoSingleton: - __instance = None - - def __init__(self): - if MongoSingleton.__instance is not None: - raise Exception("This class is a singleton!") - else: - connection_uri = get_mongo_uri() - self.client = MongoClient(connection_uri) - MongoSingleton.__instance = self - - @staticmethod - def get_instance(): - if MongoSingleton.__instance is None: - MongoSingleton() - try: - info = MongoSingleton.__instance.client.server_info() - logging.info(f"MongoDB Connected Successfully! server info: {info}") - except Exception as exp: - logging.error(f"MongoDB not connected! exp: {exp}") - - return MongoSingleton.__instance - - def get_client(self): - return self.client - - -def get_mongo_uri() -> str: - mongo_creds = load_mongo_credentials() - user = mongo_creds["user"] - password = mongo_creds["password"] - host = mongo_creds["host"] - port = mongo_creds["port"] - - connection = f"mongodb://{user}:{password}@{host}:{port}" - - return connection diff --git a/dags/hivemind_etl_helpers/src/utils/redis.py b/dags/hivemind_etl_helpers/src/utils/redis.py deleted file mode 100644 index 4554230e..00000000 --- a/dags/hivemind_etl_helpers/src/utils/redis.py +++ /dev/null @@ -1,39 +0,0 @@ -import logging - -import redis -from hivemind_etl_helpers.src.utils.credentials import load_redis_credentials - - -class RedisSingleton: - __instance = None - - def __init__(self): - if RedisSingleton.__instance is not None: - raise Exception("This class is a singleton!") - else: - creds = load_redis_credentials() - self.client = self.create_redis_client(creds) - RedisSingleton.__instance = self - - @staticmethod - def get_instance(): - if RedisSingleton.__instance is None: - RedisSingleton() - try: - info = RedisSingleton.__instance.client.ping() - logging.info(f"Redis Connected Successfully! Ping returned: {info}") - except Exception as exp: - logging.error(f"Redis not connected! exp: {exp}") - - return RedisSingleton.__instance - - def get_client(self): - return self.client - - def create_redis_client(self, redis_creds: dict[str, str]): - return redis.Redis( - host=redis_creds["host"], - port=int(redis_creds["port"]), - password=redis_creds["password"], - decode_responses=True, - ) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_role_id_to_name.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_role_id_to_name.py index 03380803..6bb53256 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_role_id_to_name.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_role_id_to_name.py @@ -1,7 +1,7 @@ import unittest from hivemind_etl_helpers.src.db.discord.utils.id_transform import convert_role_id -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestRoleIdConvert(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_user_id_to_name.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_user_id_to_name.py index 4a608c01..e35e69bd 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_user_id_to_name.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_convert_user_id_to_name.py @@ -4,7 +4,7 @@ import numpy as np from hivemind_etl_helpers.src.db.discord.utils.id_transform import convert_user_id -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestUserIdConvert(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_modules_channels.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_modules_channels.py index 7e75f1f2..3204b2a6 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_modules_channels.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_modules_channels.py @@ -5,7 +5,7 @@ from hivemind_etl_helpers.src.db.discord.fetch_raw_messages import ( fetch_channels_and_from_date, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordFetchModulesChannels(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py index 04d97c4f..f70cef03 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py @@ -4,7 +4,7 @@ import numpy as np from bson import ObjectId from hivemind_etl_helpers.src.db.discord.fetch_raw_messages import fetch_raw_messages -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestFetchRawMessages(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages_grouped.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages_grouped.py index 25221b27..678cf3b3 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages_grouped.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages_grouped.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.db.discord.fetch_raw_messages import fetch_raw_msg_grouped -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestFetchRawMessagesGrouped(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_find_guild_id.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_find_guild_id.py index 35c10f26..65b8e319 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_find_guild_id.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_find_guild_id.py @@ -5,7 +5,7 @@ from hivemind_etl_helpers.src.db.discord.find_guild_id import ( find_guild_id_by_platform_id, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestFindDiscordGuildId(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_merge_user_ids_fetch_names.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_merge_user_ids_fetch_names.py index c7e75a8b..5cef3900 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_merge_user_ids_fetch_names.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_merge_user_ids_fetch_names.py @@ -6,7 +6,7 @@ from hivemind_etl_helpers.src.db.discord.utils.merge_user_ids_fetch_names import ( merge_user_ids_and_fetch_names, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestMergeUserIdsFetchNames(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index 4f8a9b82..e72a2ca4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -6,7 +6,7 @@ from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import ( discord_raw_to_documents, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestTransformRawMsgToDocument(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_grouped_data.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_grouped_data.py index 1380b5dd..256f0471 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_grouped_data.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_grouped_data.py @@ -5,7 +5,7 @@ from hivemind_etl_helpers.src.db.discord.summary.prepare_grouped_data import ( prepare_grouped_data, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordGroupedDataPreparation(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py index ea1dcb9f..d6710b53 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py @@ -5,7 +5,7 @@ from hivemind_etl_helpers.src.db.discord.utils.transform_discord_raw_messges import ( transform_discord_raw_messages, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestTransformRawMsgToDocument(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py index fdee5426..916f0805 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from llama_index.core import Document, MockEmbedding, Settings from llama_index.core.llms import MockLLM diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py index 3ec1eb5a..c9ceccc4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py @@ -4,7 +4,7 @@ from hivemind_etl_helpers.src.db.discord.summary.prepare_summaries import ( PrepareSummaries, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from llama_index.core import MockEmbedding, Settings from llama_index.core.llms import MockLLM diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_get_communities_org.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_get_communities_org.py index cc53573d..a666f5c0 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_get_communities_org.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_get_communities_org.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules import ModulesGDrive -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestQueryGDriveModulesDB(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_get_all_discord_communities.py b/dags/hivemind_etl_helpers/tests/integration/test_get_all_discord_communities.py index fc876ed8..f12206df 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_get_all_discord_communities.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_get_all_discord_communities.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules import ModulesDiscord -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestGetAllDiscordCommunitites(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_get_discourse_community_data.py b/dags/hivemind_etl_helpers/tests/integration/test_get_discourse_community_data.py index 0aa4ec70..7fb534c0 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_get_discourse_community_data.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_get_discourse_community_data.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules import ModulesDiscourse -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestGetDiscourseCommunityData(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_get_communities_org.py b/dags/hivemind_etl_helpers/tests/integration/test_github_get_communities_org.py index 73808667..54410640 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_get_communities_org.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_get_communities_org.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules import ModulesGitHub -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestQueryGitHubModulesDB(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py b/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py index ad74008d..85529186 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py @@ -1,7 +1,7 @@ import unittest from unittest.mock import Mock -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from llama_index.core.ingestion import IngestionPipeline from llama_index.core.schema import Document diff --git a/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py b/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py deleted file mode 100644 index daec8b08..00000000 --- a/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py +++ /dev/null @@ -1,50 +0,0 @@ -import unittest - -from hivemind_etl_helpers.src.utils.credentials import ( - load_mongo_credentials, - load_redis_credentials, -) -from hivemind_etl_helpers.src.utils.mongo import get_mongo_uri - - -class TestCredentialLoadings(unittest.TestCase): - def test_mongo_envs_check_type(self): - mongo_creds = load_mongo_credentials() - - self.assertIsInstance(mongo_creds, dict) - - def test_mongo_envs_values(self): - mongo_creds = load_mongo_credentials() - - self.assertNotEqual(mongo_creds["user"], "") - self.assertNotEqual(mongo_creds["password"], "") - self.assertNotEqual(mongo_creds["host"], "") - self.assertNotEqual(mongo_creds["port"], "") - - self.assertIsInstance(mongo_creds["user"], str) - self.assertIsInstance(mongo_creds["password"], str) - self.assertIsInstance(mongo_creds["host"], str) - self.assertIsInstance(mongo_creds["port"], str) - - def test_redis_envs_check_type(self): - redis_creds = load_redis_credentials() - - self.assertIsInstance(redis_creds, dict) - - def test_redis_envs_values(self): - redis_creds = load_redis_credentials() - - self.assertIsNotNone(redis_creds["password"]) - self.assertIsNotNone(redis_creds["host"]) - self.assertIsNotNone(redis_creds["port"]) - - self.assertIsInstance(redis_creds["password"], str) - self.assertIsInstance(redis_creds["host"], str) - self.assertIsInstance(redis_creds["port"], str) - - def test_config_mongo_creds(self): - mongo_uri = get_mongo_uri() - - self.assertIsInstance(mongo_uri, str) - self.assertIn("mongodb://", mongo_uri) - self.assertNotIn("None", mongo_uri) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_mediawiki_modules.py b/dags/hivemind_etl_helpers/tests/integration/test_mediawiki_modules.py index 8e6152a4..8fbc7dea 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_mediawiki_modules.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_mediawiki_modules.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules import ModulesMediaWiki -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestGetMediaWikiModules(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py b/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py index 635f3910..e0590e01 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules.modules_base import ModulesBase -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestModulesBaseQueryToken(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_notion_modules.py b/dags/hivemind_etl_helpers/tests/integration/test_notion_modules.py index 974ada0f..a4dc35c4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_notion_modules.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_notion_modules.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.utils.modules import ModulesNotion -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestGetNotionModules(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index ebde5115..1a28ee77 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -6,7 +6,7 @@ from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import ( discord_raw_to_documents, ) -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from llama_index.core.indices.vector_store import VectorStoreIndex from tc_hivemind_backend.db.credentials import load_postgres_credentials from tc_hivemind_backend.db.pg_db_utils import setup_db diff --git a/dags/hivemind_etl_helpers/tests/integration/test_telegram_comminity.py b/dags/hivemind_etl_helpers/tests/integration/test_telegram_comminity.py index fde932d9..d351f9d7 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_telegram_comminity.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_telegram_comminity.py @@ -3,7 +3,7 @@ from bson import ObjectId from hivemind_etl_helpers.src.db.telegram.utils import TelegramPlatform -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class TestTelegramPlatform(TestCase): diff --git a/dags/hivemind_google_drive_etl.py b/dags/hivemind_google_drive_etl.py index 8a41f85b..21251af8 100644 --- a/dags/hivemind_google_drive_etl.py +++ b/dags/hivemind_google_drive_etl.py @@ -5,7 +5,7 @@ from airflow import DAG from airflow.decorators import task -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.gdrive.gdrive_loader import GoogleDriveLoader from hivemind_etl_helpers.src.utils.modules import ModulesGDrive diff --git a/dags/hivemind_telegram_etl.py b/dags/hivemind_telegram_etl.py index 8d0d7101..d01b4f8d 100644 --- a/dags/hivemind_telegram_etl.py +++ b/dags/hivemind_telegram_etl.py @@ -5,7 +5,7 @@ from airflow import DAG from airflow.decorators import task from dotenv import load_dotenv -from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.telegram.extract import ( ExtractMessages, ExtractMessagesDaily, diff --git a/dags/violation_detection_helpers/extract.py b/dags/violation_detection_helpers/extract.py index 03cd61bc..f6ef20a2 100644 --- a/dags/violation_detection_helpers/extract.py +++ b/dags/violation_detection_helpers/extract.py @@ -1,7 +1,7 @@ import logging from datetime import datetime -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from pymongo.cursor import Cursor diff --git a/dags/violation_detection_helpers/load.py b/dags/violation_detection_helpers/load.py index d280a3a4..638f19cb 100644 --- a/dags/violation_detection_helpers/load.py +++ b/dags/violation_detection_helpers/load.py @@ -1,6 +1,6 @@ from typing import Any -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from pymongo import UpdateOne diff --git a/dags/violation_detection_helpers/modules.py b/dags/violation_detection_helpers/modules.py index 8afbb536..18921559 100644 --- a/dags/violation_detection_helpers/modules.py +++ b/dags/violation_detection_helpers/modules.py @@ -1,6 +1,6 @@ import logging -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton class ViolationDetectionModules: diff --git a/dags/violation_detection_helpers/tests/integration/test_extract_data.py b/dags/violation_detection_helpers/tests/integration/test_extract_data.py index 6e2a5497..547a9b82 100644 --- a/dags/violation_detection_helpers/tests/integration/test_extract_data.py +++ b/dags/violation_detection_helpers/tests/integration/test_extract_data.py @@ -1,7 +1,7 @@ from datetime import datetime from unittest import TestCase -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from violation_detection_helpers import ExtractPlatformRawData diff --git a/dags/violation_detection_helpers/tests/integration/test_load_data.py b/dags/violation_detection_helpers/tests/integration/test_load_data.py index 45454b69..049c6861 100644 --- a/dags/violation_detection_helpers/tests/integration/test_load_data.py +++ b/dags/violation_detection_helpers/tests/integration/test_load_data.py @@ -2,7 +2,7 @@ from unittest import TestCase from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from violation_detection_helpers import LoadPlatformLabeledData diff --git a/dags/violation_detection_helpers/tests/integration/test_retrieve_modeules.py b/dags/violation_detection_helpers/tests/integration/test_retrieve_modeules.py index 06bccc07..575f063a 100644 --- a/dags/violation_detection_helpers/tests/integration/test_retrieve_modeules.py +++ b/dags/violation_detection_helpers/tests/integration/test_retrieve_modeules.py @@ -2,7 +2,7 @@ from unittest import TestCase from bson import ObjectId -from hivemind_etl_helpers.src.utils.mongo import MongoSingleton +from tc_hivemind_backend.db.mongo import MongoSingleton from violation_detection_helpers.modules import ViolationDetectionModules diff --git a/requirements.txt b/requirements.txt index c4b8f11c..dd69b10f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ cohere==4.56 neo4j>=5.14.1, <6.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.2.2 +tc-hivemind-backend==1.4.0 traceloop-sdk>=0.15.2, <0.16.0 beautifulsoup4==4.12.3 llama-index-readers-notion==0.1.6 @@ -22,6 +22,4 @@ tc-wikipedia-lib==1.0.1 llama-index-readers-file==0.1.22 docx2txt==0.8 tc-analyzer-lib==1.4.13 -crawlee[playwright]==0.3.8 -defusedxml==0.7.1 pydantic==2.9.2 \ No newline at end of file From b0e6e939c187270cfb930eec0d1bd7eda108f847 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 10:24:56 +0330 Subject: [PATCH 3/7] feat: the related code has been moved to temporal service repo! --- .../tests/unit/test_website_etl.py | 81 ------------------- 1 file changed, 81 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_website_etl.py diff --git a/dags/hivemind_etl_helpers/tests/unit/test_website_etl.py b/dags/hivemind_etl_helpers/tests/unit/test_website_etl.py deleted file mode 100644 index e625f1b3..00000000 --- a/dags/hivemind_etl_helpers/tests/unit/test_website_etl.py +++ /dev/null @@ -1,81 +0,0 @@ -from unittest import IsolatedAsyncioTestCase -from unittest.mock import AsyncMock, MagicMock - -from dotenv import load_dotenv -from hivemind_etl_helpers.website_etl import WebsiteETL -from llama_index.core import Document - - -class TestWebsiteETL(IsolatedAsyncioTestCase): - def setUp(self): - """ - Setup for the test cases. Initializes a WebsiteETL instance with mocked dependencies. - """ - load_dotenv() - self.community_id = "test_community" - self.website_etl = WebsiteETL(self.community_id) - self.website_etl.crawlee_client = AsyncMock() - self.website_etl.ingestion_pipeline = MagicMock() - - async def test_extract(self): - """ - Test the extract method. - """ - urls = ["https://example.com"] - mocked_data = [ - { - "url": "https://example.com", - "inner_text": "Example text", - "title": "Example", - } - ] - self.website_etl.crawlee_client.crawl.return_value = mocked_data - - extracted_data = await self.website_etl.extract(urls) - - self.assertEqual(extracted_data, mocked_data) - self.website_etl.crawlee_client.crawl.assert_awaited_once_with(urls) - - def test_transform(self): - """ - Test the transform method. - """ - raw_data = [ - { - "url": "https://example.com", - "inner_text": "Example text", - "title": "Example", - } - ] - expected_documents = [ - Document( - doc_id="https://example.com", - text="Example text", - metadata={"title": "Example", "url": "https://example.com"}, - ) - ] - - documents = self.website_etl.transform(raw_data) - - self.assertEqual(len(documents), len(expected_documents)) - self.assertEqual(documents[0].doc_id, expected_documents[0].doc_id) - self.assertEqual(documents[0].text, expected_documents[0].text) - self.assertEqual(documents[0].metadata, expected_documents[0].metadata) - - def test_load(self): - """ - Test the load method. - """ - documents = [ - Document( - doc_id="https://example.com", - text="Example text", - metadata={"title": "Example", "url": "https://example.com"}, - ) - ] - - self.website_etl.load(documents) - - self.website_etl.ingestion_pipeline.run_pipeline.assert_called_once_with( - docs=documents - ) From 77070589862ee74a831430165d686ba3f03b0c6d Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 10:25:50 +0330 Subject: [PATCH 4/7] feat: remove deleted code import! --- dags/hivemind_etl_helpers/src/utils/modules/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/utils/modules/__init__.py b/dags/hivemind_etl_helpers/src/utils/modules/__init__.py index 98b5e5a4..89c563a4 100644 --- a/dags/hivemind_etl_helpers/src/utils/modules/__init__.py +++ b/dags/hivemind_etl_helpers/src/utils/modules/__init__.py @@ -5,4 +5,3 @@ from .github import ModulesGitHub from .mediawiki import ModulesMediaWiki from .notion import ModulesNotion -from .website import ModulesWebsite From 66d81575183aad9fd8d0f232392239de77040340 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 10:41:49 +0330 Subject: [PATCH 5/7] feat: removed the migrated test case! --- .../test_modules_base_query_token.py | 121 ------------------ 1 file changed, 121 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py diff --git a/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py b/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py deleted file mode 100644 index e0590e01..00000000 --- a/dags/hivemind_etl_helpers/tests/integration/test_modules_base_query_token.py +++ /dev/null @@ -1,121 +0,0 @@ -from datetime import datetime, timedelta -from unittest import TestCase - -from bson import ObjectId -from hivemind_etl_helpers.src.utils.modules.modules_base import ModulesBase -from tc_hivemind_backend.db.mongo import MongoSingleton - - -class TestModulesBaseQueryToken(TestCase): - def setUp(self) -> None: - self.client = MongoSingleton.get_instance().get_client() - self.client["Core"].drop_collection("tokens") - self.client["Core"].drop_collection("platforms") - - def test_one_token(self): - sample_user = ObjectId("5d7baf326c8a2e2400000000") - community_id = ObjectId("6579c364f1120850414e0dc5") - sample_token_type = "type1" - sample_token_value = "tokenid12345" - platform_id = ObjectId("6579c364f1120850414e0dc6") - - self.client["Core"]["platforms"].insert_one( - { - "_id": platform_id, - "name": "platform_name", - "metadata": { - "id": "113445975232201081511", - "userId": str(sample_user), - }, - "community": community_id, - "disconnectedAt": None, - "connectedAt": datetime.now(), - "createdAt": datetime.now(), - "updatedAt": datetime.now(), - } - ) - - sample_token_doc = { - "token": sample_token_value, - "user": sample_user, - "type": sample_token_type, - "expires": datetime.now() + timedelta(days=1), - "blacklisted": False, - "createdAt": datetime.now() - timedelta(days=1), - "updatedAt": datetime.now() - timedelta(days=1), - } - self.client["Core"]["tokens"].insert_one(sample_token_doc) - token = ModulesBase().get_token( - platform_id=platform_id, token_type=sample_token_type - ) - - self.assertEqual(token, sample_token_value) - - def test_empty_tokens_collection(self): - platform_id = ObjectId("6579c364f1120850414e0dc6") - sample_token_type = "type1" - with self.assertRaises(ValueError): - _ = ModulesBase().get_token( - platform_id=platform_id, token_type=sample_token_type - ) - - def test_no_platform(self): - sample_user = ObjectId("5d7baf326c8a2e2400000000") - platform_id = ObjectId("6579c364f1120850414e0dc6") - sample_token_type = "type1" - sample_token_value = "tokenid12345" - - sample_token_doc = { - "token": sample_token_value, - "user": sample_user, - "type": sample_token_type, - "expires": datetime.now() + timedelta(days=1), - "blacklisted": False, - "createdAt": datetime.now() - timedelta(days=1), - "updatedAt": datetime.now() - timedelta(days=1), - } - self.client["Core"]["tokens"].insert_one(sample_token_doc) - with self.assertRaises(ValueError): - _ = ModulesBase().get_token( - platform_id=platform_id, token_type=sample_token_type - ) - - def test_no_token(self): - sample_user = ObjectId("5d7baf326c8a2e2400000000") - sample_user_with_no_token = ObjectId("5d7baf326c8a2e2400000001") - - platform_id = ObjectId("6579c364f1120850414e0dc6") - sample_token_type = "type1" - sample_token_value = "tokenid12345" - community_id = ObjectId("6579c364f1120850414e0dc5") - - self.client["Core"]["platforms"].insert_one( - { - "_id": platform_id, - "name": "platform_name", - "metadata": { - "id": "113445975232201081511", - "userId": str(sample_user_with_no_token), - }, - "community": community_id, - "disconnectedAt": None, - "connectedAt": datetime.now(), - "createdAt": datetime.now(), - "updatedAt": datetime.now(), - } - ) - - sample_token_doc = { - "token": sample_token_value, - "user": sample_user, - "type": sample_token_type, - "expires": datetime.now() + timedelta(days=1), - "blacklisted": False, - "createdAt": datetime.now() - timedelta(days=1), - "updatedAt": datetime.now() - timedelta(days=1), - } - self.client["Core"]["tokens"].insert_one(sample_token_doc) - with self.assertRaises(ValueError): - _ = ModulesBase().get_token( - platform_id=platform_id, token_type=sample_token_type - ) From a5b85e7cba8757858529b81dbbb817cb36004d91 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 10:47:45 +0330 Subject: [PATCH 6/7] fix: isort linter issue! --- dags/analyzer_helper/telegram/extract_raw_members.py | 2 +- dags/hivemind_etl_helpers/github_etl.py | 2 +- dags/hivemind_etl_helpers/mediawiki_etl.py | 2 +- dags/hivemind_etl_helpers/notion_etl.py | 2 +- .../tests/integration/test_discord_prepare_summary.py | 2 +- .../tests/integration/test_discord_prepare_thread_summaries.py | 2 +- .../tests/integration/test_ingestion_pipeline_etl.py | 2 +- .../tests/integration/test_pg_vector_access_with_discord.py | 2 +- dags/hivemind_google_drive_etl.py | 2 +- dags/hivemind_telegram_etl.py | 2 +- dags/violation_detection_helpers/extract.py | 2 +- dags/violation_detection_helpers/load.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dags/analyzer_helper/telegram/extract_raw_members.py b/dags/analyzer_helper/telegram/extract_raw_members.py index fc624afe..6b595ebc 100644 --- a/dags/analyzer_helper/telegram/extract_raw_members.py +++ b/dags/analyzer_helper/telegram/extract_raw_members.py @@ -2,8 +2,8 @@ DateTimeFormatConverter, ) from github.neo4j_storage.neo4j_connection import Neo4jConnection -from tc_hivemind_backend.db.mongo import MongoSingleton from pymongo import DESCENDING +from tc_hivemind_backend.db.mongo import MongoSingleton class ExtractRawMembers: diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index e747e2aa..27d7b45c 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -2,7 +2,6 @@ from datetime import datetime from dotenv import load_dotenv -from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.github.extract import ( GithubExtraction, fetch_issues, @@ -13,6 +12,7 @@ ) from hivemind_etl_helpers.src.db.github.transform import GitHubTransformation from llama_index.core import Document +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline def process_github_vectorstore( diff --git a/dags/hivemind_etl_helpers/mediawiki_etl.py b/dags/hivemind_etl_helpers/mediawiki_etl.py index f5b9d1b7..f130adfa 100644 --- a/dags/hivemind_etl_helpers/mediawiki_etl.py +++ b/dags/hivemind_etl_helpers/mediawiki_etl.py @@ -1,7 +1,7 @@ import logging -from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.mediawiki.extractor import MediaWikiExtractor +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline def process_mediawiki_etl( diff --git a/dags/hivemind_etl_helpers/notion_etl.py b/dags/hivemind_etl_helpers/notion_etl.py index 9a69419b..0b1ca673 100644 --- a/dags/hivemind_etl_helpers/notion_etl.py +++ b/dags/hivemind_etl_helpers/notion_etl.py @@ -1,7 +1,7 @@ import logging -from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline class NotionProcessor: diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py index 916f0805..db98e619 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py @@ -3,9 +3,9 @@ from bson import ObjectId from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary -from tc_hivemind_backend.db.mongo import MongoSingleton from llama_index.core import Document, MockEmbedding, Settings from llama_index.core.llms import MockLLM +from tc_hivemind_backend.db.mongo import MongoSingleton class TestDiscordGroupedDataPreparation(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py index c9ceccc4..792b75eb 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_thread_summaries.py @@ -4,9 +4,9 @@ from hivemind_etl_helpers.src.db.discord.summary.prepare_summaries import ( PrepareSummaries, ) -from tc_hivemind_backend.db.mongo import MongoSingleton from llama_index.core import MockEmbedding, Settings from llama_index.core.llms import MockLLM +from tc_hivemind_backend.db.mongo import MongoSingleton class TestPrepareSummaries(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py b/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py index 85529186..281426ac 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py @@ -1,9 +1,9 @@ import unittest from unittest.mock import Mock -from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from llama_index.core.ingestion import IngestionPipeline from llama_index.core.schema import Document +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline class TestIngestionPipeline(unittest.TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index 1a28ee77..4922fb5b 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -6,9 +6,9 @@ from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import ( discord_raw_to_documents, ) -from tc_hivemind_backend.db.mongo import MongoSingleton from llama_index.core.indices.vector_store import VectorStoreIndex from tc_hivemind_backend.db.credentials import load_postgres_credentials +from tc_hivemind_backend.db.mongo import MongoSingleton from tc_hivemind_backend.db.pg_db_utils import setup_db from tc_hivemind_backend.pg_vector_access import PGVectorAccess diff --git a/dags/hivemind_google_drive_etl.py b/dags/hivemind_google_drive_etl.py index 21251af8..f6a37f7d 100644 --- a/dags/hivemind_google_drive_etl.py +++ b/dags/hivemind_google_drive_etl.py @@ -5,9 +5,9 @@ from airflow import DAG from airflow.decorators import task -from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.gdrive.gdrive_loader import GoogleDriveLoader from hivemind_etl_helpers.src.utils.modules import ModulesGDrive +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline with DAG( dag_id="gdrive_vector_store", diff --git a/dags/hivemind_telegram_etl.py b/dags/hivemind_telegram_etl.py index d01b4f8d..cfa43425 100644 --- a/dags/hivemind_telegram_etl.py +++ b/dags/hivemind_telegram_etl.py @@ -5,7 +5,6 @@ from airflow import DAG from airflow.decorators import task from dotenv import load_dotenv -from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline from hivemind_etl_helpers.src.db.telegram.extract import ( ExtractMessages, ExtractMessagesDaily, @@ -18,6 +17,7 @@ ) from hivemind_etl_helpers.src.db.telegram.utils import TelegramModules, TelegramPlatform from qdrant_client.http import models +from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline # Common DAG configuration default_args = { diff --git a/dags/violation_detection_helpers/extract.py b/dags/violation_detection_helpers/extract.py index f6ef20a2..c21c5cb9 100644 --- a/dags/violation_detection_helpers/extract.py +++ b/dags/violation_detection_helpers/extract.py @@ -1,8 +1,8 @@ import logging from datetime import datetime -from tc_hivemind_backend.db.mongo import MongoSingleton from pymongo.cursor import Cursor +from tc_hivemind_backend.db.mongo import MongoSingleton class ExtractPlatformRawData: diff --git a/dags/violation_detection_helpers/load.py b/dags/violation_detection_helpers/load.py index 638f19cb..51c836e3 100644 --- a/dags/violation_detection_helpers/load.py +++ b/dags/violation_detection_helpers/load.py @@ -1,7 +1,7 @@ from typing import Any -from tc_hivemind_backend.db.mongo import MongoSingleton from pymongo import UpdateOne +from tc_hivemind_backend.db.mongo import MongoSingleton class LoadPlatformLabeledData: From b51d5cab31c95ad8a487308ff4c31f53d5c83d53 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 21 Nov 2024 11:31:09 +0330 Subject: [PATCH 7/7] fix: test cases import! --- .../tests/unit/test_extract_raw_data.py | 2 +- .../tests/unit/test_extract_raw_data_latest_date.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/violation_detection_helpers/tests/unit/test_extract_raw_data.py b/dags/violation_detection_helpers/tests/unit/test_extract_raw_data.py index 5799e262..13415776 100644 --- a/dags/violation_detection_helpers/tests/unit/test_extract_raw_data.py +++ b/dags/violation_detection_helpers/tests/unit/test_extract_raw_data.py @@ -6,7 +6,7 @@ class TestExtractRawData(TestCase): - @patch("hivemind_etl_helpers.src.utils.mongo.MongoSingleton.get_instance") + @patch("tc_hivemind_backend.db.mongo.MongoSingleton.get_instance") def test_extract(self, mock_get_instance): mock_client = MagicMock() diff --git a/dags/violation_detection_helpers/tests/unit/test_extract_raw_data_latest_date.py b/dags/violation_detection_helpers/tests/unit/test_extract_raw_data_latest_date.py index da33e7a7..e7103419 100644 --- a/dags/violation_detection_helpers/tests/unit/test_extract_raw_data_latest_date.py +++ b/dags/violation_detection_helpers/tests/unit/test_extract_raw_data_latest_date.py @@ -6,7 +6,7 @@ class TestExtractRawDataLatestDate(TestCase): - @patch("hivemind_etl_helpers.src.utils.mongo.MongoSingleton.get_instance") + @patch("tc_hivemind_backend.db.mongo.MongoSingleton.get_instance") def test_extract_latest_date(self, mock_get_instance): mock_client = MagicMock()