diff --git a/Dockerfile b/Dockerfile index 9578f4ce..15303f18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,10 +4,12 @@ COPY . . RUN chmod +x init.sh USER airflow RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt +RUN python -m spacy download en_core_web_sm FROM python:3.11-bullseye AS test WORKDIR /project COPY . . RUN pip install -r requirements.txt +RUN python -m spacy download en_core_web_sm RUN chmod +x docker-entrypoint.sh CMD ["./docker-entrypoint.sh"] diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py index dfb540bd..0f262e6d 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py @@ -7,6 +7,7 @@ def fetch_raw_messages( guild_id: str, selected_channels: list[str], from_date: datetime, + **kwargs, ) -> list[dict]: """ fetch rawinfo messages from mongodb database @@ -20,6 +21,10 @@ def fetch_raw_messages( from_date : datetime get the raw data from a specific date default is None, meaning get all the messages + kwargs : dict + min_word_limit : int + the minimum words that the messages shuold contain + default is 8 characters Returns -------- @@ -29,6 +34,8 @@ def fetch_raw_messages( client = MongoSingleton.get_instance().get_client() user_ids = get_real_users(guild_id) + min_word_limit = kwargs.get("min_word_limit", 15) + cursor = ( client[guild_id]["rawinfos"] .find( @@ -38,6 +45,7 @@ def fetch_raw_messages( "createdDate": {"$gte": from_date}, "isGeneratedByWebhook": False, "channelId": {"$in": selected_channels}, + "$expr": {"$gt": [{"$strLenCP": "$content"}, min_word_limit]}, } ) .sort("createdDate", 1) diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index cf04ddbe..5bd865ba 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -3,7 +3,6 @@ from typing import Any from hivemind_etl_helpers.src.db.discord.utils.content_parser import ( - check_no_content_only_links, remove_empty_str, remove_none_from_list, ) @@ -14,14 +13,12 @@ from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_ids import ( prepare_raw_message_ids, ) -from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_urls import ( - prepare_raw_message_urls, -) from hivemind_etl_helpers.src.db.discord.utils.prepare_reactions_id import ( prepare_raction_ids, ) from hivemind_etl_helpers.src.db.globals import DATE_FORMAT from llama_index.core import Document +from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor def transform_discord_raw_messages( @@ -160,7 +157,7 @@ def prepare_document( roles=dict(zip(role_ids, role_names)), users=dict(zip(mention_ids, mention_names)), ) - content_url_updated, url_reference = prepare_raw_message_urls(content) + # content_url_updated, url_reference = prepare_raw_message_urls(content) # always has length 1 assert len(author_name) == 1, "Either None or multiple authors!" @@ -202,8 +199,8 @@ def prepare_document( msg_meta_data["reactors_global_name"] = reactors_gname if reactors_nickname != []: msg_meta_data["reactors_nicknames"] = reactors_nickname - if url_reference != {}: - msg_meta_data["url_reference"] = url_reference + # if url_reference != {}: + # msg_meta_data["url_reference"] = url_reference if replier_name is not None: msg_meta_data["replier_username"] = replier_name[0] @@ -214,18 +211,18 @@ def prepare_document( if role_names != []: msg_meta_data["role_mentions"] = role_names - if content_url_updated == "": - raise ValueError("Message with Empty content!") + # if content_url_updated == "": + # raise ValueError("Message with Empty content!") - if check_no_content_only_links(content_url_updated): - raise ValueError("Message just did have urls") + if not BasePreprocessor().extract_main_content(text=content): + raise ValueError("Message didn't hold any valuable information!") # removing null characters - content_url_updated = re.sub(r"[\x00-\x1F\x7F]", "", content_url_updated) + content = re.sub(r"[\x00-\x1F\x7F]", "", content) doc: Document if not exclude_metadata: - doc = Document(text=content_url_updated, metadata=msg_meta_data) + doc = Document(text=content, metadata=msg_meta_data) doc.excluded_embed_metadata_keys = [ "channel", "date", @@ -239,7 +236,7 @@ def prepare_document( "reactors_global_name", "reactors_nicknames", "thread", - "url_reference", + # "url_reference", "replier_username", "replier_global_name", "replier_nickname", @@ -256,7 +253,7 @@ def prepare_document( "reactors_global_name", "reactors_nicknames", "thread", - "url_reference", + # "url_reference", "replier_username", "replier_global_name", "replier_nickname", @@ -264,6 +261,6 @@ def prepare_document( "url", ] else: - doc = Document(text=content_url_updated) + doc = Document(text=content) return doc diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py index f70cef03..4a5a1c91 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py @@ -122,7 +122,7 @@ def test_fetch_raw_messages_fetch_all(self): data = { "type": 0, "author": users_id[i], - "content": f"test_message {np.random.randint(0, 10)}", + "content": f"{np.random.randint(0, 10)} Apples are falling from trees!", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -247,7 +247,7 @@ def test_fetch_raw_messages_fetch_from_date(self): data = { "type": 0, "author": users_id[i], - "content": f"test_message {np.random.randint(0, 10)}", + "content": f"Apples falling from trees {np.random.randint(0, 10)}", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -286,3 +286,181 @@ def test_fetch_raw_messages_fetch_from_date(self): # Check if the fetched messages are equal to the expected messages self.assertCountEqual(messages, expected_messages) + + def test_fetch_raw_messages_fetch_limited_characters(self): + """ + fetch raw messages and do filtering + """ + client = MongoSingleton.get_instance().client + + guild_id = "1234" + channels = ["111111", "22222"] + users_id = ["user1", "user2", "user3", "user4", "user5"] + guild_id = "1234" + self.setup_db( + channels=channels, + guild_id=guild_id, + ) + + for user in users_id: + is_bot = False + if user == "user3": + is_bot = True + + client[guild_id]["guildmembers"].insert_one( + { + "discordId": user, + "username": f"username_{user}", + "roles": None, + "joinedAt": datetime(2023, 1, 1), + "avatar": None, + "isBot": is_bot, + "discriminator": "0", + "permissions": None, + "deletedAt": None, + "globalName": None, + "nickname": None, + } + ) + + # Dropping any previous data + client[guild_id].drop_collection("rawinfos") + + # Insert messages with different dates + raw_data = [] + + data = { + "type": 0, + "author": users_id[0], + "content": "AA", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[0], + "channelName": f"general {channels[0]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + data = { + "type": 0, + "author": users_id[1], + "content": "A sample text with more than 15 characters!", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[1], + "channelName": f"general {channels[1]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + client[guild_id]["rawinfos"].insert_many(raw_data) + + messages = fetch_raw_messages( + guild_id, + selected_channels=channels, + from_date=datetime(2023, 9, 20), + ) + # Check if the fetched messages are equal to the expected messages + self.assertEqual(len(messages), 1) + + def test_fetch_raw_messages_fetch_limited_characters_specified(self): + """ + fetch raw messages and do filtering with a specified value + """ + client = MongoSingleton.get_instance().client + + guild_id = "1234" + channels = ["111111", "22222"] + users_id = ["user1", "user2", "user3", "user4", "user5"] + guild_id = "1234" + self.setup_db( + channels=channels, + guild_id=guild_id, + ) + + for user in users_id: + is_bot = False + if user == "user3": + is_bot = True + + client[guild_id]["guildmembers"].insert_one( + { + "discordId": user, + "username": f"username_{user}", + "roles": None, + "joinedAt": datetime(2023, 1, 1), + "avatar": None, + "isBot": is_bot, + "discriminator": "0", + "permissions": None, + "deletedAt": None, + "globalName": None, + "nickname": None, + } + ) + + # Dropping any previous data + client[guild_id].drop_collection("rawinfos") + + # Insert messages with different dates + raw_data = [] + + data = { + "type": 0, + "author": users_id[0], + "content": "AA", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[0], + "channelName": f"general {channels[0]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + data = { + "type": 0, + "author": users_id[1], + "content": "A sample text with more than 8 characters!", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[1], + "channelName": f"general {channels[1]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + client[guild_id]["rawinfos"].insert_many(raw_data) + + # Fetch messages from a specific date (October 3, 2023) + messages = fetch_raw_messages( + guild_id, + selected_channels=channels, + from_date=datetime(2023, 9, 20), + min_word_limit=1, + ) + # Check if the fetched messages are equal to the expected messages + self.assertEqual(len(messages), 2) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index f92ce950..87dbb4c4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -17,7 +17,7 @@ def setup_db( create_platform: bool = True, guild_id: str = "1234", ): - client = MongoSingleton.get_instance().client + client = MongoSingleton.get_instance().get_client() community_id = ObjectId("9f59dd4f38f3474accdc8f24") platform_id = ObjectId("063a2a74282db2c00fbc2428") @@ -83,7 +83,7 @@ def setup_db( ) def test_transform_two_data(self): - client = MongoSingleton.get_instance().client + client = MongoSingleton.get_instance().get_client() channels = ["111111", "22222"] guild_id = "1234" @@ -99,7 +99,7 @@ def test_transform_two_data(self): data = { "type": 0, "author": "111", - "content": "test_message1", + "content": "test_message1 making it longer!", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -308,18 +308,16 @@ def test_transform_two_data(self): "date": datetime(2023, 5, 8).strftime("%Y-%m-%d %H:%M:%S"), "author_username": "user1", "author_global_name": "user1_GlobalName", - "url_reference": {"[URL0]": "https://www.google.com"}, "thread": None, "url": "https://discord.com/channels/1234/111111/10000000003", } - print(documents[0].text) + self.assertDictEqual(documents[0].metadata, expected_metadata_0) self.assertDictEqual(documents[1].metadata, expected_metadata_1) self.assertDictEqual(documents[2].metadata, expected_metadata_2) self.assertDictEqual(documents[3].metadata, expected_metadata_3) - # Optionally, you can also check the text separately if needed - self.assertEqual(documents[0].text, "test_message1") + self.assertEqual(documents[0].text, "test_message1 making it longer!") self.assertEqual(documents[1].text, "mentioning a person user3") self.assertEqual(documents[2].text, "mentioning user3 user4 role1") - self.assertEqual(documents[3].text, "test_message1 [URL0]") + self.assertEqual(documents[3].text, "test_message1 https://www.google.com") diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py index 3c5fa5af..8c324460 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py @@ -205,7 +205,6 @@ def test_transform_two_data(self): "author_username": "user1", "author_global_name": "user1_GlobalName", "author_nickname": "user1_nickname", - "url_reference": {"[URL0]": "https://www.google.com"}, "url": f"https://discord.com/channels/{guild_id}/1313133/1111111113", "thread": None, } @@ -215,8 +214,7 @@ def test_transform_two_data(self): self.assertDictEqual(documents[2].metadata, expected_metadata_2) self.assertDictEqual(documents[3].metadata, expected_metadata_3) - # Optionally, you can also check the text separately if needed self.assertEqual(documents[0].text, "test_message1") self.assertEqual(documents[1].text, "mentioning a person user3") self.assertEqual(documents[2].text, "mentioning user3 user4 role1") - self.assertEqual(documents[3].text, "test_message1 [URL0]") + self.assertEqual(documents[3].text, "test_message1 https://www.google.com") diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index 4922fb5b..b8855470 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -124,7 +124,7 @@ def _create_and_save_doc(self, table: str, guild_id: str, dbname: str): { "type": 0, "author": "111", - "content": "test_message1", + "content": "Apples are falling from trees 1", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -185,7 +185,7 @@ def test_save_documents(self): text, _, metadata = data[0] # nickname was `None`, so it wasn't included in metadata - self.assertEqual(text, "test_message1") + self.assertEqual(text, "Apples are falling from trees 1") self.assertEqual(metadata["channel"], documents[0].metadata["channel"]) self.assertEqual( metadata["author_username"], documents[0].metadata["author_username"] diff --git a/requirements.txt b/requirements.txt index dd69b10f..82ab0e11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ cohere==4.56 neo4j>=5.14.1, <6.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.4.0 +tc-hivemind-backend==1.4.2 traceloop-sdk>=0.15.2, <0.16.0 beautifulsoup4==4.12.3 llama-index-readers-notion==0.1.6 @@ -22,4 +22,5 @@ tc-wikipedia-lib==1.0.1 llama-index-readers-file==0.1.22 docx2txt==0.8 tc-analyzer-lib==1.4.13 -pydantic==2.9.2 \ No newline at end of file +pydantic==2.9.2 +spacy==3.7.5 \ No newline at end of file