From ba4c527e3d822647bc94108fee47f6d3bc06577e Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 5 Dec 2024 13:31:21 +0330 Subject: [PATCH 01/23] feat: Added word max limit! --- .../src/db/discord/fetch_raw_messages.py | 8 + .../test_discord_fetch_raw_messages.py | 174 ++++++++++++++++++ 2 files changed, 182 insertions(+) diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py index dfb540bd..17066a44 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py @@ -7,6 +7,7 @@ def fetch_raw_messages( guild_id: str, selected_channels: list[str], from_date: datetime, + **kwargs, ) -> list[dict]: """ fetch rawinfo messages from mongodb database @@ -20,6 +21,10 @@ def fetch_raw_messages( from_date : datetime get the raw data from a specific date default is None, meaning get all the messages + kwargs : dict + min_word_limit : int + the minimum words that the messages shuold contain + default is 8 characters Returns -------- @@ -29,6 +34,8 @@ def fetch_raw_messages( client = MongoSingleton.get_instance().get_client() user_ids = get_real_users(guild_id) + limit_max_words = kwargs.get("min_word_limit", 8) + cursor = ( client[guild_id]["rawinfos"] .find( @@ -38,6 +45,7 @@ def fetch_raw_messages( "createdDate": {"$gte": from_date}, "isGeneratedByWebhook": False, "channelId": {"$in": selected_channels}, + "$where": f"this.content.length > {limit_max_words}", } ) .sort("createdDate", 1) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py index f70cef03..66dfc77c 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py @@ -286,3 +286,177 @@ def test_fetch_raw_messages_fetch_from_date(self): # Check if the fetched messages are equal to the expected messages self.assertCountEqual(messages, expected_messages) + + def test_fetch_raw_messages_fetch_limited_characters(self): + """ + fetch raw messages and do filtering + """ + client = MongoSingleton.get_instance().client + + guild_id = "1234" + channels = ["111111", "22222"] + users_id = ["user1", "user2", "user3", "user4", "user5"] + guild_id = "1234" + self.setup_db( + channels=channels, + guild_id=guild_id, + ) + + for user in users_id: + is_bot = False + if user == "user3": + is_bot = True + + client[guild_id]["guildmembers"].insert_one( + { + "discordId": user, + "username": f"username_{user}", + "roles": None, + "joinedAt": datetime(2023, 1, 1), + "avatar": None, + "isBot": is_bot, + "discriminator": "0", + "permissions": None, + "deletedAt": None, + "globalName": None, + "nickname": None, + } + ) + + # Dropping any previous data + client[guild_id].drop_collection("rawinfos") + + # Insert messages with different dates + raw_data = [] + + data = { + "type": 0, + "author": users_id[0], + "content": "AA", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[0], + "channelName": f"general {channels[0]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + data = { + "type": 0, + "author": users_id[1], + "content": "A sample text with more than 8 characters!", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[1], + "channelName": f"general {channels[1]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + client[guild_id]["rawinfos"].insert_many(raw_data) + + # Fetch messages from a specific date (October 3, 2023) + messages = fetch_raw_messages(guild_id, selected_channels=channels) + # Check if the fetched messages are equal to the expected messages + self.assertCountEqual(messages, 1) + + def test_fetch_raw_messages_fetch_limited_characters(self): + """ + fetch raw messages and do filtering + """ + client = MongoSingleton.get_instance().client + + guild_id = "1234" + channels = ["111111", "22222"] + users_id = ["user1", "user2", "user3", "user4", "user5"] + guild_id = "1234" + self.setup_db( + channels=channels, + guild_id=guild_id, + ) + + for user in users_id: + is_bot = False + if user == "user3": + is_bot = True + + client[guild_id]["guildmembers"].insert_one( + { + "discordId": user, + "username": f"username_{user}", + "roles": None, + "joinedAt": datetime(2023, 1, 1), + "avatar": None, + "isBot": is_bot, + "discriminator": "0", + "permissions": None, + "deletedAt": None, + "globalName": None, + "nickname": None, + } + ) + + # Dropping any previous data + client[guild_id].drop_collection("rawinfos") + + # Insert messages with different dates + raw_data = [] + + data = { + "type": 0, + "author": users_id[0], + "content": "AA", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[0], + "channelName": f"general {channels[0]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + data = { + "type": 0, + "author": users_id[1], + "content": "A sample text with more than 8 characters!", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime(2023, 10, 1), + "messageId": str(np.random.randint(1000000, 9999999)), + "channelId": channels[1], + "channelName": f"general {channels[1]}", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + raw_data.append(data) + + client[guild_id]["rawinfos"].insert_many(raw_data) + + # Fetch messages from a specific date (October 3, 2023) + messages = fetch_raw_messages( + guild_id, + selected_channels=channels, + min_word_limit=1, + ) + # Check if the fetched messages are equal to the expected messages + self.assertCountEqual(messages, 2) From 3a018eb4f9f5a7ec615651ed92d8b73c941d28d9 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 5 Dec 2024 14:28:36 +0330 Subject: [PATCH 02/23] fix: test case to align with the latest codes! --- .../integration/test_discord_fetch_raw_messages.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py index 66dfc77c..6ae30624 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py @@ -367,14 +367,17 @@ def test_fetch_raw_messages_fetch_limited_characters(self): client[guild_id]["rawinfos"].insert_many(raw_data) - # Fetch messages from a specific date (October 3, 2023) - messages = fetch_raw_messages(guild_id, selected_channels=channels) + messages = fetch_raw_messages( + guild_id, + selected_channels=channels, + from_date=datetime(2023, 9, 20), + ) # Check if the fetched messages are equal to the expected messages self.assertCountEqual(messages, 1) - def test_fetch_raw_messages_fetch_limited_characters(self): + def test_fetch_raw_messages_fetch_limited_characters_specified(self): """ - fetch raw messages and do filtering + fetch raw messages and do filtering with a specified value """ client = MongoSingleton.get_instance().client @@ -456,6 +459,7 @@ def test_fetch_raw_messages_fetch_limited_characters(self): messages = fetch_raw_messages( guild_id, selected_channels=channels, + from_date=datetime(2023, 9, 20), min_word_limit=1, ) # Check if the fetched messages are equal to the expected messages From cdcea542b2027413d04236414ad52d8c867d5375 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 5 Dec 2024 14:43:53 +0330 Subject: [PATCH 03/23] fix: naming typo! --- .../hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py index 17066a44..e982a2ee 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py @@ -34,7 +34,7 @@ def fetch_raw_messages( client = MongoSingleton.get_instance().get_client() user_ids = get_real_users(guild_id) - limit_max_words = kwargs.get("min_word_limit", 8) + min_word_limit = kwargs.get("min_word_limit", 8) cursor = ( client[guild_id]["rawinfos"] @@ -45,7 +45,7 @@ def fetch_raw_messages( "createdDate": {"$gte": from_date}, "isGeneratedByWebhook": False, "channelId": {"$in": selected_channels}, - "$where": f"this.content.length > {limit_max_words}", + "$where": f"this.content.length > {min_word_limit}", } ) .sort("createdDate", 1) From 70459fcb8121ab4a9fc095edd2fbd777c3d0ffb0 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 5 Dec 2024 14:44:11 +0330 Subject: [PATCH 04/23] fix: test case assertions! --- .../tests/integration/test_discord_fetch_raw_messages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py index 6ae30624..f5abcd19 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py @@ -373,7 +373,7 @@ def test_fetch_raw_messages_fetch_limited_characters(self): from_date=datetime(2023, 9, 20), ) # Check if the fetched messages are equal to the expected messages - self.assertCountEqual(messages, 1) + self.assertEqual(len(messages), 1) def test_fetch_raw_messages_fetch_limited_characters_specified(self): """ @@ -463,4 +463,4 @@ def test_fetch_raw_messages_fetch_limited_characters_specified(self): min_word_limit=1, ) # Check if the fetched messages are equal to the expected messages - self.assertCountEqual(messages, 2) + self.assertEqual(len(messages), 2) From 9b2beadc64f13314a7e94b11cfe2458e576151f3 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 12:41:39 +0330 Subject: [PATCH 05/23] feat: Added noise cancelling for discord messages! --- Dockerfile | 1 + .../src/db/common/__init__.py | 0 .../src/db/common/base_preprocessor.py | 43 ++++++ .../discord_raw_message_to_document.py | 34 ++++- .../src/db/discord/fetch_raw_messages.py | 4 +- .../src/db/discord/preprocessor.py | 79 ++++++++++ .../tests/unit/test_common_preprocessor.py | 62 ++++++++ .../tests/unit/test_discord_preprocess.py | 43 ++++++ .../unit/test_discord_update_raw_messages.py | 141 ++++++++++++++++++ 9 files changed, 404 insertions(+), 3 deletions(-) create mode 100644 dags/hivemind_etl_helpers/src/db/common/__init__.py create mode 100644 dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py create mode 100644 dags/hivemind_etl_helpers/src/db/discord/preprocessor.py create mode 100644 dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py create mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py create mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py diff --git a/Dockerfile b/Dockerfile index 9578f4ce..88958223 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ COPY . . RUN chmod +x init.sh USER airflow RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt +RUN python -m spacy download en_core_web_lg FROM python:3.11-bullseye AS test WORKDIR /project diff --git a/dags/hivemind_etl_helpers/src/db/common/__init__.py b/dags/hivemind_etl_helpers/src/db/common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py new file mode 100644 index 00000000..1723b776 --- /dev/null +++ b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py @@ -0,0 +1,43 @@ +import spacy + + +class BasePreprocessor: + def __init__(self) -> None: + pass + + def extract_main_content(self, text: str) -> str: + """ + extract main content of a message + + Parameters + ------------ + text : str + a discord message text + + Returns + -------- + cleaned_text : str + + """ + try: + nlp = spacy.load("en_core_web_lg") + except OSError as exp: + raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp + + doc = nlp(text) + + # Filter out punctuation, whitespace, and numerical values, then extract the lemma for each remaining token + main_content_tokens = [ + token.lemma_ + for token in doc + if not token.is_punct + and not token.is_space + and not token.is_stop + and not token.like_url + and not token.like_num + and token.is_ascii + ] + + # Join the tokens to form the cleaned sentence + cleaned_text = " ".join(main_content_tokens) + return cleaned_text diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py index 190a08c1..bde5ab1d 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py @@ -5,6 +5,7 @@ transform_discord_raw_messages, ) from llama_index.core import Document +from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor def discord_raw_to_documents( @@ -29,6 +30,37 @@ def discord_raw_to_documents( list of messages converted to documents """ raw_mongo_messages = fetch_raw_messages(guild_id, selected_channels, from_date) - messages_docuemnt = transform_discord_raw_messages(guild_id, raw_mongo_messages) + processed_messages = update_raw_messages(raw_data=raw_mongo_messages) + messages_docuemnt = transform_discord_raw_messages(guild_id, processed_messages) return messages_docuemnt + + +def update_raw_messages(raw_data: list[dict]) -> list[dict]: + """ + Update raw messages text by cleaning their data + + Parameters + ----------- + data : list[dict] + a list of raw data fetched from database + each dict hold a 'content' + + Returns + --------- + cleaned_data : list[dict] + a list of dictionaries but with cleaned data + """ + preprocessor = DiscordPreprocessor() + + cleaned_data: list[dict] = [] + for data in raw_data: + content = data.get("content") + if content: + cleaned_content = preprocessor.clean_text(content) + print(cleaned_content) + if cleaned_content: + data["content"] = cleaned_content + cleaned_data.append(data) + + return cleaned_data diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py index e982a2ee..0f262e6d 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py @@ -34,7 +34,7 @@ def fetch_raw_messages( client = MongoSingleton.get_instance().get_client() user_ids = get_real_users(guild_id) - min_word_limit = kwargs.get("min_word_limit", 8) + min_word_limit = kwargs.get("min_word_limit", 15) cursor = ( client[guild_id]["rawinfos"] @@ -45,7 +45,7 @@ def fetch_raw_messages( "createdDate": {"$gte": from_date}, "isGeneratedByWebhook": False, "channelId": {"$in": selected_channels}, - "$where": f"this.content.length > {min_word_limit}", + "$expr": {"$gt": [{"$strLenCP": "$content"}, min_word_limit]}, } ) .sort("createdDate", 1) diff --git a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py new file mode 100644 index 00000000..6c01081b --- /dev/null +++ b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py @@ -0,0 +1,79 @@ +import re + +from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor + + +class DiscordPreprocessor(BasePreprocessor): + """ + preprocess discord text messages + """ + + def __init__(self) -> None: + pass + + def clean_texts(self, texts: list[str]) -> list[str]: + """ + clean the given text + + Parameters + ------------ + texts : list[str] + a list of discord messages text + + Returns + --------- + texts_cleaned : str + the cleaned text + (discord ids removed) + """ + texts_cleaned: list[str] = [] + + for text in texts: + text_cleaned = self.clean_text(text=text) + texts_cleaned.append(text_cleaned) + + return texts_cleaned + + def clean_text(self, text: str) -> str: + """ + clean the given text + + Parameters + ------------ + text : str + a discord message text + + Returns + --------- + text_cleaned : str + the cleaned text + (discord ids removed) + """ + text_cleaned = self.remove_ids(text=text) + text_cleaned = self.extract_main_content(text=text_cleaned) + + return text_cleaned + + def remove_ids(self, text: str) -> str: + """ + remove the ids that are available in texts + user ids would be with the format of <@number> + and the role ids are in the format of <@&number> + + Parameters + ------------ + text : str + a discord message text + + Returns + --------- + cleaned_text : str + the texts with removed <@number> and <@&number> + """ + pattern = r"<@&?\d+>" + + # Removing matches + cleaned_text = re.sub(pattern, "", text) + + cleaned_text = " ".join(cleaned_text.split()) + return cleaned_text diff --git a/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py b/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py new file mode 100644 index 00000000..e268089e --- /dev/null +++ b/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py @@ -0,0 +1,62 @@ +import unittest +from unittest.mock import patch +import spacy + +from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor + + +class TestDiscordPreprocessor(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.preprocessor = BasePreprocessor() + + # Tests for extract_main_content method + @unittest.skipIf( + not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" + ) + def test_extract_main_content_basic(self): + """Test basic content extraction""" + input_text = "The quick brown fox jumps over the lazy dog!" + result = self.preprocessor.extract_main_content(input_text) + # Note: Expected result will contain lemmatized words without stop words + self.assertIn("quick brown fox jump lazy dog", result.lower()) + + @unittest.skipIf( + not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" + ) + def test_extract_main_content_with_numbers(self): + """Test content extraction with numbers""" + input_text = "I have 5 apples and 3 oranges." + result = self.preprocessor.extract_main_content(input_text) + self.assertIn("apple orange", result.lower()) + self.assertNotIn("5", result) + self.assertNotIn("3", result) + + @unittest.skipIf( + not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" + ) + def test_extract_main_content_with_urls(self): + """Test content extraction with URLs""" + input_text = "Check this link: https://example.com for more information" + result = self.preprocessor.extract_main_content(input_text) + self.assertNotIn("https://example.com", result) + self.assertIn("check link information", result.lower()) + + @unittest.skipIf( + not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" + ) + def test_extract_main_content_empty_string(self): + """Test content extraction with empty string""" + self.assertEqual(self.preprocessor.extract_main_content(""), "") + + def test_extract_main_content_missing_model(self): + """Test handling of missing spacy model""" + with patch("spacy.load") as mock_load: + mock_load.side_effect = OSError("Model not found") + + with self.assertRaises(OSError) as context: + self.preprocessor.extract_main_content("Test text") + + self.assertIn( + "Model spacy `en_core_web_lg` is not installed!", str(context.exception) + ) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py new file mode 100644 index 00000000..34d0d4a1 --- /dev/null +++ b/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py @@ -0,0 +1,43 @@ +import unittest + +from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor + + +class TestDiscordPreprocessor(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.preprocessor = DiscordPreprocessor() + + # Tests for remove_ids method + def test_remove_user_ids(self): + """Test removal of user IDs from text""" + input_text = "Hello <@123456> how are you?" + expected = "Hello how are you?" + self.assertEqual(self.preprocessor.remove_ids(input_text), expected) + + def test_remove_role_ids(self): + """Test removal of role IDs from text""" + input_text = "Welcome <@&789012> to the server!" + expected = "Welcome to the server!" + self.assertEqual(self.preprocessor.remove_ids(input_text), expected) + + def test_remove_multiple_ids(self): + """Test removal of multiple IDs from text""" + input_text = "Hey <@123456> please notify <@&789012> and <@345678>" + expected = "Hey please notify and" + self.assertEqual(self.preprocessor.remove_ids(input_text), expected) + + def test_text_without_ids(self): + """Test text that doesn't contain any IDs""" + input_text = "Regular message without any IDs" + self.assertEqual(self.preprocessor.remove_ids(input_text), input_text) + + def test_empty_string(self): + """Test empty string input""" + self.assertEqual(self.preprocessor.remove_ids(""), "") + + def test_only_whitespace(self): + """Test string with only whitespace""" + input_text = " \t \n " + expected = "" + self.assertEqual(self.preprocessor.remove_ids(input_text), expected) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py new file mode 100644 index 00000000..90ce9728 --- /dev/null +++ b/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py @@ -0,0 +1,141 @@ +import unittest +from datetime import datetime +import spacy + +from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import ( + update_raw_messages, +) + + +class TestUpdateRawMessages(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Load spacy model once for all tests""" + try: + cls.nlp = spacy.load("en_core_web_lg") + except OSError as exp: + raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp + + def test_empty_list(self): + """Test function handles empty input list correctly""" + result = update_raw_messages([]) + self.assertEqual(result, [], "Empty list should return empty list") + + def test_with_content(self): + """Test function processes valid content correctly""" + test_data = [ + { + "type": 0, + "author": "123456", + "content": "test_message <@123456> Hello", + "user_mentions": [], + "role_mentions": [], + "reactions": [], + "replied_user": None, + "createdDate": datetime.now(), + "messageId": "1234567", + "channelId": "89101112", + "channelName": "general", + "threadId": None, + "threadName": None, + "isGeneratedByWebhook": False, + } + ] + + result = update_raw_messages(test_data) + + self.assertEqual(len(result), 1, "Should return one processed message") + self.assertIn("content", result[0], "Processed message should contain content") + # The actual content will depend on spaCy's processing, but should not contain Discord IDs + self.assertNotIn( + "<@123456>", result[0]["content"], "Discord IDs should be removed" + ) + self.assertNotEqual(result[0]["content"], "", "Content should not be empty") + + def test_without_content(self): + """Test function handles messages without content field""" + test_data = [ + { + "type": 0, + "author": "123456", + "user_mentions": [], + "role_mentions": [], + } + ] + + result = update_raw_messages(test_data) + self.assertEqual(result, [], "Message without content should be filtered out") + + def test_with_discord_ids(self): + """Test function removes Discord user and role IDs correctly""" + test_data = [ + { + "content": "Hello <@123456> and <@&789012> how are you?", + "type": 0, + } + ] + + result = update_raw_messages(test_data) + self.assertEqual(len(result), 1, "Should return one processed message") + # Check that Discord IDs are removed + self.assertNotIn("<@123456>", result[0]["content"], "User ID should be removed") + self.assertNotIn( + "<@&789012>", result[0]["content"], "Role ID should be removed" + ) + # Check that meaningful content remains + self.assertIn( + "hello", result[0]["content"].lower(), "Regular words should be preserved" + ) + + def test_consecutive_calls(self): + """Test function handles consecutive calls correctly""" + test_data_1 = [{"content": "Apples are falling from trees!"}] + test_data_2 = [{"content": "second message"}] + + result_1 = update_raw_messages(test_data_1) + result_2 = update_raw_messages(test_data_2) + + self.assertEqual(len(result_1), 1, "First call should return one message") + self.assertEqual(len(result_2), 1, "Second call should return one message") + self.assertNotEqual( + result_1[0]["content"], + result_2[0]["content"], + "Different inputs should produce different outputs", + ) + + def test_multiple_messages(self): + """Test function processes multiple messages correctly""" + test_data = [ + {"content": "first message about cats"}, + {"content": "second message about dogs"}, + {"content": "third message about birds"}, + ] + + result = update_raw_messages(test_data) + self.assertEqual(len(result), 3, "Should return three processed messages") + + # Check that each message has been processed + for i, msg in enumerate(result): + self.assertIn("content", msg, f"Message {i} should have content") + self.assertNotEqual( + msg["content"], "", f"Message {i} content should not be empty" + ) + + def test_special_characters(self): + """Test function handles special characters correctly""" + test_data = [ + { + "content": "Hello! This is a test... With some punctuation!?! @#$%", + "type": 0, + } + ] + + result = update_raw_messages(test_data) + self.assertEqual(len(result), 1, "Should return one processed message") + # The processed content should contain meaningful words but not punctuation + processed_content = result[0]["content"].lower() + self.assertIn("hello", processed_content, "Common words should be preserved") + self.assertIn("test", processed_content, "Common words should be preserved") + self.assertNotIn( + "!?!", processed_content, "Multiple punctuation should be removed" + ) From 3788374e40235e05f1e8c8238953d9f6b554ccb5 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 12:43:18 +0330 Subject: [PATCH 06/23] fix: remove unnecessary print! --- .../src/db/discord/discord_raw_message_to_document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py index bde5ab1d..0c5ddc45 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py @@ -58,7 +58,7 @@ def update_raw_messages(raw_data: list[dict]) -> list[dict]: content = data.get("content") if content: cleaned_content = preprocessor.clean_text(content) - print(cleaned_content) + if cleaned_content: data["content"] = cleaned_content cleaned_data.append(data) From b007eec7f299dafdf0d63bd2611dd32d2065491b Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 12:48:39 +0330 Subject: [PATCH 07/23] fix: Added missing spacy dependency! --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index dd69b10f..2f96c76a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,5 @@ tc-wikipedia-lib==1.0.1 llama-index-readers-file==0.1.22 docx2txt==0.8 tc-analyzer-lib==1.4.13 -pydantic==2.9.2 \ No newline at end of file +pydantic==2.9.2 +spacy==3.7.5 \ No newline at end of file From 33d0fb46b2e723ac23d07f4c65fd85d9e67cc8cb Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 13:36:00 +0330 Subject: [PATCH 08/23] feat: updating the messages content! --- .../tests/integration/test_discord_fetch_raw_messages.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py index f5abcd19..4a5a1c91 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py @@ -122,7 +122,7 @@ def test_fetch_raw_messages_fetch_all(self): data = { "type": 0, "author": users_id[i], - "content": f"test_message {np.random.randint(0, 10)}", + "content": f"{np.random.randint(0, 10)} Apples are falling from trees!", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -247,7 +247,7 @@ def test_fetch_raw_messages_fetch_from_date(self): data = { "type": 0, "author": users_id[i], - "content": f"test_message {np.random.randint(0, 10)}", + "content": f"Apples falling from trees {np.random.randint(0, 10)}", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -350,7 +350,7 @@ def test_fetch_raw_messages_fetch_limited_characters(self): data = { "type": 0, "author": users_id[1], - "content": "A sample text with more than 8 characters!", + "content": "A sample text with more than 15 characters!", "user_mentions": [], "role_mentions": [], "reactions": [], From 32eaafb37c4096b81a190a7c9cedaf008e9b7c5c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 13:40:32 +0330 Subject: [PATCH 09/23] fix: Added spacy model download in test environment! --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 88958223..a30ed742 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,5 +10,6 @@ FROM python:3.11-bullseye AS test WORKDIR /project COPY . . RUN pip install -r requirements.txt +RUN python -m spacy download en_core_web_lg RUN chmod +x docker-entrypoint.sh CMD ["./docker-entrypoint.sh"] From b2b64686828f23747d54d9d8b386dcfda3a47e96 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 13:49:01 +0330 Subject: [PATCH 10/23] fix: aligning with latest codes! we would remove dummy data from database, so we added a message content that holds some information. --- .../tests/integration/test_pg_vector_access_with_discord.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index 4922fb5b..025d060d 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -124,7 +124,7 @@ def _create_and_save_doc(self, table: str, guild_id: str, dbname: str): { "type": 0, "author": "111", - "content": "test_message1", + "content": "Apples are falling from trees 1", "user_mentions": [], "role_mentions": [], "reactions": [], From 0a3a6dfdd8eded79cf398a13a4ee2f7f368ea3af Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 13:56:07 +0330 Subject: [PATCH 11/23] fix: align input output data! --- .../tests/integration/test_pg_vector_access_with_discord.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index 025d060d..96ada0e4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -185,7 +185,7 @@ def test_save_documents(self): text, _, metadata = data[0] # nickname was `None`, so it wasn't included in metadata - self.assertEqual(text, "test_message1") + self.assertEqual(text, "apple fall tree") self.assertEqual(metadata["channel"], documents[0].metadata["channel"]) self.assertEqual( metadata["author_username"], documents[0].metadata["author_username"] From 5b4057d0576889ee0749af1b15953d0cdedd1070 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 13:58:10 +0330 Subject: [PATCH 12/23] fix: Added more informative messages! --- .../integration/test_discord_prepare_document_from_db.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index f92ce950..2c786ef2 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -99,7 +99,7 @@ def test_transform_two_data(self): data = { "type": 0, "author": "111", - "content": "test_message1", + "content": "Apples are falling from trees1", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -319,7 +319,7 @@ def test_transform_two_data(self): self.assertDictEqual(documents[3].metadata, expected_metadata_3) # Optionally, you can also check the text separately if needed - self.assertEqual(documents[0].text, "test_message1") - self.assertEqual(documents[1].text, "mentioning a person user3") - self.assertEqual(documents[2].text, "mentioning user3 user4 role1") + self.assertEqual(documents[0].text, "apple fall trees1") + self.assertEqual(documents[1].text, "mention person user3") + self.assertEqual(documents[2].text, "mention user3 user4 role1") self.assertEqual(documents[3].text, "test_message1 [URL0]") From cd4e0c7ccd6f85dce8bcc2b5343927f54a94f71c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 15:37:05 +0330 Subject: [PATCH 13/23] feat: removed the processors! the processor should move to the embedding part and we shouldn't change the actual content of the messages. They have been moved to hivemind_backend library. --- .../src/db/common/__init__.py | 0 .../src/db/common/base_preprocessor.py | 43 ---------- .../discord_raw_message_to_document.py | 34 +------- .../src/db/discord/preprocessor.py | 79 ------------------- .../test_discord_prepare_document_from_db.py | 9 +-- .../tests/unit/test_common_preprocessor.py | 62 --------------- 6 files changed, 5 insertions(+), 222 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/src/db/common/__init__.py delete mode 100644 dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py delete mode 100644 dags/hivemind_etl_helpers/src/db/discord/preprocessor.py delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py diff --git a/dags/hivemind_etl_helpers/src/db/common/__init__.py b/dags/hivemind_etl_helpers/src/db/common/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py deleted file mode 100644 index 1723b776..00000000 --- a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py +++ /dev/null @@ -1,43 +0,0 @@ -import spacy - - -class BasePreprocessor: - def __init__(self) -> None: - pass - - def extract_main_content(self, text: str) -> str: - """ - extract main content of a message - - Parameters - ------------ - text : str - a discord message text - - Returns - -------- - cleaned_text : str - - """ - try: - nlp = spacy.load("en_core_web_lg") - except OSError as exp: - raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp - - doc = nlp(text) - - # Filter out punctuation, whitespace, and numerical values, then extract the lemma for each remaining token - main_content_tokens = [ - token.lemma_ - for token in doc - if not token.is_punct - and not token.is_space - and not token.is_stop - and not token.like_url - and not token.like_num - and token.is_ascii - ] - - # Join the tokens to form the cleaned sentence - cleaned_text = " ".join(main_content_tokens) - return cleaned_text diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py index 0c5ddc45..190a08c1 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py @@ -5,7 +5,6 @@ transform_discord_raw_messages, ) from llama_index.core import Document -from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor def discord_raw_to_documents( @@ -30,37 +29,6 @@ def discord_raw_to_documents( list of messages converted to documents """ raw_mongo_messages = fetch_raw_messages(guild_id, selected_channels, from_date) - processed_messages = update_raw_messages(raw_data=raw_mongo_messages) - messages_docuemnt = transform_discord_raw_messages(guild_id, processed_messages) + messages_docuemnt = transform_discord_raw_messages(guild_id, raw_mongo_messages) return messages_docuemnt - - -def update_raw_messages(raw_data: list[dict]) -> list[dict]: - """ - Update raw messages text by cleaning their data - - Parameters - ----------- - data : list[dict] - a list of raw data fetched from database - each dict hold a 'content' - - Returns - --------- - cleaned_data : list[dict] - a list of dictionaries but with cleaned data - """ - preprocessor = DiscordPreprocessor() - - cleaned_data: list[dict] = [] - for data in raw_data: - content = data.get("content") - if content: - cleaned_content = preprocessor.clean_text(content) - - if cleaned_content: - data["content"] = cleaned_content - cleaned_data.append(data) - - return cleaned_data diff --git a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py deleted file mode 100644 index 6c01081b..00000000 --- a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py +++ /dev/null @@ -1,79 +0,0 @@ -import re - -from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor - - -class DiscordPreprocessor(BasePreprocessor): - """ - preprocess discord text messages - """ - - def __init__(self) -> None: - pass - - def clean_texts(self, texts: list[str]) -> list[str]: - """ - clean the given text - - Parameters - ------------ - texts : list[str] - a list of discord messages text - - Returns - --------- - texts_cleaned : str - the cleaned text - (discord ids removed) - """ - texts_cleaned: list[str] = [] - - for text in texts: - text_cleaned = self.clean_text(text=text) - texts_cleaned.append(text_cleaned) - - return texts_cleaned - - def clean_text(self, text: str) -> str: - """ - clean the given text - - Parameters - ------------ - text : str - a discord message text - - Returns - --------- - text_cleaned : str - the cleaned text - (discord ids removed) - """ - text_cleaned = self.remove_ids(text=text) - text_cleaned = self.extract_main_content(text=text_cleaned) - - return text_cleaned - - def remove_ids(self, text: str) -> str: - """ - remove the ids that are available in texts - user ids would be with the format of <@number> - and the role ids are in the format of <@&number> - - Parameters - ------------ - text : str - a discord message text - - Returns - --------- - cleaned_text : str - the texts with removed <@number> and <@&number> - """ - pattern = r"<@&?\d+>" - - # Removing matches - cleaned_text = re.sub(pattern, "", text) - - cleaned_text = " ".join(cleaned_text.split()) - return cleaned_text diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index 2c786ef2..9d6ac778 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -312,14 +312,13 @@ def test_transform_two_data(self): "thread": None, "url": "https://discord.com/channels/1234/111111/10000000003", } - print(documents[0].text) + self.assertDictEqual(documents[0].metadata, expected_metadata_0) self.assertDictEqual(documents[1].metadata, expected_metadata_1) self.assertDictEqual(documents[2].metadata, expected_metadata_2) self.assertDictEqual(documents[3].metadata, expected_metadata_3) - # Optionally, you can also check the text separately if needed - self.assertEqual(documents[0].text, "apple fall trees1") - self.assertEqual(documents[1].text, "mention person user3") - self.assertEqual(documents[2].text, "mention user3 user4 role1") + self.assertEqual(documents[0].text, "test_message1") + self.assertEqual(documents[1].text, "mentioning a person user3") + self.assertEqual(documents[2].text, "mentioning user3 user4 role1") self.assertEqual(documents[3].text, "test_message1 [URL0]") diff --git a/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py b/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py deleted file mode 100644 index e268089e..00000000 --- a/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py +++ /dev/null @@ -1,62 +0,0 @@ -import unittest -from unittest.mock import patch -import spacy - -from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor - - -class TestDiscordPreprocessor(unittest.TestCase): - def setUp(self): - """Set up test fixtures before each test method.""" - self.preprocessor = BasePreprocessor() - - # Tests for extract_main_content method - @unittest.skipIf( - not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" - ) - def test_extract_main_content_basic(self): - """Test basic content extraction""" - input_text = "The quick brown fox jumps over the lazy dog!" - result = self.preprocessor.extract_main_content(input_text) - # Note: Expected result will contain lemmatized words without stop words - self.assertIn("quick brown fox jump lazy dog", result.lower()) - - @unittest.skipIf( - not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" - ) - def test_extract_main_content_with_numbers(self): - """Test content extraction with numbers""" - input_text = "I have 5 apples and 3 oranges." - result = self.preprocessor.extract_main_content(input_text) - self.assertIn("apple orange", result.lower()) - self.assertNotIn("5", result) - self.assertNotIn("3", result) - - @unittest.skipIf( - not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" - ) - def test_extract_main_content_with_urls(self): - """Test content extraction with URLs""" - input_text = "Check this link: https://example.com for more information" - result = self.preprocessor.extract_main_content(input_text) - self.assertNotIn("https://example.com", result) - self.assertIn("check link information", result.lower()) - - @unittest.skipIf( - not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model" - ) - def test_extract_main_content_empty_string(self): - """Test content extraction with empty string""" - self.assertEqual(self.preprocessor.extract_main_content(""), "") - - def test_extract_main_content_missing_model(self): - """Test handling of missing spacy model""" - with patch("spacy.load") as mock_load: - mock_load.side_effect = OSError("Model not found") - - with self.assertRaises(OSError) as context: - self.preprocessor.extract_main_content("Test text") - - self.assertIn( - "Model spacy `en_core_web_lg` is not installed!", str(context.exception) - ) From d0fb0086409b6b5cabf3b97d07614897a60b9978 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 15:43:17 +0330 Subject: [PATCH 14/23] fix: remove the unrequired processor! --- .../tests/unit/test_discord_preprocess.py | 43 ------------------- 1 file changed, 43 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py deleted file mode 100644 index 34d0d4a1..00000000 --- a/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py +++ /dev/null @@ -1,43 +0,0 @@ -import unittest - -from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor - - -class TestDiscordPreprocessor(unittest.TestCase): - def setUp(self): - """Set up test fixtures before each test method.""" - self.preprocessor = DiscordPreprocessor() - - # Tests for remove_ids method - def test_remove_user_ids(self): - """Test removal of user IDs from text""" - input_text = "Hello <@123456> how are you?" - expected = "Hello how are you?" - self.assertEqual(self.preprocessor.remove_ids(input_text), expected) - - def test_remove_role_ids(self): - """Test removal of role IDs from text""" - input_text = "Welcome <@&789012> to the server!" - expected = "Welcome to the server!" - self.assertEqual(self.preprocessor.remove_ids(input_text), expected) - - def test_remove_multiple_ids(self): - """Test removal of multiple IDs from text""" - input_text = "Hey <@123456> please notify <@&789012> and <@345678>" - expected = "Hey please notify and" - self.assertEqual(self.preprocessor.remove_ids(input_text), expected) - - def test_text_without_ids(self): - """Test text that doesn't contain any IDs""" - input_text = "Regular message without any IDs" - self.assertEqual(self.preprocessor.remove_ids(input_text), input_text) - - def test_empty_string(self): - """Test empty string input""" - self.assertEqual(self.preprocessor.remove_ids(""), "") - - def test_only_whitespace(self): - """Test string with only whitespace""" - input_text = " \t \n " - expected = "" - self.assertEqual(self.preprocessor.remove_ids(input_text), expected) From 7dfc963fccbc48e888c6e3c6587ee631f34cc210 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 15:56:30 +0330 Subject: [PATCH 15/23] feat: removed unused test cases! the test case didn't belong to any code. --- .../unit/test_discord_update_raw_messages.py | 141 ------------------ 1 file changed, 141 deletions(-) delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py deleted file mode 100644 index 90ce9728..00000000 --- a/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py +++ /dev/null @@ -1,141 +0,0 @@ -import unittest -from datetime import datetime -import spacy - -from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import ( - update_raw_messages, -) - - -class TestUpdateRawMessages(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Load spacy model once for all tests""" - try: - cls.nlp = spacy.load("en_core_web_lg") - except OSError as exp: - raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp - - def test_empty_list(self): - """Test function handles empty input list correctly""" - result = update_raw_messages([]) - self.assertEqual(result, [], "Empty list should return empty list") - - def test_with_content(self): - """Test function processes valid content correctly""" - test_data = [ - { - "type": 0, - "author": "123456", - "content": "test_message <@123456> Hello", - "user_mentions": [], - "role_mentions": [], - "reactions": [], - "replied_user": None, - "createdDate": datetime.now(), - "messageId": "1234567", - "channelId": "89101112", - "channelName": "general", - "threadId": None, - "threadName": None, - "isGeneratedByWebhook": False, - } - ] - - result = update_raw_messages(test_data) - - self.assertEqual(len(result), 1, "Should return one processed message") - self.assertIn("content", result[0], "Processed message should contain content") - # The actual content will depend on spaCy's processing, but should not contain Discord IDs - self.assertNotIn( - "<@123456>", result[0]["content"], "Discord IDs should be removed" - ) - self.assertNotEqual(result[0]["content"], "", "Content should not be empty") - - def test_without_content(self): - """Test function handles messages without content field""" - test_data = [ - { - "type": 0, - "author": "123456", - "user_mentions": [], - "role_mentions": [], - } - ] - - result = update_raw_messages(test_data) - self.assertEqual(result, [], "Message without content should be filtered out") - - def test_with_discord_ids(self): - """Test function removes Discord user and role IDs correctly""" - test_data = [ - { - "content": "Hello <@123456> and <@&789012> how are you?", - "type": 0, - } - ] - - result = update_raw_messages(test_data) - self.assertEqual(len(result), 1, "Should return one processed message") - # Check that Discord IDs are removed - self.assertNotIn("<@123456>", result[0]["content"], "User ID should be removed") - self.assertNotIn( - "<@&789012>", result[0]["content"], "Role ID should be removed" - ) - # Check that meaningful content remains - self.assertIn( - "hello", result[0]["content"].lower(), "Regular words should be preserved" - ) - - def test_consecutive_calls(self): - """Test function handles consecutive calls correctly""" - test_data_1 = [{"content": "Apples are falling from trees!"}] - test_data_2 = [{"content": "second message"}] - - result_1 = update_raw_messages(test_data_1) - result_2 = update_raw_messages(test_data_2) - - self.assertEqual(len(result_1), 1, "First call should return one message") - self.assertEqual(len(result_2), 1, "Second call should return one message") - self.assertNotEqual( - result_1[0]["content"], - result_2[0]["content"], - "Different inputs should produce different outputs", - ) - - def test_multiple_messages(self): - """Test function processes multiple messages correctly""" - test_data = [ - {"content": "first message about cats"}, - {"content": "second message about dogs"}, - {"content": "third message about birds"}, - ] - - result = update_raw_messages(test_data) - self.assertEqual(len(result), 3, "Should return three processed messages") - - # Check that each message has been processed - for i, msg in enumerate(result): - self.assertIn("content", msg, f"Message {i} should have content") - self.assertNotEqual( - msg["content"], "", f"Message {i} content should not be empty" - ) - - def test_special_characters(self): - """Test function handles special characters correctly""" - test_data = [ - { - "content": "Hello! This is a test... With some punctuation!?! @#$%", - "type": 0, - } - ] - - result = update_raw_messages(test_data) - self.assertEqual(len(result), 1, "Should return one processed message") - # The processed content should contain meaningful words but not punctuation - processed_content = result[0]["content"].lower() - self.assertIn("hello", processed_content, "Common words should be preserved") - self.assertIn("test", processed_content, "Common words should be preserved") - self.assertNotIn( - "!?!", processed_content, "Multiple punctuation should be removed" - ) From 077ab8e1f03a730ad5ed2f466cef16cd5cd8c377 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 16:17:58 +0330 Subject: [PATCH 16/23] feat: updated data with not removing the urls from actual content! --- .../utils/transform_discord_raw_messges.py | 26 +++++++++---------- .../test_discord_prepare_document_from_db.py | 2 +- .../integration/test_discord_prepare_llama.py | 4 +-- .../test_pg_vector_access_with_discord.py | 2 +- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index cf04ddbe..0cee659f 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -14,9 +14,6 @@ from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_ids import ( prepare_raw_message_ids, ) -from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_urls import ( - prepare_raw_message_urls, -) from hivemind_etl_helpers.src.db.discord.utils.prepare_reactions_id import ( prepare_raction_ids, ) @@ -160,7 +157,7 @@ def prepare_document( roles=dict(zip(role_ids, role_names)), users=dict(zip(mention_ids, mention_names)), ) - content_url_updated, url_reference = prepare_raw_message_urls(content) + # content_url_updated, url_reference = prepare_raw_message_urls(content) # always has length 1 assert len(author_name) == 1, "Either None or multiple authors!" @@ -202,8 +199,8 @@ def prepare_document( msg_meta_data["reactors_global_name"] = reactors_gname if reactors_nickname != []: msg_meta_data["reactors_nicknames"] = reactors_nickname - if url_reference != {}: - msg_meta_data["url_reference"] = url_reference + # if url_reference != {}: + # msg_meta_data["url_reference"] = url_reference if replier_name is not None: msg_meta_data["replier_username"] = replier_name[0] @@ -214,18 +211,19 @@ def prepare_document( if role_names != []: msg_meta_data["role_mentions"] = role_names - if content_url_updated == "": - raise ValueError("Message with Empty content!") + # if content_url_updated == "": + # raise ValueError("Message with Empty content!") - if check_no_content_only_links(content_url_updated): + # TODO: clean the text with preprocessor and check if there's anything availale in it + if check_no_content_only_links(content): raise ValueError("Message just did have urls") # removing null characters - content_url_updated = re.sub(r"[\x00-\x1F\x7F]", "", content_url_updated) + content = re.sub(r"[\x00-\x1F\x7F]", "", content) doc: Document if not exclude_metadata: - doc = Document(text=content_url_updated, metadata=msg_meta_data) + doc = Document(text=content, metadata=msg_meta_data) doc.excluded_embed_metadata_keys = [ "channel", "date", @@ -239,7 +237,7 @@ def prepare_document( "reactors_global_name", "reactors_nicknames", "thread", - "url_reference", + # "url_reference", "replier_username", "replier_global_name", "replier_nickname", @@ -256,7 +254,7 @@ def prepare_document( "reactors_global_name", "reactors_nicknames", "thread", - "url_reference", + # "url_reference", "replier_username", "replier_global_name", "replier_nickname", @@ -264,6 +262,6 @@ def prepare_document( "url", ] else: - doc = Document(text=content_url_updated) + doc = Document(text=content) return doc diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index 9d6ac778..09e13050 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -99,7 +99,7 @@ def test_transform_two_data(self): data = { "type": 0, "author": "111", - "content": "Apples are falling from trees1", + "content": "test_message1", "user_mentions": [], "role_mentions": [], "reactions": [], diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py index 3c5fa5af..8c324460 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py @@ -205,7 +205,6 @@ def test_transform_two_data(self): "author_username": "user1", "author_global_name": "user1_GlobalName", "author_nickname": "user1_nickname", - "url_reference": {"[URL0]": "https://www.google.com"}, "url": f"https://discord.com/channels/{guild_id}/1313133/1111111113", "thread": None, } @@ -215,8 +214,7 @@ def test_transform_two_data(self): self.assertDictEqual(documents[2].metadata, expected_metadata_2) self.assertDictEqual(documents[3].metadata, expected_metadata_3) - # Optionally, you can also check the text separately if needed self.assertEqual(documents[0].text, "test_message1") self.assertEqual(documents[1].text, "mentioning a person user3") self.assertEqual(documents[2].text, "mentioning user3 user4 role1") - self.assertEqual(documents[3].text, "test_message1 [URL0]") + self.assertEqual(documents[3].text, "test_message1 https://www.google.com") diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py index 96ada0e4..b8855470 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py @@ -185,7 +185,7 @@ def test_save_documents(self): text, _, metadata = data[0] # nickname was `None`, so it wasn't included in metadata - self.assertEqual(text, "apple fall tree") + self.assertEqual(text, "Apples are falling from trees 1") self.assertEqual(metadata["channel"], documents[0].metadata["channel"]) self.assertEqual( metadata["author_username"], documents[0].metadata["author_username"] From ed99d65c2c84b9302fde7f94eaa1b7783a961781 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 16:57:24 +0330 Subject: [PATCH 17/23] feat: using latest version of hivemind-backend lib! --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2f96c76a..66ea62eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ cohere==4.56 neo4j>=5.14.1, <6.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.4.0 +tc-hivemind-backend==1.4.1 traceloop-sdk>=0.15.2, <0.16.0 beautifulsoup4==4.12.3 llama-index-readers-notion==0.1.6 From 866088d4c5165b5cc4021db308fdc1832f57051c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 9 Dec 2024 16:57:45 +0330 Subject: [PATCH 18/23] feat: checking if there's any valid date in the text! --- .../src/db/discord/utils/transform_discord_raw_messges.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index 0cee659f..4688032e 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -18,6 +18,7 @@ prepare_raction_ids, ) from hivemind_etl_helpers.src.db.globals import DATE_FORMAT +from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor from llama_index.core import Document @@ -214,9 +215,8 @@ def prepare_document( # if content_url_updated == "": # raise ValueError("Message with Empty content!") - # TODO: clean the text with preprocessor and check if there's anything availale in it - if check_no_content_only_links(content): - raise ValueError("Message just did have urls") + if BasePreprocessor().extract_main_content(text=content): + raise ValueError("Message didn't hold any valuable information!") # removing null characters content = re.sub(r"[\x00-\x1F\x7F]", "", content) From 96d7ab32fb86f67b617c7f3a6850cdd5365c2118 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 10 Dec 2024 10:08:37 +0330 Subject: [PATCH 19/23] feat: using smaller sized spacy model! --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index a30ed742..15303f18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,12 +4,12 @@ COPY . . RUN chmod +x init.sh USER airflow RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt -RUN python -m spacy download en_core_web_lg +RUN python -m spacy download en_core_web_sm FROM python:3.11-bullseye AS test WORKDIR /project COPY . . RUN pip install -r requirements.txt -RUN python -m spacy download en_core_web_lg +RUN python -m spacy download en_core_web_sm RUN chmod +x docker-entrypoint.sh CMD ["./docker-entrypoint.sh"] From 5634296d52658c67ac82675530db815dac927ca5 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 10 Dec 2024 11:26:44 +0330 Subject: [PATCH 20/23] fix: wrong if condition! --- .../src/db/discord/utils/transform_discord_raw_messges.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index 4688032e..5fd57629 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -215,7 +215,7 @@ def prepare_document( # if content_url_updated == "": # raise ValueError("Message with Empty content!") - if BasePreprocessor().extract_main_content(text=content): + if not BasePreprocessor().extract_main_content(text=content): raise ValueError("Message didn't hold any valuable information!") # removing null characters From d3f8d7d851fc043a0abdfe7ec3971dc62ecbdf4e Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 10 Dec 2024 11:27:01 +0330 Subject: [PATCH 21/23] fix: aligning with latest codes! --- .../test_discord_prepare_document_from_db.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index 09e13050..87dbb4c4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -17,7 +17,7 @@ def setup_db( create_platform: bool = True, guild_id: str = "1234", ): - client = MongoSingleton.get_instance().client + client = MongoSingleton.get_instance().get_client() community_id = ObjectId("9f59dd4f38f3474accdc8f24") platform_id = ObjectId("063a2a74282db2c00fbc2428") @@ -83,7 +83,7 @@ def setup_db( ) def test_transform_two_data(self): - client = MongoSingleton.get_instance().client + client = MongoSingleton.get_instance().get_client() channels = ["111111", "22222"] guild_id = "1234" @@ -99,7 +99,7 @@ def test_transform_two_data(self): data = { "type": 0, "author": "111", - "content": "test_message1", + "content": "test_message1 making it longer!", "user_mentions": [], "role_mentions": [], "reactions": [], @@ -308,7 +308,6 @@ def test_transform_two_data(self): "date": datetime(2023, 5, 8).strftime("%Y-%m-%d %H:%M:%S"), "author_username": "user1", "author_global_name": "user1_GlobalName", - "url_reference": {"[URL0]": "https://www.google.com"}, "thread": None, "url": "https://discord.com/channels/1234/111111/10000000003", } @@ -318,7 +317,7 @@ def test_transform_two_data(self): self.assertDictEqual(documents[2].metadata, expected_metadata_2) self.assertDictEqual(documents[3].metadata, expected_metadata_3) - self.assertEqual(documents[0].text, "test_message1") + self.assertEqual(documents[0].text, "test_message1 making it longer!") self.assertEqual(documents[1].text, "mentioning a person user3") self.assertEqual(documents[2].text, "mentioning user3 user4 role1") - self.assertEqual(documents[3].text, "test_message1 [URL0]") + self.assertEqual(documents[3].text, "test_message1 https://www.google.com") From ce0a9c230bf318fc0609c50f4d8accc42bc70ce8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 10 Dec 2024 12:20:26 +0330 Subject: [PATCH 22/23] fix: using the fixed version of hivemind-backend lib! --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 66ea62eb..82ab0e11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ cohere==4.56 neo4j>=5.14.1, <6.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.4.1 +tc-hivemind-backend==1.4.2 traceloop-sdk>=0.15.2, <0.16.0 beautifulsoup4==4.12.3 llama-index-readers-notion==0.1.6 From 67a3aae90356f78f2c0329fab4dd946b977004b6 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 10 Dec 2024 14:00:38 +0330 Subject: [PATCH 23/23] fix: isort linter issue! --- .../src/db/discord/utils/transform_discord_raw_messges.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index 5fd57629..5bd865ba 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -3,7 +3,6 @@ from typing import Any from hivemind_etl_helpers.src.db.discord.utils.content_parser import ( - check_no_content_only_links, remove_empty_str, remove_none_from_list, ) @@ -18,8 +17,8 @@ prepare_raction_ids, ) from hivemind_etl_helpers.src.db.globals import DATE_FORMAT -from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor from llama_index.core import Document +from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor def transform_discord_raw_messages(