From ba4c527e3d822647bc94108fee47f6d3bc06577e Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 5 Dec 2024 13:31:21 +0330
Subject: [PATCH 01/23] feat: Added word max limit!

---
 .../src/db/discord/fetch_raw_messages.py      |   8 +
 .../test_discord_fetch_raw_messages.py        | 174 ++++++++++++++++++
 2 files changed, 182 insertions(+)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
index dfb540bd..17066a44 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
@@ -7,6 +7,7 @@ def fetch_raw_messages(
     guild_id: str,
     selected_channels: list[str],
     from_date: datetime,
+    **kwargs,
 ) -> list[dict]:
     """
     fetch rawinfo messages from mongodb database
@@ -20,6 +21,10 @@ def fetch_raw_messages(
     from_date : datetime
         get the raw data from a specific date
         default is None, meaning get all the messages
+    kwargs : dict
+        min_word_limit : int
+            the minimum words that the messages shuold contain
+            default is 8 characters
 
     Returns
     --------
@@ -29,6 +34,8 @@ def fetch_raw_messages(
     client = MongoSingleton.get_instance().get_client()
     user_ids = get_real_users(guild_id)
 
+    limit_max_words = kwargs.get("min_word_limit", 8)
+
     cursor = (
         client[guild_id]["rawinfos"]
         .find(
@@ -38,6 +45,7 @@ def fetch_raw_messages(
                 "createdDate": {"$gte": from_date},
                 "isGeneratedByWebhook": False,
                 "channelId": {"$in": selected_channels},
+                "$where": f"this.content.length > {limit_max_words}",
             }
         )
         .sort("createdDate", 1)
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
index f70cef03..66dfc77c 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
@@ -286,3 +286,177 @@ def test_fetch_raw_messages_fetch_from_date(self):
 
         # Check if the fetched messages are equal to the expected messages
         self.assertCountEqual(messages, expected_messages)
+
+    def test_fetch_raw_messages_fetch_limited_characters(self):
+        """
+        fetch raw messages and do filtering
+        """
+        client = MongoSingleton.get_instance().client
+
+        guild_id = "1234"
+        channels = ["111111", "22222"]
+        users_id = ["user1", "user2", "user3", "user4", "user5"]
+        guild_id = "1234"
+        self.setup_db(
+            channels=channels,
+            guild_id=guild_id,
+        )
+
+        for user in users_id:
+            is_bot = False
+            if user == "user3":
+                is_bot = True
+
+            client[guild_id]["guildmembers"].insert_one(
+                {
+                    "discordId": user,
+                    "username": f"username_{user}",
+                    "roles": None,
+                    "joinedAt": datetime(2023, 1, 1),
+                    "avatar": None,
+                    "isBot": is_bot,
+                    "discriminator": "0",
+                    "permissions": None,
+                    "deletedAt": None,
+                    "globalName": None,
+                    "nickname": None,
+                }
+            )
+
+        # Dropping any previous data
+        client[guild_id].drop_collection("rawinfos")
+
+        # Insert messages with different dates
+        raw_data = []
+
+        data = {
+            "type": 0,
+            "author": users_id[0],
+            "content": "AA",
+            "user_mentions": [],
+            "role_mentions": [],
+            "reactions": [],
+            "replied_user": None,
+            "createdDate": datetime(2023, 10, 1),
+            "messageId": str(np.random.randint(1000000, 9999999)),
+            "channelId": channels[0],
+            "channelName": f"general {channels[0]}",
+            "threadId": None,
+            "threadName": None,
+            "isGeneratedByWebhook": False,
+        }
+        raw_data.append(data)
+
+        data = {
+            "type": 0,
+            "author": users_id[1],
+            "content": "A sample text with more than 8 characters!",
+            "user_mentions": [],
+            "role_mentions": [],
+            "reactions": [],
+            "replied_user": None,
+            "createdDate": datetime(2023, 10, 1),
+            "messageId": str(np.random.randint(1000000, 9999999)),
+            "channelId": channels[1],
+            "channelName": f"general {channels[1]}",
+            "threadId": None,
+            "threadName": None,
+            "isGeneratedByWebhook": False,
+        }
+        raw_data.append(data)
+
+        client[guild_id]["rawinfos"].insert_many(raw_data)
+
+        # Fetch messages from a specific date (October 3, 2023)
+        messages = fetch_raw_messages(guild_id, selected_channels=channels)
+        # Check if the fetched messages are equal to the expected messages
+        self.assertCountEqual(messages, 1)
+
+    def test_fetch_raw_messages_fetch_limited_characters(self):
+        """
+        fetch raw messages and do filtering
+        """
+        client = MongoSingleton.get_instance().client
+
+        guild_id = "1234"
+        channels = ["111111", "22222"]
+        users_id = ["user1", "user2", "user3", "user4", "user5"]
+        guild_id = "1234"
+        self.setup_db(
+            channels=channels,
+            guild_id=guild_id,
+        )
+
+        for user in users_id:
+            is_bot = False
+            if user == "user3":
+                is_bot = True
+
+            client[guild_id]["guildmembers"].insert_one(
+                {
+                    "discordId": user,
+                    "username": f"username_{user}",
+                    "roles": None,
+                    "joinedAt": datetime(2023, 1, 1),
+                    "avatar": None,
+                    "isBot": is_bot,
+                    "discriminator": "0",
+                    "permissions": None,
+                    "deletedAt": None,
+                    "globalName": None,
+                    "nickname": None,
+                }
+            )
+
+        # Dropping any previous data
+        client[guild_id].drop_collection("rawinfos")
+
+        # Insert messages with different dates
+        raw_data = []
+
+        data = {
+            "type": 0,
+            "author": users_id[0],
+            "content": "AA",
+            "user_mentions": [],
+            "role_mentions": [],
+            "reactions": [],
+            "replied_user": None,
+            "createdDate": datetime(2023, 10, 1),
+            "messageId": str(np.random.randint(1000000, 9999999)),
+            "channelId": channels[0],
+            "channelName": f"general {channels[0]}",
+            "threadId": None,
+            "threadName": None,
+            "isGeneratedByWebhook": False,
+        }
+        raw_data.append(data)
+
+        data = {
+            "type": 0,
+            "author": users_id[1],
+            "content": "A sample text with more than 8 characters!",
+            "user_mentions": [],
+            "role_mentions": [],
+            "reactions": [],
+            "replied_user": None,
+            "createdDate": datetime(2023, 10, 1),
+            "messageId": str(np.random.randint(1000000, 9999999)),
+            "channelId": channels[1],
+            "channelName": f"general {channels[1]}",
+            "threadId": None,
+            "threadName": None,
+            "isGeneratedByWebhook": False,
+        }
+        raw_data.append(data)
+
+        client[guild_id]["rawinfos"].insert_many(raw_data)
+
+        # Fetch messages from a specific date (October 3, 2023)
+        messages = fetch_raw_messages(
+            guild_id,
+            selected_channels=channels,
+            min_word_limit=1,
+        )
+        # Check if the fetched messages are equal to the expected messages
+        self.assertCountEqual(messages, 2)

From 3a018eb4f9f5a7ec615651ed92d8b73c941d28d9 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 5 Dec 2024 14:28:36 +0330
Subject: [PATCH 02/23] fix: test case to align with the latest codes!

---
 .../integration/test_discord_fetch_raw_messages.py   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
index 66dfc77c..6ae30624 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
@@ -367,14 +367,17 @@ def test_fetch_raw_messages_fetch_limited_characters(self):
 
         client[guild_id]["rawinfos"].insert_many(raw_data)
 
-        # Fetch messages from a specific date (October 3, 2023)
-        messages = fetch_raw_messages(guild_id, selected_channels=channels)
+        messages = fetch_raw_messages(
+            guild_id,
+            selected_channels=channels,
+            from_date=datetime(2023, 9, 20),
+        )
         # Check if the fetched messages are equal to the expected messages
         self.assertCountEqual(messages, 1)
 
-    def test_fetch_raw_messages_fetch_limited_characters(self):
+    def test_fetch_raw_messages_fetch_limited_characters_specified(self):
         """
-        fetch raw messages and do filtering
+        fetch raw messages and do filtering with a specified value
         """
         client = MongoSingleton.get_instance().client
 
@@ -456,6 +459,7 @@ def test_fetch_raw_messages_fetch_limited_characters(self):
         messages = fetch_raw_messages(
             guild_id,
             selected_channels=channels,
+            from_date=datetime(2023, 9, 20),
             min_word_limit=1,
         )
         # Check if the fetched messages are equal to the expected messages

From cdcea542b2027413d04236414ad52d8c867d5375 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 5 Dec 2024 14:43:53 +0330
Subject: [PATCH 03/23] fix: naming typo!

---
 .../hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
index 17066a44..e982a2ee 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
@@ -34,7 +34,7 @@ def fetch_raw_messages(
     client = MongoSingleton.get_instance().get_client()
     user_ids = get_real_users(guild_id)
 
-    limit_max_words = kwargs.get("min_word_limit", 8)
+    min_word_limit = kwargs.get("min_word_limit", 8)
 
     cursor = (
         client[guild_id]["rawinfos"]
@@ -45,7 +45,7 @@ def fetch_raw_messages(
                 "createdDate": {"$gte": from_date},
                 "isGeneratedByWebhook": False,
                 "channelId": {"$in": selected_channels},
-                "$where": f"this.content.length > {limit_max_words}",
+                "$where": f"this.content.length > {min_word_limit}",
             }
         )
         .sort("createdDate", 1)

From 70459fcb8121ab4a9fc095edd2fbd777c3d0ffb0 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Thu, 5 Dec 2024 14:44:11 +0330
Subject: [PATCH 04/23] fix: test case assertions!

---
 .../tests/integration/test_discord_fetch_raw_messages.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
index 6ae30624..f5abcd19 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
@@ -373,7 +373,7 @@ def test_fetch_raw_messages_fetch_limited_characters(self):
             from_date=datetime(2023, 9, 20),
         )
         # Check if the fetched messages are equal to the expected messages
-        self.assertCountEqual(messages, 1)
+        self.assertEqual(len(messages), 1)
 
     def test_fetch_raw_messages_fetch_limited_characters_specified(self):
         """
@@ -463,4 +463,4 @@ def test_fetch_raw_messages_fetch_limited_characters_specified(self):
             min_word_limit=1,
         )
         # Check if the fetched messages are equal to the expected messages
-        self.assertCountEqual(messages, 2)
+        self.assertEqual(len(messages), 2)

From 9b2beadc64f13314a7e94b11cfe2458e576151f3 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 12:41:39 +0330
Subject: [PATCH 05/23] feat: Added noise cancelling for discord messages!

---
 Dockerfile                                    |   1 +
 .../src/db/common/__init__.py                 |   0
 .../src/db/common/base_preprocessor.py        |  43 ++++++
 .../discord_raw_message_to_document.py        |  34 ++++-
 .../src/db/discord/fetch_raw_messages.py      |   4 +-
 .../src/db/discord/preprocessor.py            |  79 ++++++++++
 .../tests/unit/test_common_preprocessor.py    |  62 ++++++++
 .../tests/unit/test_discord_preprocess.py     |  43 ++++++
 .../unit/test_discord_update_raw_messages.py  | 141 ++++++++++++++++++
 9 files changed, 404 insertions(+), 3 deletions(-)
 create mode 100644 dags/hivemind_etl_helpers/src/db/common/__init__.py
 create mode 100644 dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
 create mode 100644 dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
 create mode 100644 dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py
 create mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py
 create mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py

diff --git a/Dockerfile b/Dockerfile
index 9578f4ce..88958223 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,6 +4,7 @@ COPY . .
 RUN chmod +x init.sh
 USER airflow
 RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt
+RUN python -m spacy download en_core_web_lg
 
 FROM python:3.11-bullseye AS test
 WORKDIR /project
diff --git a/dags/hivemind_etl_helpers/src/db/common/__init__.py b/dags/hivemind_etl_helpers/src/db/common/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
new file mode 100644
index 00000000..1723b776
--- /dev/null
+++ b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
@@ -0,0 +1,43 @@
+import spacy
+
+
+class BasePreprocessor:
+    def __init__(self) -> None:
+        pass
+
+    def extract_main_content(self, text: str) -> str:
+        """
+        extract main content of a message
+
+        Parameters
+        ------------
+        text : str
+            a discord message text
+
+        Returns
+        --------
+        cleaned_text : str
+
+        """
+        try:
+            nlp = spacy.load("en_core_web_lg")
+        except OSError as exp:
+            raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp
+
+        doc = nlp(text)
+
+        # Filter out punctuation, whitespace, and numerical values, then extract the lemma for each remaining token
+        main_content_tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_punct
+            and not token.is_space
+            and not token.is_stop
+            and not token.like_url
+            and not token.like_num
+            and token.is_ascii
+        ]
+
+        # Join the tokens to form the cleaned sentence
+        cleaned_text = " ".join(main_content_tokens)
+        return cleaned_text
diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
index 190a08c1..bde5ab1d 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
@@ -5,6 +5,7 @@
     transform_discord_raw_messages,
 )
 from llama_index.core import Document
+from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor
 
 
 def discord_raw_to_documents(
@@ -29,6 +30,37 @@ def discord_raw_to_documents(
         list of messages converted to documents
     """
     raw_mongo_messages = fetch_raw_messages(guild_id, selected_channels, from_date)
-    messages_docuemnt = transform_discord_raw_messages(guild_id, raw_mongo_messages)
+    processed_messages = update_raw_messages(raw_data=raw_mongo_messages)
+    messages_docuemnt = transform_discord_raw_messages(guild_id, processed_messages)
 
     return messages_docuemnt
+
+
+def update_raw_messages(raw_data: list[dict]) -> list[dict]:
+    """
+    Update raw messages text by cleaning their data
+
+    Parameters
+    -----------
+    data : list[dict]
+        a list of raw data fetched from database
+        each dict hold a 'content'
+
+    Returns
+    ---------
+    cleaned_data : list[dict]
+        a list of dictionaries but with cleaned data
+    """
+    preprocessor = DiscordPreprocessor()
+
+    cleaned_data: list[dict] = []
+    for data in raw_data:
+        content = data.get("content")
+        if content:
+            cleaned_content = preprocessor.clean_text(content)
+            print(cleaned_content)
+            if cleaned_content:
+                data["content"] = cleaned_content
+                cleaned_data.append(data)
+
+    return cleaned_data
diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
index e982a2ee..0f262e6d 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
@@ -34,7 +34,7 @@ def fetch_raw_messages(
     client = MongoSingleton.get_instance().get_client()
     user_ids = get_real_users(guild_id)
 
-    min_word_limit = kwargs.get("min_word_limit", 8)
+    min_word_limit = kwargs.get("min_word_limit", 15)
 
     cursor = (
         client[guild_id]["rawinfos"]
@@ -45,7 +45,7 @@ def fetch_raw_messages(
                 "createdDate": {"$gte": from_date},
                 "isGeneratedByWebhook": False,
                 "channelId": {"$in": selected_channels},
-                "$where": f"this.content.length > {min_word_limit}",
+                "$expr": {"$gt": [{"$strLenCP": "$content"}, min_word_limit]},
             }
         )
         .sort("createdDate", 1)
diff --git a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
new file mode 100644
index 00000000..6c01081b
--- /dev/null
+++ b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
@@ -0,0 +1,79 @@
+import re
+
+from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor
+
+
+class DiscordPreprocessor(BasePreprocessor):
+    """
+    preprocess discord text messages
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def clean_texts(self, texts: list[str]) -> list[str]:
+        """
+        clean the given text
+
+        Parameters
+        ------------
+        texts : list[str]
+            a list of discord messages text
+
+        Returns
+        ---------
+        texts_cleaned : str
+            the cleaned text
+            (discord ids removed)
+        """
+        texts_cleaned: list[str] = []
+
+        for text in texts:
+            text_cleaned = self.clean_text(text=text)
+            texts_cleaned.append(text_cleaned)
+
+        return texts_cleaned
+
+    def clean_text(self, text: str) -> str:
+        """
+        clean the given text
+
+        Parameters
+        ------------
+        text : str
+            a discord message text
+
+        Returns
+        ---------
+        text_cleaned : str
+            the cleaned text
+            (discord ids removed)
+        """
+        text_cleaned = self.remove_ids(text=text)
+        text_cleaned = self.extract_main_content(text=text_cleaned)
+
+        return text_cleaned
+
+    def remove_ids(self, text: str) -> str:
+        """
+        remove the ids that are available in texts
+        user ids would be with the format of <@number>
+        and the role ids are in the format of <@&number>
+
+        Parameters
+        ------------
+        text : str
+            a discord message text
+
+        Returns
+        ---------
+        cleaned_text : str
+            the texts with removed <@number> and <@&number>
+        """
+        pattern = r"<@&?\d+>"
+
+        # Removing matches
+        cleaned_text = re.sub(pattern, "", text)
+
+        cleaned_text = " ".join(cleaned_text.split())
+        return cleaned_text
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py b/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py
new file mode 100644
index 00000000..e268089e
--- /dev/null
+++ b/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py
@@ -0,0 +1,62 @@
+import unittest
+from unittest.mock import patch
+import spacy
+
+from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor
+
+
+class TestDiscordPreprocessor(unittest.TestCase):
+    def setUp(self):
+        """Set up test fixtures before each test method."""
+        self.preprocessor = BasePreprocessor()
+
+    # Tests for extract_main_content method
+    @unittest.skipIf(
+        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
+    )
+    def test_extract_main_content_basic(self):
+        """Test basic content extraction"""
+        input_text = "The quick brown fox jumps over the lazy dog!"
+        result = self.preprocessor.extract_main_content(input_text)
+        # Note: Expected result will contain lemmatized words without stop words
+        self.assertIn("quick brown fox jump lazy dog", result.lower())
+
+    @unittest.skipIf(
+        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
+    )
+    def test_extract_main_content_with_numbers(self):
+        """Test content extraction with numbers"""
+        input_text = "I have 5 apples and 3 oranges."
+        result = self.preprocessor.extract_main_content(input_text)
+        self.assertIn("apple orange", result.lower())
+        self.assertNotIn("5", result)
+        self.assertNotIn("3", result)
+
+    @unittest.skipIf(
+        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
+    )
+    def test_extract_main_content_with_urls(self):
+        """Test content extraction with URLs"""
+        input_text = "Check this link: https://example.com for more information"
+        result = self.preprocessor.extract_main_content(input_text)
+        self.assertNotIn("https://example.com", result)
+        self.assertIn("check link information", result.lower())
+
+    @unittest.skipIf(
+        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
+    )
+    def test_extract_main_content_empty_string(self):
+        """Test content extraction with empty string"""
+        self.assertEqual(self.preprocessor.extract_main_content(""), "")
+
+    def test_extract_main_content_missing_model(self):
+        """Test handling of missing spacy model"""
+        with patch("spacy.load") as mock_load:
+            mock_load.side_effect = OSError("Model not found")
+
+            with self.assertRaises(OSError) as context:
+                self.preprocessor.extract_main_content("Test text")
+
+            self.assertIn(
+                "Model spacy `en_core_web_lg` is not installed!", str(context.exception)
+            )
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py
new file mode 100644
index 00000000..34d0d4a1
--- /dev/null
+++ b/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py
@@ -0,0 +1,43 @@
+import unittest
+
+from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor
+
+
+class TestDiscordPreprocessor(unittest.TestCase):
+    def setUp(self):
+        """Set up test fixtures before each test method."""
+        self.preprocessor = DiscordPreprocessor()
+
+    # Tests for remove_ids method
+    def test_remove_user_ids(self):
+        """Test removal of user IDs from text"""
+        input_text = "Hello <@123456> how are you?"
+        expected = "Hello how are you?"
+        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
+
+    def test_remove_role_ids(self):
+        """Test removal of role IDs from text"""
+        input_text = "Welcome <@&789012> to the server!"
+        expected = "Welcome to the server!"
+        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
+
+    def test_remove_multiple_ids(self):
+        """Test removal of multiple IDs from text"""
+        input_text = "Hey <@123456> please notify <@&789012> and <@345678>"
+        expected = "Hey please notify and"
+        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
+
+    def test_text_without_ids(self):
+        """Test text that doesn't contain any IDs"""
+        input_text = "Regular message without any IDs"
+        self.assertEqual(self.preprocessor.remove_ids(input_text), input_text)
+
+    def test_empty_string(self):
+        """Test empty string input"""
+        self.assertEqual(self.preprocessor.remove_ids(""), "")
+
+    def test_only_whitespace(self):
+        """Test string with only whitespace"""
+        input_text = "   \t   \n   "
+        expected = ""
+        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py
new file mode 100644
index 00000000..90ce9728
--- /dev/null
+++ b/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py
@@ -0,0 +1,141 @@
+import unittest
+from datetime import datetime
+import spacy
+
+from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import (
+    update_raw_messages,
+)
+
+
+class TestUpdateRawMessages(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Load spacy model once for all tests"""
+        try:
+            cls.nlp = spacy.load("en_core_web_lg")
+        except OSError as exp:
+            raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp
+
+    def test_empty_list(self):
+        """Test function handles empty input list correctly"""
+        result = update_raw_messages([])
+        self.assertEqual(result, [], "Empty list should return empty list")
+
+    def test_with_content(self):
+        """Test function processes valid content correctly"""
+        test_data = [
+            {
+                "type": 0,
+                "author": "123456",
+                "content": "test_message <@123456> Hello",
+                "user_mentions": [],
+                "role_mentions": [],
+                "reactions": [],
+                "replied_user": None,
+                "createdDate": datetime.now(),
+                "messageId": "1234567",
+                "channelId": "89101112",
+                "channelName": "general",
+                "threadId": None,
+                "threadName": None,
+                "isGeneratedByWebhook": False,
+            }
+        ]
+
+        result = update_raw_messages(test_data)
+
+        self.assertEqual(len(result), 1, "Should return one processed message")
+        self.assertIn("content", result[0], "Processed message should contain content")
+        # The actual content will depend on spaCy's processing, but should not contain Discord IDs
+        self.assertNotIn(
+            "<@123456>", result[0]["content"], "Discord IDs should be removed"
+        )
+        self.assertNotEqual(result[0]["content"], "", "Content should not be empty")
+
+    def test_without_content(self):
+        """Test function handles messages without content field"""
+        test_data = [
+            {
+                "type": 0,
+                "author": "123456",
+                "user_mentions": [],
+                "role_mentions": [],
+            }
+        ]
+
+        result = update_raw_messages(test_data)
+        self.assertEqual(result, [], "Message without content should be filtered out")
+
+    def test_with_discord_ids(self):
+        """Test function removes Discord user and role IDs correctly"""
+        test_data = [
+            {
+                "content": "Hello <@123456> and <@&789012> how are you?",
+                "type": 0,
+            }
+        ]
+
+        result = update_raw_messages(test_data)
+        self.assertEqual(len(result), 1, "Should return one processed message")
+        # Check that Discord IDs are removed
+        self.assertNotIn("<@123456>", result[0]["content"], "User ID should be removed")
+        self.assertNotIn(
+            "<@&789012>", result[0]["content"], "Role ID should be removed"
+        )
+        # Check that meaningful content remains
+        self.assertIn(
+            "hello", result[0]["content"].lower(), "Regular words should be preserved"
+        )
+
+    def test_consecutive_calls(self):
+        """Test function handles consecutive calls correctly"""
+        test_data_1 = [{"content": "Apples are falling from trees!"}]
+        test_data_2 = [{"content": "second message"}]
+
+        result_1 = update_raw_messages(test_data_1)
+        result_2 = update_raw_messages(test_data_2)
+
+        self.assertEqual(len(result_1), 1, "First call should return one message")
+        self.assertEqual(len(result_2), 1, "Second call should return one message")
+        self.assertNotEqual(
+            result_1[0]["content"],
+            result_2[0]["content"],
+            "Different inputs should produce different outputs",
+        )
+
+    def test_multiple_messages(self):
+        """Test function processes multiple messages correctly"""
+        test_data = [
+            {"content": "first message about cats"},
+            {"content": "second message about dogs"},
+            {"content": "third message about birds"},
+        ]
+
+        result = update_raw_messages(test_data)
+        self.assertEqual(len(result), 3, "Should return three processed messages")
+
+        # Check that each message has been processed
+        for i, msg in enumerate(result):
+            self.assertIn("content", msg, f"Message {i} should have content")
+            self.assertNotEqual(
+                msg["content"], "", f"Message {i} content should not be empty"
+            )
+
+    def test_special_characters(self):
+        """Test function handles special characters correctly"""
+        test_data = [
+            {
+                "content": "Hello! This is a test... With some punctuation!?! @#$%",
+                "type": 0,
+            }
+        ]
+
+        result = update_raw_messages(test_data)
+        self.assertEqual(len(result), 1, "Should return one processed message")
+        # The processed content should contain meaningful words but not punctuation
+        processed_content = result[0]["content"].lower()
+        self.assertIn("hello", processed_content, "Common words should be preserved")
+        self.assertIn("test", processed_content, "Common words should be preserved")
+        self.assertNotIn(
+            "!?!", processed_content, "Multiple punctuation should be removed"
+        )

From 3788374e40235e05f1e8c8238953d9f6b554ccb5 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 12:43:18 +0330
Subject: [PATCH 06/23] fix: remove unnecessary print!

---
 .../src/db/discord/discord_raw_message_to_document.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
index bde5ab1d..0c5ddc45 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
@@ -58,7 +58,7 @@ def update_raw_messages(raw_data: list[dict]) -> list[dict]:
         content = data.get("content")
         if content:
             cleaned_content = preprocessor.clean_text(content)
-            print(cleaned_content)
+
             if cleaned_content:
                 data["content"] = cleaned_content
                 cleaned_data.append(data)

From b007eec7f299dafdf0d63bd2611dd32d2065491b Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 12:48:39 +0330
Subject: [PATCH 07/23] fix: Added missing spacy dependency!

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index dd69b10f..2f96c76a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,4 +22,5 @@ tc-wikipedia-lib==1.0.1
 llama-index-readers-file==0.1.22
 docx2txt==0.8
 tc-analyzer-lib==1.4.13
-pydantic==2.9.2
\ No newline at end of file
+pydantic==2.9.2
+spacy==3.7.5
\ No newline at end of file

From 33d0fb46b2e723ac23d07f4c65fd85d9e67cc8cb Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 13:36:00 +0330
Subject: [PATCH 08/23] feat: updating the messages content!

---
 .../tests/integration/test_discord_fetch_raw_messages.py    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
index f5abcd19..4a5a1c91 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
@@ -122,7 +122,7 @@ def test_fetch_raw_messages_fetch_all(self):
             data = {
                 "type": 0,
                 "author": users_id[i],
-                "content": f"test_message {np.random.randint(0, 10)}",
+                "content": f"{np.random.randint(0, 10)} Apples are falling from trees!",
                 "user_mentions": [],
                 "role_mentions": [],
                 "reactions": [],
@@ -247,7 +247,7 @@ def test_fetch_raw_messages_fetch_from_date(self):
             data = {
                 "type": 0,
                 "author": users_id[i],
-                "content": f"test_message {np.random.randint(0, 10)}",
+                "content": f"Apples falling from trees {np.random.randint(0, 10)}",
                 "user_mentions": [],
                 "role_mentions": [],
                 "reactions": [],
@@ -350,7 +350,7 @@ def test_fetch_raw_messages_fetch_limited_characters(self):
         data = {
             "type": 0,
             "author": users_id[1],
-            "content": "A sample text with more than 8 characters!",
+            "content": "A sample text with more than 15 characters!",
             "user_mentions": [],
             "role_mentions": [],
             "reactions": [],

From 32eaafb37c4096b81a190a7c9cedaf008e9b7c5c Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 13:40:32 +0330
Subject: [PATCH 09/23] fix: Added spacy model download in test environment!

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 88958223..a30ed742 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,5 +10,6 @@ FROM python:3.11-bullseye AS test
 WORKDIR /project
 COPY . .
 RUN pip install -r requirements.txt
+RUN python -m spacy download en_core_web_lg
 RUN chmod +x docker-entrypoint.sh
 CMD ["./docker-entrypoint.sh"]

From b2b64686828f23747d54d9d8b386dcfda3a47e96 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 13:49:01 +0330
Subject: [PATCH 10/23] fix: aligning with latest codes! we would remove dummy
 data from database, so we added a message content that holds some
 information.

---
 .../tests/integration/test_pg_vector_access_with_discord.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
index 4922fb5b..025d060d 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
@@ -124,7 +124,7 @@ def _create_and_save_doc(self, table: str, guild_id: str, dbname: str):
             {
                 "type": 0,
                 "author": "111",
-                "content": "test_message1",
+                "content": "Apples are falling from trees 1",
                 "user_mentions": [],
                 "role_mentions": [],
                 "reactions": [],

From 0a3a6dfdd8eded79cf398a13a4ee2f7f368ea3af Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 13:56:07 +0330
Subject: [PATCH 11/23] fix: align input output data!

---
 .../tests/integration/test_pg_vector_access_with_discord.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
index 025d060d..96ada0e4 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
@@ -185,7 +185,7 @@ def test_save_documents(self):
         text, _, metadata = data[0]
 
         # nickname was `None`, so it wasn't included in metadata
-        self.assertEqual(text, "test_message1")
+        self.assertEqual(text, "apple fall tree")
         self.assertEqual(metadata["channel"], documents[0].metadata["channel"])
         self.assertEqual(
             metadata["author_username"], documents[0].metadata["author_username"]

From 5b4057d0576889ee0749af1b15953d0cdedd1070 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 13:58:10 +0330
Subject: [PATCH 12/23] fix: Added more informative messages!

---
 .../integration/test_discord_prepare_document_from_db.py  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
index f92ce950..2c786ef2 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
@@ -99,7 +99,7 @@ def test_transform_two_data(self):
         data = {
             "type": 0,
             "author": "111",
-            "content": "test_message1",
+            "content": "Apples are falling from trees1",
             "user_mentions": [],
             "role_mentions": [],
             "reactions": [],
@@ -319,7 +319,7 @@ def test_transform_two_data(self):
         self.assertDictEqual(documents[3].metadata, expected_metadata_3)
 
         # Optionally, you can also check the text separately if needed
-        self.assertEqual(documents[0].text, "test_message1")
-        self.assertEqual(documents[1].text, "mentioning a person user3")
-        self.assertEqual(documents[2].text, "mentioning user3 user4 role1")
+        self.assertEqual(documents[0].text, "apple fall trees1")
+        self.assertEqual(documents[1].text, "mention person user3")
+        self.assertEqual(documents[2].text, "mention user3 user4 role1")
         self.assertEqual(documents[3].text, "test_message1 [URL0]")

From cd4e0c7ccd6f85dce8bcc2b5343927f54a94f71c Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 15:37:05 +0330
Subject: [PATCH 13/23] feat: removed the processors! the processor should move
 to the embedding part and we shouldn't change the actual content of the
 messages. They have been moved to hivemind_backend library.

---
 .../src/db/common/__init__.py                 |  0
 .../src/db/common/base_preprocessor.py        | 43 ----------
 .../discord_raw_message_to_document.py        | 34 +-------
 .../src/db/discord/preprocessor.py            | 79 -------------------
 .../test_discord_prepare_document_from_db.py  |  9 +--
 .../tests/unit/test_common_preprocessor.py    | 62 ---------------
 6 files changed, 5 insertions(+), 222 deletions(-)
 delete mode 100644 dags/hivemind_etl_helpers/src/db/common/__init__.py
 delete mode 100644 dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
 delete mode 100644 dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
 delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py

diff --git a/dags/hivemind_etl_helpers/src/db/common/__init__.py b/dags/hivemind_etl_helpers/src/db/common/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
deleted file mode 100644
index 1723b776..00000000
--- a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import spacy
-
-
-class BasePreprocessor:
-    def __init__(self) -> None:
-        pass
-
-    def extract_main_content(self, text: str) -> str:
-        """
-        extract main content of a message
-
-        Parameters
-        ------------
-        text : str
-            a discord message text
-
-        Returns
-        --------
-        cleaned_text : str
-
-        """
-        try:
-            nlp = spacy.load("en_core_web_lg")
-        except OSError as exp:
-            raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp
-
-        doc = nlp(text)
-
-        # Filter out punctuation, whitespace, and numerical values, then extract the lemma for each remaining token
-        main_content_tokens = [
-            token.lemma_
-            for token in doc
-            if not token.is_punct
-            and not token.is_space
-            and not token.is_stop
-            and not token.like_url
-            and not token.like_num
-            and token.is_ascii
-        ]
-
-        # Join the tokens to form the cleaned sentence
-        cleaned_text = " ".join(main_content_tokens)
-        return cleaned_text
diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
index 0c5ddc45..190a08c1 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
@@ -5,7 +5,6 @@
     transform_discord_raw_messages,
 )
 from llama_index.core import Document
-from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor
 
 
 def discord_raw_to_documents(
@@ -30,37 +29,6 @@ def discord_raw_to_documents(
         list of messages converted to documents
     """
     raw_mongo_messages = fetch_raw_messages(guild_id, selected_channels, from_date)
-    processed_messages = update_raw_messages(raw_data=raw_mongo_messages)
-    messages_docuemnt = transform_discord_raw_messages(guild_id, processed_messages)
+    messages_docuemnt = transform_discord_raw_messages(guild_id, raw_mongo_messages)
 
     return messages_docuemnt
-
-
-def update_raw_messages(raw_data: list[dict]) -> list[dict]:
-    """
-    Update raw messages text by cleaning their data
-
-    Parameters
-    -----------
-    data : list[dict]
-        a list of raw data fetched from database
-        each dict hold a 'content'
-
-    Returns
-    ---------
-    cleaned_data : list[dict]
-        a list of dictionaries but with cleaned data
-    """
-    preprocessor = DiscordPreprocessor()
-
-    cleaned_data: list[dict] = []
-    for data in raw_data:
-        content = data.get("content")
-        if content:
-            cleaned_content = preprocessor.clean_text(content)
-
-            if cleaned_content:
-                data["content"] = cleaned_content
-                cleaned_data.append(data)
-
-    return cleaned_data
diff --git a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
deleted file mode 100644
index 6c01081b..00000000
--- a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import re
-
-from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor
-
-
-class DiscordPreprocessor(BasePreprocessor):
-    """
-    preprocess discord text messages
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def clean_texts(self, texts: list[str]) -> list[str]:
-        """
-        clean the given text
-
-        Parameters
-        ------------
-        texts : list[str]
-            a list of discord messages text
-
-        Returns
-        ---------
-        texts_cleaned : str
-            the cleaned text
-            (discord ids removed)
-        """
-        texts_cleaned: list[str] = []
-
-        for text in texts:
-            text_cleaned = self.clean_text(text=text)
-            texts_cleaned.append(text_cleaned)
-
-        return texts_cleaned
-
-    def clean_text(self, text: str) -> str:
-        """
-        clean the given text
-
-        Parameters
-        ------------
-        text : str
-            a discord message text
-
-        Returns
-        ---------
-        text_cleaned : str
-            the cleaned text
-            (discord ids removed)
-        """
-        text_cleaned = self.remove_ids(text=text)
-        text_cleaned = self.extract_main_content(text=text_cleaned)
-
-        return text_cleaned
-
-    def remove_ids(self, text: str) -> str:
-        """
-        remove the ids that are available in texts
-        user ids would be with the format of <@number>
-        and the role ids are in the format of <@&number>
-
-        Parameters
-        ------------
-        text : str
-            a discord message text
-
-        Returns
-        ---------
-        cleaned_text : str
-            the texts with removed <@number> and <@&number>
-        """
-        pattern = r"<@&?\d+>"
-
-        # Removing matches
-        cleaned_text = re.sub(pattern, "", text)
-
-        cleaned_text = " ".join(cleaned_text.split())
-        return cleaned_text
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
index 2c786ef2..9d6ac778 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
@@ -312,14 +312,13 @@ def test_transform_two_data(self):
             "thread": None,
             "url": "https://discord.com/channels/1234/111111/10000000003",
         }
-        print(documents[0].text)
+
         self.assertDictEqual(documents[0].metadata, expected_metadata_0)
         self.assertDictEqual(documents[1].metadata, expected_metadata_1)
         self.assertDictEqual(documents[2].metadata, expected_metadata_2)
         self.assertDictEqual(documents[3].metadata, expected_metadata_3)
 
-        # Optionally, you can also check the text separately if needed
-        self.assertEqual(documents[0].text, "apple fall trees1")
-        self.assertEqual(documents[1].text, "mention person user3")
-        self.assertEqual(documents[2].text, "mention user3 user4 role1")
+        self.assertEqual(documents[0].text, "test_message1")
+        self.assertEqual(documents[1].text, "mentioning a person user3")
+        self.assertEqual(documents[2].text, "mentioning user3 user4 role1")
         self.assertEqual(documents[3].text, "test_message1 [URL0]")
diff --git a/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py b/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py
deleted file mode 100644
index e268089e..00000000
--- a/dags/hivemind_etl_helpers/tests/unit/test_common_preprocessor.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import unittest
-from unittest.mock import patch
-import spacy
-
-from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor
-
-
-class TestDiscordPreprocessor(unittest.TestCase):
-    def setUp(self):
-        """Set up test fixtures before each test method."""
-        self.preprocessor = BasePreprocessor()
-
-    # Tests for extract_main_content method
-    @unittest.skipIf(
-        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
-    )
-    def test_extract_main_content_basic(self):
-        """Test basic content extraction"""
-        input_text = "The quick brown fox jumps over the lazy dog!"
-        result = self.preprocessor.extract_main_content(input_text)
-        # Note: Expected result will contain lemmatized words without stop words
-        self.assertIn("quick brown fox jump lazy dog", result.lower())
-
-    @unittest.skipIf(
-        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
-    )
-    def test_extract_main_content_with_numbers(self):
-        """Test content extraction with numbers"""
-        input_text = "I have 5 apples and 3 oranges."
-        result = self.preprocessor.extract_main_content(input_text)
-        self.assertIn("apple orange", result.lower())
-        self.assertNotIn("5", result)
-        self.assertNotIn("3", result)
-
-    @unittest.skipIf(
-        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
-    )
-    def test_extract_main_content_with_urls(self):
-        """Test content extraction with URLs"""
-        input_text = "Check this link: https://example.com for more information"
-        result = self.preprocessor.extract_main_content(input_text)
-        self.assertNotIn("https://example.com", result)
-        self.assertIn("check link information", result.lower())
-
-    @unittest.skipIf(
-        not spacy.util.is_package("en_core_web_lg"), "requires en_core_web_lg model"
-    )
-    def test_extract_main_content_empty_string(self):
-        """Test content extraction with empty string"""
-        self.assertEqual(self.preprocessor.extract_main_content(""), "")
-
-    def test_extract_main_content_missing_model(self):
-        """Test handling of missing spacy model"""
-        with patch("spacy.load") as mock_load:
-            mock_load.side_effect = OSError("Model not found")
-
-            with self.assertRaises(OSError) as context:
-                self.preprocessor.extract_main_content("Test text")
-
-            self.assertIn(
-                "Model spacy `en_core_web_lg` is not installed!", str(context.exception)
-            )

From d0fb0086409b6b5cabf3b97d07614897a60b9978 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 15:43:17 +0330
Subject: [PATCH 14/23] fix: remove the unrequired processor!

---
 .../tests/unit/test_discord_preprocess.py     | 43 -------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py

diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py
deleted file mode 100644
index 34d0d4a1..00000000
--- a/dags/hivemind_etl_helpers/tests/unit/test_discord_preprocess.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import unittest
-
-from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor
-
-
-class TestDiscordPreprocessor(unittest.TestCase):
-    def setUp(self):
-        """Set up test fixtures before each test method."""
-        self.preprocessor = DiscordPreprocessor()
-
-    # Tests for remove_ids method
-    def test_remove_user_ids(self):
-        """Test removal of user IDs from text"""
-        input_text = "Hello <@123456> how are you?"
-        expected = "Hello how are you?"
-        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
-
-    def test_remove_role_ids(self):
-        """Test removal of role IDs from text"""
-        input_text = "Welcome <@&789012> to the server!"
-        expected = "Welcome to the server!"
-        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
-
-    def test_remove_multiple_ids(self):
-        """Test removal of multiple IDs from text"""
-        input_text = "Hey <@123456> please notify <@&789012> and <@345678>"
-        expected = "Hey please notify and"
-        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)
-
-    def test_text_without_ids(self):
-        """Test text that doesn't contain any IDs"""
-        input_text = "Regular message without any IDs"
-        self.assertEqual(self.preprocessor.remove_ids(input_text), input_text)
-
-    def test_empty_string(self):
-        """Test empty string input"""
-        self.assertEqual(self.preprocessor.remove_ids(""), "")
-
-    def test_only_whitespace(self):
-        """Test string with only whitespace"""
-        input_text = "   \t   \n   "
-        expected = ""
-        self.assertEqual(self.preprocessor.remove_ids(input_text), expected)

From 7dfc963fccbc48e888c6e3c6587ee631f34cc210 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 15:56:30 +0330
Subject: [PATCH 15/23] feat: removed unused test cases! the test case didn't
 belong to any code.

---
 .../unit/test_discord_update_raw_messages.py  | 141 ------------------
 1 file changed, 141 deletions(-)
 delete mode 100644 dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py

diff --git a/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py b/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py
deleted file mode 100644
index 90ce9728..00000000
--- a/dags/hivemind_etl_helpers/tests/unit/test_discord_update_raw_messages.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import unittest
-from datetime import datetime
-import spacy
-
-from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import (
-    update_raw_messages,
-)
-
-
-class TestUpdateRawMessages(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Load spacy model once for all tests"""
-        try:
-            cls.nlp = spacy.load("en_core_web_lg")
-        except OSError as exp:
-            raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp
-
-    def test_empty_list(self):
-        """Test function handles empty input list correctly"""
-        result = update_raw_messages([])
-        self.assertEqual(result, [], "Empty list should return empty list")
-
-    def test_with_content(self):
-        """Test function processes valid content correctly"""
-        test_data = [
-            {
-                "type": 0,
-                "author": "123456",
-                "content": "test_message <@123456> Hello",
-                "user_mentions": [],
-                "role_mentions": [],
-                "reactions": [],
-                "replied_user": None,
-                "createdDate": datetime.now(),
-                "messageId": "1234567",
-                "channelId": "89101112",
-                "channelName": "general",
-                "threadId": None,
-                "threadName": None,
-                "isGeneratedByWebhook": False,
-            }
-        ]
-
-        result = update_raw_messages(test_data)
-
-        self.assertEqual(len(result), 1, "Should return one processed message")
-        self.assertIn("content", result[0], "Processed message should contain content")
-        # The actual content will depend on spaCy's processing, but should not contain Discord IDs
-        self.assertNotIn(
-            "<@123456>", result[0]["content"], "Discord IDs should be removed"
-        )
-        self.assertNotEqual(result[0]["content"], "", "Content should not be empty")
-
-    def test_without_content(self):
-        """Test function handles messages without content field"""
-        test_data = [
-            {
-                "type": 0,
-                "author": "123456",
-                "user_mentions": [],
-                "role_mentions": [],
-            }
-        ]
-
-        result = update_raw_messages(test_data)
-        self.assertEqual(result, [], "Message without content should be filtered out")
-
-    def test_with_discord_ids(self):
-        """Test function removes Discord user and role IDs correctly"""
-        test_data = [
-            {
-                "content": "Hello <@123456> and <@&789012> how are you?",
-                "type": 0,
-            }
-        ]
-
-        result = update_raw_messages(test_data)
-        self.assertEqual(len(result), 1, "Should return one processed message")
-        # Check that Discord IDs are removed
-        self.assertNotIn("<@123456>", result[0]["content"], "User ID should be removed")
-        self.assertNotIn(
-            "<@&789012>", result[0]["content"], "Role ID should be removed"
-        )
-        # Check that meaningful content remains
-        self.assertIn(
-            "hello", result[0]["content"].lower(), "Regular words should be preserved"
-        )
-
-    def test_consecutive_calls(self):
-        """Test function handles consecutive calls correctly"""
-        test_data_1 = [{"content": "Apples are falling from trees!"}]
-        test_data_2 = [{"content": "second message"}]
-
-        result_1 = update_raw_messages(test_data_1)
-        result_2 = update_raw_messages(test_data_2)
-
-        self.assertEqual(len(result_1), 1, "First call should return one message")
-        self.assertEqual(len(result_2), 1, "Second call should return one message")
-        self.assertNotEqual(
-            result_1[0]["content"],
-            result_2[0]["content"],
-            "Different inputs should produce different outputs",
-        )
-
-    def test_multiple_messages(self):
-        """Test function processes multiple messages correctly"""
-        test_data = [
-            {"content": "first message about cats"},
-            {"content": "second message about dogs"},
-            {"content": "third message about birds"},
-        ]
-
-        result = update_raw_messages(test_data)
-        self.assertEqual(len(result), 3, "Should return three processed messages")
-
-        # Check that each message has been processed
-        for i, msg in enumerate(result):
-            self.assertIn("content", msg, f"Message {i} should have content")
-            self.assertNotEqual(
-                msg["content"], "", f"Message {i} content should not be empty"
-            )
-
-    def test_special_characters(self):
-        """Test function handles special characters correctly"""
-        test_data = [
-            {
-                "content": "Hello! This is a test... With some punctuation!?! @#$%",
-                "type": 0,
-            }
-        ]
-
-        result = update_raw_messages(test_data)
-        self.assertEqual(len(result), 1, "Should return one processed message")
-        # The processed content should contain meaningful words but not punctuation
-        processed_content = result[0]["content"].lower()
-        self.assertIn("hello", processed_content, "Common words should be preserved")
-        self.assertIn("test", processed_content, "Common words should be preserved")
-        self.assertNotIn(
-            "!?!", processed_content, "Multiple punctuation should be removed"
-        )

From 077ab8e1f03a730ad5ed2f466cef16cd5cd8c377 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 16:17:58 +0330
Subject: [PATCH 16/23] feat: updated data with not removing the urls from
 actual content!

---
 .../utils/transform_discord_raw_messges.py    | 26 +++++++++----------
 .../test_discord_prepare_document_from_db.py  |  2 +-
 .../integration/test_discord_prepare_llama.py |  4 +--
 .../test_pg_vector_access_with_discord.py     |  2 +-
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
index cf04ddbe..0cee659f 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -14,9 +14,6 @@
 from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_ids import (
     prepare_raw_message_ids,
 )
-from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_urls import (
-    prepare_raw_message_urls,
-)
 from hivemind_etl_helpers.src.db.discord.utils.prepare_reactions_id import (
     prepare_raction_ids,
 )
@@ -160,7 +157,7 @@ def prepare_document(
         roles=dict(zip(role_ids, role_names)),
         users=dict(zip(mention_ids, mention_names)),
     )
-    content_url_updated, url_reference = prepare_raw_message_urls(content)
+    # content_url_updated, url_reference = prepare_raw_message_urls(content)
 
     # always has length 1
     assert len(author_name) == 1, "Either None or multiple authors!"
@@ -202,8 +199,8 @@ def prepare_document(
             msg_meta_data["reactors_global_name"] = reactors_gname
         if reactors_nickname != []:
             msg_meta_data["reactors_nicknames"] = reactors_nickname
-    if url_reference != {}:
-        msg_meta_data["url_reference"] = url_reference
+    # if url_reference != {}:
+    #     msg_meta_data["url_reference"] = url_reference
 
     if replier_name is not None:
         msg_meta_data["replier_username"] = replier_name[0]
@@ -214,18 +211,19 @@ def prepare_document(
     if role_names != []:
         msg_meta_data["role_mentions"] = role_names
 
-    if content_url_updated == "":
-        raise ValueError("Message with Empty content!")
+    # if content_url_updated == "":
+    #     raise ValueError("Message with Empty content!")
 
-    if check_no_content_only_links(content_url_updated):
+    # TODO: clean the text with preprocessor and check if there's anything availale in it
+    if check_no_content_only_links(content):
         raise ValueError("Message just did have urls")
 
     # removing null characters
-    content_url_updated = re.sub(r"[\x00-\x1F\x7F]", "", content_url_updated)
+    content = re.sub(r"[\x00-\x1F\x7F]", "", content)
 
     doc: Document
     if not exclude_metadata:
-        doc = Document(text=content_url_updated, metadata=msg_meta_data)
+        doc = Document(text=content, metadata=msg_meta_data)
         doc.excluded_embed_metadata_keys = [
             "channel",
             "date",
@@ -239,7 +237,7 @@ def prepare_document(
             "reactors_global_name",
             "reactors_nicknames",
             "thread",
-            "url_reference",
+            # "url_reference",
             "replier_username",
             "replier_global_name",
             "replier_nickname",
@@ -256,7 +254,7 @@ def prepare_document(
             "reactors_global_name",
             "reactors_nicknames",
             "thread",
-            "url_reference",
+            # "url_reference",
             "replier_username",
             "replier_global_name",
             "replier_nickname",
@@ -264,6 +262,6 @@ def prepare_document(
             "url",
         ]
     else:
-        doc = Document(text=content_url_updated)
+        doc = Document(text=content)
 
     return doc
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
index 9d6ac778..09e13050 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
@@ -99,7 +99,7 @@ def test_transform_two_data(self):
         data = {
             "type": 0,
             "author": "111",
-            "content": "Apples are falling from trees1",
+            "content": "test_message1",
             "user_mentions": [],
             "role_mentions": [],
             "reactions": [],
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py
index 3c5fa5af..8c324460 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py
@@ -205,7 +205,6 @@ def test_transform_two_data(self):
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
             "author_nickname": "user1_nickname",
-            "url_reference": {"[URL0]": "https://www.google.com"},
             "url": f"https://discord.com/channels/{guild_id}/1313133/1111111113",
             "thread": None,
         }
@@ -215,8 +214,7 @@ def test_transform_two_data(self):
         self.assertDictEqual(documents[2].metadata, expected_metadata_2)
         self.assertDictEqual(documents[3].metadata, expected_metadata_3)
 
-        # Optionally, you can also check the text separately if needed
         self.assertEqual(documents[0].text, "test_message1")
         self.assertEqual(documents[1].text, "mentioning a person user3")
         self.assertEqual(documents[2].text, "mentioning user3 user4 role1")
-        self.assertEqual(documents[3].text, "test_message1 [URL0]")
+        self.assertEqual(documents[3].text, "test_message1 https://www.google.com")
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
index 96ada0e4..b8855470 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
@@ -185,7 +185,7 @@ def test_save_documents(self):
         text, _, metadata = data[0]
 
         # nickname was `None`, so it wasn't included in metadata
-        self.assertEqual(text, "apple fall tree")
+        self.assertEqual(text, "Apples are falling from trees 1")
         self.assertEqual(metadata["channel"], documents[0].metadata["channel"])
         self.assertEqual(
             metadata["author_username"], documents[0].metadata["author_username"]

From ed99d65c2c84b9302fde7f94eaa1b7783a961781 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 16:57:24 +0330
Subject: [PATCH 17/23] feat: using latest version of hivemind-backend lib!

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2f96c76a..66ea62eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ cohere==4.56
 neo4j>=5.14.1, <6.0.0
 python-dotenv>=1.0.0, <2.0.0
 urlextract==1.8.0
-tc-hivemind-backend==1.4.0
+tc-hivemind-backend==1.4.1
 traceloop-sdk>=0.15.2, <0.16.0
 beautifulsoup4==4.12.3
 llama-index-readers-notion==0.1.6

From 866088d4c5165b5cc4021db308fdc1832f57051c Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Mon, 9 Dec 2024 16:57:45 +0330
Subject: [PATCH 18/23] feat: checking if there's any valid date in the text!

---
 .../src/db/discord/utils/transform_discord_raw_messges.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
index 0cee659f..4688032e 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -18,6 +18,7 @@
     prepare_raction_ids,
 )
 from hivemind_etl_helpers.src.db.globals import DATE_FORMAT
+from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor
 from llama_index.core import Document
 
 
@@ -214,9 +215,8 @@ def prepare_document(
     # if content_url_updated == "":
     #     raise ValueError("Message with Empty content!")
 
-    # TODO: clean the text with preprocessor and check if there's anything availale in it
-    if check_no_content_only_links(content):
-        raise ValueError("Message just did have urls")
+    if BasePreprocessor().extract_main_content(text=content):
+        raise ValueError("Message didn't hold any valuable information!")
 
     # removing null characters
     content = re.sub(r"[\x00-\x1F\x7F]", "", content)

From 96d7ab32fb86f67b617c7f3a6850cdd5365c2118 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 10 Dec 2024 10:08:37 +0330
Subject: [PATCH 19/23] feat: using smaller sized spacy model!

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a30ed742..15303f18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,12 +4,12 @@ COPY . .
 RUN chmod +x init.sh
 USER airflow
 RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt
-RUN python -m spacy download en_core_web_lg
+RUN python -m spacy download en_core_web_sm
 
 FROM python:3.11-bullseye AS test
 WORKDIR /project
 COPY . .
 RUN pip install -r requirements.txt
-RUN python -m spacy download en_core_web_lg
+RUN python -m spacy download en_core_web_sm
 RUN chmod +x docker-entrypoint.sh
 CMD ["./docker-entrypoint.sh"]

From 5634296d52658c67ac82675530db815dac927ca5 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 10 Dec 2024 11:26:44 +0330
Subject: [PATCH 20/23] fix: wrong if condition!

---
 .../src/db/discord/utils/transform_discord_raw_messges.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
index 4688032e..5fd57629 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -215,7 +215,7 @@ def prepare_document(
     # if content_url_updated == "":
     #     raise ValueError("Message with Empty content!")
 
-    if BasePreprocessor().extract_main_content(text=content):
+    if not BasePreprocessor().extract_main_content(text=content):
         raise ValueError("Message didn't hold any valuable information!")
 
     # removing null characters

From d3f8d7d851fc043a0abdfe7ec3971dc62ecbdf4e Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 10 Dec 2024 11:27:01 +0330
Subject: [PATCH 21/23] fix: aligning with latest codes!

---
 .../test_discord_prepare_document_from_db.py          | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
index 09e13050..87dbb4c4 100644
--- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
+++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
@@ -17,7 +17,7 @@ def setup_db(
         create_platform: bool = True,
         guild_id: str = "1234",
     ):
-        client = MongoSingleton.get_instance().client
+        client = MongoSingleton.get_instance().get_client()
 
         community_id = ObjectId("9f59dd4f38f3474accdc8f24")
         platform_id = ObjectId("063a2a74282db2c00fbc2428")
@@ -83,7 +83,7 @@ def setup_db(
             )
 
     def test_transform_two_data(self):
-        client = MongoSingleton.get_instance().client
+        client = MongoSingleton.get_instance().get_client()
 
         channels = ["111111", "22222"]
         guild_id = "1234"
@@ -99,7 +99,7 @@ def test_transform_two_data(self):
         data = {
             "type": 0,
             "author": "111",
-            "content": "test_message1",
+            "content": "test_message1 making it longer!",
             "user_mentions": [],
             "role_mentions": [],
             "reactions": [],
@@ -308,7 +308,6 @@ def test_transform_two_data(self):
             "date": datetime(2023, 5, 8).strftime("%Y-%m-%d %H:%M:%S"),
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
-            "url_reference": {"[URL0]": "https://www.google.com"},
             "thread": None,
             "url": "https://discord.com/channels/1234/111111/10000000003",
         }
@@ -318,7 +317,7 @@ def test_transform_two_data(self):
         self.assertDictEqual(documents[2].metadata, expected_metadata_2)
         self.assertDictEqual(documents[3].metadata, expected_metadata_3)
 
-        self.assertEqual(documents[0].text, "test_message1")
+        self.assertEqual(documents[0].text, "test_message1 making it longer!")
         self.assertEqual(documents[1].text, "mentioning a person user3")
         self.assertEqual(documents[2].text, "mentioning user3 user4 role1")
-        self.assertEqual(documents[3].text, "test_message1 [URL0]")
+        self.assertEqual(documents[3].text, "test_message1 https://www.google.com")

From ce0a9c230bf318fc0609c50f4d8accc42bc70ce8 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 10 Dec 2024 12:20:26 +0330
Subject: [PATCH 22/23] fix: using the fixed version of hivemind-backend lib!

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 66ea62eb..82ab0e11 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ cohere==4.56
 neo4j>=5.14.1, <6.0.0
 python-dotenv>=1.0.0, <2.0.0
 urlextract==1.8.0
-tc-hivemind-backend==1.4.1
+tc-hivemind-backend==1.4.2
 traceloop-sdk>=0.15.2, <0.16.0
 beautifulsoup4==4.12.3
 llama-index-readers-notion==0.1.6

From 67a3aae90356f78f2c0329fab4dd946b977004b6 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 10 Dec 2024 14:00:38 +0330
Subject: [PATCH 23/23] fix: isort linter issue!

---
 .../src/db/discord/utils/transform_discord_raw_messges.py      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
index 5fd57629..5bd865ba 100644
--- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
+++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -3,7 +3,6 @@
 from typing import Any
 
 from hivemind_etl_helpers.src.db.discord.utils.content_parser import (
-    check_no_content_only_links,
     remove_empty_str,
     remove_none_from_list,
 )
@@ -18,8 +17,8 @@
     prepare_raction_ids,
 )
 from hivemind_etl_helpers.src.db.globals import DATE_FORMAT
-from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor
 from llama_index.core import Document
+from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor
 
 
 def transform_discord_raw_messages(