TogetherCrew · amindadgar · Dec 10, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -4,10 +4,12 @@ COPY . .
 RUN chmod +x init.sh
 USER airflow
 RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt
+RUN python -m spacy download en_core_web_lg
 
 FROM python:3.11-bullseye AS test
 WORKDIR /project
 COPY . .
 RUN pip install -r requirements.txt
+RUN python -m spacy download en_core_web_lg
 RUN chmod +x docker-entrypoint.sh
 CMD ["./docker-entrypoint.sh"]
diff --git a/dags/hivemind_etl_helpers/src/db/common/__init__.py b/dags/hivemind_etl_helpers/src/db/common/__init__.py
diff --git a/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py b/dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
@@ -0,0 +1,43 @@
+import spacy
+
+
+class BasePreprocessor:
+    def __init__(self) -> None:
+        pass
+
+    def extract_main_content(self, text: str) -> str:
+        """
+        extract main content of a message
+
+        Parameters
+        ------------
+        text : str
+            a discord message text
+
+        Returns
+        --------
+        cleaned_text : str
+
+        """
+        try:
+            nlp = spacy.load("en_core_web_lg")
+        except OSError as exp:
+            raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp
+
+        doc = nlp(text)
+
+        # Filter out punctuation, whitespace, and numerical values, then extract the lemma for each remaining token
+        main_content_tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_punct
+            and not token.is_space
+            and not token.is_stop
+            and not token.like_url
+            and not token.like_num
+            and token.is_ascii
+        ]
+
+        # Join the tokens to form the cleaned sentence
+        cleaned_text = " ".join(main_content_tokens)
+        return cleaned_text
diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py b/dags/hivemind_etl_helpers/src/db/discord/discord_raw_message_to_document.py
@@ -5,6 +5,7 @@
     transform_discord_raw_messages,
 )
 from llama_index.core import Document
+from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor
 
 
 def discord_raw_to_documents(
@@ -29,6 +30,37 @@ def discord_raw_to_documents(
         list of messages converted to documents
     """
     raw_mongo_messages = fetch_raw_messages(guild_id, selected_channels, from_date)
-    messages_docuemnt = transform_discord_raw_messages(guild_id, raw_mongo_messages)
+    processed_messages = update_raw_messages(raw_data=raw_mongo_messages)
+    messages_docuemnt = transform_discord_raw_messages(guild_id, processed_messages)
 
     return messages_docuemnt
+
+
+def update_raw_messages(raw_data: list[dict]) -> list[dict]:
+    """
+    Update raw messages text by cleaning their data
+
+    Parameters
+    -----------
+    data : list[dict]
+        a list of raw data fetched from database
+        each dict hold a 'content'
+
+    Returns
+    ---------
+    cleaned_data : list[dict]
+        a list of dictionaries but with cleaned data
+    """
+    preprocessor = DiscordPreprocessor()
+
+    cleaned_data: list[dict] = []
+    for data in raw_data:
+        content = data.get("content")
+        if content:
+            cleaned_content = preprocessor.clean_text(content)
+
+            if cleaned_content:
+                data["content"] = cleaned_content
+                cleaned_data.append(data)
+
+    return cleaned_data
diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
@@ -7,6 +7,7 @@ def fetch_raw_messages(
     guild_id: str,
     selected_channels: list[str],
     from_date: datetime,
+    **kwargs,
 ) -> list[dict]:
     """
     fetch rawinfo messages from mongodb database
@@ -20,6 +21,10 @@ def fetch_raw_messages(
     from_date : datetime
         get the raw data from a specific date
         default is None, meaning get all the messages
+    kwargs : dict
+        min_word_limit : int
+            the minimum words that the messages shuold contain
+            default is 8 characters
 
     Returns
     --------
@@ -29,6 +34,8 @@ def fetch_raw_messages(
     client = MongoSingleton.get_instance().get_client()
     user_ids = get_real_users(guild_id)
 
+    min_word_limit = kwargs.get("min_word_limit", 15)
+
     cursor = (
         client[guild_id]["rawinfos"]
         .find(
@@ -38,6 +45,7 @@ def fetch_raw_messages(
                 "createdDate": {"$gte": from_date},
                 "isGeneratedByWebhook": False,
                 "channelId": {"$in": selected_channels},
+                "$expr": {"$gt": [{"$strLenCP": "$content"}, min_word_limit]},
             }
         )
         .sort("createdDate", 1)

diff --git a/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py b/dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
@@ -0,0 +1,79 @@
+import re
+
+from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor
+
+
+class DiscordPreprocessor(BasePreprocessor):
+    """
+    preprocess discord text messages
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def clean_texts(self, texts: list[str]) -> list[str]:
+        """
+        clean the given text
+
+        Parameters
+        ------------
+        texts : list[str]
+            a list of discord messages text
+
+        Returns
+        ---------
+        texts_cleaned : str
+            the cleaned text
+            (discord ids removed)
+        """
+        texts_cleaned: list[str] = []
+
+        for text in texts:
+            text_cleaned = self.clean_text(text=text)
+            texts_cleaned.append(text_cleaned)
+
+        return texts_cleaned
+
+    def clean_text(self, text: str) -> str:
+        """
+        clean the given text
+
+        Parameters
+        ------------
+        text : str
+            a discord message text
+
+        Returns
+        ---------
+        text_cleaned : str
+            the cleaned text
+            (discord ids removed)
+        """
+        text_cleaned = self.remove_ids(text=text)
+        text_cleaned = self.extract_main_content(text=text_cleaned)
+
+        return text_cleaned
+
+    def remove_ids(self, text: str) -> str:
+        """
+        remove the ids that are available in texts
+        user ids would be with the format of <@number>
+        and the role ids are in the format of <@&number>
+
+        Parameters
+        ------------
+        text : str
+            a discord message text
+
+        Returns
+        ---------
+        cleaned_text : str
+            the texts with removed <@number> and <@&number>
+        """
+        pattern = r"<@&?\d+>"
+
+        # Removing matches
+        cleaned_text = re.sub(pattern, "", text)
+
+        cleaned_text = " ".join(cleaned_text.split())
+        return cleaned_text