TogetherCrew · cyri113 · Jan 25, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 24, 2024
diff --git a/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py b/dags/hivemind_etl_helpers/src/db/discord/fetch_raw_messages.py
@@ -30,6 +30,7 @@ def fetch_raw_messages(guild_id: str, from_date: datetime | None = None) -> list
             client[guild_id]["rawinfos"]
             .find(
                 {
+                    "type": {"$ne": 18},
                     "createdDate": {"$gte": from_date},
                     "isGeneratedByWebhook": False,
                     "channelId": {"$in": channels},
@@ -43,6 +44,7 @@ def fetch_raw_messages(guild_id: str, from_date: datetime | None = None) -> list
             client[guild_id]["rawinfos"]
             .find(
                 {
+                    "type": {"$ne": 18},
                     "isGeneratedByWebhook": False,
                     "channelId": {"$in": channels},
                     "createdDate": {"$gte": from_date_modules},
@@ -98,6 +100,7 @@ def fetch_raw_msg_grouped(
         pipeline.append(
             {
                 "$match": {
+                    "type": {"$ne": 18},
                     "createdDate": {
                         "$gte": from_date,
                         "$lt": datetime.now().replace(
@@ -113,6 +116,7 @@ def fetch_raw_msg_grouped(
         pipeline.append(
             {
                 "$match": {
+                    "type": {"$ne": 18},
                     "createdDate": {
                         "$gte": from_date_modules,
                         "$lt": datetime.now().replace(

diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/content_parser.py b/dags/hivemind_etl_helpers/src/db/discord/utils/content_parser.py
@@ -0,0 +1,61 @@
+import re
+
+
+def remove_empty_str(data: list[str]):
+    """
+    a utility function to remove the empty string from a list
+
+    Parameters
+    -----------
+    data : list[str]
+        a list with string values
+    """
+    while "" in data:
+        data.remove("")
+
+    return data
+
+
+def check_no_content_only_links(content: str, link_pattern: str = r"\[URL\d\]") -> bool:
+    """
+    check if there's just links in the function and there's no content written
+
+    Parameters
+    -----------
+    content : str
+        the message content
+    link_pattern : str
+        the pattern of link
+        default pattern is for links like `[URL0]`, or `[URL1]`, etc
+
+    Returns
+    --------
+    no_content : bool
+        if `True` then there was no content but the links in the given string
+    """
+    pattern = re.compile(link_pattern)
+    replacement = ""
+
+    result_string = re.sub(pattern, replacement, content)
+
+    alphabet_pattern = re.compile(r"[a-zA-Z]")
+    no_content = not bool(alphabet_pattern.search(result_string))
+    return no_content
+
+
+def remove_none_from_list(data: list[str | None]) -> list[str]:
+    """
+    remove the `None` values from a list
+
+    Parameters
+    -----------
+    data : list[str | None]
+        the list of data to process
+
+    Returns
+    --------
+    data_processed : list[str]
+        the data just removed the `None` values
+    """
+    data_processed = [value for value in data if value is not None]
+    return data_processed
diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/prepare_raw_message_urls.py b/dags/hivemind_etl_helpers/src/db/discord/utils/prepare_raw_message_urls.py
@@ -1,27 +1,4 @@
-from urllib.parse import urlparse
-
-
-def extract_urls(text: str) -> list[str]:
-    """
-    extract the urls within the text and just return the urls
-
-    Parameters
-    ------------
-    text : str
-        the raw text
-
-    Returns
-    ---------
-    urls : list[str]
-        the list of urls within the text
-    """
-    urls = []
-    words = text.split()
-    for word in words:
-        parsed_url = urlparse(word)
-        if parsed_url.scheme and parsed_url.netloc:
-            urls.append(parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path)
-    return urls
+from urlextract import URLExtract
 
 
 def prepare_raw_message_urls(message: str) -> tuple[str, dict[str, str]]:
@@ -42,7 +19,7 @@ def prepare_raw_message_urls(message: str) -> tuple[str, dict[str, str]]:
         the url reference dict that keys are reference name
         and values are the actual url
     """
-    msg_urls = extract_urls(message)
+    msg_urls = URLExtract().find_urls(message)
 
     references: dict[str, str] = {}
 

diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -1,6 +1,11 @@
 import logging
 from typing import Any
 
+from hivemind_etl_helpers.src.db.discord.utils.content_parser import (
+    check_no_content_only_links,
+    remove_empty_str,
+    remove_none_from_list,
+)
 from hivemind_etl_helpers.src.db.discord.utils.id_transform import convert_role_id
 from hivemind_etl_helpers.src.db.discord.utils.merge_user_ids_fetch_names import (
     merge_user_ids_and_fetch_names,
@@ -158,6 +163,8 @@ def prepare_document(
         "channel": message["channelName"],
         "date": message["createdDate"].strftime("%Y-%m-%d %H:%M:%S"),
         "author_username": author_name[0],
+        # always including the thread_name, if `None`, then it was a channel message
+        "thread_name": message["threadName"],
     }
     if author_global_name[0] is not None:
         msg_meta_data["author_global_name"] = author_global_name[0]
@@ -185,8 +192,6 @@ def prepare_document(
             msg_meta_data["reactors_nicknames"] = reactors_nickname
     if url_reference != {}:
         msg_meta_data["url_reference"] = url_reference
-    if message["threadName"] is not None:
-        msg_meta_data["thread_name"] = message["threadName"]
 
     if replier_name is not None:
         msg_meta_data["replier_username"] = replier_name[0]
@@ -197,43 +202,49 @@ def prepare_document(
     if role_names != []:
         msg_meta_data["role_mentions"] = role_names
 
+    if content_url_updated == "":
+        raise ValueError("Message with Empty content!")
+
+    if check_no_content_only_links(content_url_updated):
+        raise ValueError("Message just did have urls")
+
     doc: Document
     if not exclude_metadata:
         doc = Document(text=content_url_updated, metadata=msg_meta_data)
+        doc.excluded_embed_metadata_keys = [
+            "channel",
+            "date",
+            "author_username",
+            "author_global_name",
+            "author_nickname",
+            "mention_usernames",
+            "mention_global_names",
+            "mention_nicknames",
+            "reactors_username",
+            "reactors_global_name",
+            "reactors_nicknames",
+            "thread_name",
+            "url_reference",
+            "replier_username",
+            "replier_global_name",
+            "replier_nickname",
+            "role_mentions",
+        ]
+        doc.excluded_llm_metadata_keys = [
+            "mention_usernames",
+            "mention_global_names",
+            "mention_nicknames",
+            "reactors_username",
+            "reactors_global_name",
+            "reactors_nicknames",
+            "thread_name",
+            "url_reference",
+            "replier_username",
+            "replier_global_name",
+            "replier_nickname",
+            "role_mentions",
+        ]
     else:
         doc = Document(text=content_url_updated)
 
     return doc
-
-
-def remove_empty_str(data: list[str]):
-    """
-    a utility function to remove the empty string from a list
-
-    Parameters
-    -----------
-    data : list[str]
-        a list with string values
-    """
-    while "" in data:
-        data.remove("")
-
-    return data
-
-
-def remove_none_from_list(data: list[str | None]) -> list[str]:
-    """
-    remove the `None` values from a list
-
-    Parameters
-    -----------
-    data : list[str | None]
-        the list of data to process
-
-    Returns
-    --------
-    data_processed : list[str]
-        the data just removed the `None` values
-    """
-    data_processed = [value for value in data if value is not None]
-    return data_processed
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_modules_channels.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_modules_channels.py
@@ -82,7 +82,7 @@ def setup_db(
             )
 
     def test_fetch_channels(self):
-        guild_id = "12345"
+        guild_id = "1234"
         channels = ["111111", "22222"]
         self.setup_db(
             create_modules=True,

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages.py
@@ -31,6 +31,7 @@ def setup_db(
                     "platforms": [
                         {
                             "platformId": platform_id,
+                            "fromDate": datetime(2023, 10, 1),
                             "options": {
                                 "channels": channels,
                                 "roles": ["role_id"],

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages_grouped.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_fetch_raw_messages_grouped.py
@@ -30,6 +30,7 @@ def setup_db(
                     "platforms": [
                         {
                             "platformId": platform_id,
+                            "fromDate": datetime(2023, 10, 1),
                             "options": {
                                 "channels": channels,
                                 "roles": ["role_id"],

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
@@ -33,6 +33,7 @@ def setup_db(
                     "platforms": [
                         {
                             "platformId": platform_id,
+                            "fromDate": datetime(2023, 1, 1),
                             "options": {
                                 "channels": channels,
                                 "roles": ["role_id"],
@@ -265,6 +266,7 @@ def test_transform_two_data(self):
             "date": datetime(2023, 5, 1).strftime("%Y-%m-%d %H:%M:%S"),
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
+            "thread_name": None,
         }
 
         expected_metadata_1 = {
@@ -277,6 +279,7 @@ def test_transform_two_data(self):
             "mention_nicknames": ["user3_nickname"],
             "replier_username": "user4",
             "replier_global_name": "user4_GlobalName",
+            "thread_name": None,
         }
 
         expected_metadata_2 = {
@@ -299,8 +302,9 @@ def test_transform_two_data(self):
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
             "url_reference": {"[URL0]": "https://www.google.com"},
+            "thread_name": None,
         }
-        print(documents[3].metadata)
+        print(documents[0].text)
         self.assertDictEqual(documents[0].metadata, expected_metadata_0)
         self.assertDictEqual(documents[1].metadata, expected_metadata_1)
         self.assertDictEqual(documents[2].metadata, expected_metadata_2)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_grouped_data.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_grouped_data.py
@@ -32,6 +32,7 @@ def setup_db(
                     "platforms": [
                         {
                             "platformId": platform_id,
+                            "fromDate": datetime(2023, 10, 1),
                             "options": {
                                 "channels": channels,
                                 "roles": ["role_id"],

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py
@@ -168,6 +168,7 @@ def test_transform_two_data(self):
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
             "author_nickname": "user1_nickname",
+            "thread_name": None,
         }
 
         expected_metadata_1 = {
@@ -179,6 +180,7 @@ def test_transform_two_data(self):
             "mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
             "replier_username": "user4",
             "replier_global_name": "user4_GlobalName",
+            "thread_name": None,
         }
 
         expected_metadata_2 = {
@@ -201,6 +203,7 @@ def test_transform_two_data(self):
             "author_global_name": "user1_GlobalName",
             "author_nickname": "user1_nickname",
             "url_reference": {"[URL0]": "https://www.google.com"},
+            "thread_name": None,
         }
 
         self.assertDictEqual(documents[0].metadata, expected_metadata_0)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_raw_message_urls.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_raw_message_urls.py
@@ -22,7 +22,7 @@ def test_normal_messag_single_url_https(self):
         self.assertEqual(msg_updated, "Here you can have a look [URL0]")
         self.assertEqual(url_reference, {"[URL0]": "https://google.com"})
 
-    def test_normal_messag_multiple_url(self):
+    def test_normal_message_multiple_url(self):
         msg = "Here you can have a look https://google.com https://example.com"
 
         msg_updated, url_reference = prepare_raw_message_urls(msg)
@@ -32,3 +32,14 @@ def test_normal_messag_multiple_url(self):
             url_reference,
             {"[URL0]": "https://google.com", "[URL1]": "https://example.com"},
         )
+
+    def test_message_multiple_url_wrappend(self):
+        msg = "Here you can have a look <https://google.com> <https://example.com>"
+
+        msg_updated, url_reference = prepare_raw_message_urls(msg)
+
+        self.assertEqual(msg_updated, "Here you can have a look <[URL0]> <[URL1]>")
+        self.assertEqual(
+            url_reference,
+            {"[URL0]": "https://google.com", "[URL1]": "https://example.com"},
+        )
diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_summary.py
@@ -40,6 +40,7 @@ def setup_db(
                     "platforms": [
                         {
                             "platformId": platform_id,
+                            "fromDate": datetime(2023, 10, 1),
                             "options": {
                                 "channels": channels,
                                 "roles": ["role_id"],

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py b/dags/hivemind_etl_helpers/tests/integration/test_pg_vector_access_with_discord.py
@@ -59,6 +59,7 @@ def setup_mongo_information(
                     "platforms": [
                         {
                             "platformId": platform_id,
+                            "fromDate": datetime(2023, 1, 1),
                             "options": {
                                 "channels": channels,
                                 "roles": ["role_id"],