Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added word max limit! #335

Merged
merged 23 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ba4c527
feat: Added word max limit!
amindadgar Dec 5, 2024
3a018eb
fix: test case to align with the latest codes!
amindadgar Dec 5, 2024
cdcea54
fix: naming typo!
amindadgar Dec 5, 2024
70459fc
fix: test case assertions!
amindadgar Dec 5, 2024
9b2bead
feat: Added noise cancelling for discord messages!
amindadgar Dec 9, 2024
3788374
fix: remove unnecessary print!
amindadgar Dec 9, 2024
b007eec
fix: Added missing spacy dependency!
amindadgar Dec 9, 2024
33d0fb4
feat: updating the messages content!
amindadgar Dec 9, 2024
32eaafb
fix: Added spacy model download in test environment!
amindadgar Dec 9, 2024
b2b6468
fix: aligning with latest codes!
amindadgar Dec 9, 2024
0a3a6df
fix: align input output data!
amindadgar Dec 9, 2024
5b4057d
fix: Added more informative messages!
amindadgar Dec 9, 2024
cd4e0c7
feat: removed the processors!
amindadgar Dec 9, 2024
d0fb008
fix: remove the unrequired processor!
amindadgar Dec 9, 2024
7dfc963
feat: removed unused test cases!
amindadgar Dec 9, 2024
077ab8e
feat: updated data with not removing the urls from actual content!
amindadgar Dec 9, 2024
ed99d65
feat: using latest version of hivemind-backend lib!
amindadgar Dec 9, 2024
866088d
feat: checking if there's any valid date in the text!
amindadgar Dec 9, 2024
96d7ab3
feat: using smaller sized spacy model!
amindadgar Dec 10, 2024
5634296
fix: wrong if condition!
amindadgar Dec 10, 2024
d3f8d7d
fix: aligning with latest codes!
amindadgar Dec 10, 2024
ce0a9c2
fix: using the fixed version of hivemind-backend lib!
amindadgar Dec 10, 2024
67a3aae
fix: isort linter issue!
amindadgar Dec 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ COPY . .
RUN chmod +x init.sh
USER airflow
RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt
RUN python -m spacy download en_core_web_sm

FROM python:3.11-bullseye AS test
WORKDIR /project
COPY . .
RUN pip install -r requirements.txt
RUN python -m spacy download en_core_web_sm
RUN chmod +x docker-entrypoint.sh
CMD ["./docker-entrypoint.sh"]
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def fetch_raw_messages(
guild_id: str,
selected_channels: list[str],
from_date: datetime,
**kwargs,
) -> list[dict]:
"""
fetch rawinfo messages from mongodb database
Expand All @@ -20,6 +21,10 @@ def fetch_raw_messages(
from_date : datetime
get the raw data from a specific date
default is None, meaning get all the messages
kwargs : dict
min_word_limit : int
the minimum words that the messages shuold contain
default is 8 characters

Returns
--------
Expand All @@ -29,6 +34,8 @@ def fetch_raw_messages(
client = MongoSingleton.get_instance().get_client()
user_ids = get_real_users(guild_id)

min_word_limit = kwargs.get("min_word_limit", 15)

cursor = (
client[guild_id]["rawinfos"]
.find(
Expand All @@ -38,6 +45,7 @@ def fetch_raw_messages(
"createdDate": {"$gte": from_date},
"isGeneratedByWebhook": False,
"channelId": {"$in": selected_channels},
"$expr": {"$gt": [{"$strLenCP": "$content"}, min_word_limit]},
}
)
.sort("createdDate", 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Any

from hivemind_etl_helpers.src.db.discord.utils.content_parser import (
check_no_content_only_links,
remove_empty_str,
remove_none_from_list,
)
Expand All @@ -14,14 +13,12 @@
from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_ids import (
prepare_raw_message_ids,
)
from hivemind_etl_helpers.src.db.discord.utils.prepare_raw_message_urls import (
prepare_raw_message_urls,
)
from hivemind_etl_helpers.src.db.discord.utils.prepare_reactions_id import (
prepare_raction_ids,
)
from hivemind_etl_helpers.src.db.globals import DATE_FORMAT
from llama_index.core import Document
from tc_hivemind_backend.db.utils.preprocess_text import BasePreprocessor


def transform_discord_raw_messages(
Expand Down Expand Up @@ -160,7 +157,7 @@ def prepare_document(
roles=dict(zip(role_ids, role_names)),
users=dict(zip(mention_ids, mention_names)),
)
content_url_updated, url_reference = prepare_raw_message_urls(content)
# content_url_updated, url_reference = prepare_raw_message_urls(content)
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

# always has length 1
assert len(author_name) == 1, "Either None or multiple authors!"
Expand Down Expand Up @@ -202,8 +199,8 @@ def prepare_document(
msg_meta_data["reactors_global_name"] = reactors_gname
if reactors_nickname != []:
msg_meta_data["reactors_nicknames"] = reactors_nickname
if url_reference != {}:
msg_meta_data["url_reference"] = url_reference
# if url_reference != {}:
# msg_meta_data["url_reference"] = url_reference

if replier_name is not None:
msg_meta_data["replier_username"] = replier_name[0]
Expand All @@ -214,18 +211,18 @@ def prepare_document(
if role_names != []:
msg_meta_data["role_mentions"] = role_names

if content_url_updated == "":
raise ValueError("Message with Empty content!")
# if content_url_updated == "":
# raise ValueError("Message with Empty content!")

if check_no_content_only_links(content_url_updated):
raise ValueError("Message just did have urls")
if not BasePreprocessor().extract_main_content(text=content):
raise ValueError("Message didn't hold any valuable information!")

# removing null characters
content_url_updated = re.sub(r"[\x00-\x1F\x7F]", "", content_url_updated)
content = re.sub(r"[\x00-\x1F\x7F]", "", content)

doc: Document
if not exclude_metadata:
doc = Document(text=content_url_updated, metadata=msg_meta_data)
doc = Document(text=content, metadata=msg_meta_data)
doc.excluded_embed_metadata_keys = [
"channel",
"date",
Expand All @@ -239,7 +236,7 @@ def prepare_document(
"reactors_global_name",
"reactors_nicknames",
"thread",
"url_reference",
# "url_reference",
"replier_username",
"replier_global_name",
"replier_nickname",
Expand All @@ -256,14 +253,14 @@ def prepare_document(
"reactors_global_name",
"reactors_nicknames",
"thread",
"url_reference",
# "url_reference",
"replier_username",
"replier_global_name",
"replier_nickname",
"role_mentions",
"url",
]
else:
doc = Document(text=content_url_updated)
doc = Document(text=content)

return doc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_fetch_raw_messages_fetch_all(self):
data = {
"type": 0,
"author": users_id[i],
"content": f"test_message {np.random.randint(0, 10)}",
"content": f"{np.random.randint(0, 10)} Apples are falling from trees!",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
Expand Down Expand Up @@ -247,7 +247,7 @@ def test_fetch_raw_messages_fetch_from_date(self):
data = {
"type": 0,
"author": users_id[i],
"content": f"test_message {np.random.randint(0, 10)}",
"content": f"Apples falling from trees {np.random.randint(0, 10)}",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
Expand Down Expand Up @@ -286,3 +286,181 @@ def test_fetch_raw_messages_fetch_from_date(self):

# Check if the fetched messages are equal to the expected messages
self.assertCountEqual(messages, expected_messages)

def test_fetch_raw_messages_fetch_limited_characters(self):
"""
fetch raw messages and do filtering
"""
client = MongoSingleton.get_instance().client

guild_id = "1234"
channels = ["111111", "22222"]
users_id = ["user1", "user2", "user3", "user4", "user5"]
guild_id = "1234"
self.setup_db(
channels=channels,
guild_id=guild_id,
)

for user in users_id:
is_bot = False
if user == "user3":
is_bot = True

client[guild_id]["guildmembers"].insert_one(
{
"discordId": user,
"username": f"username_{user}",
"roles": None,
"joinedAt": datetime(2023, 1, 1),
"avatar": None,
"isBot": is_bot,
"discriminator": "0",
"permissions": None,
"deletedAt": None,
"globalName": None,
"nickname": None,
}
)

# Dropping any previous data
client[guild_id].drop_collection("rawinfos")

# Insert messages with different dates
raw_data = []

data = {
"type": 0,
"author": users_id[0],
"content": "AA",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[0],
"channelName": f"general {channels[0]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

data = {
"type": 0,
"author": users_id[1],
"content": "A sample text with more than 15 characters!",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[1],
"channelName": f"general {channels[1]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

client[guild_id]["rawinfos"].insert_many(raw_data)

messages = fetch_raw_messages(
guild_id,
selected_channels=channels,
from_date=datetime(2023, 9, 20),
)
# Check if the fetched messages are equal to the expected messages
self.assertEqual(len(messages), 1)

def test_fetch_raw_messages_fetch_limited_characters_specified(self):
"""
fetch raw messages and do filtering with a specified value
"""
client = MongoSingleton.get_instance().client

guild_id = "1234"
channels = ["111111", "22222"]
users_id = ["user1", "user2", "user3", "user4", "user5"]
guild_id = "1234"
self.setup_db(
channels=channels,
guild_id=guild_id,
)

for user in users_id:
is_bot = False
if user == "user3":
is_bot = True

client[guild_id]["guildmembers"].insert_one(
{
"discordId": user,
"username": f"username_{user}",
"roles": None,
"joinedAt": datetime(2023, 1, 1),
"avatar": None,
"isBot": is_bot,
"discriminator": "0",
"permissions": None,
"deletedAt": None,
"globalName": None,
"nickname": None,
}
)

# Dropping any previous data
client[guild_id].drop_collection("rawinfos")

# Insert messages with different dates
raw_data = []

data = {
"type": 0,
"author": users_id[0],
"content": "AA",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[0],
"channelName": f"general {channels[0]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

data = {
"type": 0,
"author": users_id[1],
"content": "A sample text with more than 8 characters!",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[1],
"channelName": f"general {channels[1]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

client[guild_id]["rawinfos"].insert_many(raw_data)

# Fetch messages from a specific date (October 3, 2023)
messages = fetch_raw_messages(
guild_id,
selected_channels=channels,
from_date=datetime(2023, 9, 20),
min_word_limit=1,
)
# Check if the fetched messages are equal to the expected messages
self.assertEqual(len(messages), 2)
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def setup_db(
create_platform: bool = True,
guild_id: str = "1234",
):
client = MongoSingleton.get_instance().client
client = MongoSingleton.get_instance().get_client()

community_id = ObjectId("9f59dd4f38f3474accdc8f24")
platform_id = ObjectId("063a2a74282db2c00fbc2428")
Expand Down Expand Up @@ -83,7 +83,7 @@ def setup_db(
)

def test_transform_two_data(self):
client = MongoSingleton.get_instance().client
client = MongoSingleton.get_instance().get_client()

channels = ["111111", "22222"]
guild_id = "1234"
Expand All @@ -99,7 +99,7 @@ def test_transform_two_data(self):
data = {
"type": 0,
"author": "111",
"content": "test_message1",
"content": "test_message1 making it longer!",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
Expand Down Expand Up @@ -308,18 +308,16 @@ def test_transform_two_data(self):
"date": datetime(2023, 5, 8).strftime("%Y-%m-%d %H:%M:%S"),
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"url_reference": {"[URL0]": "https://www.google.com"},
"thread": None,
"url": "https://discord.com/channels/1234/111111/10000000003",
}
print(documents[0].text)

self.assertDictEqual(documents[0].metadata, expected_metadata_0)
self.assertDictEqual(documents[1].metadata, expected_metadata_1)
self.assertDictEqual(documents[2].metadata, expected_metadata_2)
self.assertDictEqual(documents[3].metadata, expected_metadata_3)

# Optionally, you can also check the text separately if needed
self.assertEqual(documents[0].text, "test_message1")
self.assertEqual(documents[0].text, "test_message1 making it longer!")
self.assertEqual(documents[1].text, "mentioning a person user3")
self.assertEqual(documents[2].text, "mentioning user3 user4 role1")
self.assertEqual(documents[3].text, "test_message1 [URL0]")
self.assertEqual(documents[3].text, "test_message1 https://www.google.com")
Loading
Loading