Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added word max limit! #335

Merged
merged 23 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ba4c527
feat: Added word max limit!
amindadgar Dec 5, 2024
3a018eb
fix: test case to align with the latest codes!
amindadgar Dec 5, 2024
cdcea54
fix: naming typo!
amindadgar Dec 5, 2024
70459fc
fix: test case assertions!
amindadgar Dec 5, 2024
9b2bead
feat: Added noise cancelling for discord messages!
amindadgar Dec 9, 2024
3788374
fix: remove unnecessary print!
amindadgar Dec 9, 2024
b007eec
fix: Added missing spacy dependency!
amindadgar Dec 9, 2024
33d0fb4
feat: updating the messages content!
amindadgar Dec 9, 2024
32eaafb
fix: Added spacy model download in test environment!
amindadgar Dec 9, 2024
b2b6468
fix: aligning with latest codes!
amindadgar Dec 9, 2024
0a3a6df
fix: align input output data!
amindadgar Dec 9, 2024
5b4057d
fix: Added more informative messages!
amindadgar Dec 9, 2024
cd4e0c7
feat: removed the processors!
amindadgar Dec 9, 2024
d0fb008
fix: remove the unrequired processor!
amindadgar Dec 9, 2024
7dfc963
feat: removed unused test cases!
amindadgar Dec 9, 2024
077ab8e
feat: updated data with not removing the urls from actual content!
amindadgar Dec 9, 2024
ed99d65
feat: using latest version of hivemind-backend lib!
amindadgar Dec 9, 2024
866088d
feat: checking if there's any valid date in the text!
amindadgar Dec 9, 2024
96d7ab3
feat: using smaller sized spacy model!
amindadgar Dec 10, 2024
5634296
fix: wrong if condition!
amindadgar Dec 10, 2024
d3f8d7d
fix: aligning with latest codes!
amindadgar Dec 10, 2024
ce0a9c2
fix: using the fixed version of hivemind-backend lib!
amindadgar Dec 10, 2024
67a3aae
fix: isort linter issue!
amindadgar Dec 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def fetch_raw_messages(
guild_id: str,
selected_channels: list[str],
from_date: datetime,
**kwargs,
) -> list[dict]:
"""
fetch rawinfo messages from mongodb database
Expand All @@ -20,6 +21,10 @@ def fetch_raw_messages(
from_date : datetime
get the raw data from a specific date
default is None, meaning get all the messages
kwargs : dict
min_word_limit : int
the minimum words that the messages shuold contain
default is 8 characters

Returns
--------
Expand All @@ -29,6 +34,8 @@ def fetch_raw_messages(
client = MongoSingleton.get_instance().get_client()
user_ids = get_real_users(guild_id)

limit_max_words = kwargs.get("min_word_limit", 8)
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

cursor = (
client[guild_id]["rawinfos"]
.find(
Expand All @@ -38,6 +45,7 @@ def fetch_raw_messages(
"createdDate": {"$gte": from_date},
"isGeneratedByWebhook": False,
"channelId": {"$in": selected_channels},
"$where": f"this.content.length > {limit_max_words}",
}
)
.sort("createdDate", 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,177 @@ def test_fetch_raw_messages_fetch_from_date(self):

# Check if the fetched messages are equal to the expected messages
self.assertCountEqual(messages, expected_messages)

def test_fetch_raw_messages_fetch_limited_characters(self):
"""
fetch raw messages and do filtering
"""
client = MongoSingleton.get_instance().client

guild_id = "1234"
channels = ["111111", "22222"]
users_id = ["user1", "user2", "user3", "user4", "user5"]
guild_id = "1234"
self.setup_db(
channels=channels,
guild_id=guild_id,
)

for user in users_id:
is_bot = False
if user == "user3":
is_bot = True

client[guild_id]["guildmembers"].insert_one(
{
"discordId": user,
"username": f"username_{user}",
"roles": None,
"joinedAt": datetime(2023, 1, 1),
"avatar": None,
"isBot": is_bot,
"discriminator": "0",
"permissions": None,
"deletedAt": None,
"globalName": None,
"nickname": None,
}
)

# Dropping any previous data
client[guild_id].drop_collection("rawinfos")

# Insert messages with different dates
raw_data = []

data = {
"type": 0,
"author": users_id[0],
"content": "AA",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[0],
"channelName": f"general {channels[0]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

data = {
"type": 0,
"author": users_id[1],
"content": "A sample text with more than 8 characters!",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[1],
"channelName": f"general {channels[1]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

client[guild_id]["rawinfos"].insert_many(raw_data)

# Fetch messages from a specific date (October 3, 2023)
messages = fetch_raw_messages(guild_id, selected_channels=channels)
# Check if the fetched messages are equal to the expected messages
self.assertCountEqual(messages, 1)

amindadgar marked this conversation as resolved.
Show resolved Hide resolved
def test_fetch_raw_messages_fetch_limited_characters(self):
"""
fetch raw messages and do filtering
"""
client = MongoSingleton.get_instance().client

guild_id = "1234"
channels = ["111111", "22222"]
users_id = ["user1", "user2", "user3", "user4", "user5"]
guild_id = "1234"
self.setup_db(
channels=channels,
guild_id=guild_id,
)

for user in users_id:
is_bot = False
if user == "user3":
is_bot = True

client[guild_id]["guildmembers"].insert_one(
{
"discordId": user,
"username": f"username_{user}",
"roles": None,
"joinedAt": datetime(2023, 1, 1),
"avatar": None,
"isBot": is_bot,
"discriminator": "0",
"permissions": None,
"deletedAt": None,
"globalName": None,
"nickname": None,
}
)

# Dropping any previous data
client[guild_id].drop_collection("rawinfos")

# Insert messages with different dates
raw_data = []

data = {
"type": 0,
"author": users_id[0],
"content": "AA",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[0],
"channelName": f"general {channels[0]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

data = {
"type": 0,
"author": users_id[1],
"content": "A sample text with more than 8 characters!",
"user_mentions": [],
"role_mentions": [],
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 10, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"channelId": channels[1],
"channelName": f"general {channels[1]}",
"threadId": None,
"threadName": None,
"isGeneratedByWebhook": False,
}
raw_data.append(data)

client[guild_id]["rawinfos"].insert_many(raw_data)

# Fetch messages from a specific date (October 3, 2023)
messages = fetch_raw_messages(
guild_id,
selected_channels=channels,
min_word_limit=1,
)
# Check if the fetched messages are equal to the expected messages
self.assertCountEqual(messages, 2)
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
Loading