Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added word max limit! #335

Merged
merged 23 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ba4c527
feat: Added word max limit!
amindadgar Dec 5, 2024
3a018eb
fix: test case to align with the latest codes!
amindadgar Dec 5, 2024
cdcea54
fix: naming typo!
amindadgar Dec 5, 2024
70459fc
fix: test case assertions!
amindadgar Dec 5, 2024
9b2bead
feat: Added noise cancelling for discord messages!
amindadgar Dec 9, 2024
3788374
fix: remove unnecessary print!
amindadgar Dec 9, 2024
b007eec
fix: Added missing spacy dependency!
amindadgar Dec 9, 2024
33d0fb4
feat: updating the messages content!
amindadgar Dec 9, 2024
32eaafb
fix: Added spacy model download in test environment!
amindadgar Dec 9, 2024
b2b6468
fix: aligning with latest codes!
amindadgar Dec 9, 2024
0a3a6df
fix: align input output data!
amindadgar Dec 9, 2024
5b4057d
fix: Added more informative messages!
amindadgar Dec 9, 2024
cd4e0c7
feat: removed the processors!
amindadgar Dec 9, 2024
d0fb008
fix: remove the unrequired processor!
amindadgar Dec 9, 2024
7dfc963
feat: removed unused test cases!
amindadgar Dec 9, 2024
077ab8e
feat: updated data with not removing the urls from actual content!
amindadgar Dec 9, 2024
ed99d65
feat: using latest version of hivemind-backend lib!
amindadgar Dec 9, 2024
866088d
feat: checking if there's any valid date in the text!
amindadgar Dec 9, 2024
96d7ab3
feat: using smaller sized spacy model!
amindadgar Dec 10, 2024
5634296
fix: wrong if condition!
amindadgar Dec 10, 2024
d3f8d7d
fix: aligning with latest codes!
amindadgar Dec 10, 2024
ce0a9c2
fix: using the fixed version of hivemind-backend lib!
amindadgar Dec 10, 2024
67a3aae
fix: isort linter issue!
amindadgar Dec 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ COPY . .
RUN chmod +x init.sh
USER airflow
RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt
RUN python -m spacy download en_core_web_lg
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

FROM python:3.11-bullseye AS test
WORKDIR /project
COPY . .
RUN pip install -r requirements.txt
RUN python -m spacy download en_core_web_lg
RUN chmod +x docker-entrypoint.sh
CMD ["./docker-entrypoint.sh"]
Empty file.
43 changes: 43 additions & 0 deletions dags/hivemind_etl_helpers/src/db/common/base_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import spacy


class BasePreprocessor:
def __init__(self) -> None:
pass

def extract_main_content(self, text: str) -> str:
"""
extract main content of a message

Parameters
------------
text : str
a discord message text

Returns
--------
cleaned_text : str

"""
try:
nlp = spacy.load("en_core_web_lg")
except OSError as exp:
raise OSError(f"Model spacy `en_core_web_lg` is not installed!") from exp
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

doc = nlp(text)

# Filter out punctuation, whitespace, and numerical values, then extract the lemma for each remaining token
main_content_tokens = [
token.lemma_
for token in doc
if not token.is_punct
and not token.is_space
and not token.is_stop
and not token.like_url
and not token.like_num
and token.is_ascii
]

# Join the tokens to form the cleaned sentence
cleaned_text = " ".join(main_content_tokens)
return cleaned_text
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
transform_discord_raw_messages,
)
from llama_index.core import Document
from hivemind_etl_helpers.src.db.discord.preprocessor import DiscordPreprocessor


def discord_raw_to_documents(
Expand All @@ -29,6 +30,37 @@ def discord_raw_to_documents(
list of messages converted to documents
"""
raw_mongo_messages = fetch_raw_messages(guild_id, selected_channels, from_date)
messages_docuemnt = transform_discord_raw_messages(guild_id, raw_mongo_messages)
processed_messages = update_raw_messages(raw_data=raw_mongo_messages)
messages_docuemnt = transform_discord_raw_messages(guild_id, processed_messages)

return messages_docuemnt


def update_raw_messages(raw_data: list[dict]) -> list[dict]:
"""
Update raw messages text by cleaning their data

Parameters
-----------
data : list[dict]
a list of raw data fetched from database
each dict hold a 'content'

Returns
---------
cleaned_data : list[dict]
a list of dictionaries but with cleaned data
"""
preprocessor = DiscordPreprocessor()

cleaned_data: list[dict] = []
for data in raw_data:
content = data.get("content")
if content:
cleaned_content = preprocessor.clean_text(content)

if cleaned_content:
data["content"] = cleaned_content
cleaned_data.append(data)

return cleaned_data
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def fetch_raw_messages(
guild_id: str,
selected_channels: list[str],
from_date: datetime,
**kwargs,
) -> list[dict]:
"""
fetch rawinfo messages from mongodb database
Expand All @@ -20,6 +21,10 @@ def fetch_raw_messages(
from_date : datetime
get the raw data from a specific date
default is None, meaning get all the messages
kwargs : dict
min_word_limit : int
the minimum words that the messages shuold contain
default is 8 characters

Returns
--------
Expand All @@ -29,6 +34,8 @@ def fetch_raw_messages(
client = MongoSingleton.get_instance().get_client()
user_ids = get_real_users(guild_id)

min_word_limit = kwargs.get("min_word_limit", 15)

cursor = (
client[guild_id]["rawinfos"]
.find(
Expand All @@ -38,6 +45,7 @@ def fetch_raw_messages(
"createdDate": {"$gte": from_date},
"isGeneratedByWebhook": False,
"channelId": {"$in": selected_channels},
"$expr": {"$gt": [{"$strLenCP": "$content"}, min_word_limit]},
}
)
.sort("createdDate", 1)
Expand Down
79 changes: 79 additions & 0 deletions dags/hivemind_etl_helpers/src/db/discord/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import re

from hivemind_etl_helpers.src.db.common.base_preprocessor import BasePreprocessor


class DiscordPreprocessor(BasePreprocessor):
"""
preprocess discord text messages
"""

def __init__(self) -> None:
pass

def clean_texts(self, texts: list[str]) -> list[str]:
"""
clean the given text

Parameters
------------
texts : list[str]
a list of discord messages text

Returns
---------
texts_cleaned : str
the cleaned text
(discord ids removed)
"""
texts_cleaned: list[str] = []

for text in texts:
text_cleaned = self.clean_text(text=text)
texts_cleaned.append(text_cleaned)

return texts_cleaned

def clean_text(self, text: str) -> str:
"""
clean the given text

Parameters
------------
text : str
a discord message text

Returns
---------
text_cleaned : str
the cleaned text
(discord ids removed)
"""
text_cleaned = self.remove_ids(text=text)
text_cleaned = self.extract_main_content(text=text_cleaned)

return text_cleaned

def remove_ids(self, text: str) -> str:
"""
remove the ids that are available in texts
user ids would be with the format of <@number>
and the role ids are in the format of <@&number>

Parameters
------------
text : str
a discord message text

Returns
---------
cleaned_text : str
the texts with removed <@number> and <@&number>
"""
pattern = r"<@&?\d+>"

# Removing matches
cleaned_text = re.sub(pattern, "", text)

cleaned_text = " ".join(cleaned_text.split())
return cleaned_text
Loading
Loading