Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: exclude embedding and llm metadata! #47

Merged
merged 14 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def fetch_raw_messages(guild_id: str, from_date: datetime | None = None) -> list
client[guild_id]["rawinfos"]
.find(
{
"type": {"$ne": 18},
"createdDate": {"$gte": from_date},
"isGeneratedByWebhook": False,
"channelId": {"$in": channels},
Expand All @@ -43,6 +44,7 @@ def fetch_raw_messages(guild_id: str, from_date: datetime | None = None) -> list
client[guild_id]["rawinfos"]
.find(
{
"type": {"$ne": 18},
"isGeneratedByWebhook": False,
"channelId": {"$in": channels},
"createdDate": {"$gte": from_date_modules},
Expand Down Expand Up @@ -98,6 +100,7 @@ def fetch_raw_msg_grouped(
pipeline.append(
{
"$match": {
"type": {"$ne": 18},
"createdDate": {
"$gte": from_date,
"$lt": datetime.now().replace(
Expand All @@ -113,6 +116,7 @@ def fetch_raw_msg_grouped(
pipeline.append(
{
"$match": {
"type": {"$ne": 18},
"createdDate": {
"$gte": from_date_modules,
"$lt": datetime.now().replace(
Expand Down
61 changes: 61 additions & 0 deletions dags/hivemind_etl_helpers/src/db/discord/utils/content_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re


def remove_empty_str(data: list[str]):
"""
a utility function to remove the empty string from a list

Parameters
-----------
data : list[str]
a list with string values
"""
while "" in data:
data.remove("")

return data


def check_no_content_only_links(content: str, link_pattern: str = r"\[URL\d\]") -> bool:
"""
check if there's just links in the function and there's no content written

Parameters
-----------
content : str
the message content
link_pattern : str
the pattern of link
default pattern is for links like `[URL0]`, or `[URL1]`, etc

Returns
--------
no_content : bool
if `True` then there was no content but the links in the given string
"""
pattern = re.compile(link_pattern)
replacement = ""

result_string = re.sub(pattern, replacement, content)

alphabet_pattern = re.compile(r"[a-zA-Z]")
no_content = not bool(alphabet_pattern.search(result_string))
return no_content


def remove_none_from_list(data: list[str | None]) -> list[str]:
"""
remove the `None` values from a list

Parameters
-----------
data : list[str | None]
the list of data to process

Returns
--------
data_processed : list[str]
the data just removed the `None` values
"""
data_processed = [value for value in data if value is not None]
return data_processed
Original file line number Diff line number Diff line change
@@ -1,27 +1,4 @@
from urllib.parse import urlparse


def extract_urls(text: str) -> list[str]:
"""
extract the urls within the text and just return the urls

Parameters
------------
text : str
the raw text

Returns
---------
urls : list[str]
the list of urls within the text
"""
urls = []
words = text.split()
for word in words:
parsed_url = urlparse(word)
if parsed_url.scheme and parsed_url.netloc:
urls.append(parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path)
return urls
from urlextract import URLExtract


def prepare_raw_message_urls(message: str) -> tuple[str, dict[str, str]]:
Expand All @@ -42,7 +19,7 @@ def prepare_raw_message_urls(message: str) -> tuple[str, dict[str, str]]:
the url reference dict that keys are reference name
and values are the actual url
"""
msg_urls = extract_urls(message)
msg_urls = URLExtract().find_urls(message)

references: dict[str, str] = {}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import logging
from typing import Any

from hivemind_etl_helpers.src.db.discord.utils.content_parser import (
check_no_content_only_links,
remove_empty_str,
remove_none_from_list,
)
from hivemind_etl_helpers.src.db.discord.utils.id_transform import convert_role_id
from hivemind_etl_helpers.src.db.discord.utils.merge_user_ids_fetch_names import (
merge_user_ids_and_fetch_names,
Expand Down Expand Up @@ -158,6 +163,8 @@ def prepare_document(
"channel": message["channelName"],
"date": message["createdDate"].strftime("%Y-%m-%d %H:%M:%S"),
"author_username": author_name[0],
# always including the thread_name, if `None`, then it was a channel message
"thread_name": message["threadName"],
}
if author_global_name[0] is not None:
msg_meta_data["author_global_name"] = author_global_name[0]
Expand Down Expand Up @@ -185,8 +192,6 @@ def prepare_document(
msg_meta_data["reactors_nicknames"] = reactors_nickname
if url_reference != {}:
msg_meta_data["url_reference"] = url_reference
if message["threadName"] is not None:
msg_meta_data["thread_name"] = message["threadName"]

if replier_name is not None:
msg_meta_data["replier_username"] = replier_name[0]
Expand All @@ -197,43 +202,49 @@ def prepare_document(
if role_names != []:
msg_meta_data["role_mentions"] = role_names

if content_url_updated == "":
raise ValueError("Message with Empty content!")

if check_no_content_only_links(content_url_updated):
raise ValueError("Message just did have urls")

doc: Document
if not exclude_metadata:
doc = Document(text=content_url_updated, metadata=msg_meta_data)
doc.excluded_embed_metadata_keys = [
"channel",
"date",
"author_username",
"author_global_name",
"author_nickname",
"mention_usernames",
"mention_global_names",
"mention_nicknames",
"reactors_username",
"reactors_global_name",
"reactors_nicknames",
"thread_name",
"url_reference",
"replier_username",
"replier_global_name",
"replier_nickname",
"role_mentions",
]
doc.excluded_llm_metadata_keys = [
"mention_usernames",
"mention_global_names",
"mention_nicknames",
"reactors_username",
"reactors_global_name",
"reactors_nicknames",
"thread_name",
"url_reference",
"replier_username",
"replier_global_name",
"replier_nickname",
"role_mentions",
]
else:
doc = Document(text=content_url_updated)

return doc


def remove_empty_str(data: list[str]):
"""
a utility function to remove the empty string from a list

Parameters
-----------
data : list[str]
a list with string values
"""
while "" in data:
data.remove("")

return data


def remove_none_from_list(data: list[str | None]) -> list[str]:
"""
remove the `None` values from a list

Parameters
-----------
data : list[str | None]
the list of data to process

Returns
--------
data_processed : list[str]
the data just removed the `None` values
"""
data_processed = [value for value in data if value is not None]
return data_processed
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def setup_db(
)

def test_fetch_channels(self):
guild_id = "12345"
guild_id = "1234"
channels = ["111111", "22222"]
self.setup_db(
create_modules=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def setup_db(
"platforms": [
{
"platformId": platform_id,
"fromDate": datetime(2023, 10, 1),
"options": {
"channels": channels,
"roles": ["role_id"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def setup_db(
"platforms": [
{
"platformId": platform_id,
"fromDate": datetime(2023, 10, 1),
"options": {
"channels": channels,
"roles": ["role_id"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def setup_db(
"platforms": [
{
"platformId": platform_id,
"fromDate": datetime(2023, 1, 1),
"options": {
"channels": channels,
"roles": ["role_id"],
Expand Down Expand Up @@ -265,6 +266,7 @@ def test_transform_two_data(self):
"date": datetime(2023, 5, 1).strftime("%Y-%m-%d %H:%M:%S"),
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"thread_name": None,
}

expected_metadata_1 = {
Expand All @@ -277,6 +279,7 @@ def test_transform_two_data(self):
"mention_nicknames": ["user3_nickname"],
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"thread_name": None,
}

expected_metadata_2 = {
Expand All @@ -299,8 +302,9 @@ def test_transform_two_data(self):
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"url_reference": {"[URL0]": "https://www.google.com"},
"thread_name": None,
}
print(documents[3].metadata)
print(documents[0].text)
self.assertDictEqual(documents[0].metadata, expected_metadata_0)
self.assertDictEqual(documents[1].metadata, expected_metadata_1)
self.assertDictEqual(documents[2].metadata, expected_metadata_2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def setup_db(
"platforms": [
{
"platformId": platform_id,
"fromDate": datetime(2023, 10, 1),
"options": {
"channels": channels,
"roles": ["role_id"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def test_transform_two_data(self):
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"author_nickname": "user1_nickname",
"thread_name": None,
}

expected_metadata_1 = {
Expand All @@ -179,6 +180,7 @@ def test_transform_two_data(self):
"mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"thread_name": None,
}

expected_metadata_2 = {
Expand All @@ -201,6 +203,7 @@ def test_transform_two_data(self):
"author_global_name": "user1_GlobalName",
"author_nickname": "user1_nickname",
"url_reference": {"[URL0]": "https://www.google.com"},
"thread_name": None,
}

self.assertDictEqual(documents[0].metadata, expected_metadata_0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_normal_messag_single_url_https(self):
self.assertEqual(msg_updated, "Here you can have a look [URL0]")
self.assertEqual(url_reference, {"[URL0]": "https://google.com"})

def test_normal_messag_multiple_url(self):
def test_normal_message_multiple_url(self):
msg = "Here you can have a look https://google.com https://example.com"

msg_updated, url_reference = prepare_raw_message_urls(msg)
Expand All @@ -32,3 +32,14 @@ def test_normal_messag_multiple_url(self):
url_reference,
{"[URL0]": "https://google.com", "[URL1]": "https://example.com"},
)

def test_message_multiple_url_wrappend(self):
msg = "Here you can have a look <https://google.com> <https://example.com>"

msg_updated, url_reference = prepare_raw_message_urls(msg)

self.assertEqual(msg_updated, "Here you can have a look <[URL0]> <[URL1]>")
self.assertEqual(
url_reference,
{"[URL0]": "https://google.com", "[URL1]": "https://example.com"},
)
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def setup_db(
"platforms": [
{
"platformId": platform_id,
"fromDate": datetime(2023, 10, 1),
"options": {
"channels": channels,
"roles": ["role_id"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def setup_mongo_information(
"platforms": [
{
"platformId": platform_id,
"fromDate": datetime(2023, 1, 1),
"options": {
"channels": channels,
"roles": ["role_id"],
Expand Down
Loading
Loading