From 01bf57bd2475bd7dfc7c9a378b0fd0bebdb205fa Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 11:28:18 +0330 Subject: [PATCH 01/15] feat: updated github ETL, document metadata! updated `html_url` to be `url` --- .../src/db/github/extract/commit.py | 2 +- .../src/db/github/schema/commit.py | 8 ++++---- .../integration/test_github_etl_fetch_commits.py | 10 +++++----- .../integration/test_github_transformation.py | 4 ++-- .../tests/unit/test_github_transform_commits.py | 16 ++++++++-------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py index d3334599..f04a21c7 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py @@ -78,7 +78,7 @@ def _fetch_raw_commits( user_commiter.login AS committer_name, co.`commit.message` AS message, co.`commit.url` AS api_url, - co.`parents.0.html_url` AS html_url, + co.`parents.0.html_url` AS url, co.repository_id AS repository_id, repo.full_name AS repository_name, co.sha AS sha, diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/commit.py b/dags/hivemind_etl_helpers/src/db/github/schema/commit.py index 1b07bea6..132a44e5 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/commit.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/commit.py @@ -8,7 +8,7 @@ def __init__( committer_name: str, message: str, api_url: str, - html_url: str, + url: str, repository_id: int, repository_name: str, sha: str, @@ -26,7 +26,7 @@ def __init__( self.committer_name = committer_name self.message = message self.api_url = api_url - self.html_url = html_url + self.url = url self.repository_id = repository_id self.repository_name = repository_name self.sha = sha @@ -42,7 +42,7 @@ def from_dict(cls, data: dict[str, str | int | None]) -> "GitHubCommit": committer_name=data["committer_name"], # type: ignore message=data["message"], # type: ignore api_url=data["api_url"], # type: ignore - html_url=data["html_url"], # type: ignore + url=data["url"], # type: ignore repository_id=data["repository_id"], # type: ignore repository_name=data["repository_name"], # type: ignore sha=data["sha"], # type: ignore @@ -58,7 +58,7 @@ def to_dict(self) -> dict[str, str | int | None]: "committer_name": self.committer_name, "message": self.message, "api_url": self.api_url, - "html_url": self.html_url, + "url": self.url, "repository_id": self.repository_id, "repository_name": self.repository_name, "sha": self.sha, diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py index f68ef1f2..d81aa7c4 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py @@ -62,7 +62,7 @@ def test_get_single_commit_single_repo_no_from_date(self): self.assertEqual(commits[0].committer_name, "author #1") self.assertEqual(commits[0].message, "Issue #1 is resolved!") self.assertEqual(commits[0].api_url, "https://api.sample_url_for_commit.html") - self.assertEqual(commits[0].html_url, "https://sample_url_for_commit.html") + self.assertEqual(commits[0].url, "https://sample_url_for_commit.html") self.assertEqual(commits[0].repository_id, 123) self.assertEqual(commits[0].repository_name, "Org/SampleRepo") self.assertEqual(commits[0].sha, "sha#1111") @@ -105,7 +105,7 @@ def test_get_single_commit_single_repo_no_from_date_no_commiter(self): self.assertEqual(commits[0].committer_name, None) self.assertEqual(commits[0].message, "Issue #1 is resolved!") self.assertEqual(commits[0].api_url, "https://api.sample_url_for_commit.html") - self.assertEqual(commits[0].html_url, "https://sample_url_for_commit.html") + self.assertEqual(commits[0].url, "https://sample_url_for_commit.html") self.assertEqual(commits[0].repository_id, 123) self.assertEqual(commits[0].repository_name, "Org/SampleRepo") self.assertEqual(commits[0].sha, "sha#1111") @@ -150,7 +150,7 @@ def test_get_single_commit_single_repo_with_from_date(self): self.assertEqual(commits[0].author_name, "author #2") self.assertEqual(commits[0].message, "Issue #1 is resolved!") self.assertEqual(commits[0].api_url, "https://api.sample_url_for_commit.html") - self.assertEqual(commits[0].html_url, "https://sample_url_for_commit.html") + self.assertEqual(commits[0].url, "https://sample_url_for_commit.html") self.assertEqual(commits[0].repository_id, 123) self.assertEqual(commits[0].repository_name, "Org/SampleRepo2") self.assertEqual(commits[0].sha, "sha#1111") @@ -228,7 +228,7 @@ def test_get_multiple_commit_multi_repo_with_from_date_filter(self): self.assertEqual(commits[0].committer_name, "author #1") self.assertEqual(commits[0].message, "Issue #1 is resolved!") self.assertEqual(commits[0].api_url, "https://api.sample_url_for_commit.html") - self.assertEqual(commits[0].html_url, "https://sample_url_for_commit.html") + self.assertEqual(commits[0].url, "https://sample_url_for_commit.html") self.assertEqual(commits[0].repository_id, 123) self.assertEqual(commits[0].repository_name, "Org/SampleRepo2") self.assertEqual(commits[0].sha, "sha#1111") @@ -272,7 +272,7 @@ def test_fetch_commit_no_related_pr(self): self.assertEqual(commits[0].committer_name, None) self.assertEqual(commits[0].message, "Issue #1 is resolved!") self.assertEqual(commits[0].api_url, "https://api.sample_url_for_commit.html") - self.assertEqual(commits[0].html_url, "https://sample_url_for_commit.html") + self.assertEqual(commits[0].url, "https://sample_url_for_commit.html") self.assertEqual(commits[0].repository_id, 123) self.assertEqual(commits[0].repository_name, "Org/SampleRepo") self.assertEqual(commits[0].sha, "sha#1111") diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_transformation.py b/dags/hivemind_etl_helpers/tests/integration/test_github_transformation.py index 1f200073..b7e80090 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_transformation.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_transformation.py @@ -83,7 +83,7 @@ def test_commit_transformation(self): sha="sha#1000000", created_at=datetime(2023, 11, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), latest_saved_at=datetime(2023, 12, 1, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), - html_url="https://github.com/repo/commit/1", + url="https://github.com/repo/commit/1", api_url="https://api.github.com/repo/commit/1", repository_id=123, repository_name="SampleRepo", @@ -104,7 +104,7 @@ def test_commit_transformation(self): "sha": "sha#1000000", "created_at": "2023-11-01 00:00:00", "latest_saved_at": "2023-12-01 01:00:00", - "html_url": "https://github.com/repo/commit/1", + "url": "https://github.com/repo/commit/1", "api_url": "https://api.github.com/repo/commit/1", "repository_id": 123, "repository_name": "SampleRepo", diff --git a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py index a41de6de..27150b70 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_github_transform_commits.py @@ -20,7 +20,7 @@ def test_github_single_document(self): sha="sha#1000000", created_at=datetime(2023, 11, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), latest_saved_at=datetime(2023, 12, 1, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), - html_url="https://github.com/repo/commit/1", + url="https://github.com/repo/commit/1", api_url="https://api.github.com/repo/commit/1", repository_id=123, repository_name="SampleRepo", @@ -42,7 +42,7 @@ def test_github_single_document(self): "sha": "sha#1000000", "created_at": "2023-11-01 00:00:00", "latest_saved_at": "2023-12-01 01:00:00", - "html_url": "https://github.com/repo/commit/1", + "url": "https://github.com/repo/commit/1", "api_url": "https://api.github.com/repo/commit/1", "repository_id": 123, "repository_name": "SampleRepo", @@ -61,7 +61,7 @@ def test_multiple_documents(self): sha="sha#1000000", created_at=datetime(2023, 11, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), latest_saved_at=datetime(2023, 12, 1, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), - html_url="https://github.com/repo/commit/1", + url="https://github.com/repo/commit/1", api_url="https://api.github.com/repo/commit/1", repository_id=123, repository_name="SampleRepo", @@ -75,7 +75,7 @@ def test_multiple_documents(self): sha="sha#1000001", created_at=datetime(2023, 11, 2).strftime("%Y-%m-%dT%H:%M:%SZ"), latest_saved_at=datetime(2023, 12, 2, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), - html_url="https://github.com/repo/commit/2", + url="https://github.com/repo/commit/2", api_url="https://api.github.com/repo/commit/2", repository_id=123, repository_name="SampleRepo", @@ -88,7 +88,7 @@ def test_multiple_documents(self): sha="sha#1000002", created_at=datetime(2023, 11, 3).strftime("%Y-%m-%dT%H:%M:%SZ"), latest_saved_at=datetime(2023, 12, 3, 1).strftime("%Y-%m-%dT%H:%M:%SZ"), - html_url="https://github.com/repo/commit/3", + url="https://github.com/repo/commit/3", api_url="https://api.github.com/repo/commit/3", repository_id=126, repository_name="SampleRepo#6", @@ -114,7 +114,7 @@ def test_multiple_documents(self): "sha": "sha#1000000", "created_at": "2023-11-01 00:00:00", "latest_saved_at": "2023-12-01 01:00:00", - "html_url": "https://github.com/repo/commit/1", + "url": "https://github.com/repo/commit/1", "api_url": "https://api.github.com/repo/commit/1", "repository_id": 123, "repository_name": "SampleRepo", @@ -132,7 +132,7 @@ def test_multiple_documents(self): "sha": "sha#1000001", "created_at": "2023-11-02 00:00:00", "latest_saved_at": "2023-12-02 01:00:00", - "html_url": "https://github.com/repo/commit/2", + "url": "https://github.com/repo/commit/2", "api_url": "https://api.github.com/repo/commit/2", "repository_id": 123, "repository_name": "SampleRepo", @@ -150,7 +150,7 @@ def test_multiple_documents(self): "sha": "sha#1000002", "created_at": "2023-11-03 00:00:00", "latest_saved_at": "2023-12-03 01:00:00", - "html_url": "https://github.com/repo/commit/3", + "url": "https://github.com/repo/commit/3", "api_url": "https://api.github.com/repo/commit/3", "repository_id": 126, "repository_name": "SampleRepo#6", From 97684fdf8a5ff3b0cd4ea05c6d5a0979ce8ef6a8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 12:47:24 +0330 Subject: [PATCH 02/15] fix: excluding github message url from LLM! --- dags/hivemind_etl_helpers/src/db/github/transform/commits.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py index c52544f8..87743ed3 100644 --- a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py +++ b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py @@ -27,7 +27,7 @@ def transform_commits(data: list[GitHubCommit]) -> list[Document]: metadata=metadata, # all metadata to be excluded from embedding model excluded_embed_metadata_keys=list(metadata.keys()), - excluded_llm_metadata_keys=["sha", "api_url", "html_url", "verification"], + excluded_llm_metadata_keys=["sha", "api_url", "url", "verification"], ) transformed_commits.append(document) From 72348fd03d104d838f60c7c5f5715ad55a979e93 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 13:04:50 +0330 Subject: [PATCH 03/15] feat: Added discord message url in hivemind documents' metadata! --- .../discord/utils/transform_discord_raw_messges.py | 12 ++++++++++++ .../tests/integration/test_discord_prepare_llama.py | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index fbbd65b9..cf04ddbe 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -95,6 +95,10 @@ def prepare_document( reactions = message["reactions"] raw_content = message["content"] + message_id = message["messageId"] + channel_id = message["channelId"] + thread_id = message["threadId"] + reaction_ids = prepare_raction_ids(reactions) mention_names: list[str] @@ -161,10 +165,16 @@ def prepare_document( # always has length 1 assert len(author_name) == 1, "Either None or multiple authors!" + if thread_id is None: + url = f"https://discord.com/channels/{guild_id}/{channel_id}/{message_id}" + else: + url = f"https://discord.com/channels/{guild_id}/{thread_id}/{message_id}" + msg_meta_data = { "channel": message["channelName"], "date": message["createdDate"].strftime(DATE_FORMAT), "author_username": author_name[0], + "url": url, # always including the thread_name, if `None`, then it was a channel message "thread": message["threadName"], } @@ -234,6 +244,7 @@ def prepare_document( "replier_global_name", "replier_nickname", "role_mentions", + "url", ] doc.excluded_llm_metadata_keys = [ "author_nickname", @@ -250,6 +261,7 @@ def prepare_document( "replier_global_name", "replier_nickname", "role_mentions", + "url", ] else: doc = Document(text=content_url_updated) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py index d6710b53..3c5fa5af 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py @@ -168,6 +168,7 @@ def test_transform_two_data(self): "author_username": "user1", "author_global_name": "user1_GlobalName", "author_nickname": "user1_nickname", + "url": f"https://discord.com/channels/{guild_id}/1313130/1111111110", "thread": None, } @@ -180,6 +181,7 @@ def test_transform_two_data(self): "mention_global_names": ["user3_GlobalName", "user4_GlobalName"], "replier_username": "user4", "replier_global_name": "user4_GlobalName", + "url": f"https://discord.com/channels/{guild_id}/1313131/1111111111", "thread": None, } @@ -192,6 +194,7 @@ def test_transform_two_data(self): "mention_global_names": ["user3_GlobalName", "user4_GlobalName"], "replier_username": "user4", "replier_global_name": "user4_GlobalName", + "url": f"https://discord.com/channels/{guild_id}/88888/1111111112", "thread": "example_thread1", "role_mentions": ["role1"], } @@ -203,6 +206,7 @@ def test_transform_two_data(self): "author_global_name": "user1_GlobalName", "author_nickname": "user1_nickname", "url_reference": {"[URL0]": "https://www.google.com"}, + "url": f"https://discord.com/channels/{guild_id}/1313133/1111111113", "thread": None, } From 6329b86ed1c97d88c8bdbd530f87186a7250bcc1 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 14:33:08 +0330 Subject: [PATCH 04/15] fix: updated test cases to include url in metadata! --- .../test_discord_prepare_document_from_db.py | 14 +++++++++----- .../test_github_etl_fetch_raw_commits.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index e72a2ca4..93a5f13f 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -105,7 +105,7 @@ def test_transform_two_data(self): "reactions": [], "replied_user": None, "createdDate": datetime(2023, 5, 1), - "messageId": str(np.random.randint(1000000, 9999999)), + "messageId": "10000000000", "channelId": channels[0], "channelName": "channel1", "threadId": None, @@ -123,7 +123,7 @@ def test_transform_two_data(self): "reactions": [], "replied_user": "114", "createdDate": datetime(2023, 5, 2), - "messageId": str(np.random.randint(1000000, 9999999)), + "messageId": "10000000001", "channelId": channels[1], "channelName": "channel2", "threadId": None, @@ -141,7 +141,7 @@ def test_transform_two_data(self): "reactions": [], "replied_user": "114", "createdDate": datetime(2023, 5, 2), - "messageId": str(np.random.randint(1000000, 9999999)), + "messageId": "10000000002", "channelId": channels[1], "channelName": "channel2", "threadId": "88888", @@ -159,7 +159,7 @@ def test_transform_two_data(self): "reactions": [], "replied_user": None, "createdDate": datetime(2023, 5, 8), - "messageId": str(np.random.randint(1000000, 9999999)), + "messageId": "10000000003", "channelId": channels[0], "channelName": "channel1", "threadId": None, @@ -179,7 +179,7 @@ def test_transform_two_data(self): "reactions": [], "replied_user": None, "createdDate": datetime(2023, 5, 8), - "messageId": str(np.random.randint(1000000, 9999999)), + "messageId": "10000000004", "channelId": "734738382", "channelName": "channel1", "threadId": None, @@ -271,6 +271,7 @@ def test_transform_two_data(self): "author_username": "user1", "author_global_name": "user1_GlobalName", "thread": None, + "url": "https://discord.com/channels/1234/111111/10000000000", } expected_metadata_1 = { @@ -284,6 +285,7 @@ def test_transform_two_data(self): "replier_username": "user4", "replier_global_name": "user4_GlobalName", "thread": None, + "url": "https://discord.com/channels/1234/22222/10000000001", } expected_metadata_2 = { @@ -298,6 +300,7 @@ def test_transform_two_data(self): "replier_global_name": "user4_GlobalName", "thread": "example_thread1", "role_mentions": ["role1"], + "url": "https://discord.com/channels/1234/22222/10000000002", } expected_metadata_3 = { @@ -307,6 +310,7 @@ def test_transform_two_data(self): "author_global_name": "user1_GlobalName", "url_reference": {"[URL0]": "https://www.google.com"}, "thread": None, + "url": "https://discord.com/channels/1234/111111/10000000003", } print(documents[0].text) self.assertDictEqual(documents[0].metadata, expected_metadata_0) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py index be0f628d..3b991a8d 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_commits.py @@ -62,7 +62,7 @@ def test_get_single_commit_single_repo_no_from_date(self): self.assertEqual( commits[0]["api_url"], "https://api.sample_url_for_commit.html" ) - self.assertEqual(commits[0]["html_url"], "https://sample_url_for_commit.html") + self.assertEqual(commits[0]["url"], "https://sample_url_for_commit.html") self.assertEqual(commits[0]["repository_id"], 123) self.assertEqual(commits[0]["repository_name"], "Org/SampleRepo") self.assertEqual(commits[0]["sha"], "sha#1111") @@ -106,7 +106,7 @@ def test_get_single_commit_single_repo_with_from_date(self): self.assertEqual( commits[0]["api_url"], "https://api.sample_url_for_commit.html" ) - self.assertEqual(commits[0]["html_url"], "https://sample_url_for_commit.html") + self.assertEqual(commits[0]["url"], "https://sample_url_for_commit.html") self.assertEqual(commits[0]["repository_id"], 123) self.assertEqual(commits[0]["repository_name"], "Org/SampleRepo2") self.assertEqual(commits[0]["sha"], "sha#1111") From 737cf2f2995b371c87bf8eb6a0d1a80bfaedb0a4 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 14:56:10 +0330 Subject: [PATCH 05/15] feat: Added telegram message url to metadata! --- .../src/db/telegram/transform/messages.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py b/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py index fe2423de..7370b093 100644 --- a/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py +++ b/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py @@ -22,6 +22,9 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]: a list of llama-index documents to be embedded & loaded into db """ transformed_docs: list[Document] = [] + + # within links the "-100" of chat_id is removed + chat_id = str(self.chat_id).removeprefix("-100") for message in messages: document = Document( @@ -35,6 +38,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]: "replies": message.repliers, "reactors": message.reactors, "chat_name": self.chat_name, + "url": f"https://t.me/c/{chat_id}/{message.message_id}", }, excluded_embed_metadata_keys=[ "author", @@ -44,6 +48,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]: "replies", "reactors", "chat_name", + "url", ], excluded_llm_metadata_keys=[ "createdAt", @@ -52,6 +57,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]: "replies", "reactors", "chat_name", + "url", ], ) transformed_docs.append(document) From 7c1f32aa003758fb3537974631fa77988ce1fe87 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 15:09:59 +0330 Subject: [PATCH 06/15] fix: wrong link! --- .../tests/integration/test_discord_prepare_document_from_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py index 93a5f13f..f92ce950 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py @@ -300,7 +300,7 @@ def test_transform_two_data(self): "replier_global_name": "user4_GlobalName", "thread": "example_thread1", "role_mentions": ["role1"], - "url": "https://discord.com/channels/1234/22222/10000000002", + "url": "https://discord.com/channels/1234/88888/10000000002", } expected_metadata_3 = { From b598cefc93918abfd007ca4b358c48cbd4c1e5f8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 15:21:28 +0330 Subject: [PATCH 07/15] feat: Adding discourse message link to documents! --- .../src/db/discourse/fetch_raw_posts.py | 2 ++ .../utils/transform_raw_to_documents.py | 9 ++++++++- .../integration/test_discourse_fetch_posts.py | 17 +++++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py b/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py index 30b0017c..be579e8f 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py @@ -55,8 +55,10 @@ def fetch_raw_posts( author.username AS author_username, author.name AS author_name, t.title AS topic, + t.id AS topic_id, p.id AS postId, $forum_endpoint AS forum_endpoint, + p.postNumber as post_number, p.raw AS raw, p.createdAt AS createdAt, p.updatedAt AS updatedAt, diff --git a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py index b9e4042a..56188eaf 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py @@ -33,12 +33,18 @@ def transform_raw_to_documents( doc: Document if not exclude_metadata: + forum_endpoint = post["forum_endpoint"] + topic_id = post["topic_id"] + post_number = post["post_number"] + + link = f"https://{forum_endpoint}/t/{topic_id}/{post_number}" + doc = Document( text=post["raw"], metadata={ "author_name": post["author_name"], "author_username": post["author_username"], - "forum_endpoint": post["forum_endpoint"], + "forum_endpoint": forum_endpoint, "createdAt": post["createdAt"], "updatedAt": post["updatedAt"], "postId": post["postId"], @@ -49,6 +55,7 @@ def transform_raw_to_documents( "liker_names": post["liker_names"], "replier_usernames": post["replier_usernames"], "replier_names": post["replier_names"], + "link": link, }, ) else: diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py b/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py index c304ac71..4cabcc66 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py @@ -55,7 +55,8 @@ def test_fetch_some_data_without_from_date(self): p.topicId = 1, p.id = 100, p.createdAt = '2022-01-01T00:00:00.000Z', - p.updatedAt = '2022-01-01T01:00:00.000Z' + p.updatedAt = '2022-01-01T01:00:00.000Z', + p.postNumber: 1.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -83,7 +84,8 @@ def test_fetch_some_data_without_from_date(self): p.topicId = 2, p.id = 101, p.createdAt = '2022-01-01T00:01:00.000Z', - p.updatedAt = '2022-01-01T01:01:00.000Z' + p.updatedAt = '2022-01-01T01:01:00.000Z', + p.postNumber: 2.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -119,6 +121,7 @@ def test_fetch_some_data_without_from_date(self): if data["author_username"] == "user#1": self.assertEqual(data["author_name"], "user1") self.assertEqual(data["topic"], "topic#1") + self.assertEqual(data["topic_id"], 1) self.assertEqual(data["createdAt"], "2022-01-01T00:00:00.000Z") self.assertEqual(data["updatedAt"], "2022-01-01T01:00:00.000Z") self.assertEqual(data["authorTrustLevel"], 4) @@ -130,9 +133,11 @@ def test_fetch_some_data_without_from_date(self): self.assertEqual(data["replier_usernames"], ["user#2"]) self.assertEqual(data["replier_names"], ["user2"]) self.assertEqual(data["forum_endpoint"], "wwwdwadeswdpoi123") + self.assertEqual(data["post_number"], 1.0) elif data["author_username"] == "user#2": self.assertEqual(data["author_name"], "user2") self.assertEqual(data["topic"], "topic#2") + self.assertEqual(data["topic_id"], 2) self.assertEqual(data["createdAt"], "2022-01-01T00:01:00.000Z") self.assertEqual(data["updatedAt"], "2022-01-01T01:01:00.000Z") self.assertEqual(data["raw"], "texttexttext of post 2") @@ -144,6 +149,7 @@ def test_fetch_some_data_without_from_date(self): self.assertEqual(data["replier_usernames"], []) self.assertEqual(data["replier_names"], []) self.assertEqual(data["forum_endpoint"], "wwwdwadeswdpoi123") + self.assertEqual(data["post_number"], 2.0) else: raise IndexError("It shouldn't get here!") @@ -166,7 +172,8 @@ def test_fetch_some_data_with_from_date(self): p.topicId = 1, p.id = 100, p.createdAt = '2022-01-01T00:00:00.000Z', - p.updatedAt = '2022-01-01T01:00:00.000Z' + p.updatedAt = '2022-01-01T01:00:00.000Z', + p.postNumber: 1.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -194,7 +201,8 @@ def test_fetch_some_data_with_from_date(self): p.topicId = 2, p.id = 101, p.createdAt = '2022-05-01T00:01:00.000Z', - p.updatedAt = '2022-05-01T01:01:00.000Z' + p.updatedAt = '2022-05-01T01:01:00.000Z', + p.postNumber: 2.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -230,6 +238,7 @@ def test_fetch_some_data_with_from_date(self): if data["author_username"] == "user#2": self.assertEqual(data["author_name"], "user2") self.assertEqual(data["topic"], "topic#2") + self.assertEqual(data["post_number"], 2.0) self.assertEqual(data["createdAt"], "2022-05-01T00:01:00.000Z") self.assertEqual(data["updatedAt"], "2022-05-01T01:01:00.000Z") self.assertEqual(data["raw"], "texttexttext of post 2") From c22cdef054f6ccc5b12fed0484d1ba8bd021ae0b Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 15:21:58 +0330 Subject: [PATCH 08/15] fix: black linter issue! --- dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py b/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py index 7370b093..e8e0f25d 100644 --- a/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py +++ b/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py @@ -22,7 +22,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]: a list of llama-index documents to be embedded & loaded into db """ transformed_docs: list[Document] = [] - + # within links the "-100" of chat_id is removed chat_id = str(self.chat_id).removeprefix("-100") From ceecdc0d0b34e8b2316a52bc9371ef14e5654998 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 15:50:45 +0330 Subject: [PATCH 09/15] fix: cypher queries! --- .../tests/integration/test_discourse_fetch_posts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py b/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py index 4cabcc66..bb75d18b 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py @@ -56,7 +56,7 @@ def test_fetch_some_data_without_from_date(self): p.id = 100, p.createdAt = '2022-01-01T00:00:00.000Z', p.updatedAt = '2022-01-01T01:00:00.000Z', - p.postNumber: 1.0 + p.postNumber = 1.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -85,7 +85,7 @@ def test_fetch_some_data_without_from_date(self): p.id = 101, p.createdAt = '2022-01-01T00:01:00.000Z', p.updatedAt = '2022-01-01T01:01:00.000Z', - p.postNumber: 2.0 + p.postNumber = 2.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -173,7 +173,7 @@ def test_fetch_some_data_with_from_date(self): p.id = 100, p.createdAt = '2022-01-01T00:00:00.000Z', p.updatedAt = '2022-01-01T01:00:00.000Z', - p.postNumber: 1.0 + p.postNumber = 1.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -202,7 +202,7 @@ def test_fetch_some_data_with_from_date(self): p.id = 101, p.createdAt = '2022-05-01T00:01:00.000Z', p.updatedAt = '2022-05-01T01:01:00.000Z', - p.postNumber: 2.0 + p.postNumber = 2.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET From 828158c94383356364ec30fddcea92f2a268a7ae Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 16:53:34 +0330 Subject: [PATCH 10/15] feat: Added google file urls! --- .../src/db/gdrive/gdrive_loader.py | 20 ++- .../tests/integration/test_gdrive_loader.py | 116 ++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py b/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py index a181388a..9f813d5e 100644 --- a/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py +++ b/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py @@ -49,7 +49,9 @@ def load_data( documents.extend(self._load_from_files(file_ids)) if not documents: raise ValueError("One input at least must be given!") - return documents + + transformed_documents = self._transform_google_documents(documents) + return transformed_documents def _load_from_folders(self, folder_ids: List[str]): folders_data = [] @@ -108,3 +110,19 @@ def _load_google_drive_creds(self) -> tuple[str, str]: raise ValueError("`GOOGLE_CLIENT_SECRET` not found from env variables!") return client_id, client_secret + + def _transform_google_documents(self, documents: list[Document]) -> list[Document]: + """ + transform google extracted documents by inserting their metadata a url + """ + # copying + transformed_docs: list[Document] = list(documents) + + for doc in transformed_docs: + file_id: str | None = doc.metadata.get("file id") + if file_id is None: + doc.metadata["url"] = None + else: + doc.metadata["url"] = f"https://drive.google.com/file/d/{file_id}/view" + + return transformed_docs diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py index 6e2f8818..3db2a8c3 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py @@ -255,3 +255,119 @@ def test__load_by_drive_id(self, mock_load_data): for i in range(4): self.assertEqual(result[i].id_, mock_data[i % 2].id_) self.assertEqual(len(result), 4) + + def test_transform_single_document(self): + loader = GoogleDriveLoader(refresh_token=self.refresh_token) + + documents = [ + Document( + doc_id=1, + text="test", + metadata={ + "file id": "file_1", + "author": "author_1", + "file name": "file_name_1", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + }, + ) + ] + transformed_docs = loader._transform_google_documents(documents=documents) + + self.assertEqual(len(transformed_docs), 1) + self.assertEqual( + transformed_docs[0].metadata, + { + "file id": "file_1", + "author": "author_1", + "file name": "file_name_1", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + "url": f"https://drive.google.com/file/d/file_1/view", + }, + ) + + def test_transform_multiple_document(self): + loader = GoogleDriveLoader(refresh_token=self.refresh_token) + + documents = [ + Document( + doc_id=1, + text="test", + metadata={ + "file id": "file_1", + "author": "author_1", + "file name": "file_name_1", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + }, + ), + Document( + doc_id=2, + text="test", + metadata={ + "file id": "file_2", + "author": "author_2", + "file name": "file_name_2", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + }, + ), + Document( + doc_id=3, + text="test", + metadata={ + "file id": "file_3", + "author": "author_3", + "file name": "file_name_3", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + }, + ), + ] + transformed_docs = loader._transform_google_documents(documents=documents) + + self.assertEqual(len(transformed_docs), 3) + self.assertEqual( + transformed_docs[0].metadata, + { + "file id": "file_1", + "author": "author_1", + "file name": "file_name_1", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + "url": f"https://drive.google.com/file/d/file_1/view", + }, + ) + + self.assertEqual( + transformed_docs[1].metadata, + { + "file id": "file_2", + "author": "author_2", + "file name": "file_name_2", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + "url": f"https://drive.google.com/file/d/file_2/view", + }, + ) + + self.assertEqual( + transformed_docs[2].metadata, + { + "file id": "file_3", + "author": "author_3", + "file name": "file_name_3", + "mime type": "mime", + "created at": "date", + "modified at": "modified", + "url": f"https://drive.google.com/file/d/file_3/view", + }, + ) From 3e8f740b19fe7a3168b16f212fbdca18e6d3b6e4 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 16:58:18 +0330 Subject: [PATCH 11/15] fix: deep copy of documents! --- dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py b/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py index 9f813d5e..9ad91180 100644 --- a/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py +++ b/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py @@ -1,3 +1,4 @@ +import copy import logging import os from typing import List, Optional @@ -116,7 +117,7 @@ def _transform_google_documents(self, documents: list[Document]) -> list[Documen transform google extracted documents by inserting their metadata a url """ # copying - transformed_docs: list[Document] = list(documents) + transformed_docs: list[Document] = copy.deepcopy(documents) for doc in transformed_docs: file_id: str | None = doc.metadata.get("file id") From a4325e1706fff253ec97b7b0d0ad6914f1f1552a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 17:10:55 +0330 Subject: [PATCH 12/15] fix: aligning with the latest updates! --- .../tests/integration/test_gdrive_loader.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py index 3db2a8c3..24ef784c 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py @@ -118,8 +118,33 @@ def test_load_by_file_id(self, mock_load_data): loader = GoogleDriveLoader(self.refresh_token) result = loader.load_data(file_ids=file_ids) + expected_results = [ + Document( + id_="file_id_1.docx", + metadata={ + "file_name": "qwertU10p2.docx", + "file id": "qwertU10p2", + "author": "Jacku", + "file name": "Option", + "url": "https://drive.google.com/file/d/qwertU10p2/view", + }, + relationships={}, + text="Option 1: Keep it super casual", + ), + Document( + id_="file_id_2.docx", + metadata={ + "file_name": "qwertU10p3.docx", + "file id": "qwertU10p3", + "author": "Jacku", + "file name": "Option", + "url": "https://drive.google.com/file/d/qwertU10p3/view", + }, + text="Option 1: Keep it super casual", + ), + ] self.assertEqual(len(result), 2) - self.assertEqual(result, mock_data) + self.assertEqual(result, expected_results) @patch.object(GoogleDriveReader, "load_data") def test_load_from_folders_exception(self, mock_reader): From d261e67366d7a6c986fce548af4109a8d3f7fb1a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 27 Nov 2024 17:14:38 +0330 Subject: [PATCH 13/15] fix: lint issues! --- .../tests/integration/test_gdrive_loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py index 24ef784c..7f99b489 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_gdrive_loader.py @@ -310,7 +310,7 @@ def test_transform_single_document(self): "mime type": "mime", "created at": "date", "modified at": "modified", - "url": f"https://drive.google.com/file/d/file_1/view", + "url": "https://drive.google.com/file/d/file_1/view", }, ) @@ -367,7 +367,7 @@ def test_transform_multiple_document(self): "mime type": "mime", "created at": "date", "modified at": "modified", - "url": f"https://drive.google.com/file/d/file_1/view", + "url": "https://drive.google.com/file/d/file_1/view", }, ) @@ -380,7 +380,7 @@ def test_transform_multiple_document(self): "mime type": "mime", "created at": "date", "modified at": "modified", - "url": f"https://drive.google.com/file/d/file_2/view", + "url": "https://drive.google.com/file/d/file_2/view", }, ) @@ -393,6 +393,6 @@ def test_transform_multiple_document(self): "mime type": "mime", "created at": "date", "modified at": "modified", - "url": f"https://drive.google.com/file/d/file_3/view", + "url": "https://drive.google.com/file/d/file_3/view", }, ) From 6a3316c5d52f5fadefbb25f894c2a104fe1468e5 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 28 Nov 2024 10:04:50 +0330 Subject: [PATCH 14/15] feat: Adding url to notion extracted pages metadata! --- dags/hivemind_etl_helpers/notion_etl.py | 37 +++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/dags/hivemind_etl_helpers/notion_etl.py b/dags/hivemind_etl_helpers/notion_etl.py index 0b1ca673..057d9472 100644 --- a/dags/hivemind_etl_helpers/notion_etl.py +++ b/dags/hivemind_etl_helpers/notion_etl.py @@ -1,5 +1,7 @@ +import copy import logging +from llama_index.core import Document from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline @@ -56,7 +58,8 @@ def process( documents = self.notion_extractor.extract( page_ids=page_ids, database_ids=database_ids ) - self.ingestion_pipeline.run_pipeline(docs=documents) + transformed_docs = self._transform_documents(documents=documents) + self.ingestion_pipeline.run_pipeline(docs=transformed_docs) def process_page(self, page_id: str) -> None: """ @@ -71,7 +74,8 @@ def process_page(self, page_id: str) -> None: f"Processing page_id: {page_id}, of community id: {self.community_id}" ) documents = self.notion_extractor.extract_from_pages(page_ids=[page_id]) - self.ingestion_pipeline.run_pipeline(docs=documents) + transformed_docs = self._transform_documents(documents=documents) + self.ingestion_pipeline.run_pipeline(docs=transformed_docs) def process_database(self, database_id: str) -> None: """ @@ -86,4 +90,31 @@ def process_database(self, database_id: str) -> None: f"Processing database id: {database_id}, of community id: {self.community_id}" ) documents = self.notion_extractor.extract_from_database(database_id=database_id) - self.ingestion_pipeline.run_pipeline(docs=documents) + transformed_docs = self._transform_documents(documents=documents) + self.ingestion_pipeline.run_pipeline(docs=transformed_docs) + + def _transform_documents(self, documents: list[Document]) -> list[Document]: + """ + transform notion extracted documents by inserting their metadata a url + + Parameters + ------------ + documents : list[Document] + a list of notion extracted pages + + Returns + --------- + documents : list[Document] + a list of documents each inlcuded with url in its metadata + """ + # Copying + transformed_docs: list[Document] = copy.deepcopy(documents) + + for doc in transformed_docs: + page_id: str | None = doc.metadata.get("page_id") + if page_id is None: + doc.metadata["url"] = None + else: + doc.metadata["url"] = f"https://www.notion.so/{page_id}" + + return transformed_docs From 7d2191e1b47f4138f409f5431c0b1b4bd3ab527e Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 28 Nov 2024 10:25:40 +0330 Subject: [PATCH 15/15] fix: isort linter issue! --- dags/hivemind_etl_helpers/notion_etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/notion_etl.py b/dags/hivemind_etl_helpers/notion_etl.py index 057d9472..8cf93211 100644 --- a/dags/hivemind_etl_helpers/notion_etl.py +++ b/dags/hivemind_etl_helpers/notion_etl.py @@ -1,8 +1,8 @@ import copy import logging -from llama_index.core import Document from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor +from llama_index.core import Document from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline