From 0eae515d04360baeaf04217f0a8b0d2d0b061829 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 6 Aug 2024 15:02:57 +0330 Subject: [PATCH 1/3] feat: Added text and link to transformed data! --- dags/analyzer_helper/discourse/extract_raw_data.py | 2 ++ dags/analyzer_helper/discourse/transform_raw_data.py | 8 +++++++- dags/discourse_analyzer_etl.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/dags/analyzer_helper/discourse/extract_raw_data.py b/dags/analyzer_helper/discourse/extract_raw_data.py index c37d6dc1..ee428e44 100644 --- a/dags/analyzer_helper/discourse/extract_raw_data.py +++ b/dags/analyzer_helper/discourse/extract_raw_data.py @@ -59,6 +59,8 @@ def fetch_post_details( OPTIONAL MATCH (post)-[:REPLIED_TO]->(repliedPost:DiscoursePost) OPTIONAL MATCH (repliedPost)<-[:POSTED]-(repliedAuthor:DiscourseUser) RETURN + post.raw as text, + post.postNumber as post_number, post.id AS post_id, author.id AS author_id, post.createdAt AS created_at, diff --git a/dags/analyzer_helper/discourse/transform_raw_data.py b/dags/analyzer_helper/discourse/transform_raw_data.py index e77cfe77..4af23e8e 100644 --- a/dags/analyzer_helper/discourse/transform_raw_data.py +++ b/dags/analyzer_helper/discourse/transform_raw_data.py @@ -4,7 +4,8 @@ class TransformRawInfo: - def __init__(self): + def __init__(self, forum_endpoint: str): + self.forum_endpoint = forum_endpoint self.converter = DateTimeFormatConverter() def create_data_entry( @@ -14,6 +15,10 @@ def create_data_entry( "category_id": raw_data.get("category_id"), "topic_id": raw_data.get("topic_id"), "bot_activity": False, + "link": ( + f"https://{self.forum_endpoint}/t/" + + raw_data.get("topic_id") + "/" + raw_data.get("post_number") + ) } result = { @@ -22,6 +27,7 @@ def create_data_entry( if interaction_type == "reply" else raw_data.get("author_id") ), + "text": raw_data["text"], "date": self.converter.from_iso_format(raw_data.get("created_at")), "source_id": str(raw_data["post_id"]), "metadata": metadata, diff --git a/dags/discourse_analyzer_etl.py b/dags/discourse_analyzer_etl.py index 4acf7a44..cc055a77 100644 --- a/dags/discourse_analyzer_etl.py +++ b/dags/discourse_analyzer_etl.py @@ -107,7 +107,7 @@ def discourse_etl_raw_data( forum_endpoint=forum_endpoint, platform_id=platform_id ) extracted_data = extractor.extract(period=period, recompute=recompute) - transformer = TransformRawInfo() + transformer = TransformRawInfo(forum_endpoint=forum_endpoint) transformed_data = transformer.transform( raw_data=extracted_data, ) From ee9897e763013920244f2cc9d9f0f32c4ada8c9b Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 6 Aug 2024 16:01:56 +0330 Subject: [PATCH 2/3] wip: updating test cases with new structure! --- .../discourse/transform_raw_data.py | 18 ++++++--- .../test_discourse_extract_raw_data.py | 36 +++++++++++++++++- .../unit/test_discourse_transform_raw_data.py | 38 ++++++++++++++++++- 3 files changed, 84 insertions(+), 8 deletions(-) diff --git a/dags/analyzer_helper/discourse/transform_raw_data.py b/dags/analyzer_helper/discourse/transform_raw_data.py index 4af23e8e..e015c1d3 100644 --- a/dags/analyzer_helper/discourse/transform_raw_data.py +++ b/dags/analyzer_helper/discourse/transform_raw_data.py @@ -11,16 +11,24 @@ def __init__(self, forum_endpoint: str): def create_data_entry( self, raw_data: dict, interaction_type: str = None, interaction_user: int = None ) -> dict: + topic_id = raw_data.get("topic_id") + post_number = raw_data.get("post_number") metadata = { "category_id": raw_data.get("category_id"), - "topic_id": raw_data.get("topic_id"), + "topic_id": topic_id, "bot_activity": False, - "link": ( - f"https://{self.forum_endpoint}/t/" + - raw_data.get("topic_id") + "/" + raw_data.get("post_number") - ) } + # Adding the message link to metadata + if topic_id and post_number: + metadata = { + **metadata, # previous ones + "link": ( + f"https://{self.forum_endpoint}/t/" + + f"{int(topic_id)}/{int(post_number)}" + ) + } + result = { "author_id": str( interaction_user diff --git a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py index 04fd57c1..ae3dd0cc 100644 --- a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py +++ b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py @@ -27,8 +27,28 @@ def setUpClass(cls): CREATE (f:DiscourseForum {endpoint: $endpoint, uuid: 'forum-uuid'}), (u1:DiscourseUser {id: 'user1', name: 'User One'}), (u2:DiscourseUser {id: 'user2', name: 'User Two'}), - (p1:DiscoursePost {id: '1', content: 'Post 1', createdAt: '2023-01-01T00:00:00Z', topicId: 'topic-uuid', forumUuid: 'forum-uuid'}), - (p2:DiscoursePost {id: '2', content: 'Post 2', createdAt: '2023-01-02T00:00:00Z', topicId: 'topic-uuid', forumUuid: 'forum-uuid'}), + (p1:DiscoursePost + { + id: '1', + content: 'Post 1', + createdAt: '2023-01-01T00:00:00Z', + topicId: 'topic-uuid', + forumUuid: 'forum-uuid', + raw: "Sample Text 1", + postNumber: 1.0 + } + ), + (p2:DiscoursePost + { + id: '2', + content: 'Post 2', + createdAt: '2023-01-02T00:00:00Z', + topicId: 'topic-uuid', + forumUuid: 'forum-uuid', + raw: "Sample Text 2", + postNumber: 2.0 + } + ), (t:DiscourseTopic {id: 'topic-uuid', forumUuid: 'forum-uuid'}), (c:DiscourseCategory {id: 'category1', name: 'Category 1'}), (p1)<-[:HAS_POST]-(t), @@ -60,6 +80,8 @@ def test_fetch_post_details(self): "replied_post_id": "2", "replied_post_user_id": "user2", "topic_id": "topic-uuid", + "post_number": 1.0, + "text": "Sample Text 1", }, { "post_id": "2", @@ -69,6 +91,8 @@ def test_fetch_post_details(self): "replied_post_id": None, "replied_post_user_id": None, "topic_id": "topic-uuid", + "post_number": 2.0, + "text": "Sample Text 2", }, ] self.assertEqual(len(result), 2) @@ -127,6 +151,8 @@ def test_extract_without_recompute_no_latest_activity(self): "replied_post_user_id": "user2", "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 1.0, + "text": "Sample Text 1", }, { "post_id": "2", @@ -137,6 +163,8 @@ def test_extract_without_recompute_no_latest_activity(self): "replied_post_user_id": None, "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 2.0, + "text": "Sample Text 2", }, ] self.assertEqual(len(result), 2) @@ -185,6 +213,8 @@ def test_extract_without_recompute_latest_activity_before_period(self): "replied_post_user_id": "user2", "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 1.0, + "text": "Sample Text 1", }, { "post_id": "2", @@ -195,6 +225,8 @@ def test_extract_without_recompute_latest_activity_before_period(self): "replied_post_user_id": None, "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 2.0, + "text": "Sample Text 2", }, ] self.assertEqual(len(result), 2) diff --git a/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py b/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py index 5bc89887..84e944fb 100644 --- a/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py +++ b/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py @@ -7,8 +7,9 @@ class TestTransformRawInfo(unittest.TestCase): def setUp(self): """Initialize the TransformRawInfo instance before each test.""" - self.transformer = TransformRawInfo() self.platform_id = "test_platform" + self.forum_endpoint = "sample.endpoint.gov" + self.transformer = TransformRawInfo(forum_endpoint=self.forum_endpoint) def test_create_data_entry_no_interaction(self): """Test data entry creation with no specific interaction type.""" @@ -16,6 +17,8 @@ def test_create_data_entry_no_interaction(self): "post_id": 6262, "author_id": 6168, "created_at": "2023-09-11T21:41:43.553Z", + "text": "some content", + "post_number": 1.0, "category_id": 500, "topic_id": 6134, "reactions": [], @@ -25,6 +28,11 @@ def test_create_data_entry_no_interaction(self): self.assertEqual(result["author_id"], str(raw_data["author_id"])) self.assertIsInstance(result["date"], datetime.datetime) self.assertFalse(result["metadata"]["bot_activity"]) + self.assertEqual( + result["metadata"]["link"], + f"https://{self.forum_endpoint}/t/6134/1", + ) + self.assertEqual(result["text"], "some content") self.assertEqual(len(result["interactions"]), 0) self.assertEqual(result["source_id"], str(raw_data["post_id"])) self.assertEqual(result["metadata"]["category_id"], raw_data["category_id"]) @@ -43,6 +51,8 @@ def test_create_data_entry_with_reaction(self): "topic_id": 6134, "reactions": [6263], "replied_post_id": None, + "text": "some content #2", + "post_number": 1.0, } result = self.transformer.create_data_entry( raw_data, interaction_type="reaction", interaction_user=6263 @@ -53,6 +63,7 @@ def test_create_data_entry_with_reaction(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "interactions": [ { "name": "reaction", @@ -64,6 +75,7 @@ def test_create_data_entry_with_reaction(self): "category_id": 500, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "source_id": "6261", } @@ -80,6 +92,8 @@ def test_transform_data_with_replied_user(self): "replied_post_id": 6512, "replied_post_user_id": 4444, "topic_id": 6134, + "text": "some content #2", + "post_number": 1.0, } ] @@ -89,11 +103,13 @@ def test_transform_data_with_replied_user(self): "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "source_id": "6262", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [ { @@ -147,6 +163,8 @@ def test_transform_data_with_reactions(self): "replied_post_id": None, "replied_post_user_id": None, "topic_id": 6134, + "text": "some content #2", + "post_number": 1.0, } ] expected_result = [ @@ -155,11 +173,13 @@ def test_transform_data_with_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "source_id": "6261", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [ { @@ -181,11 +201,13 @@ def test_transform_data_with_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "interactions": [ { "name": "reaction", "type": "emitter", "users_engaged_id": ["6168"], + "link": f"https://{self.forum_endpoint}/t/6134/1", } ], "metadata": { @@ -233,6 +255,8 @@ def test_transform_data_replied_and_reactions(self): "replied_post_id": 6512, "replied_post_user_id": 4444, "topic_id": 6134, + "text": "some content #1", + "post_number": 1.0, }, { "post_id": 6261, @@ -243,6 +267,8 @@ def test_transform_data_replied_and_reactions(self): "replied_post_id": None, "replied_post_user_id": None, "topic_id": 6134, + "text": "some content #2", + "post_number": 2.0, }, ] @@ -252,11 +278,13 @@ def test_transform_data_replied_and_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #1", "source_id": "6262", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [ { @@ -277,11 +305,13 @@ def test_transform_data_replied_and_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #1", "source_id": "6262", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [], "interactions": [ @@ -297,11 +327,13 @@ def test_transform_data_replied_and_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "source_id": "6261", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/2", }, "actions": [ { @@ -320,6 +352,7 @@ def test_transform_data_replied_and_reactions(self): { "actions": [], "author_id": "1", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -334,12 +367,14 @@ def test_transform_data_replied_and_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/2", }, "source_id": "6261", }, { "actions": [], "author_id": "2", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -354,6 +389,7 @@ def test_transform_data_replied_and_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/2", }, "source_id": "6261", }, From 7b87f2ada30415529a466d14fe705373bb6e8af8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 6 Aug 2024 16:26:56 +0330 Subject: [PATCH 3/3] fix: missng test cases to update! + fixed black linter isssues too. --- dags/analyzer_helper/discourse/transform_raw_data.py | 6 +++--- .../tests/unit/test_discourse_transform_raw_data.py | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dags/analyzer_helper/discourse/transform_raw_data.py b/dags/analyzer_helper/discourse/transform_raw_data.py index e015c1d3..242c5a88 100644 --- a/dags/analyzer_helper/discourse/transform_raw_data.py +++ b/dags/analyzer_helper/discourse/transform_raw_data.py @@ -24,9 +24,9 @@ def create_data_entry( metadata = { **metadata, # previous ones "link": ( - f"https://{self.forum_endpoint}/t/" + - f"{int(topic_id)}/{int(post_number)}" - ) + f"https://{self.forum_endpoint}/t/" + + f"{int(topic_id)}/{int(post_number)}" + ), } result = { diff --git a/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py b/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py index 84e944fb..9536ba57 100644 --- a/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py +++ b/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py @@ -127,6 +127,7 @@ def test_transform_data_with_replied_user(self): }, { "author_id": "4444", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -135,6 +136,7 @@ def test_transform_data_with_replied_user(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [], "interactions": [ @@ -207,19 +209,20 @@ def test_transform_data_with_reactions(self): "name": "reaction", "type": "emitter", "users_engaged_id": ["6168"], - "link": f"https://{self.forum_endpoint}/t/6134/1", } ], "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "source_id": "6261", }, { "actions": [], "author_id": "2", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -234,6 +237,7 @@ def test_transform_data_with_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "source_id": "6261", },