diff --git a/dags/analyzer_helper/discourse/extract_raw_data.py b/dags/analyzer_helper/discourse/extract_raw_data.py index c37d6dc1..ee428e44 100644 --- a/dags/analyzer_helper/discourse/extract_raw_data.py +++ b/dags/analyzer_helper/discourse/extract_raw_data.py @@ -59,6 +59,8 @@ def fetch_post_details( OPTIONAL MATCH (post)-[:REPLIED_TO]->(repliedPost:DiscoursePost) OPTIONAL MATCH (repliedPost)<-[:POSTED]-(repliedAuthor:DiscourseUser) RETURN + post.raw as text, + post.postNumber as post_number, post.id AS post_id, author.id AS author_id, post.createdAt AS created_at, diff --git a/dags/analyzer_helper/discourse/transform_raw_data.py b/dags/analyzer_helper/discourse/transform_raw_data.py index e77cfe77..242c5a88 100644 --- a/dags/analyzer_helper/discourse/transform_raw_data.py +++ b/dags/analyzer_helper/discourse/transform_raw_data.py @@ -4,24 +4,38 @@ class TransformRawInfo: - def __init__(self): + def __init__(self, forum_endpoint: str): + self.forum_endpoint = forum_endpoint self.converter = DateTimeFormatConverter() def create_data_entry( self, raw_data: dict, interaction_type: str = None, interaction_user: int = None ) -> dict: + topic_id = raw_data.get("topic_id") + post_number = raw_data.get("post_number") metadata = { "category_id": raw_data.get("category_id"), - "topic_id": raw_data.get("topic_id"), + "topic_id": topic_id, "bot_activity": False, } + # Adding the message link to metadata + if topic_id and post_number: + metadata = { + **metadata, # previous ones + "link": ( + f"https://{self.forum_endpoint}/t/" + + f"{int(topic_id)}/{int(post_number)}" + ), + } + result = { "author_id": str( interaction_user if interaction_type == "reply" else raw_data.get("author_id") ), + "text": raw_data["text"], "date": self.converter.from_iso_format(raw_data.get("created_at")), "source_id": str(raw_data["post_id"]), "metadata": metadata, diff --git a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py index 04fd57c1..ae3dd0cc 100644 --- a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py +++ b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py @@ -27,8 +27,28 @@ def setUpClass(cls): CREATE (f:DiscourseForum {endpoint: $endpoint, uuid: 'forum-uuid'}), (u1:DiscourseUser {id: 'user1', name: 'User One'}), (u2:DiscourseUser {id: 'user2', name: 'User Two'}), - (p1:DiscoursePost {id: '1', content: 'Post 1', createdAt: '2023-01-01T00:00:00Z', topicId: 'topic-uuid', forumUuid: 'forum-uuid'}), - (p2:DiscoursePost {id: '2', content: 'Post 2', createdAt: '2023-01-02T00:00:00Z', topicId: 'topic-uuid', forumUuid: 'forum-uuid'}), + (p1:DiscoursePost + { + id: '1', + content: 'Post 1', + createdAt: '2023-01-01T00:00:00Z', + topicId: 'topic-uuid', + forumUuid: 'forum-uuid', + raw: "Sample Text 1", + postNumber: 1.0 + } + ), + (p2:DiscoursePost + { + id: '2', + content: 'Post 2', + createdAt: '2023-01-02T00:00:00Z', + topicId: 'topic-uuid', + forumUuid: 'forum-uuid', + raw: "Sample Text 2", + postNumber: 2.0 + } + ), (t:DiscourseTopic {id: 'topic-uuid', forumUuid: 'forum-uuid'}), (c:DiscourseCategory {id: 'category1', name: 'Category 1'}), (p1)<-[:HAS_POST]-(t), @@ -60,6 +80,8 @@ def test_fetch_post_details(self): "replied_post_id": "2", "replied_post_user_id": "user2", "topic_id": "topic-uuid", + "post_number": 1.0, + "text": "Sample Text 1", }, { "post_id": "2", @@ -69,6 +91,8 @@ def test_fetch_post_details(self): "replied_post_id": None, "replied_post_user_id": None, "topic_id": "topic-uuid", + "post_number": 2.0, + "text": "Sample Text 2", }, ] self.assertEqual(len(result), 2) @@ -127,6 +151,8 @@ def test_extract_without_recompute_no_latest_activity(self): "replied_post_user_id": "user2", "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 1.0, + "text": "Sample Text 1", }, { "post_id": "2", @@ -137,6 +163,8 @@ def test_extract_without_recompute_no_latest_activity(self): "replied_post_user_id": None, "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 2.0, + "text": "Sample Text 2", }, ] self.assertEqual(len(result), 2) @@ -185,6 +213,8 @@ def test_extract_without_recompute_latest_activity_before_period(self): "replied_post_user_id": "user2", "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 1.0, + "text": "Sample Text 1", }, { "post_id": "2", @@ -195,6 +225,8 @@ def test_extract_without_recompute_latest_activity_before_period(self): "replied_post_user_id": None, "topic_id": "topic-uuid", "category_id": "category1", + "post_number": 2.0, + "text": "Sample Text 2", }, ] self.assertEqual(len(result), 2) diff --git a/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py b/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py index 5bc89887..9536ba57 100644 --- a/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py +++ b/dags/analyzer_helper/tests/unit/test_discourse_transform_raw_data.py @@ -7,8 +7,9 @@ class TestTransformRawInfo(unittest.TestCase): def setUp(self): """Initialize the TransformRawInfo instance before each test.""" - self.transformer = TransformRawInfo() self.platform_id = "test_platform" + self.forum_endpoint = "sample.endpoint.gov" + self.transformer = TransformRawInfo(forum_endpoint=self.forum_endpoint) def test_create_data_entry_no_interaction(self): """Test data entry creation with no specific interaction type.""" @@ -16,6 +17,8 @@ def test_create_data_entry_no_interaction(self): "post_id": 6262, "author_id": 6168, "created_at": "2023-09-11T21:41:43.553Z", + "text": "some content", + "post_number": 1.0, "category_id": 500, "topic_id": 6134, "reactions": [], @@ -25,6 +28,11 @@ def test_create_data_entry_no_interaction(self): self.assertEqual(result["author_id"], str(raw_data["author_id"])) self.assertIsInstance(result["date"], datetime.datetime) self.assertFalse(result["metadata"]["bot_activity"]) + self.assertEqual( + result["metadata"]["link"], + f"https://{self.forum_endpoint}/t/6134/1", + ) + self.assertEqual(result["text"], "some content") self.assertEqual(len(result["interactions"]), 0) self.assertEqual(result["source_id"], str(raw_data["post_id"])) self.assertEqual(result["metadata"]["category_id"], raw_data["category_id"]) @@ -43,6 +51,8 @@ def test_create_data_entry_with_reaction(self): "topic_id": 6134, "reactions": [6263], "replied_post_id": None, + "text": "some content #2", + "post_number": 1.0, } result = self.transformer.create_data_entry( raw_data, interaction_type="reaction", interaction_user=6263 @@ -53,6 +63,7 @@ def test_create_data_entry_with_reaction(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "interactions": [ { "name": "reaction", @@ -64,6 +75,7 @@ def test_create_data_entry_with_reaction(self): "category_id": 500, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "source_id": "6261", } @@ -80,6 +92,8 @@ def test_transform_data_with_replied_user(self): "replied_post_id": 6512, "replied_post_user_id": 4444, "topic_id": 6134, + "text": "some content #2", + "post_number": 1.0, } ] @@ -89,11 +103,13 @@ def test_transform_data_with_replied_user(self): "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "source_id": "6262", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [ { @@ -111,6 +127,7 @@ def test_transform_data_with_replied_user(self): }, { "author_id": "4444", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -119,6 +136,7 @@ def test_transform_data_with_replied_user(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [], "interactions": [ @@ -147,6 +165,8 @@ def test_transform_data_with_reactions(self): "replied_post_id": None, "replied_post_user_id": None, "topic_id": 6134, + "text": "some content #2", + "post_number": 1.0, } ] expected_result = [ @@ -155,11 +175,13 @@ def test_transform_data_with_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "source_id": "6261", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [ { @@ -181,6 +203,7 @@ def test_transform_data_with_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "interactions": [ { "name": "reaction", @@ -192,12 +215,14 @@ def test_transform_data_with_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "source_id": "6261", }, { "actions": [], "author_id": "2", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -212,6 +237,7 @@ def test_transform_data_with_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "source_id": "6261", }, @@ -233,6 +259,8 @@ def test_transform_data_replied_and_reactions(self): "replied_post_id": 6512, "replied_post_user_id": 4444, "topic_id": 6134, + "text": "some content #1", + "post_number": 1.0, }, { "post_id": 6261, @@ -243,6 +271,8 @@ def test_transform_data_replied_and_reactions(self): "replied_post_id": None, "replied_post_user_id": None, "topic_id": 6134, + "text": "some content #2", + "post_number": 2.0, }, ] @@ -252,11 +282,13 @@ def test_transform_data_replied_and_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #1", "source_id": "6262", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [ { @@ -277,11 +309,13 @@ def test_transform_data_replied_and_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #1", "source_id": "6262", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/1", }, "actions": [], "interactions": [ @@ -297,11 +331,13 @@ def test_transform_data_replied_and_reactions(self): "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), + "text": "some content #2", "source_id": "6261", "metadata": { "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/2", }, "actions": [ { @@ -320,6 +356,7 @@ def test_transform_data_replied_and_reactions(self): { "actions": [], "author_id": "1", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -334,12 +371,14 @@ def test_transform_data_replied_and_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/2", }, "source_id": "6261", }, { "actions": [], "author_id": "2", + "text": "some content #2", "date": datetime.datetime( 2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc ), @@ -354,6 +393,7 @@ def test_transform_data_replied_and_reactions(self): "category_id": None, "topic_id": 6134, "bot_activity": False, + "link": f"https://{self.forum_endpoint}/t/6134/2", }, "source_id": "6261", }, diff --git a/dags/discourse_analyzer_etl.py b/dags/discourse_analyzer_etl.py index 8e65e068..47266ecb 100644 --- a/dags/discourse_analyzer_etl.py +++ b/dags/discourse_analyzer_etl.py @@ -107,7 +107,7 @@ def discourse_etl_raw_data( forum_endpoint=forum_endpoint, platform_id=platform_id ) extracted_data = extractor.extract(period=period, recompute=recompute) - transformer = TransformRawInfo() + transformer = TransformRawInfo(forum_endpoint=forum_endpoint) transformed_data = transformer.transform( raw_data=extracted_data, )