diff --git a/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py b/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py index 30b0017c..be579e8f 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py @@ -55,8 +55,10 @@ def fetch_raw_posts( author.username AS author_username, author.name AS author_name, t.title AS topic, + t.id AS topic_id, p.id AS postId, $forum_endpoint AS forum_endpoint, + p.postNumber as post_number, p.raw AS raw, p.createdAt AS createdAt, p.updatedAt AS updatedAt, diff --git a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py index b9e4042a..56188eaf 100644 --- a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py +++ b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py @@ -33,12 +33,18 @@ def transform_raw_to_documents( doc: Document if not exclude_metadata: + forum_endpoint = post["forum_endpoint"] + topic_id = post["topic_id"] + post_number = post["post_number"] + + link = f"https://{forum_endpoint}/t/{topic_id}/{post_number}" + doc = Document( text=post["raw"], metadata={ "author_name": post["author_name"], "author_username": post["author_username"], - "forum_endpoint": post["forum_endpoint"], + "forum_endpoint": forum_endpoint, "createdAt": post["createdAt"], "updatedAt": post["updatedAt"], "postId": post["postId"], @@ -49,6 +55,7 @@ def transform_raw_to_documents( "liker_names": post["liker_names"], "replier_usernames": post["replier_usernames"], "replier_names": post["replier_names"], + "link": link, }, ) else: diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py b/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py index c304ac71..4cabcc66 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_discourse_fetch_posts.py @@ -55,7 +55,8 @@ def test_fetch_some_data_without_from_date(self): p.topicId = 1, p.id = 100, p.createdAt = '2022-01-01T00:00:00.000Z', - p.updatedAt = '2022-01-01T01:00:00.000Z' + p.updatedAt = '2022-01-01T01:00:00.000Z', + p.postNumber: 1.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -83,7 +84,8 @@ def test_fetch_some_data_without_from_date(self): p.topicId = 2, p.id = 101, p.createdAt = '2022-01-01T00:01:00.000Z', - p.updatedAt = '2022-01-01T01:01:00.000Z' + p.updatedAt = '2022-01-01T01:01:00.000Z', + p.postNumber: 2.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -119,6 +121,7 @@ def test_fetch_some_data_without_from_date(self): if data["author_username"] == "user#1": self.assertEqual(data["author_name"], "user1") self.assertEqual(data["topic"], "topic#1") + self.assertEqual(data["topic_id"], 1) self.assertEqual(data["createdAt"], "2022-01-01T00:00:00.000Z") self.assertEqual(data["updatedAt"], "2022-01-01T01:00:00.000Z") self.assertEqual(data["authorTrustLevel"], 4) @@ -130,9 +133,11 @@ def test_fetch_some_data_without_from_date(self): self.assertEqual(data["replier_usernames"], ["user#2"]) self.assertEqual(data["replier_names"], ["user2"]) self.assertEqual(data["forum_endpoint"], "wwwdwadeswdpoi123") + self.assertEqual(data["post_number"], 1.0) elif data["author_username"] == "user#2": self.assertEqual(data["author_name"], "user2") self.assertEqual(data["topic"], "topic#2") + self.assertEqual(data["topic_id"], 2) self.assertEqual(data["createdAt"], "2022-01-01T00:01:00.000Z") self.assertEqual(data["updatedAt"], "2022-01-01T01:01:00.000Z") self.assertEqual(data["raw"], "texttexttext of post 2") @@ -144,6 +149,7 @@ def test_fetch_some_data_without_from_date(self): self.assertEqual(data["replier_usernames"], []) self.assertEqual(data["replier_names"], []) self.assertEqual(data["forum_endpoint"], "wwwdwadeswdpoi123") + self.assertEqual(data["post_number"], 2.0) else: raise IndexError("It shouldn't get here!") @@ -166,7 +172,8 @@ def test_fetch_some_data_with_from_date(self): p.topicId = 1, p.id = 100, p.createdAt = '2022-01-01T00:00:00.000Z', - p.updatedAt = '2022-01-01T01:00:00.000Z' + p.updatedAt = '2022-01-01T01:00:00.000Z', + p.postNumber: 1.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -194,7 +201,8 @@ def test_fetch_some_data_with_from_date(self): p.topicId = 2, p.id = 101, p.createdAt = '2022-05-01T00:01:00.000Z', - p.updatedAt = '2022-05-01T01:01:00.000Z' + p.updatedAt = '2022-05-01T01:01:00.000Z', + p.postNumber: 2.0 WITH p CREATE (a:DiscourseUser) -[:POSTED]->(p) SET @@ -230,6 +238,7 @@ def test_fetch_some_data_with_from_date(self): if data["author_username"] == "user#2": self.assertEqual(data["author_name"], "user2") self.assertEqual(data["topic"], "topic#2") + self.assertEqual(data["post_number"], 2.0) self.assertEqual(data["createdAt"], "2022-05-01T00:01:00.000Z") self.assertEqual(data["updatedAt"], "2022-05-01T01:01:00.000Z") self.assertEqual(data["raw"], "texttexttext of post 2")