Skip to content

Commit

Permalink
Merge pull request #237 from TogetherCrew/feat/235-add-text-link-disc…
Browse files Browse the repository at this point in the history
…ourse-etl

feat: Added text and link to transformed data!
  • Loading branch information
amindadgar authored Aug 6, 2024
2 parents 149f3c3 + 7b87f2a commit c46b73c
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 6 deletions.
2 changes: 2 additions & 0 deletions dags/analyzer_helper/discourse/extract_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def fetch_post_details(
OPTIONAL MATCH (post)-[:REPLIED_TO]->(repliedPost:DiscoursePost)
OPTIONAL MATCH (repliedPost)<-[:POSTED]-(repliedAuthor:DiscourseUser)
RETURN
post.raw as text,
post.postNumber as post_number,
post.id AS post_id,
author.id AS author_id,
post.createdAt AS created_at,
Expand Down
18 changes: 16 additions & 2 deletions dags/analyzer_helper/discourse/transform_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,38 @@


class TransformRawInfo:
def __init__(self):
def __init__(self, forum_endpoint: str):
self.forum_endpoint = forum_endpoint
self.converter = DateTimeFormatConverter()

def create_data_entry(
self, raw_data: dict, interaction_type: str = None, interaction_user: int = None
) -> dict:
topic_id = raw_data.get("topic_id")
post_number = raw_data.get("post_number")
metadata = {
"category_id": raw_data.get("category_id"),
"topic_id": raw_data.get("topic_id"),
"topic_id": topic_id,
"bot_activity": False,
}

# Adding the message link to metadata
if topic_id and post_number:
metadata = {
**metadata, # previous ones
"link": (
f"https://{self.forum_endpoint}/t/"
+ f"{int(topic_id)}/{int(post_number)}"
),
}

result = {
"author_id": str(
interaction_user
if interaction_type == "reply"
else raw_data.get("author_id")
),
"text": raw_data["text"],
"date": self.converter.from_iso_format(raw_data.get("created_at")),
"source_id": str(raw_data["post_id"]),
"metadata": metadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,28 @@ def setUpClass(cls):
CREATE (f:DiscourseForum {endpoint: $endpoint, uuid: 'forum-uuid'}),
(u1:DiscourseUser {id: 'user1', name: 'User One'}),
(u2:DiscourseUser {id: 'user2', name: 'User Two'}),
(p1:DiscoursePost {id: '1', content: 'Post 1', createdAt: '2023-01-01T00:00:00Z', topicId: 'topic-uuid', forumUuid: 'forum-uuid'}),
(p2:DiscoursePost {id: '2', content: 'Post 2', createdAt: '2023-01-02T00:00:00Z', topicId: 'topic-uuid', forumUuid: 'forum-uuid'}),
(p1:DiscoursePost
{
id: '1',
content: 'Post 1',
createdAt: '2023-01-01T00:00:00Z',
topicId: 'topic-uuid',
forumUuid: 'forum-uuid',
raw: "Sample Text 1",
postNumber: 1.0
}
),
(p2:DiscoursePost
{
id: '2',
content: 'Post 2',
createdAt: '2023-01-02T00:00:00Z',
topicId: 'topic-uuid',
forumUuid: 'forum-uuid',
raw: "Sample Text 2",
postNumber: 2.0
}
),
(t:DiscourseTopic {id: 'topic-uuid', forumUuid: 'forum-uuid'}),
(c:DiscourseCategory {id: 'category1', name: 'Category 1'}),
(p1)<-[:HAS_POST]-(t),
Expand Down Expand Up @@ -60,6 +80,8 @@ def test_fetch_post_details(self):
"replied_post_id": "2",
"replied_post_user_id": "user2",
"topic_id": "topic-uuid",
"post_number": 1.0,
"text": "Sample Text 1",
},
{
"post_id": "2",
Expand All @@ -69,6 +91,8 @@ def test_fetch_post_details(self):
"replied_post_id": None,
"replied_post_user_id": None,
"topic_id": "topic-uuid",
"post_number": 2.0,
"text": "Sample Text 2",
},
]
self.assertEqual(len(result), 2)
Expand Down Expand Up @@ -127,6 +151,8 @@ def test_extract_without_recompute_no_latest_activity(self):
"replied_post_user_id": "user2",
"topic_id": "topic-uuid",
"category_id": "category1",
"post_number": 1.0,
"text": "Sample Text 1",
},
{
"post_id": "2",
Expand All @@ -137,6 +163,8 @@ def test_extract_without_recompute_no_latest_activity(self):
"replied_post_user_id": None,
"topic_id": "topic-uuid",
"category_id": "category1",
"post_number": 2.0,
"text": "Sample Text 2",
},
]
self.assertEqual(len(result), 2)
Expand Down Expand Up @@ -185,6 +213,8 @@ def test_extract_without_recompute_latest_activity_before_period(self):
"replied_post_user_id": "user2",
"topic_id": "topic-uuid",
"category_id": "category1",
"post_number": 1.0,
"text": "Sample Text 1",
},
{
"post_id": "2",
Expand All @@ -195,6 +225,8 @@ def test_extract_without_recompute_latest_activity_before_period(self):
"replied_post_user_id": None,
"topic_id": "topic-uuid",
"category_id": "category1",
"post_number": 2.0,
"text": "Sample Text 2",
},
]
self.assertEqual(len(result), 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,18 @@
class TestTransformRawInfo(unittest.TestCase):
def setUp(self):
"""Initialize the TransformRawInfo instance before each test."""
self.transformer = TransformRawInfo()
self.platform_id = "test_platform"
self.forum_endpoint = "sample.endpoint.gov"
self.transformer = TransformRawInfo(forum_endpoint=self.forum_endpoint)

def test_create_data_entry_no_interaction(self):
"""Test data entry creation with no specific interaction type."""
raw_data = {
"post_id": 6262,
"author_id": 6168,
"created_at": "2023-09-11T21:41:43.553Z",
"text": "some content",
"post_number": 1.0,
"category_id": 500,
"topic_id": 6134,
"reactions": [],
Expand All @@ -25,6 +28,11 @@ def test_create_data_entry_no_interaction(self):
self.assertEqual(result["author_id"], str(raw_data["author_id"]))
self.assertIsInstance(result["date"], datetime.datetime)
self.assertFalse(result["metadata"]["bot_activity"])
self.assertEqual(
result["metadata"]["link"],
f"https://{self.forum_endpoint}/t/6134/1",
)
self.assertEqual(result["text"], "some content")
self.assertEqual(len(result["interactions"]), 0)
self.assertEqual(result["source_id"], str(raw_data["post_id"]))
self.assertEqual(result["metadata"]["category_id"], raw_data["category_id"])
Expand All @@ -43,6 +51,8 @@ def test_create_data_entry_with_reaction(self):
"topic_id": 6134,
"reactions": [6263],
"replied_post_id": None,
"text": "some content #2",
"post_number": 1.0,
}
result = self.transformer.create_data_entry(
raw_data, interaction_type="reaction", interaction_user=6263
Expand All @@ -53,6 +63,7 @@ def test_create_data_entry_with_reaction(self):
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #2",
"interactions": [
{
"name": "reaction",
Expand All @@ -64,6 +75,7 @@ def test_create_data_entry_with_reaction(self):
"category_id": 500,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"source_id": "6261",
}
Expand All @@ -80,6 +92,8 @@ def test_transform_data_with_replied_user(self):
"replied_post_id": 6512,
"replied_post_user_id": 4444,
"topic_id": 6134,
"text": "some content #2",
"post_number": 1.0,
}
]

Expand All @@ -89,11 +103,13 @@ def test_transform_data_with_replied_user(self):
"date": datetime.datetime(
2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #2",
"source_id": "6262",
"metadata": {
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"actions": [
{
Expand All @@ -111,6 +127,7 @@ def test_transform_data_with_replied_user(self):
},
{
"author_id": "4444",
"text": "some content #2",
"date": datetime.datetime(
2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc
),
Expand All @@ -119,6 +136,7 @@ def test_transform_data_with_replied_user(self):
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"actions": [],
"interactions": [
Expand Down Expand Up @@ -147,6 +165,8 @@ def test_transform_data_with_reactions(self):
"replied_post_id": None,
"replied_post_user_id": None,
"topic_id": 6134,
"text": "some content #2",
"post_number": 1.0,
}
]
expected_result = [
Expand All @@ -155,11 +175,13 @@ def test_transform_data_with_reactions(self):
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #2",
"source_id": "6261",
"metadata": {
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"actions": [
{
Expand All @@ -181,6 +203,7 @@ def test_transform_data_with_reactions(self):
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #2",
"interactions": [
{
"name": "reaction",
Expand All @@ -192,12 +215,14 @@ def test_transform_data_with_reactions(self):
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"source_id": "6261",
},
{
"actions": [],
"author_id": "2",
"text": "some content #2",
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
Expand All @@ -212,6 +237,7 @@ def test_transform_data_with_reactions(self):
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"source_id": "6261",
},
Expand All @@ -233,6 +259,8 @@ def test_transform_data_replied_and_reactions(self):
"replied_post_id": 6512,
"replied_post_user_id": 4444,
"topic_id": 6134,
"text": "some content #1",
"post_number": 1.0,
},
{
"post_id": 6261,
Expand All @@ -243,6 +271,8 @@ def test_transform_data_replied_and_reactions(self):
"replied_post_id": None,
"replied_post_user_id": None,
"topic_id": 6134,
"text": "some content #2",
"post_number": 2.0,
},
]

Expand All @@ -252,11 +282,13 @@ def test_transform_data_replied_and_reactions(self):
"date": datetime.datetime(
2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #1",
"source_id": "6262",
"metadata": {
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"actions": [
{
Expand All @@ -277,11 +309,13 @@ def test_transform_data_replied_and_reactions(self):
"date": datetime.datetime(
2023, 9, 11, 21, 41, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #1",
"source_id": "6262",
"metadata": {
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/1",
},
"actions": [],
"interactions": [
Expand All @@ -297,11 +331,13 @@ def test_transform_data_replied_and_reactions(self):
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
"text": "some content #2",
"source_id": "6261",
"metadata": {
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/2",
},
"actions": [
{
Expand All @@ -320,6 +356,7 @@ def test_transform_data_replied_and_reactions(self):
{
"actions": [],
"author_id": "1",
"text": "some content #2",
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
Expand All @@ -334,12 +371,14 @@ def test_transform_data_replied_and_reactions(self):
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/2",
},
"source_id": "6261",
},
{
"actions": [],
"author_id": "2",
"text": "some content #2",
"date": datetime.datetime(
2023, 9, 11, 21, 42, 43, 553000, tzinfo=datetime.timezone.utc
),
Expand All @@ -354,6 +393,7 @@ def test_transform_data_replied_and_reactions(self):
"category_id": None,
"topic_id": 6134,
"bot_activity": False,
"link": f"https://{self.forum_endpoint}/t/6134/2",
},
"source_id": "6261",
},
Expand Down
2 changes: 1 addition & 1 deletion dags/discourse_analyzer_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def discourse_etl_raw_data(
forum_endpoint=forum_endpoint, platform_id=platform_id
)
extracted_data = extractor.extract(period=period, recompute=recompute)
transformer = TransformRawInfo()
transformer = TransformRawInfo(forum_endpoint=forum_endpoint)
transformed_data = transformer.transform(
raw_data=extracted_data,
)
Expand Down

0 comments on commit c46b73c

Please sign in to comment.