From dd5101b5f40ecc6f0893fd65c95afeadce4d71f4 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 21 Oct 2024 12:24:53 +0330 Subject: [PATCH] fix: from_date None handling! and more fine-grained test case for it. --- .../src/db/telegram/extract/messages.py | 12 +- .../integration/test_extract_messages.py | 133 +++++++++++++----- 2 files changed, 107 insertions(+), 38 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/telegram/extract/messages.py b/dags/hivemind_etl_helpers/src/db/telegram/extract/messages.py index f1da3820..65d5c14c 100644 --- a/dags/hivemind_etl_helpers/src/db/telegram/extract/messages.py +++ b/dags/hivemind_etl_helpers/src/db/telegram/extract/messages.py @@ -26,9 +26,12 @@ def extract(self, from_date: datetime | None = None) -> list[TelegramMessagesMod """ query = "MATCH (c:TGChat {id: $chat_id})<-[:SENT_IN]-(message:TGMessage)" + # initialize where_clause: str | None = None - from_date_timestamp = int(from_date.timestamp() * 1000) + from_date_timestamp: int | None = None + if from_date: + from_date_timestamp = int(from_date.timestamp() * 1000) where_clause = f""" AND message.date >= $from_date_timestamp """ @@ -64,11 +67,16 @@ def extract(self, from_date: datetime | None = None) -> list[TelegramMessagesMod COLLECT(DISTINCT reacted_user.username) AS reactors ORDER BY message_created_at DESC """ + + parameters = {"chat_id": self.chat_id} + if from_date_timestamp: + parameters["from_date_timestamp"] = from_date_timestamp + tg_messages = [] with self._connection.neo4j_driver.session() as session: result = session.run( query, - {"chat_id": self.chat_id, "from_date_timestamp": from_date_timestamp}, + parameters=parameters, ) messages = result.data() tg_messages = [TelegramMessagesModel(**message) for message in messages] diff --git a/dags/hivemind_etl_helpers/tests/integration/test_extract_messages.py b/dags/hivemind_etl_helpers/tests/integration/test_extract_messages.py index 42334000..0ea07b12 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_extract_messages.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_extract_messages.py @@ -10,7 +10,7 @@ class TestExtractTelegramMessages(TestCase): def setUp(self) -> None: load_dotenv() self.chat_id = "1234567890" - self.extractor = ExtractMessages() + self.extractor = ExtractMessages(self.chat_id) self._delete_everything() def tearDown(self) -> None: @@ -102,6 +102,101 @@ def test_extract_single_data_with_from_date(self): self.assertEqual(data, []) def test_extract_multiple_data(self): + with self.extractor._connection.neo4j_driver.session() as session: + session.run( + """ + CREATE (c:TGChat {id: $chat_id}), + (u1:TGUser {id: '927814807.0', username: 'User One'}), + (u2:TGUser {id: '203678862.0', username: 'User Two'}), + (m1:TGMessage { + id: '3.0', + text: '🎉️️️️️️ Welcome to the TC Ingestion Pipeline', + date: $created_at1, + updated_at: $created_at1 + } + ), + (m4:TGMessage { + id: '3.0', + text: '🎉️️️️️️ Welcome to the TC Ingestion Pipeline. EDITED MSG', + date: $created_at4, + updated_at: $created_at4 + } + ), + (m2:TGMessage { + id: '4.0', + text: 'Hi', + date: $created_at2, + updated_at: $created_at2 + } + ), + (m3:TGMessage { + id: '5.0', + text: 'Reply🫡', + date: $created_at3, + updated_at: $created_at3 + } + ), + (m1)-[:SENT_IN]->(c), + (m2)-[:SENT_IN]->(c), + (m3)-[:SENT_IN]->(c), + (m4)-[:SENT_IN]->(c), + (u1)-[:CREATED_MESSAGE]->(m1), + (u2)-[:CREATED_MESSAGE]->(m2), + (u2)-[:CREATED_MESSAGE]->(m3), + (m1)-[:EDITED]->(m4), + (m3)-[:REPLIED]->(m1), + (u2)-[:REACTED_TO {new_reaction: '[{"type":"emoji","emoji":"🍓"}]', date: $reaction_date}]->(m1) + """, + { + "chat_id": self.chat_id, + "created_at1": 1672531200.0, # Sunday, January 1, 2023 12:00:00 AM + "created_at4": 1672531205.0, # Sunday, January 1, 2023 12:00:05 AM + "created_at2": 1672617600.0, # Monday, January 2, 2023 12:00:00 AM + "created_at3": 1672704000.0, # Tuesday, January 3, 2023 12:00:00 AM + "reaction_date": 1672790400.0, # Wednesday, January 4, 2023 12:00:00 AM + }, + ) + data = self.extractor.extract() + print("data", data) + + expected_data = [ + TelegramMessagesModel( + message_id=3, + message_text="🎉️️️️️️ Welcome to the TC Ingestion Pipeline. EDITED MSG", + author_username="User One", + message_created_at=1672531200.0, + message_edited_at=1672531205.0, + mentions=[], + repliers=["User Two"], + reactors=["User Two"], + ), + TelegramMessagesModel( + message_id=4, + message_text="Hi", + author_username="User Two", + message_created_at=1672617600.0, + message_edited_at=1672617600.0, + mentions=[], + repliers=[], + reactors=[], + ), + TelegramMessagesModel( + message_id=5, + message_text="Reply🫡", + author_username="User Two", + message_created_at=1672704000.0, + message_edited_at=1672704000.0, + mentions=[], + repliers=[], + reactors=[], + ), + ] + + self.assertEqual(len(data), 3) + for d in data: + self.assertIn(d, expected_data) + + def test_extract_multiple_data_with_from_date(self): with self.extractor._connection.neo4j_driver.session() as session: session.run( """ @@ -158,38 +253,4 @@ def test_extract_multiple_data(self): ) data = self.extractor.extract(from_date=datetime(2024, 1, 1)) - self.assertEqual( - data, - [ - TelegramMessagesModel( - message_id=3, - message_text="🎉️️️️️️ Welcome to the TC Ingestion Pipeline. EDITED MSG", - author_username="User One", - message_created_at=1672531200, - message_edited_at=1672531200, - mentions=[], - repliers=[], - reactors=["User Two"], - ), - TelegramMessagesModel( - message_id=4, - message_text="Hi", - author_username="User Two", - message_created_at=1672531205, - message_edited_at=1672531205, - mentions=[], - repliers=[], - reactors=[], - ), - TelegramMessagesModel( - message_id=5, - message_text="Reply🫡", - author_username="User Two", - message_created_at=1672704000, - message_edited_at=1672704000, - mentions=[], - repliers=[], - reactors=[], - ), - ], - ) + self.assertEqual(data, [])