-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #303 from TogetherCrew/feat/302-telegram-raw-vecto…
…rize Feat/302 telegram raw vectorize
- Loading branch information
Showing
15 changed files
with
939 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
2 changes: 2 additions & 0 deletions
2
dags/hivemind_etl_helpers/src/db/telegram/extract/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .messages import ExtractMessages | ||
from .tc_chats import TelegramChats |
82 changes: 82 additions & 0 deletions
82
dags/hivemind_etl_helpers/src/db/telegram/extract/messages.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from datetime import datetime | ||
|
||
from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel | ||
from tc_neo4j_lib import Neo4jOps | ||
|
||
|
||
class ExtractMessages: | ||
def __init__(self, chat_id: str) -> None: | ||
self.chat_id = chat_id | ||
self._connection = Neo4jOps.get_instance() | ||
|
||
def extract(self, from_date: datetime | None = None) -> list[TelegramMessagesModel]: | ||
""" | ||
extract messages related to the given `chat_id` | ||
Parameters | ||
----------- | ||
from_date : datetime | None | ||
load from a specific date | ||
if not given, load all data | ||
Returns | ||
--------- | ||
tg_messages : list[TelegramMessagesModel] | ||
the telegram messages | ||
""" | ||
# initialize | ||
where_clause: str | None = None | ||
from_date_timestamp: int | None = None | ||
|
||
if from_date: | ||
from_date_timestamp = int(from_date.timestamp() * 1000) | ||
where_clause = """ | ||
AND message.date >= $from_date_timestamp | ||
""" | ||
query = f""" | ||
MATCH (c:TGChat {{id: $chat_id}})<-[:SENT_IN]-(message:TGMessage) | ||
WHERE message.text IS NOT NULL | ||
{where_clause if where_clause else ""} | ||
WITH | ||
message.id AS message_id, | ||
MAX(message.updated_at) AS latest_msg_time, | ||
MIN(message.updated_at) AS first_msg_time | ||
MATCH (first_message:TGMessage {{id: message_id, updated_at: first_msg_time}}) | ||
MATCH (last_edit:TGMessage {{id: message_id, updated_at: latest_msg_time}}) | ||
WITH | ||
first_message AS message, | ||
last_edit.updated_at AS edited_at, | ||
last_edit.text AS message_text | ||
OPTIONAL MATCH (author:TGUser)-[created_rel:CREATED_MESSAGE]->(message) | ||
OPTIONAL MATCH (reacted_user:TGUser)-[react_rel:REACTED_TO]->(message) | ||
OPTIONAL MATCH (reply_msg:TGMessage)-[:REPLIED]->(message) | ||
OPTIONAL MATCH (replied_user:TGUser)-[:CREATED_MESSAGE]->(reply_msg) | ||
OPTIONAL MATCH (message)-[:MENTIONED]->(mentioned_user:TGUser) | ||
RETURN | ||
message.id AS message_id, | ||
message_text, | ||
author.username AS author_username, | ||
message.date AS message_created_at, | ||
edited_at AS message_edited_at, | ||
COLLECT(DISTINCT mentioned_user.username) AS mentions, | ||
COLLECT(DISTINCT replied_user.username) AS repliers, | ||
COLLECT(DISTINCT reacted_user.username) AS reactors | ||
ORDER BY message_created_at DESC | ||
""" | ||
|
||
parameters = {"chat_id": self.chat_id} | ||
if from_date_timestamp: | ||
parameters["from_date_timestamp"] = from_date_timestamp | ||
|
||
tg_messages = [] | ||
with self._connection.neo4j_driver.session() as session: | ||
result = session.run( | ||
query, | ||
parameters=parameters, | ||
) | ||
messages = result.data() | ||
tg_messages = [TelegramMessagesModel(**message) for message in messages] | ||
|
||
return tg_messages |
33 changes: 33 additions & 0 deletions
33
dags/hivemind_etl_helpers/src/db/telegram/extract/tc_chats.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import logging | ||
|
||
from tc_neo4j_lib import Neo4jOps | ||
|
||
|
||
class TelegramChats: | ||
def __init__(self) -> None: | ||
self._connection = Neo4jOps.get_instance() | ||
|
||
def extract_chats(self) -> list[tuple[str, str]]: | ||
""" | ||
extract the chat id and chat names | ||
Returns | ||
--------- | ||
chat_info : list[tuple[str, str]] | ||
a list of Telegram chat id and chat name | ||
""" | ||
driver = self._connection.neo4j_driver | ||
|
||
chat_info: list[str] = [] | ||
try: | ||
with driver.session() as session: | ||
records = session.run( | ||
"MATCH (c:TGChat) RETURN c.id as chat_id, c.title as name" | ||
) | ||
chat_info = [ | ||
(str(record["chat_id"]), str(record["name"])) for record in records | ||
] | ||
except Exception as exp: | ||
logging.error(f"Exception during extracting chat ids. exp: {exp}") | ||
|
||
return chat_info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from pydantic import BaseModel | ||
|
||
|
||
class TelegramMessagesModel(BaseModel): | ||
""" | ||
Represents a Telegram message with its associated metadata. | ||
""" | ||
|
||
message_id: int | ||
message_text: str | ||
author_username: str | ||
message_created_at: float | ||
message_edited_at: float | ||
mentions: list[str] | ||
repliers: list[str] | ||
reactors: list[str] |
1 change: 1 addition & 0 deletions
1
dags/hivemind_etl_helpers/src/db/telegram/transform/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .messages import TransformMessages |
59 changes: 59 additions & 0 deletions
59
dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel | ||
from llama_index.core import Document | ||
|
||
|
||
class TransformMessages: | ||
def __init__(self, chat_id: str, chat_name: str) -> None: | ||
self.chat_id = chat_id | ||
self.chat_name = chat_name | ||
|
||
def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]: | ||
""" | ||
transform the given telegram messages to llama-index documents | ||
Parameters | ||
---------- | ||
messages : list[TelegramMessagesModel] | ||
the extracted telegram messages | ||
Returns | ||
--------- | ||
transformed_docs : list[llama_index.core.Document] | ||
a list of llama-index documents to be embedded & loaded into db | ||
""" | ||
transformed_docs: list[Document] = [] | ||
|
||
for message in messages: | ||
document = Document( | ||
text=message.message_text, | ||
doc_id=message.message_id, | ||
metadata={ | ||
"author": message.author_username, | ||
"createdAt": message.message_created_at, | ||
"updatedAt": message.message_edited_at, | ||
"mentions": message.mentions, | ||
"replies": message.repliers, | ||
"reactors": message.reactors, | ||
"chat_name": self.chat_name, | ||
}, | ||
excluded_embed_metadata_keys=[ | ||
"author", | ||
"createdAt", | ||
"updatedAt", | ||
"mentions", | ||
"replies", | ||
"reactors", | ||
"chat_name", | ||
], | ||
excluded_llm_metadata_keys=[ | ||
"createdAt", | ||
"updatedAt", | ||
"mentions", | ||
"replies", | ||
"reactors", | ||
"chat_name", | ||
], | ||
) | ||
transformed_docs.append(document) | ||
|
||
return transformed_docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from hivemind_etl_helpers.src.db.telegram.utils import TelegramPlatform | ||
|
||
|
||
class TelegramUtils(TelegramPlatform): | ||
def __init__(self, chat_id: str, chat_name: str) -> None: | ||
super().__init__(chat_id, chat_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .platform import TelegramPlatform |
Oops, something went wrong.