TogetherCrew · cyri113 · Dec 21, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 14, 2023
diff --git a/.gitignore b/.gitignore
@@ -168,4 +168,7 @@ cython_debug/
 .org
 
 # Logs
-logs
+logs
+
+credentials_oauth.json
+credentials.json
diff --git a/dags/hivemind_etl.py b/dags/hivemind_etl.py
@@ -0,0 +1,97 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Example DAG demonstrating the usage of dynamic task mapping."""
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timedelta
+
+import phoenix as px
+from airflow import DAG
+from airflow.decorators import task
+
+from hivemind_etl_helpers.discord_mongo_summary_etl import process_discord_summaries
+from hivemind_etl_helpers.discord_mongo_vector_store_etl import (
+    process_discord_guild_mongo,
+)
+from hivemind_etl_helpers.src.utils.mongo import MongoSingleton
+
+
+def setup_phonix():
+    _ = px.launch_app()
+    logging.info(f"Phonix Session Url: {px.active_session().url}")
+
+
+with DAG(dag_id="phonix_startup", start_date=datetime(2022, 3, 4)) as dag:
+    dag.on_startup.append(setup_phonix)
+
+
+with DAG(
+    dag_id="discord_vector_store_update",
+    start_date=datetime(2022, 11, 10, 12),
+    schedule_interval=timedelta(minutes=60),
+    catchup=False,
+) as dag:
+
+    @task
+    def get_all_discord_communities() -> list[str]:
+        """
+        Getting all communities having discord from database
+        """
+        mongo = MongoSingleton.get_instance()
+        communities = (
+            mongo.client["Core"]["platforms"]
+            .find({"name": "discord"})
+            .distinct("community")
+        )
+        return communities
+
+    @task
+    def start_discord_vectorstore(community_id: str):
+        logging.info(f"Working on community, {community_id}")
+        process_discord_guild_mongo(community_id=community_id)
+        logging.info(f"Community {community_id} Job finished!")
+
+    communities = get_all_discord_communities()
+    # `start_discord_vectorstore` will be called multiple times
+    # with the length of the list
+    start_discord_vectorstore.expand(community_id=communities)
+
+with DAG(dag_id="discord_summary_vector_store", start_date=datetime(2023, 1, 1)) as dag:
+
+    @task
+    def get_all_discord_communities() -> list[str]:
+        """
+        Getting all communities having discord from database
+        """
+        mongo = MongoSingleton.get_instance()
+        communities = (
+            mongo.client["Core"]["platforms"]
+            .find({"name": "discord"})
+            .distinct("community")
+        )
+        return communities
+
+    @task
+    def start_discord_summary_vectorstore(community_id: str):
+        logging.info(f"Working on community, {community_id}")
+        process_discord_summaries(community_id=community_id, verbose=False)
+        logging.info(f"Community {community_id} Job finished!")
+
+    communities = get_all_discord_communities()
+    start_discord_summary_vectorstore.extend(community_id=communities)
diff --git a/dags/hivemind_etl_helpers/README.md b/dags/hivemind_etl_helpers/README.md
@@ -0,0 +1,13 @@
+# Hivemind ETL
+
+In this repository we're writing the data etl process scripts for hivemind bot.
+
+## How to
+
+For now, the scripts are focused on having discord data within mongodb and will store the embdeddings in postgress. To start the scripts for mongodb discord data
+
+```python3
+python discord_mongo_etl.py [guild_id]
+```
+
+Notes: Please replace [guild_id] with your guild id.
diff --git a/dags/hivemind_etl_helpers/__init__.py b/dags/hivemind_etl_helpers/__init__.py
diff --git a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
@@ -0,0 +1,122 @@
+import argparse
+import logging
+
+from llama_index import Document
+from llama_index.response_synthesizers import get_response_synthesizer
+
+from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary
+from hivemind_etl_helpers.src.db.discord.find_guild_id import (
+    find_guild_id_by_community_id,
+)
+from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
+from hivemind_etl_helpers.src.utils.cohere_embedding import CohereEmbedding
+from hivemind_etl_helpers.src.utils.pg_db_utils import setup_db
+from hivemind_etl_helpers.src.utils.pg_vector_access import PGVectorAccess
+
+
+def process_discord_summaries(
+    community_id: str, verbose: bool = False
+) -> list[Document]:
+    """
+    prepare the discord data by grouping it into thread, channel and day
+    Note: This will always process the data until 1 day ago.
+
+    Parameters
+    ------------
+    community_id : str
+        the community id to process its guild data
+    verbose : bool
+        verbose the process of summarization or not
+        if `True` the summarization process will be printed out
+        default is `False`
+
+    Returns
+    ---------
+    messages_docuemnt : list[llama_index.Document]
+        list of messages converted to documents
+    """
+    guild_id = find_guild_id_by_community_id(community_id)
+    logging.info(f"COMMUNITYID: {community_id}, GUILDID: {guild_id}")
+    table_name = "discord_summary"
+    dbname = f"community_{community_id}"
+
+    latest_date_query = f"""
+            SELECT (metadata_->> 'date')::timestamp
+            AS latest_date
+            FROM data_{table_name}
+            ORDER BY (metadata_->>'date')::timestamp DESC
+            LIMIT 1;
+    """
+    from_date = setup_db(
+        community_id=community_id, dbname=dbname, latest_date_query=latest_date_query
+    )
+
+    discord_summary = DiscordSummary(
+        response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"),
+        verbose=verbose,
+    )
+
+    (
+        thread_summaries_documents,
+        channel_summary_documenets,
+        daily_summary_documenets,
+    ) = discord_summary.prepare_summaries(
+        guild_id=guild_id,
+        from_date=from_date,
+        summarization_query="Please give me a summary using the data you have!",
+    )
+
+    logging.info("Getting the summaries embedding and saving within database!")
+
+    node_parser = configure_node_parser(chunk_size=256)
+    pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
+
+    embed_model = CohereEmbedding()
+    embed_dim = 1024
+
+    # saving thread summaries
+    pg_vector.save_documents_in_batches(
+        community_id=community_id,
+        documents=thread_summaries_documents,
+        batch_size=100,
+        node_parser=node_parser,
+        max_request_per_minute=None,
+        embed_model=embed_model,
+        embed_dim=embed_dim,
+        request_per_minute=10000,
+    )
+
+    # saving daily summaries
+    pg_vector.save_documents_in_batches(
+        community_id=community_id,
+        documents=daily_summary_documenets,
+        batch_size=100,
+        node_parser=node_parser,
+        max_request_per_minute=None,
+        embed_model=embed_model,
+        embed_dim=embed_dim,
+        request_per_minute=10000,
+    )
+
+    # saving channel summaries
+    pg_vector.save_documents_in_batches(
+        community_id=community_id,
+        documents=channel_summary_documenets,
+        batch_size=100,
+        node_parser=node_parser,
+        max_request_per_minute=None,
+        embed_model=embed_model,
+        embed_dim=embed_dim,
+        request_per_minute=10000,
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "community_id", type=str, help="the Community that the guild is related to"
+    )
+    args = parser.parse_args()
+    process_discord_summaries(community_id=args.community_id)
diff --git a/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py b/dags/hivemind_etl_helpers/discord_mongo_vector_store_etl.py
@@ -0,0 +1,77 @@
+import argparse
+import logging
+from datetime import timedelta
+
+from hivemind_etl_helpers.src.db.discord.discord_raw_message_to_document import (
+    discord_raw_to_docuemnts,
+)
+from hivemind_etl_helpers.src.db.discord.find_guild_id import (
+    find_guild_id_by_community_id,
+)
+from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
+from hivemind_etl_helpers.src.utils.cohere_embedding import CohereEmbedding
+from hivemind_etl_helpers.src.utils.pg_db_utils import setup_db
+from hivemind_etl_helpers.src.utils.pg_vector_access import PGVectorAccess
+
+
+def process_discord_guild_mongo(community_id: str) -> None:
+    """
+    process the discord guild messages from mongodb
+    and save the processed data within postgres
+
+    Parameters
+    -----------
+    community_id : str
+        the community id to create or use its database
+    """
+    guild_id = find_guild_id_by_community_id(community_id)
+    logging.info(f"COMMUNITYID: {community_id}, GUILDID: {guild_id}")
+    table_name = "discord"
+    dbname = f"community_{community_id}"
+
+    latest_date_query = f"""
+            SELECT (metadata_->> 'date')::timestamp
+            AS latest_date
+            FROM data_{table_name}
+            ORDER BY (metadata_->>'date')::timestamp DESC
+            LIMIT 1;
+    """
+    from_date = setup_db(
+        community_id=community_id, dbname=dbname, latest_date_query=latest_date_query
+    )
+
+    # because postgresql does not support miliseconds
+    # we might get duplicate messages
+    # so adding just a second after
+    if from_date is not None:
+        from_date += timedelta(seconds=1)
+
+    documents = discord_raw_to_docuemnts(guild_id=guild_id, from_date=from_date)
+    node_parser = configure_node_parser(chunk_size=256)
+    pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
+
+    embed_model = CohereEmbedding()
+    embed_dim = 1024
+
+    pg_vector.save_documents_in_batches(
+        community_id=community_id,
+        documents=documents,
+        batch_size=100,
+        node_parser=node_parser,
+        max_request_per_minute=None,
+        embed_model=embed_model,
+        embed_dim=embed_dim,
+        request_per_minute=10000,
+        # max_request_per_day=REQUEST_PER_DAY,
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "community_id", type=str, help="the Community that the guild is related to"
+    )
+    args = parser.parse_args()
+
+    process_discord_guild_mongo(community_id=args.community_id)
-Original file line number
+Diff line change
@@ Expand Up / @@ -168,4 +168,7 @@ cython_debug/ @@
     .org
     # Logs
-    logs
+    logs
+    credentials_oauth.json
+    credentials.json