diff --git a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py index 395fa4b4..be7083f7 100644 --- a/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py +++ b/dags/hivemind_etl_helpers/discord_mongo_summary_etl.py @@ -1,5 +1,6 @@ import argparse import logging +from datetime import timedelta from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary from hivemind_etl_helpers.src.db.discord.find_guild_id import ( @@ -47,11 +48,15 @@ def process_discord_summaries(community_id: str, verbose: bool = False) -> None: from_date = setup_db( community_id=community_id, dbname=dbname, latest_date_query=latest_date_query ) - # deleting any in-complete saved summaries (meaning for threads or channels) - deletion_query = f""" - DELETE FROM data_{table_name} - WHERE (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}'; - """ + if from_date is not None: + # deleting any in-complete saved summaries (meaning for threads or channels) + deletion_query = f""" + DELETE FROM data_{table_name} + WHERE (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}'; + """ + from_date += timedelta(days=1) + else: + deletion_query = "" discord_summary = DiscordSummary( response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"), diff --git a/dags/hivemind_etl_helpers/discourse_summary_etl.py b/dags/hivemind_etl_helpers/discourse_summary_etl.py index a04bc634..dfd6d729 100644 --- a/dags/hivemind_etl_helpers/discourse_summary_etl.py +++ b/dags/hivemind_etl_helpers/discourse_summary_etl.py @@ -93,12 +93,15 @@ def process_forum( community_id=community_id, dbname=dbname, latest_date_query=latest_date_query ) - # deleting any in-complete saved summaries - deletion_query = f""" - DELETE FROM data_{table_name} - WHERE (metadata_ ->> 'forum_endpoint') = '{forum_endpoint}' - AND (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}'; - """ + if from_date is not None: + # deleting any in-complete saved summaries + deletion_query = f""" + DELETE FROM data_{table_name} + WHERE (metadata_ ->> 'forum_endpoint') = '{forum_endpoint}' + AND (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}'; + """ + else: + deletion_query = "" # increasing 1 day since we've saved the summaries of the last day # e.g.: we would have the summaries of date 2023.12.15 diff --git a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py index 7fa3297f..a540aae2 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py +++ b/dags/hivemind_etl_helpers/src/db/discord/discord_summary.py @@ -1,3 +1,4 @@ +import logging from datetime import datetime from hivemind_etl_helpers.src.db.discord.summary.prepare_grouped_data import ( @@ -72,18 +73,24 @@ def prepare_summaries( list of daily summaries converted to llama_index documents """ raw_data_grouped = prepare_grouped_data(guild_id, from_date) - thread_summaries = self.prepare_thread_summaries( - guild_id, raw_data_grouped, summarization_prefix + " discord thread" - ) - (channel_summaries, thread_summary_documenets) = self.prepare_channel_summaries( - thread_summaries, - summarization_prefix + " selection of discord thread summaries", - ) - (daily_summaries, channel_summary_documenets) = self.prepare_daily_summaries( - channel_summaries, - summarization_prefix + " selection of discord channel summaries", - ) - daily_summary_documents = transform_daily_summary_to_document(daily_summaries) + if raw_data_grouped != {}: + thread_summaries = self.prepare_thread_summaries( + guild_id, raw_data_grouped, summarization_prefix + " discord thread" + ) + (channel_summaries, thread_summary_documenets) = self.prepare_channel_summaries( + thread_summaries, + summarization_prefix + " selection of discord thread summaries", + ) + (daily_summaries, channel_summary_documenets) = self.prepare_daily_summaries( + channel_summaries, + summarization_prefix + " selection of discord channel summaries", + ) + daily_summary_documents = transform_daily_summary_to_document(daily_summaries) + else: + logging.info(f"No data received after the data: {from_date}") + thread_summary_documenets = [] + channel_summary_documenets = [] + daily_summary_documents = [] return ( thread_summary_documenets,