Skip to content

Commit

Permalink
fix: deleting the incomplete data (if was saved)!
Browse files Browse the repository at this point in the history
+ In case of some thread/channel sumamries saved we were saving duplicate data. now it is fixed by deleting the incomplete ones.
+ fix the deletion query discord_mongo_summary_etl.py in case of from_date equal to None.
  • Loading branch information
amindadgar committed Jan 1, 2024
1 parent 3f8d60b commit d438dc7
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 23 deletions.
15 changes: 10 additions & 5 deletions dags/hivemind_etl_helpers/discord_mongo_summary_etl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import logging
from datetime import timedelta

from hivemind_etl_helpers.src.db.discord.discord_summary import DiscordSummary
from hivemind_etl_helpers.src.db.discord.find_guild_id import (
Expand Down Expand Up @@ -47,11 +48,15 @@ def process_discord_summaries(community_id: str, verbose: bool = False) -> None:
from_date = setup_db(
community_id=community_id, dbname=dbname, latest_date_query=latest_date_query
)
# deleting any in-complete saved summaries (meaning for threads or channels)
deletion_query = f"""
DELETE FROM data_{table_name}
WHERE (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}';
"""
if from_date is not None:
# deleting any in-complete saved summaries (meaning for threads or channels)
deletion_query = f"""
DELETE FROM data_{table_name}
WHERE (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}';
"""
from_date += timedelta(days=1)
else:
deletion_query = ""

discord_summary = DiscordSummary(
response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"),
Expand Down
15 changes: 9 additions & 6 deletions dags/hivemind_etl_helpers/discourse_summary_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,15 @@ def process_forum(
community_id=community_id, dbname=dbname, latest_date_query=latest_date_query
)

# deleting any in-complete saved summaries
deletion_query = f"""
DELETE FROM data_{table_name}
WHERE (metadata_ ->> 'forum_endpoint') = '{forum_endpoint}'
AND (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}';
"""
if from_date is not None:
# deleting any in-complete saved summaries
deletion_query = f"""
DELETE FROM data_{table_name}
WHERE (metadata_ ->> 'forum_endpoint') = '{forum_endpoint}'
AND (metadata_ ->> 'date')::timestamp > '{from_date.strftime("%Y-%m-%d")}';
"""
else:
deletion_query = ""

# increasing 1 day since we've saved the summaries of the last day
# e.g.: we would have the summaries of date 2023.12.15
Expand Down
31 changes: 19 additions & 12 deletions dags/hivemind_etl_helpers/src/db/discord/discord_summary.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from datetime import datetime

from hivemind_etl_helpers.src.db.discord.summary.prepare_grouped_data import (
Expand Down Expand Up @@ -72,18 +73,24 @@ def prepare_summaries(
list of daily summaries converted to llama_index documents
"""
raw_data_grouped = prepare_grouped_data(guild_id, from_date)
thread_summaries = self.prepare_thread_summaries(
guild_id, raw_data_grouped, summarization_prefix + " discord thread"
)
(channel_summaries, thread_summary_documenets) = self.prepare_channel_summaries(
thread_summaries,
summarization_prefix + " selection of discord thread summaries",
)
(daily_summaries, channel_summary_documenets) = self.prepare_daily_summaries(
channel_summaries,
summarization_prefix + " selection of discord channel summaries",
)
daily_summary_documents = transform_daily_summary_to_document(daily_summaries)
if raw_data_grouped != {}:
thread_summaries = self.prepare_thread_summaries(
guild_id, raw_data_grouped, summarization_prefix + " discord thread"
)
(channel_summaries, thread_summary_documenets) = self.prepare_channel_summaries(
thread_summaries,
summarization_prefix + " selection of discord thread summaries",
)
(daily_summaries, channel_summary_documenets) = self.prepare_daily_summaries(
channel_summaries,
summarization_prefix + " selection of discord channel summaries",
)
daily_summary_documents = transform_daily_summary_to_document(daily_summaries)
else:
logging.info(f"No data received after the data: {from_date}")
thread_summary_documenets = []
channel_summary_documenets = []
daily_summary_documents = []

return (
thread_summary_documenets,
Expand Down

0 comments on commit d438dc7

Please sign in to comment.