diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py index f6194145..7a10f7a7 100644 --- a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py +++ b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py @@ -1,3 +1,4 @@ +import re import logging from typing import Any @@ -158,7 +159,7 @@ def prepare_document( content_url_updated, url_reference = prepare_raw_message_urls(content) # always has length 1 - assert len(author_name) == 1 + assert len(author_name) == 1, "Either None or multiple authors!" msg_meta_data = { "channel": message["channelName"], @@ -209,6 +210,9 @@ def prepare_document( if check_no_content_only_links(content_url_updated): raise ValueError("Message just did have urls") + # removing null characters + content_url_updated = re.sub(r'[\x00-\x1F\x7F]', '', content_url_updated) + doc: Document if not exclude_metadata: doc = Document(text=content_url_updated, metadata=msg_meta_data)