Skip to content

Commit

Permalink
Merge pull request #295 from TogetherCrew/fix/294-hivemind-discord-re…
Browse files Browse the repository at this point in the history
…move-null-characters

fix: discord hivemind ETL added removal of null characters!
  • Loading branch information
amindadgar authored Sep 26, 2024
2 parents 71d8e76 + 16df757 commit e5bf6be
Showing 1 changed file with 5 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
from typing import Any

from hivemind_etl_helpers.src.db.discord.utils.content_parser import (
Expand Down Expand Up @@ -158,7 +159,7 @@ def prepare_document(
content_url_updated, url_reference = prepare_raw_message_urls(content)

# always has length 1
assert len(author_name) == 1
assert len(author_name) == 1, "Either None or multiple authors!"

msg_meta_data = {
"channel": message["channelName"],
Expand Down Expand Up @@ -209,6 +210,9 @@ def prepare_document(
if check_no_content_only_links(content_url_updated):
raise ValueError("Message just did have urls")

# removing null characters
content_url_updated = re.sub(r"[\x00-\x1F\x7F]", "", content_url_updated)

doc: Document
if not exclude_metadata:
doc = Document(text=content_url_updated, metadata=msg_meta_data)
Expand Down

0 comments on commit e5bf6be

Please sign in to comment.