diff --git a/src/panza/data_preparation/extract_emails.py b/src/panza/data_preparation/extract_emails.py index 12f1d79..92410de 100644 --- a/src/panza/data_preparation/extract_emails.py +++ b/src/panza/data_preparation/extract_emails.py @@ -9,6 +9,7 @@ import langdetect CLEAN_EMAILS = [] +TRUNCATED_EMAILS_COUNTER = 0 DISCARDED_EMAILS = { "non_english": [], "forwarded": [], @@ -18,7 +19,7 @@ } SHORT_EMAIL_THRESHOLD = 10 # words - +LONG_EMAIL_THRESHOLD = 500 # words def extract_only_plain_text(msg_part): if msg_part.get_content_type() == "text/plain": @@ -46,6 +47,11 @@ def remove_date_time(email_body): else: return email_body +def truncate_long_emails(email_body): + if count_words(email_body) > LONG_EMAIL_THRESHOLD: + TRUNCATED_EMAILS_COUNTER += 1 + return " ".join(email_body.split()[:LONG_EMAIL_THRESHOLD]) + return email_body def remove_lines_starting_with_gt(text): lines = text.split("\n") @@ -100,6 +106,9 @@ def filter_message(msg): main_email = email_with_thread.pop(0) email_with_thread.reverse() # chronological order + + # truncate long emails + main_email = truncate_long_emails(main_email) # check length before detecting language if count_words(main_email) < SHORT_EMAIL_THRESHOLD: @@ -167,6 +176,7 @@ def main(): f"\n\t forwarded = {len(DISCARDED_EMAILS['forwarded'])}" f"\n\t cant_decode_utf8 = {len(DISCARDED_EMAILS['cant_decode_utf8'])}" ) + print(f"# truncated emails = {TRUNCATED_EMAILS_COUNTER}") first_email = EMAIL[0] username = first_email[: first_email.find("@")]