From 0f057188c6eda11cdc7e169b1332d01328622409 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 1 Aug 2024 09:47:08 -0700 Subject: [PATCH 1/3] Improve pdfminer embedded image extraction in `pdf` partitioning (#3456) ### Summary This PR addresses an issue in `pdfminer` library's embedded image extraction process. Previously, some extracted "images" were incorrect, including embedded text elements, resulting in oversized bounding boxes. This update refines the extraction process to focus on actual images with more accurate, smaller bounding boxes. ### Testing PDF: [test_pdfminer_text_extraction.pdf](https://github.com/user-attachments/files/16448213/test_pdfminer_text_extraction.pdf) ``` elements = partition_pdf( filename="test_pdfminer_text_extraction", strategy=strategy, languages=["chi_sim"], analysis=True, ) ``` **Results** - this `PR` ![page1_layout_pdfminer](https://github.com/user-attachments/assets/098e0a1f-fdad-4627-a881-cbafd71ce5a0) ![page1_layout_final](https://github.com/user-attachments/assets/6dc89180-36ac-424a-99de-63810ebf8958) - `main` branch ![page1_layout_pdfminer](https://github.com/user-attachments/assets/8228995a-2ef1-4b76-9758-b8015c224e6d) ![page1_layout_final](https://github.com/user-attachments/assets/68d43d7b-7270-4f58-8360-dc76bd0df78f) --- CHANGELOG.md | 4 +- unstructured/__version__.py | 2 +- .../pdf_image/pdfminer_processing.py | 48 +++++++++++-------- .../partition/pdf_image/pdfminer_utils.py | 44 +++++------------ 4 files changed, 42 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c33bfc91a..e2b8877b92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.15.1-dev7 +## 0.15.1-dev8 ### Enhancements +* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning. + ### Features * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d05dee38d4..31c20c9568 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev7" # pragma: no cover +__version__ = "0.15.1-dev8" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 83e4bab67f..4759d4b610 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -5,7 +5,7 @@ from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( - get_images_from_pdf_element, + extract_image_objects, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -51,32 +51,26 @@ def process_data_with_pdfminer( for page, page_layout in open_pdfminer_pages_generator(file): height = page_layout.height - layout: List["TextRegion"] = [] + layout: list["TextRegion"] = [] for obj in page_layout: x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) if hasattr(obj, "get_text"): _text = obj.get_text() - element_class = EmbeddedTextRegion # type: ignore + text_region = _create_text_region( + x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion + ) + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) else: - embedded_images = get_images_from_pdf_element(obj) - if len(embedded_images) > 0: - _text = None - element_class = ImageTextRegion # type: ignore - else: - continue - - text_region = element_class.from_coords( - x1 * coef, - y1 * coef, - x2 * coef, - y2 * coef, - text=_text, - source=Source.PDFMINER, - ) - - if text_region.bbox is not None and text_region.bbox.area > 0: - layout.append(text_region) + inner_image_objects = extract_image_objects(obj) + for img_obj in inner_image_objects: + new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height) + text_region = _create_text_region( + new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion + ) + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) # NOTE(christine): always do the basic sort first for deterministic order across # python versions. @@ -90,6 +84,18 @@ def process_data_with_pdfminer( return layouts +def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class): + """Creates a text region of the specified class with scaled coordinates.""" + return region_class.from_coords( + x1 * coef, + y1 * coef, + x2 * coef, + y2 * coef, + text=text, + source=source, + ) + + @requires_dependencies("unstructured_inference") def merge_inferred_with_extracted_layout( inferred_document_layout: "DocumentLayout", diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index c35a4dedd3..fce84de0bd 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,8 +1,8 @@ import tempfile -from typing import Any, BinaryIO, List, Tuple +from typing import BinaryIO, List, Tuple from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTContainer, LTImage +from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PSSyntaxError @@ -20,39 +20,17 @@ def init_pdfminer(): return device, interpreter -def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]: - """ - Recursively extracts LTImage objects from a PDF layout element. - - This function takes a PDF layout element (could be LTImage or LTContainer) and recursively - extracts all LTImage objects contained within it. - - Parameters: - - layout_object (Any): The PDF layout element to extract images from. +def extract_image_objects(parent_object: LTItem) -> List[LTImage]: + """Recursively extracts image objects from a given parent object in a PDF document.""" + objects = [] - Returns: - - List[LTImage]: A list of LTImage objects extracted from the layout object. - - Note: - - This function recursively traverses through the layout_object to find and accumulate all - LTImage objects. - - If the input layout_object is an LTImage, it will be included in the returned list. - - If the input layout_object is an LTContainer, the function will recursively search its - children for LTImage objects. - - If the input layout_object is neither LTImage nor LTContainer, an empty list will be - returned. - """ + if isinstance(parent_object, LTImage): + objects.append(parent_object) + elif isinstance(parent_object, LTContainer): + for child in parent_object: + objects.extend(extract_image_objects(child)) - # recursively locate Image objects in layout_object - if isinstance(layout_object, LTImage): - return [layout_object] - if isinstance(layout_object, LTContainer): - img_list: List[LTImage] = [] - for child in layout_object: - img_list = img_list + get_images_from_pdf_element(child) - return img_list - else: - return [] + return objects def rect_to_bbox( From 147514f6b50bc5d6d3b1ddab38c970141d538e04 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Thu, 1 Aug 2024 15:24:17 -0400 Subject: [PATCH 2/3] feat: msg and email metadata (#3444) Update partition_eml and partition_msg to capture cc, bcc, and message id fields. Docs PR: https://github.com/Unstructured-IO/docs/pull/135/files Testing ``` from unstructured.partition.email import partition_email from test_unstructured.unit_utils import example_doc_path elements = partition_email(filename=example_doc_path("eml/fake-email-header.eml"), include_headers=True) print(elements) elements[0].metadata.to_dict() ``` Note to reviewers: Tests in `test_unstructured/partition/test_email.py` were refactored and rearranged to group similar tests together, so it will be easiest to review those changes commit by commit. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Coniferish --- CHANGELOG.md | 2 +- example-docs/eml/fake-email-header.eml | 4 +- example-docs/fake-email-with-cc-and-bcc.msg | Bin 0 -> 13824 bytes test_unstructured/partition/test_email.py | 551 +++++++++--------- test_unstructured/partition/test_msg.py | 20 + .../outlook/21be155fb0c95885.eml.json | 1 + .../outlook/497eba8c81c801c6.eml.json | 1 + .../outlook/4a16a411f162ebbb.eml.json | 1 + .../EmailMessage/02sHu00001efErPIAU.eml.json | 1 + .../EmailMessage/02sHu00001efErQIAU.eml.json | 1 + unstructured/documents/elements.py | 12 + unstructured/documents/email_elements.py | 2 +- unstructured/partition/email.py | 133 +++-- unstructured/partition/msg.py | 12 + 14 files changed, 416 insertions(+), 325 deletions(-) create mode 100644 example-docs/fake-email-with-cc-and-bcc.msg diff --git a/CHANGELOG.md b/CHANGELOG.md index e2b8877b92..754cb6af4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,8 @@ ### Features +* **Update partition_eml and partition_msg to capture cc, bcc, and message_id fields** Cc, bcc, and message_id information is captured in element metadata for both msg and email partitioning and `Recipient` elements are generated for cc and bcc when `include_headers=True` for email partitioning. * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo. - * **Add `pdf_hi_res_max_pages` argument for partitioning, which allows rejecting PDF files that exceed this page number limit, when the `high_res` strategy is chosen.** By default, it will allow parsing PDF files with an unlimited number of pages. ### Fixes diff --git a/example-docs/eml/fake-email-header.eml b/example-docs/eml/fake-email-header.eml index b8d188ff5f..cdaadd9228 100644 --- a/example-docs/eml/fake-email-header.eml +++ b/example-docs/eml/fake-email-header.eml @@ -1,13 +1,15 @@ Received: from ABCDEFG-000.ABC.guide (00.0.0.00) by ABCDEFG-000.ABC.guide ([ba23::58b5:2236:45g2:88h2]) with Unstructured TTTT Server (version=ABC0_0, cipher=ABC_ABCDE_ABC_NOPE_ABC_000_ABC_ABC000) id 00.0.000.0 via Techbox - Transport; Wed, 20 Feb 2023 10:03:18 +1200 + Transport; Wed, 20 Feb 2023 10:03:18 +1200 MIME-Version: 1.0 Date: Fri, 16 Dec 2022 17:04:16 -0500 +Bcc: Hello Message-ID: Subject: Test Email From: Matthew Robinson To: Matthew Robinson +Cc: Fake Email , test@unstructured.io Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" --00000000000095c9b205eff92630 diff --git a/example-docs/fake-email-with-cc-and-bcc.msg b/example-docs/fake-email-with-cc-and-bcc.msg new file mode 100644 index 0000000000000000000000000000000000000000..68f59f01198bda33c3f1b7550c440c0de0644ad1 GIT binary patch literal 13824 zcmeHNTW=f36&{j~?~M4E)so}rM%hwyi$n!|AfS@cN@7fr3YWgc z5THI3DDu)@P!w(Q1KK}Oy zKg7GDHj!GzpN0IQTEjc%(PeP2f@d4iMp;EUfsx$`OS9>Pq3QX%bJMB0xv9B3O9S!b z$er2IshQaL{A}w+ry=(ydSIzr>O(L6q3%-=eJiRB^lU)IRZ@*0Z6u7L_a#ubpQ6tG zK255!ZfOWTxT;KVu0C~tI@D=3juxp?SF_zJtsXKo}))9kDSJBm-Xm6Y8^GpsF4CTwX^2wkEjC}d0GTpm_=#}yxr;p zUBdy*{*o^NON?u-3AD<-GIv&&cOjKjH}RZC-)Ln?JubaSB~(8+d(+BZsk~)m3zx?Xz3T^b6oELx#M1;K`Il4-3${$^QsFpv+Uyi#slPXpGih*8S;K zqmW@j%hV6A>PGB^Bt1y=B6kv!bc3f4sXn9^i?%2Au$YpCuB(>6LougETFz>QD@{DL|zIdaFN!fBE$Mh2Is`KSJlyVXW%A!RHdCmSE{=%*X1;vr6ke8xAw$ z7&_?tsPk`4=Ag(f|I_HX9!c&Y^|fkgxv$jf)TfO85Bp3#mw&o_^O!$IvywE{jXb>5 z@}60wID1ep^?QOCs0GYKwq8-VU&zCsioVreT|9Nqz2;vieH)*)a?rq(3D%o_|IAu z+o@RgRyJRVSqlX_>)2L)qgX22u}mRfw1McpMhgY2vT0j6@OOE6)1}QK`(9IT%ha=; zgNQQ1-PO+}O&k40)Ys>X{49^CZ!}%wSLa#&h#3gs&?!wX1HhHHPt!TK zSdFL?aq?f;{W^R3-(Q4A)X9BxICMtSIDYH;5!KO4{>PoibpImipYM|X{hZrhoY8m9 z*dV)6_-&)iAbE}U4`!xov@X=vOtbn)cT|Ua)ePq__u-scUk;Z^a*irNb^p;lo?aYT zo?A*UE{~^|7gH0lsd-5UJf4VpS^I&p+i>W=+76GQoOTut?P+@3qhYJsjtoRpg*0va zBC5igHhvgY5ltt(3dRmEd+nKag1*s@XJUUr_BAyd2vWIlY%&KmpUT*G@Uc^yk-MIYPrG&ud!cAHSGQQQtHU~w7;9dVzx#B+$;gS?2yz2uJ&e~kDt;wy-+BC;;O30(Bj2ivWuuX#K_ z^WJabeI4=Vh&K@5LL`5;_hvedbOMoUQV$~cfPIMlhy#eG{h_+F$>Vzj`J;7Z6Lsl& zzG>uhlWy{`wW_8~9^Yx?FMDNpeJ{cyPO!W=xAeUyZ`U3N>h~+^g%Cv7fphtp4%eeT zcUzhCYiyT>g0O4$z>E5ckH>xggqvv%M()ir-8eeY2=a)x(2X~1% zed6DMJ@E#2Mc=&g);x|JzWu@ttaKEfa}M8rdS2v%~E{YikZ=x;Sb=ugA||p0x3Ri<&Sg%1b_T-_}xw5%lw~v5-I*q z!21co5t`%6{h#|GDL&6Et>+)gPyNf}=X@f?=NZ4X{}DeW9)IowrTCY`^7E^qZ2sjQ zREp0#i`Mo}+2#Jv{jU_Cy1#+Q;BSA#=lU;`pL=g9z6D+xzWW7Qvi@;9C&j-4UY-dU zn#u3lzf69fIi&bk!8=WG1V4Owo%xf<&-0EH|8?*_AUHyEeEIyxIZcZH26%b@%+L(K zC&2n+e*5LN+Wg5ooL_n@e)z8C%l99&C#n1~@Ye6YwsGny>0g+XaT4Nh8XV}w{1T%1 zIVA4w|2d~i@!tk-+&ipL{@SmJ8l$eLYgJn`Hi;&581l|P*Fc%OTdzp;^3#7Dd)`T$ zq^0W*=X9z3gW%;Jl%Y93=YQGy$2nb!e-pgh1V`}0H#4tv{^gu5#UBQ5tNG8xm#;ro z0Q_6vZMFY!@r@Q*pMN-~OXYtDym3T^=K7cWKj(BQ{%!EmUKohq4*w@MgWGEJ=esEW zjmJX#;Qr5BFE<$f=lvMhm7u{-|3Uqq_jXeG$H3ce|L2+}#UBUn$A}Dm@*B&S`akb0 zrT8iEw%UIM_5bSu@afI%_J7{fO69*JmS67w)Tb2xE_k_KGdvIfp91dD{NJ2j6n?qr z-hWbd^KDEECVcqWe^CGDy}VTaGvIBv|8q|##h(RlyZxVgIw}4;=eDJU+(|(2r2#|c%P^LF9F~9Kaxic)$cD-`Rf6B4CGOS*Kzbo-s2xf{`2-nZ=758?k(1OEdaB&NRr literal 0 HcmV?d00001 diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 1b3f7cad9d..d39576c241 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -1,18 +1,28 @@ +"""Test suite for `unstructured.partition.email` module.""" + +from __future__ import annotations + import datetime import email import os import pathlib import tempfile +from email import policy +from email.message import EmailMessage +from typing import cast import pytest +from pytest_mock import MockFixture from test_unstructured.unit_utils import ( + LogCaptureFixture, assert_round_trips_through_JSON, example_doc_path, parse_optional_datetime, ) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( + Element, ElementMetadata, Image, ListItem, @@ -35,10 +45,6 @@ ) from unstructured.partition.text import partition_text -FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml") - - EXPECTED_OUTPUT = [ NarrativeText(text="This is a test email to use for unit tests."), Title(text="Important points:"), @@ -73,13 +79,16 @@ ), MetaData(name="MIME-Version", text="1.0"), MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"), + Recipient(name="Hello", text="hello@unstructured.io"), MetaData( name="Message-ID", - text="", + text="CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com", ), Subject(text="Test Email"), Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"), Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"), + Recipient(name="Fake Email", text="fake-email@unstructured.io"), + Recipient(name="test", text="test@unstructured.io"), MetaData( name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"', @@ -91,7 +100,7 @@ MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"), MetaData( name="Message-ID", - text="", + text="CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com", ), Subject(text="Test Email"), Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"), @@ -110,24 +119,17 @@ def test_partition_email_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - elements = partition_email(filename=filename) + elements = partition_email(filename=example_doc_path("eml/fake-email.eml")) + assert len(elements) > 0 assert elements == EXPECTED_OUTPUT for element in elements: assert element.metadata.filename == "fake-email.eml" -def test_partition_email_from_filename_with_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - elements = partition_email(filename=filename, metadata_filename="test") - assert len(elements) > 0 - assert all(element.metadata.filename == "test" for element in elements) - - def test_partition_email_from_filename_malformed_encoding(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml") - elements = partition_email(filename=filename) + elements = partition_email(filename=example_doc_path("eml/fake-email-malformed-encoding.eml")) + assert len(elements) > 0 assert elements == EXPECTED_OUTPUT @@ -148,9 +150,11 @@ def test_partition_email_from_filename_malformed_encoding(): ("email-replace-mime-encodings-error-5.eml", None), ], ) -def test_partition_email_from_filename_default_encoding(filename, expected_output): - filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename) - elements = partition_email(filename=filename_path) +def test_partition_email_from_filename_default_encoding( + filename: str, expected_output: Element | None +): + elements = partition_email(example_doc_path("eml/" + filename)) + assert len(elements) > 0 if expected_output: assert elements == expected_output @@ -159,9 +163,9 @@ def test_partition_email_from_filename_default_encoding(filename, expected_outpu def test_partition_email_from_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - with open(filename) as f: + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: elements = partition_email(file=f) + assert len(elements) > 0 assert elements == EXPECTED_OUTPUT for element in elements: @@ -184,10 +188,10 @@ def test_partition_email_from_file(): ("email-replace-mime-encodings-error-5.eml", None), ], ) -def test_partition_email_from_file_default_encoding(filename, expected_output): - filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename) - with open(filename_path) as f: +def test_partition_email_from_file_default_encoding(filename: str, expected_output: Element | None): + with open(example_doc_path("eml/" + filename), "rb") as f: elements = partition_email(file=f) + assert len(elements) > 0 if expected_output: assert elements == expected_output @@ -196,9 +200,9 @@ def test_partition_email_from_file_default_encoding(filename, expected_output): def test_partition_email_from_file_rb(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - with open(filename, "rb") as f: + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: elements = partition_email(file=f) + assert len(elements) > 0 assert elements == EXPECTED_OUTPUT for element in elements: @@ -220,10 +224,12 @@ def test_partition_email_from_file_rb(): ("email-replace-mime-encodings-error-5.eml", None), ], ) -def test_partition_email_from_file_rb_default_encoding(filename, expected_output): - filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename) - with open(filename_path, "rb") as f: +def test_partition_email_from_file_rb_default_encoding( + filename: str, expected_output: Element | None +): + with open(example_doc_path("eml/" + filename), "rb") as f: elements = partition_email(file=f) + assert len(elements) > 0 if expected_output: assert elements == expected_output @@ -243,9 +249,9 @@ def test_partition_email_from_spooled_temp_file(): def test_partition_email_from_text_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") - with open(filename) as f: + with open(example_doc_path("eml/fake-email.txt"), "rb") as f: elements = partition_email(file=f, content_source="text/plain") + assert len(elements) > 0 assert elements == EXPECTED_OUTPUT for element in elements: @@ -253,13 +259,9 @@ def test_partition_email_from_text_file(): def test_partition_email_from_text_file_with_headers(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") - with open(filename) as f: - elements = partition_email( - file=f, - content_source="text/plain", - include_headers=True, - ) + with open(example_doc_path("eml/fake-email.txt"), "rb") as f: + elements = partition_email(file=f, content_source="text/plain", include_headers=True) + assert len(elements) > 0 assert elements == ALL_EXPECTED_OUTPUT for element in elements: @@ -267,27 +269,23 @@ def test_partition_email_from_text_file_with_headers(): def test_partition_email_from_text_file_max(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") - with open(filename) as f: - elements = partition_email( - file=f, - content_source="text/plain", - max_partition=20, - ) + with open(example_doc_path("eml/fake-email.txt"), "rb") as f: + elements = partition_email(file=f, content_source="text/plain", max_partition=20) + assert len(elements) == 6 def test_partition_email_from_text_file_raises_value_error(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") - with pytest.raises(ValueError), open(filename) as f: + with pytest.raises(ValueError), open(example_doc_path("eml/fake-email.txt"), "rb") as f: partition_email(file=f, content_source="text/plain", min_partition=1000) def test_partition_email_from_text(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - with open(filename) as f: + with open(example_doc_path("eml/fake-email.eml")) as f: text = f.read() + elements = partition_email(text=text) + assert len(elements) > 0 assert elements == EXPECTED_OUTPUT for element in elements: @@ -299,8 +297,10 @@ def test_partition_email_from_text_work_with_empty_string(): def test_partition_email_from_filename_with_embedded_image(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-image-embedded.eml") - elements = partition_email(filename=filename, content_source="text/plain") + elements = partition_email( + example_doc_path("eml/fake-email-image-embedded.eml"), content_source="text/plain" + ) + assert len(elements) > 0 assert elements == IMAGE_EXPECTED_OUTPUT for element in elements: @@ -308,48 +308,22 @@ def test_partition_email_from_filename_with_embedded_image(): def test_partition_email_from_file_with_header(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") - with open(filename) as f: - msg = email.message_from_file(f) - elements = partition_email_header(msg) - assert len(elements) > 0 - assert elements == RECEIVED_HEADER_OUTPUT - for element in elements: - assert element.metadata.filename is None - + with open(example_doc_path("eml/fake-email-header.eml")) as f: + msg = email.message_from_file(f, policy=policy.default) -def test_partition_email_from_filename_has_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - elements = partition_email(filename=filename) - parent_id = elements[0].metadata.parent_id + msg = cast(EmailMessage, msg) + elements = partition_email_header(msg) assert len(elements) > 0 - assert ( - elements[0].metadata.to_dict() - == ElementMetadata( - coordinates=None, - filename=filename, - last_modified="2022-12-16T17:04:16-05:00", - page_number=None, - url=None, - sent_from=["Matthew Robinson "], - sent_to=["NotMatthew "], - subject="Test Email", - filetype="message/rfc822", - parent_id=parent_id, - languages=["eng"], - ).to_dict() - ) - expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00") - assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt - for element in elements: - assert element.metadata.filename == "fake-email.eml" + assert elements == RECEIVED_HEADER_OUTPUT + all(element.metadata.filename is None for element in elements) def test_extract_email_text_matches_html(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml") - elements_from_text = partition_email(filename=filename, content_source="text/plain") - elements_from_html = partition_email(filename=filename, content_source="text/html") + filename = example_doc_path("eml/fake-email-attachment.eml") + elements_from_text = partition_email(filename, content_source="text/plain") + elements_from_html = partition_email(filename, content_source="text/html") + assert len(elements_from_text) == len(elements_from_html) # NOTE(robinson) - checking each individually is necessary because the text/html returns # HTMLTitle, HTMLNarrativeText, etc @@ -359,49 +333,30 @@ def test_extract_email_text_matches_html(): def test_extract_base64_email_text_matches_html(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-b64.eml") - elements_from_text = partition_email(filename=filename, content_source="text/plain") - elements_from_html = partition_email(filename=filename, content_source="text/html") + filename = example_doc_path("eml/fake-email-b64.eml") + elements_from_text = partition_email(filename, content_source="text/plain") + elements_from_html = partition_email(filename, content_source="text/html") + assert len(elements_from_text) == len(elements_from_html) for i, element in enumerate(elements_from_text): assert element == elements_from_text[i] assert element.metadata.filename == "fake-email-b64.eml" -def test_extract_attachment_info(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml") - with open(filename) as f: - msg = email.message_from_file(f) - attachment_info = extract_attachment_info(msg) - assert len(attachment_info) > 0 - assert attachment_info == ATTACH_EXPECTED_OUTPUT - - -def test_partition_email_raises_with_none_specified(): - with pytest.raises(ValueError): - partition_email() - - -def test_partition_email_raises_with_too_many_specified(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - with open(filename) as f: - text = f.read() - with pytest.raises(ValueError): - partition_email(filename=filename, text=text) - - -def test_partition_email_raises_with_invalid_content_type(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - with pytest.raises(ValueError): - partition_email(filename=filename, content_source="application/json") - - def test_partition_email_processes_fake_email_with_header(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") - elements = partition_email(filename=filename) + elements = partition_email(example_doc_path("eml/fake-email-header.eml")) + assert len(elements) > 0 - for element in elements: - assert element.metadata.filename == "fake-email-header.eml" + assert all(element.metadata.filename == "fake-email-header.eml" for element in elements) + assert all( + element.metadata.bcc_recipient == ["Hello "] for element in elements + ) + assert all( + element.metadata.cc_recipient + == ["Fake Email ", "test@unstructured.io"] + for element in elements + ) + assert all(element.metadata.email_message_id is not None for element in elements) @pytest.mark.parametrize( @@ -413,22 +368,124 @@ def test_partition_email_processes_fake_email_with_header(): ("Thursday 5/3/2023 02:32:49", None), ], ) -def test_convert_to_iso_8601(time, expected): +def test_convert_to_iso_8601(time: str, expected: str | None): iso_time = convert_to_iso_8601(time) + assert iso_time == expected -def test_partition_email_still_works_with_no_content(caplog): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml") - elements = partition_email(filename=filename) +def test_partition_email_still_works_with_no_content(caplog: LogCaptureFixture): + elements = partition_email(example_doc_path("eml/email-no-html-content-1.eml")) + assert len(elements) == 1 assert elements[0].text.startswith("Hey there") assert "text/html was not found. Falling back to text/plain" in caplog.text +def test_partition_email_with_json(): + elements = partition_email(example_doc_path("eml/fake-email.eml")) + assert_round_trips_through_JSON(elements) + + +def test_partition_email_with_pgp_encrypted_message(caplog: LogCaptureFixture): + elements = partition_email(example_doc_path("eml/fake-encrypted.eml")) + + assert elements == [] + assert "WARNING" in caplog.text + assert "Encrypted email detected" in caplog.text + + +def test_partition_email_inline_content_disposition(): + elements = partition_email( + example_doc_path("eml/email-inline-content-disposition.eml"), + process_attachments=True, + attachment_partitioner=partition_text, + ) + + assert isinstance(elements[0], Text) + assert isinstance(elements[1], Text) + + +def test_add_chunking_strategy_on_partition_email(): + chunk_elements = partition_email( + example_doc_path("eml/fake-email.txt"), chunking_strategy="by_title" + ) + elements = partition_email(example_doc_path("eml/fake-email.txt")) + chunks = chunk_by_title(elements) + + assert chunk_elements != elements + assert chunk_elements == chunks + + +# -- raise error behaviors ----------------------------------------------------------------------- + + +def test_partition_msg_raises_with_no_partitioner(): + with pytest.raises(ValueError): + partition_email(example_doc_path("eml/fake-email-attachment.eml"), process_attachments=True) + + +def test_partition_email_raises_with_none_specified(): + with pytest.raises(ValueError): + partition_email() + + +def test_partition_email_raises_with_too_many_specified(): + with open(example_doc_path("eml/fake-email.eml")) as f: + text = f.read() + + with pytest.raises(ValueError): + partition_email(example_doc_path("eml/fake-email.eml"), text=text) + + +def test_partition_email_raises_with_invalid_content_type(): + with pytest.raises(ValueError): + partition_email(example_doc_path("eml/fake-email.eml"), content_source="application/json") + + +# -- metadata behaviors -------------------------------------------------------------------------- + + +def test_partition_email_from_filename_with_metadata_filename(): + elements = partition_email(example_doc_path("eml/fake-email.eml"), metadata_filename="test") + + assert len(elements) > 0 + assert all(element.metadata.filename == "test" for element in elements) + + +def test_partition_email_from_filename_has_metadata(): + elements = partition_email(example_doc_path("eml/fake-email.eml")) + parent_id = elements[0].metadata.parent_id + + assert len(elements) > 0 + assert ( + elements[0].metadata.to_dict() + == ElementMetadata( + coordinates=None, + filename=example_doc_path("eml/fake-email.eml"), + last_modified="2022-12-16T17:04:16-05:00", + page_number=None, + url=None, + sent_from=["Matthew Robinson "], + sent_to=["NotMatthew "], + subject="Test Email", + filetype="message/rfc822", + parent_id=parent_id, + languages=["eng"], + email_message_id="CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com", + ).to_dict() + ) + expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00") + assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt + for element in elements: + assert element.metadata.filename == "fake-email.eml" + + def test_partition_email_from_filename_exclude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") - elements = partition_email(filename=filename, include_metadata=False) + elements = partition_email( + example_doc_path("eml/fake-email-header.eml"), include_metadata=False + ) + assert parse_optional_datetime(elements[0].metadata.last_modified) is None assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None @@ -436,13 +493,9 @@ def test_partition_email_from_filename_exclude_metadata(): def test_partition_email_from_text_file_exclude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") - with open(filename) as f: - elements = partition_email( - file=f, - content_source="text/plain", - include_metadata=False, - ) + with open(example_doc_path("eml/fake-email.txt"), "rb") as f: + elements = partition_email(file=f, content_source="text/plain", include_metadata=False) + assert parse_optional_datetime(elements[0].metadata.last_modified) is None assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None @@ -450,25 +503,83 @@ def test_partition_email_from_text_file_exclude_metadata(): def test_partition_email_from_file_exclude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - with open(filename) as f: + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: elements = partition_email(file=f, include_metadata=False) + assert parse_optional_datetime(elements[0].metadata.last_modified) is None assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None -def test_partition_email_can_process_attachments( - tmpdir, - filename="example-docs/eml/fake-email-attachment.eml", -): +def test_partition_email_metadata_date_from_header(mocker: MockFixture): + mocker.patch("unstructured.partition.email.get_last_modified_date", return_value=None) + mocker.patch("unstructured.partition.email.get_last_modified_date_from_file", return_value=None) + + elements = partition_email(example_doc_path("eml/fake-email-attachment.eml")) + + assert elements[0].metadata.last_modified == "2022-12-23T12:08:48-06:00" + + +def test_partition_email_from_file_custom_metadata_date(): + with open(example_doc_path("eml/fake-email-attachment.eml"), "rb") as f: + elements = partition_email(file=f, metadata_last_modified="2020-07-05T09:24:28") + + assert elements[0].metadata.last_modified == "2020-07-05T09:24:28" + + +def test_partition_email_custom_metadata_date(): + elements = partition_email( + example_doc_path("eml/fake-email-attachment.eml"), + metadata_last_modified="2020-07-05T09:24:28", + ) + + assert elements[0].metadata.last_modified == "2020-07-05T09:24:28" + + +def test_partition_eml_add_signature_to_metadata(): + elements = partition_email(example_doc_path("eml/signed-doc.p7s")) + + assert len(elements) == 1 + assert elements[0].text == "This is a test" + assert elements[0].metadata.signature == "\n" + + +# -- attachment behaviors ------------------------------------------------------------------------ + + +def test_extract_attachment_info(): + with open(example_doc_path("eml/fake-email-attachment.eml")) as f: + msg = email.message_from_file(f, policy=policy.default) + msg = cast(EmailMessage, msg) + attachment_info = extract_attachment_info(msg) + + assert len(attachment_info) > 0 + assert attachment_info == ATTACH_EXPECTED_OUTPUT + + +def test_partition_email_odd_attachment_filename(): + elements = partition_email( + example_doc_path("eml/email-equals-attachment-filename.eml"), + process_attachments=True, + attachment_partitioner=partition_text, + ) + + assert elements[1].metadata.filename == "odd=file=name.txt" + + +def test_partition_email_can_process_attachments(tmp_path: pathlib.Path): + output_dir = tmp_path / "output" + output_dir.mkdir() + filename = example_doc_path("eml/fake-email-attachment.eml") with open(filename) as f: - msg = email.message_from_file(f) - extract_attachment_info(msg, output_dir=tmpdir.dirname) + msg = email.message_from_file(f, policy=policy.default) + msg = cast(EmailMessage, msg) + extract_attachment_info(msg, output_dir=str(output_dir)) + attachment_filename = os.path.join( - tmpdir.dirname, - ATTACH_EXPECTED_OUTPUT[0]["filename"], + output_dir, + str(ATTACH_EXPECTED_OUTPUT[0]["filename"]), ) mocked_last_modification_date = "0000-00-05T09:24:28" @@ -495,25 +606,27 @@ def test_partition_email_can_process_attachments( elements[-1].metadata.parent_id = None assert elements[0].text.startswith("Hello!") - for element in elements[:-1]: assert element.metadata.filename == "fake-email-attachment.eml" assert element.metadata.subject == "Fake email with attachment" - assert elements[-1].text == "Hey this is a fake attachment!" assert elements[-1].metadata == expected_metadata -def test_partition_email_can_process_min_max_with_attachments( - tmpdir, - filename="example-docs/eml/fake-email-attachment.eml", -): +def test_partition_email_can_process_min_max_with_attachments(tmp_path: pathlib.Path): + output_dir = tmp_path / "output" + output_dir.mkdir() + filename = example_doc_path("eml/fake-email-attachment.eml") with open(filename) as f: - msg = email.message_from_file(f) - extract_attachment_info(msg, output_dir=tmpdir.dirname) - attachment_filename = os.path.join( - tmpdir.dirname, - ATTACH_EXPECTED_OUTPUT[0]["filename"], + msg = email.message_from_file(f, policy=policy.default) + msg = cast(EmailMessage, msg) + extract_attachment_info(msg, output_dir=str(output_dir)) + + attachment_filename = str( + os.path.join( + output_dir, + str(ATTACH_EXPECTED_OUTPUT[0]["filename"]), + ) ) attachment_elements = partition_text( @@ -540,135 +653,29 @@ def test_partition_email_can_process_min_max_with_attachments( assert len(element.text) >= 6 -def test_partition_msg_raises_with_no_partitioner( - filename="example-docs/eml/fake-email-attachment.eml", -): - with pytest.raises(ValueError): - partition_email(filename=filename, process_attachments=True) - - -def test_partition_email_metadata_date_from_header( - mocker, - filename="example-docs/eml/fake-email-attachment.eml", -): - expected_last_modification_date = "2022-12-23T12:08:48-06:00" - - mocker.patch( - "unstructured.partition.email.get_last_modified_date", - return_value=None, - ) - mocker.patch( - "unstructured.partition.email.get_last_modified_date_from_file", - return_value=None, - ) - - elements = partition_email(filename=filename) - - assert elements[0].metadata.last_modified == expected_last_modification_date - - -def test_partition_email_from_file_custom_metadata_date( - filename="example-docs/eml/fake-email-attachment.eml", -): - expected_last_modification_date = "2020-07-05T09:24:28" - - with open(filename) as f: - elements = partition_email( - file=f, - metadata_last_modified=expected_last_modification_date, - ) - - assert elements[0].metadata.last_modified == expected_last_modification_date - - -def test_partition_email_custom_metadata_date( - filename="example-docs/eml/fake-email-attachment.eml", -): - expected_last_modification_date = "2020-07-05T09:24:28" - - elements = partition_email( - filename=filename, - metadata_last_modified=expected_last_modification_date, - ) - - assert elements[0].metadata.last_modified == expected_last_modification_date - - -def test_partition_email_inline_content_disposition( - filename="example-docs/eml/email-inline-content-disposition.eml", -): - elements = partition_email( - filename=filename, - process_attachments=True, - attachment_partitioner=partition_text, - ) - - assert isinstance(elements[0], Text) - assert isinstance(elements[1], Text) - +# -- language behaviors -------------------------------------------------------------------------- -def test_partition_email_odd_attachment_filename( - filename="example-docs/eml/email-equals-attachment-filename.eml", -): - elements = partition_email( - filename=filename, - process_attachments=True, - attachment_partitioner=partition_text, - ) - - assert elements[1].metadata.filename == "odd=file=name.txt" - -def test_partition_email_with_json(): +def test_partition_email_element_metadata_has_languages(): elements = partition_email(example_doc_path("eml/fake-email.eml")) - assert_round_trips_through_JSON(elements) - - -def test_partition_email_with_pgp_encrypted_message( - caplog, - filename="example-docs/eml/fake-encrypted.eml", -): - elements = partition_email(filename=filename) - - assert elements == [] - assert "WARNING" in caplog.text - assert "Encrypted email detected" in caplog.text - -def test_add_chunking_strategy_on_partition_email( - filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt"), -): - elements = partition_email(filename=filename) - chunk_elements = partition_email(filename, chunking_strategy="by_title") - chunks = chunk_by_title(elements) - assert chunk_elements != elements - assert chunk_elements == chunks - - -def test_partition_email_element_metadata_has_languages(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - elements = partition_email(filename=filename) assert elements[0].metadata.languages == ["eng"] def test_partition_email_respects_languages_arg(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") - elements = partition_email(filename=filename, languages=["deu"]) + elements = partition_email(example_doc_path("eml/fake-email.eml"), languages=["deu"]) + assert all(element.metadata.languages == ["deu"] for element in elements) def test_partition_eml_respects_detect_language_per_element(): - filename = "example-docs/language-docs/eng_spa_mult.eml" - elements = partition_email(filename=filename, detect_language_per_element=True) + elements = partition_email( + example_doc_path("language-docs/eng_spa_mult.eml"), + detect_language_per_element=True, + ) # languages other than English and Spanish are detected by this partitioner, # so this test is slightly different from the other partition tests langs = {e.metadata.languages[0] for e in elements if e.metadata.languages is not None} + assert "eng" in langs assert "spa" in langs - - -def test_partition_eml_add_signature_to_metadata(): - elements = partition_email(filename="example-docs/eml/signed-doc.p7s") - assert len(elements) == 1 - assert elements[0].text == "This is a test" - assert elements[0].metadata.signature == "\n" diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index 02dd5044c0..0686b0ad55 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -379,6 +379,26 @@ def it_adds_email_specific_fields_to_the_msg_element_metadata(self, opts_args: d assert opts.msg_metadata.sent_to == ["mrobinson@unstructured.io"] assert opts.msg_metadata.subject == "Test Email" + def it_captures_cc_and_bcc_element_metadata(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata.cc_recipient == ["steve@unstructured.io"] + assert opts.msg_metadata.bcc_recipient == ["hello@unstructured.io"] + assert opts.msg_metadata.sent_to == [ + "john-ctr@unstructured.io", + "steve@unstructured.io", + "hello@unstructured.io", + ] + + def it_captures_email_message_id_element_metadata(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert ( + opts.msg_metadata.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com" + ) + # -- .partition_attachments ------------------ @pytest.mark.parametrize("partition_attachments", [True, False]) diff --git a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json index d17cbdc167..62f8a3ebe8 100644 --- a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json +++ b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json @@ -12,6 +12,7 @@ "url": "https://outlook.office365.com/owa/?ItemID=AAMkAGE2MmEwNzFlLWVjYzAtNDNhZS04ZGM1LTFjYmMzZDhiMmI0MABGAAAAAADc1MfJYetSQ6QZntYrI9k4BwDZYn%2FlfnvLSqIcW%2FYsN8ebAAATaI%2BsAADZYn%2FlfnvLSqIcW%2FYsN8ebAAATaJ9PAAA%3D&exvsurl=1&viewmodel=ReadMessageItem", "version": "CQAAABYAAADZYn/lfnvLSqIcW/YsN8ebAAATYGBM" }, + "email_message_id": "CAOvAh-6yWG99vvaoQ5niLgGTgpwe90LGiNPLvx7bAY3ZFyq54w@mail.gmail.com", "filename": "21be155fb0c95885.eml", "filetype": "message/rfc822", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json index d73cabba5b..56d73c1da9 100644 --- a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json +++ b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json @@ -12,6 +12,7 @@ "url": "https://outlook.office365.com/owa/?ItemID=AAMkAGE2MmEwNzFlLWVjYzAtNDNhZS04ZGM1LTFjYmMzZDhiMmI0MABGAAAAAADc1MfJYetSQ6QZntYrI9k4BwDZYn%2FlfnvLSqIcW%2FYsN8ebAAATzq5tAADZYn%2FlfnvLSqIcW%2FYsN8ebAAAZT8XfAAA%3D&exvsurl=1&viewmodel=ReadMessageItem", "version": "CQAAABYAAADZYn/lfnvLSqIcW/YsN8ebAAAZRQ8Y" }, + "email_message_id": "CAL=c59DZsEqq49DgVLQy=6v_WnxmkGfznjOoaGqqJb6VK-Mu=g@mail.gmail.com", "filename": "497eba8c81c801c6.eml", "filetype": "message/rfc822", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json index 6d29f079e3..7596c5df23 100644 --- a/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json +++ b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json @@ -12,6 +12,7 @@ "url": "https://outlook.office365.com/owa/?ItemID=AAMkAGE2MmEwNzFlLWVjYzAtNDNhZS04ZGM1LTFjYmMzZDhiMmI0MABGAAAAAADc1MfJYetSQ6QZntYrI9k4BwDZYn%2FlfnvLSqIcW%2FYsN8ebAAATzq5sAADZYn%2FlfnvLSqIcW%2FYsN8ebAAATzrolAAA%3D&exvsurl=1&viewmodel=ReadMessageItem", "version": "CQAAABYAAADZYn/lfnvLSqIcW/YsN8ebAAATxicu" }, + "email_message_id": "CAOvAh-7KVeFHwtX20KVL=S4WgpWN91YzK11td4_W0Pv3cJ4jLQ@mail.gmail.com", "filename": "4a16a411f162ebbb.eml", "filetype": "message/rfc822", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json index 65f645ccfb..5c1f06ae6a 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json @@ -4,6 +4,7 @@ "element_id": "4196fe41da19e8657761ecffcafd3d2f", "text": "Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/", "metadata": { + "email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net", "sent_from": [ "devops+salesforce-connector@unstructured.io" ], diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json index e03bc00d94..cc1ab18fd6 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json @@ -4,6 +4,7 @@ "element_id": "6f168cd430b41fc0d66a3691ef3caa0f", "text": "Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/", "metadata": { + "email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net", "sent_from": [ "devops+salesforce-connector@unstructured.io" ], diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 7d300b3f68..c1b29ee6d9 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -206,6 +206,9 @@ class ElementMetadata: regex_metadata: Optional[dict[str, list[RegexMetadata]]] # -- e-mail specific metadata fields -- + bcc_recipient: Optional[list[str]] + cc_recipient: Optional[list[str]] + email_message_id: Optional[str] sent_from: Optional[list[str]] sent_to: Optional[list[str]] subject: Optional[str] @@ -224,7 +227,9 @@ class ElementMetadata: def __init__( self, attached_to_filename: Optional[str] = None, + bcc_recipient: Optional[list[str]] = None, category_depth: Optional[int] = None, + cc_recipient: Optional[list[str]] = None, coordinates: Optional[CoordinatesMetadata] = None, data_source: Optional[DataSourceMetadata] = None, detection_class_prob: Optional[float] = None, @@ -244,6 +249,7 @@ def __init__( link_texts: Optional[list[str]] = None, link_urls: Optional[list[str]] = None, links: Optional[list[Link]] = None, + email_message_id: Optional[str] = None, orig_elements: Optional[list[Element]] = None, page_name: Optional[str] = None, page_number: Optional[int] = None, @@ -258,7 +264,9 @@ def __init__( url: Optional[str] = None, ) -> None: self.attached_to_filename = attached_to_filename + self.bcc_recipient = bcc_recipient self.category_depth = category_depth + self.cc_recipient = cc_recipient self.coordinates = coordinates self.data_source = data_source self.detection_class_prob = detection_class_prob @@ -286,6 +294,7 @@ def __init__( self.link_urls = link_urls self.link_start_indexes = link_start_indexes self.links = links + self.email_message_id = email_message_id self.orig_elements = orig_elements self.page_name = page_name self.page_number = page_number @@ -481,6 +490,8 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: """ return { "attached_to_filename": cls.FIRST, + "cc_recipient": cls.FIRST, + "bcc_recipient": cls.FIRST, "category_depth": cls.DROP, "coordinates": cls.DROP, "data_source": cls.FIRST, @@ -502,6 +513,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "link_urls": cls.LIST_CONCATENATE, "link_start_indexes": cls.DROP, "links": cls.DROP, # -- deprecated field -- + "email_message_id": cls.FIRST, "max_characters": cls.DROP, # -- unused, remove from ElementMetadata -- "orig_elements": cls.DROP, # -- not expected, added by chunking, not before -- "page_name": cls.FIRST, diff --git a/unstructured/documents/email_elements.py b/unstructured/documents/email_elements.py index d0ec73cd00..e38bbe435f 100644 --- a/unstructured/documents/email_elements.py +++ b/unstructured/documents/email_elements.py @@ -40,7 +40,7 @@ def has_datestamp(self): def __str__(self): return f"{self.name}: {self.text}" - def __eq__(self, other): + def __eq__(self, other) -> bool: if self.has_datestamp(): return ( self.name == other.name diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index bd53b26e31..3370d7534a 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -5,16 +5,17 @@ import email import os import re -from email.message import Message +from email import policy +from email.headerregistry import AddressHeader +from email.message import EmailMessage from functools import partial from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import IO, Any, Callable, Final, Optional +from typing import IO, Any, Callable, Final, Optional, Type, cast from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings from unstructured.cleaners.extract import ( extract_datetimetz, - extract_email_address, extract_ip_address, extract_ip_address_name, extract_mapi_id, @@ -82,40 +83,59 @@ def _parse_received_data(data: str) -> list[Element]: return elements -def _parse_email_address(data: str) -> tuple[str, str]: - email_address = extract_email_address(data) +def _strip_angle_brackets(data: str) -> str: + """Remove angle brackets from the beginning and end of the string if they exist. - PATTERN = r"<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>" - name = re.split(PATTERN, data.lower())[0].title().strip() + Returns: + str: The string with surrounding angle brackets removed. - return name, email_address[0] + Example: + >>> _strip_angle_brackets("") + 'example' + >>> _strip_angle_brackets("test>") + 'another>test' + >>> _strip_angle_brackets("<>") + '' + """ + return re.sub(r"^<|>$", "", data) + + +def partition_email_header(msg: EmailMessage) -> list[Element]: + def append_address_header_elements(header: AddressHeader, element_type: Type[Element]): + for addr in header.addresses: + elements.append( + element_type( + name=addr.display_name or addr.username, text=addr.addr_spec # type: ignore + ) + ) -def partition_email_header(msg: Message) -> list[Element]: elements: list[Element] = [] - for item in msg.raw_items(): - if item[0] == "To": - text = _parse_email_address(item[1]) - elements.append(Recipient(name=text[0], text=text[1])) - elif item[0] == "From": - text = _parse_email_address(item[1]) - elements.append(Sender(name=text[0], text=text[1])) - elif item[0] == "Subject": - elements.append(Subject(text=item[1])) - elif item[0] == "Received": - elements += _parse_received_data(item[1]) + + for msg_field, msg_value in msg.items(): + if msg_field in {"To", "Bcc", "Cc"}: + append_address_header_elements(msg_value, Recipient) + elif msg_field == "From": + append_address_header_elements(msg_value, Sender) + elif msg_field == "Subject": + elements.append(Subject(text=msg_value)) + elif msg_field == "Received": + elements += _parse_received_data(msg_value) + elif msg_field == "Message-ID": + elements.append(MetaData(name=msg_field, text=_strip_angle_brackets(str(msg_value)))) else: - elements.append(MetaData(name=item[0], text=item[1])) + elements.append(MetaData(name=msg_field, text=msg_value)) return elements -def find_signature(msg: Message) -> Optional[str]: +def find_signature(msg: EmailMessage) -> Optional[str]: """Extracts the signature from an email message, if it's available.""" - payload = msg.get_payload() + payload: Any = msg.get_payload() if not isinstance(payload, list): return None + payload = cast(list[EmailMessage], payload) for item in payload: if item.get_content_type().endswith("signature"): return item.get_payload() @@ -124,7 +144,7 @@ def find_signature(msg: Message) -> Optional[str]: def build_email_metadata( - msg: Message, + msg: EmailMessage, filename: Optional[str], metadata_last_modified: Optional[str] = None, last_modification_date: Optional[str] = None, @@ -134,21 +154,26 @@ def build_email_metadata( header_dict = dict(msg.raw_items()) email_date = header_dict.get("Date") + + def parse_recipients(header_value: Optional[str]) -> Optional[list[str]]: + if header_value is not None: + return [recipient.strip() for recipient in header_value.split(",")] + return None + if email_date is not None: email_date = convert_to_iso_8601(email_date) - sent_from = header_dict.get("From") - if sent_from is not None: - sent_from = [sender.strip() for sender in sent_from.split(",")] - - sent_to = header_dict.get("To") - if sent_to is not None: - sent_to = [recipient.strip() for recipient in sent_to.split(",")] + email_message_id = header_dict.get("Message-ID") + if email_message_id: + email_message_id = _strip_angle_brackets(email_message_id) element_metadata = ElementMetadata( - sent_to=sent_to, - sent_from=sent_from, - subject=header_dict.get("Subject"), + bcc_recipient=parse_recipients(header_dict.get("Bcc")), + cc_recipient=parse_recipients(header_dict.get("Cc")), + email_message_id=email_message_id, + sent_to=parse_recipients(header_dict.get("To")), + sent_from=parse_recipients(header_dict.get("From")), + subject=msg.get("Subject"), signature=signature, last_modified=metadata_last_modified or email_date or last_modification_date, filename=filename, @@ -174,17 +199,17 @@ def convert_to_iso_8601(time: str) -> Optional[str]: def extract_attachment_info( - message: Message, + message: EmailMessage, output_dir: Optional[str] = None, ) -> list[dict[str, str]]: - list_attachments = [] + list_attachments: list[Any] = [] for part in message.walk(): if "content-disposition" in part: cdisp = part["content-disposition"].split(";") cdisp = [clean_extra_whitespace(item) for item in cdisp] - attachment_info = {} + attachment_info: dict[str, Any] = {} for item in cdisp: if item.lower() in ("attachment", "inline"): continue @@ -204,7 +229,7 @@ def extract_attachment_info( with open(filename, "wb") as f: # Note(harrell) mypy wants to just us `w` when opening the file but this # causes an error since the payloads are bytes not str - f.write(attachment["payload"]) # type: ignore + f.write(attachment["payload"]) else: with NamedTemporaryFile( mode="wb", @@ -212,18 +237,18 @@ def extract_attachment_info( delete=False, ) as f: list_attachments[idx]["filename"] = os.path.basename(f.name) - f.write(attachment["payload"]) # type: ignore + f.write(attachment["payload"]) return list_attachments -def has_embedded_image(element): +def has_embedded_image(element: Element): PATTERN = re.compile(r"\[image: .+\]") return PATTERN.search(element.text) def find_embedded_image( - element: NarrativeText | Title, indices: re.Match + element: NarrativeText | Title, indices: re.Match[str] ) -> tuple[Element, Element]: start, end = indices.start(), indices.end() @@ -235,13 +260,13 @@ def find_embedded_image( def parse_email( filename: Optional[str] = None, file: Optional[IO[bytes]] = None -) -> tuple[Optional[str], Message]: +) -> tuple[Optional[str], EmailMessage]: if filename is not None: with open(filename, "rb") as f: - msg = email.message_from_binary_file(f) + msg = email.message_from_binary_file(f, policy=policy.default) elif file is not None: f_bytes = convert_to_bytes(file) - msg = email.message_from_bytes(f_bytes) + msg = email.message_from_bytes(f_bytes, policy=policy.default) else: raise ValueError("Either 'filename' or 'file' must be provided.") @@ -253,7 +278,7 @@ def parse_email( break formatted_encoding = format_encoding_str(encoding) if encoding else None - + msg = cast(EmailMessage, msg) return formatted_encoding, msg @@ -342,19 +367,22 @@ def partition_email( filename=filename, encoding=encoding, ) - msg = email.message_from_string(file_text) + msg = email.message_from_string(file_text, policy=policy.default) elif file is not None: extracted_encoding, msg = parse_email(file=file) if extracted_encoding: detected_encoding = extracted_encoding else: detected_encoding, file_text = read_txt_file(file=file, encoding=encoding) - msg = email.message_from_string(file_text) + msg = email.message_from_string(file_text, policy=policy.default) elif text is not None: _text: str = str(text) - msg = email.message_from_string(_text) + msg = email.message_from_string(_text, policy=policy.default) + else: + return [] if not encoding: encoding = detected_encoding + msg = cast(EmailMessage, msg) is_encrypted = False content_map: dict[str, str] = {} @@ -385,6 +413,7 @@ def partition_email( else: content_map[content_type] = part.get_payload() + content = None if content_source in content_map: content = content_map.get(content_source) # NOTE(robinson) - If the chosen content source is not available and there is @@ -435,7 +464,7 @@ def partition_email( element.apply(_replace_mime_encodings) except (UnicodeDecodeError, UnicodeError): # If decoding fails, try decoding through common encodings - common_encodings = [] + common_encodings: list[str] = [] for x in COMMON_ENCODINGS: _x = format_encoding_str(x) if _x != encoding: @@ -451,7 +480,6 @@ def partition_email( break except (UnicodeDecodeError, UnicodeError): continue - elif content_source == "text/plain": elements = partition_text( text=content, @@ -462,6 +490,11 @@ def partition_email( include_metadata=False, # metadata is overwritten later, so no need to compute it here detection_origin="email", ) + else: + raise ValueError( + f"Invalid content source: {content_source}. " + f"Valid content sources are: {VALID_CONTENT_SOURCES}", + ) for idx, element in enumerate(elements): indices = has_embedded_image(element) diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 838064963a..4c9daa89c9 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -2,6 +2,7 @@ import copy import os +import re import tempfile from typing import IO, Any, Iterator, Optional @@ -188,6 +189,14 @@ def _msg_metadata(self) -> ElementMetadata: email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None sent_to = [r.email_address for r in msg.recipients] or None + bcc_recipient = ( + [c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None + ) + cc_recipient = ( + [c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None + ) + if email_message_id := msg.message_headers.get("Message-Id"): + email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets element_metadata = ElementMetadata( filename=self.metadata_file_path, @@ -195,6 +204,9 @@ def _msg_metadata(self) -> ElementMetadata: sent_from=sent_from, sent_to=sent_to, subject=msg.subject or None, + bcc_recipient=bcc_recipient, + cc_recipient=cc_recipient, + email_message_id=email_message_id, ) element_metadata.detection_origin = "msg" From b749b891a7d8508ec835277653613260fb2976ed Mon Sep 17 00:00:00 2001 From: Maciej Kurzawa <68014263+mackurzawa@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:25:08 +0200 Subject: [PATCH 3/3] fix: disabled checking max pages for images (#3473) Added fix related to https://github.com/Unstructured-IO/unstructured/pull/3431, which disables checking max pages for images --- CHANGELOG.md | 2 +- test_unstructured/partition/pdf_image/test_pdf.py | 5 +++++ unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 9 +++++---- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 754cb6af4e..e5206f2be3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.1-dev8 +## 0.15.1-dev9 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 63f12d921a..1dc6036053 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1372,14 +1372,18 @@ def test_analysis_artifacts_saved(): ("pdf/layout-parser-paper-with-empty-pages.pdf", 3, True), ("pdf/reliance.pdf", 3, False), ("pdf/reliance.pdf", 2, True), + ("img/DA-1p.jpg", None, False), + ("img/DA-1p.jpg", 2, False), ], ) def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_error): + is_image = not Path(filename).suffix.endswith("pdf") if not expected_error: pdf.partition_pdf_or_image( filename=example_doc_path(filename), strategy=PartitionStrategy.HI_RES, pdf_hi_res_max_pages=pdf_hi_res_max_pages, + is_image=is_image, ) else: @@ -1388,4 +1392,5 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_ filename=example_doc_path(filename), strategy=PartitionStrategy.HI_RES, pdf_hi_res_max_pages=pdf_hi_res_max_pages, + is_image=is_image, ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 31c20c9568..f4bdb64eb8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev8" # pragma: no cover +__version__ = "0.15.1-dev9" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index cf49b2066f..91b04beabc 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -518,7 +518,7 @@ def _get_pdf_page_number( elif filename: number_of_pages = PdfReader(filename).get_num_pages() else: - ValueError("Either 'file' or 'filename' must be provided.") + raise ValueError("Either 'file' or 'filename' must be provided.") return number_of_pages @@ -575,9 +575,10 @@ def _partition_pdf_or_image_local( process_file_with_pdfminer, ) - check_pdf_hi_res_max_pages_exceeded( - filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages - ) + if not is_image: + check_pdf_hi_res_max_pages_exceeded( + filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages + ) hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model() if pdf_image_dpi is None: