From 656a563ad9b8085f35c1ecf025f891adb179bbeb Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 24 Oct 2024 18:41:41 +0200 Subject: [PATCH 01/16] Minimal working example --- .../example_with_inline_fields.html | 18 +++ .../test_ontology_to_unstructured_parsing.py | 109 +++++++++++++- .../example_with_inline_fields.json | 46 ++++++ ...t_html_to_unstructured_and_back_parsing.py | 41 ++++++ .../partition/html/transformations.py | 139 ++++++++++++++++-- 5 files changed, 338 insertions(+), 15 deletions(-) create mode 100644 test_unstructured/documents/html_files/example_with_inline_fields.html create mode 100644 test_unstructured/documents/unstructured_json_output/example_with_inline_fields.json diff --git a/test_unstructured/documents/html_files/example_with_inline_fields.html b/test_unstructured/documents/html_files/example_with_inline_fields.html new file mode 100644 index 0000000000..3e55b5817c --- /dev/null +++ b/test_unstructured/documents/html_files/example_with_inline_fields.html @@ -0,0 +1,18 @@ + +
+
+

+ Table of Contents +

+
+ 68 Prince Street Palmdale, CA 93550 +
+ + www.google.com + + + More text + +
+
+ diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index 94d0870f5b..93afad78b3 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -4,7 +4,15 @@ from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title -from unstructured.documents.ontology import Column, Document, Page, Paragraph +from unstructured.documents.ontology import ( + Column, + Document, + Hyperlink, + Image, + Page, + Paragraph, + Table, +) from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder from unstructured.partition.html import partition_html from unstructured.partition.html.transformations import ( @@ -171,6 +179,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): ("html_file_path", "json_file_path"), [ ("html_files/example.html", "unstructured_json_output/example.json"), + ( + "html_files/example_with_inline_fields.html", + "unstructured_json_output/example_with_inline_fields.json", + ), ], ) def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): @@ -185,3 +197,98 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p for i in range(len(expected_json_elements)): assert expected_json_elements[i] == expected_json_elements[i] + + +def test_inline_elements_are_squeezed(): + ontology = Document( + children=[ + Page( + children=[ + Hyperlink(text="Hyperlink1"), + Hyperlink(text="Hyperlink2"), + Hyperlink(text="Hyperlink3"), + ], + ) + ] + ) + unstructured_elements = ontology_to_unstructured_elements(ontology) + assert len(unstructured_elements) == 2 + + page, text1 = unstructured_elements + assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3" + + +def test_text_elements_are_not_squeezed(): + ontology = Document( + children=[ + Page( + children=[ + Paragraph(text="Paragraph1"), + Paragraph(text="Paragraph2"), + ], + ) + ] + ) + unstructured_elements = ontology_to_unstructured_elements(ontology) + assert len(unstructured_elements) == 3 + + page, text1, text2 = unstructured_elements + assert text1.text == "Paragraph1" + assert text2.text == "Paragraph2" + + +def test_inline_elements_are_squeezed_when_image(): + ontology = Document( + children=[ + Page( + children=[ + Paragraph(text="Paragraph1"), + Hyperlink(text="Hyperlink1"), + Image(text="Image1"), + Hyperlink(text="Hyperlink2"), + Hyperlink(text="Hyperlink3"), + Paragraph(text="Paragraph2"), + Paragraph(text="Paragraph3"), + ], + ) + ] + ) + unstructured_elements = ontology_to_unstructured_elements(ontology) + assert len(unstructured_elements) == 5 + + page, text1, image, text2, text3 = unstructured_elements + assert text1.text == "Paragraph1 Hyperlink1" + assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2" + assert text3.text == "Paragraph3" + + assert '" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "45b3d0053468484ba1c7b53998115412", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "3a6b156a81764e17be128264241f8136", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "6cd3c1ba79654abb9c86162b6d1dae46", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "45b3d0053468484ba1c7b53998115412", + "text_as_html": "

Table of Contents

68 Prince Street Palmdale, CA 93550
www.google.com " + }, + "text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com", + "type": "NarrativeText" + }, + { + "element_id": "cb0d6675109241428778c7b996e0b21c", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "45b3d0053468484ba1c7b53998115412", + "text_as_html": "More text " + }, + "text": "More text", + "type": "UncategorizedText" + } +] \ No newline at end of file diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py index 8da0e3b832..06281c712e 100644 --- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py +++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py @@ -487,3 +487,44 @@ def test_ordered_list(): ) ] _assert_elements_equal(unstructured_elements, expected_elements) + + +def test_squeezed_elements_are_parsed_back(): + # language=HTML + html_as_str = _wrap_in_body_and_page( + """ +

+ Table of Contents +

+
+ 68 Prince Street Palmdale, CA 93550 +
+ + www.google.com + + """ + ) + + unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html( + html_as_str + ) + expected_html = indent_html(html_as_str, html_parser="html.parser") + parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser") + + assert expected_html == parsed_html + expected_elements = _page_elements + [ + NarrativeText( + text="Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com", + element_id="2", + detection_origin="vlm_partitioner", + metadata=ElementMetadata( + text_as_html='

Table of Contents

' + '
' + "68 Prince Street Palmdale, CA 93550 " + "
" + 'www.google.com ', + parent_id="1", + ), + ) + ] + _assert_elements_equal(unstructured_elements, expected_elements) diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index aa1c53555c..b36289aead 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -2,6 +2,7 @@ import html from collections import OrderedDict +from itertools import chain from typing import Sequence, Type from bs4 import BeautifulSoup, Tag @@ -19,11 +20,19 @@ ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME, ) from unstructured.documents.ontology import ( + Bibliography, + Citation, Document, ElementTypeEnum, + Footnote, + FootnoteReference, + Glossary, + Hyperlink, + NarrativeText, OntologyElement, Page, Paragraph, + Quote, UncategorizedText, ) @@ -79,15 +88,18 @@ def ontology_to_unstructured_elements( ), ) ] - + children = [] for child in ontology_element.children: - elements_to_return += ontology_to_unstructured_elements( + child = ontology_to_unstructured_elements( child, parent_id=ontology_element.id, page_number=page_number, depth=0 if isinstance(ontology_element, Document) else depth + 1, filename=filename, ) + children += child + squeezed_children = squeeze_inline_elements(children) + elements_to_return += squeezed_children else: unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[ ontology_element.__class__.__name__ @@ -116,6 +128,102 @@ def ontology_to_unstructured_elements( return elements_to_return +def squeeze_inline_elements(elements: list[Element]) -> list[Element]: + result_elements = [] + + current_element = None + for next_element in elements: + if current_element is None: + current_element = next_element + continue + + if can_unstructured_elements_be_merged(current_element, next_element): + current_element.text += " " + next_element.text # append here + current_element.metadata.text_as_html += ( + " " + next_element.metadata.text_as_html + ) # append here + else: + result_elements.append(current_element) + current_element = next_element + + if current_element is not None: + result_elements.append(current_element) + + return result_elements + + +def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool: + """ + Elements can be merged when (remember they can be already after some merging): + - Neither of them has children (only first HTML tag in text_as_html has to considered; + if they were merged already they didn't have children) + - We do not want to merge two text elements, only inline element with text so there + is no two text elements + - and all other elements are inline elements + """ + current_html_tags = BeautifulSoup( + current_element.metadata.text_as_html, "html.parser" + ).find_all() + next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all() + + ontology_elements = [ + parse_html_to_ontology_element(html_tag) + for html_tag in chain(current_html_tags, next_html_tags) + ] + + has_text_flag = False + for ontology_element in ontology_elements: + if ontology_element.children: + return False + + if is_text_element(ontology_element) and not has_text_flag: + has_text_flag = True + continue + + if is_text_element(ontology_element) and has_text_flag: + return False + + if not (is_inline_element(ontology_element) or is_text_element(ontology_element)): + return False + + return True + + +def is_text_element(ontology_element: OntologyElement) -> bool: + text_classes = [ + NarrativeText, + Quote, + Paragraph, + Footnote, + FootnoteReference, + Citation, + Bibliography, + Glossary, + ] + text_categories = [ElementTypeEnum.metadata] + + if any(isinstance(ontology_element, class_) for class_ in text_classes): + return True + + if any(ontology_element.elementType == category for category in text_categories): + return True + + return False + + +def is_inline_element(ontology_element: OntologyElement) -> bool: + inline_classes = [Hyperlink] + inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation] + + if any(isinstance(ontology_element, class_) for class_ in inline_classes): + return True + + if any(ontology_element.elementType == category for category in inline_categories): + return True + + return False + + def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement: """ Converts a sequence of unstructured Element objects to an OntologyElement object. @@ -140,18 +248,21 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) ) for element in unstructured_elements: - html_as_tag = BeautifulSoup(element.metadata.text_as_html, "html.parser").find() - ontology_element = parse_html_to_ontology_element(html_as_tag) - # Note: Each HTML of non-terminal Element doesn't have children in HTML - # So we just add Ontology Element with tag and class, later children are appended by - # parent_id. - # For terminal Elements entire HTML is added to text_as_html, thus it allows us to - # recreate the entire HTML structure - - id_to_element_mapping[ontology_element.id] = ontology_element - - if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping: - id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element) + html_as_tags = BeautifulSoup(element.metadata.text_as_html, "html.parser").find_all( + recursive=False + ) + for html_as_tag in html_as_tags: + ontology_element = parse_html_to_ontology_element(html_as_tag) + # Note: Each HTML of non-terminal Element doesn't have children in HTML + # So we just add Ontology Element with tag and class, later children are appended by + # parent_id. + # For terminal Elements entire HTML is added to text_as_html, thus it allows us to + # recreate the entire HTML structure + + id_to_element_mapping[ontology_element.id] = ontology_element + + if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping: + id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element) root_id, root_element = id_to_element_mapping.popitem(last=False) return root_element From 1c435cde3074c973599b9f035e65ffbc3ddd3413 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 24 Oct 2024 18:43:52 +0200 Subject: [PATCH 02/16] Update changelog --- CHANGELOG.md | 1 + unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8b38ebbf7..8d07df3b8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies * **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine. * **Request retry parameters in `partition_via_api` function.** Expose retry-mechanism related parameters in the `partition_via_api` function to allow users to configure the retry behavior of the API requests. +* **Elements created from V2 HTML are less granular** Added merging of adjacent text elements and inline html tags in the HTML partitioner to reduce the number of elements created from V2 HTML. ### Features diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 0358235928..4ed3db7923 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.1-dev8" # pragma: no cover +__version__ = "0.16.1-dev9" # pragma: no cover From 0d8dd4dfee27ee93bed15a3b53b605b6337edabf Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 24 Oct 2024 19:08:14 +0200 Subject: [PATCH 03/16] Add docstring and merge only on the same level in tree --- .../test_ontology_to_unstructured_parsing.py | 35 ++++++++++++ .../partition/html/transformations.py | 56 ++++++++++++++----- 2 files changed, 76 insertions(+), 15 deletions(-) diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index 93afad78b3..a176e1d8b7 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -11,6 +11,7 @@ Image, Page, Paragraph, + Section, Table, ) from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder @@ -292,3 +293,37 @@ def test_inline_elements_are_squeezed_when_table(): assert text2.text == "Paragraph2" assert table1.text == "Table1" assert text3.text == "Paragraph2 Hyperlink2 Hyperlink3" + + +def test_inline_elements_are_on_many_depths(): + ontology = Document( + children=[ + Page( + children=[ + Hyperlink(text="Hyperlink1"), + Paragraph(text="Paragraph1"), + Section( + children=[ + Section( + children=[ + Hyperlink(text="Hyperlink2"), + Hyperlink(text="Hyperlink3"), + ] + ), + Paragraph(text="Paragraph2"), + Hyperlink(text="Hyperlink4"), + ] + ), + ], + ) + ] + ) + unstructured_elements = ontology_to_unstructured_elements(ontology) + + assert len(unstructured_elements) == 6 + + page, text1, section1, section2, text2, text3 = unstructured_elements + + assert text1.text == "Hyperlink1 Paragraph1" + assert text2.text == "Hyperlink2 Hyperlink3" + assert text3.text == "Paragraph2 Hyperlink4" diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index b36289aead..7b6bf10a4b 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -98,8 +98,9 @@ def ontology_to_unstructured_elements( filename=filename, ) children += child - squeezed_children = squeeze_inline_elements(children) - elements_to_return += squeezed_children + + combined_children = combine_inline_elements(children) + elements_to_return += combined_children else: unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[ ontology_element.__class__.__name__ @@ -128,7 +129,25 @@ def ontology_to_unstructured_elements( return elements_to_return -def squeeze_inline_elements(elements: list[Element]) -> list[Element]: +def combine_inline_elements(elements: list[Element]) -> list[Element]: + """ + Combines consecutive inline elements into a single element. Inline elements + can be also combined with text elements. + + Combined elements contains multiple HTML tags together eg. + { + 'text': "Text from element 1 Text from element 2", + 'metadata': { + 'text_as_html': "

Text from element 1

Text from element 2" + } + } + + Args: + elements (list[Element]): A list of elements to be combined. + + Returns: + list[Element]: A list of combined elements. + """ result_elements = [] current_element = None @@ -138,10 +157,8 @@ def squeeze_inline_elements(elements: list[Element]) -> list[Element]: continue if can_unstructured_elements_be_merged(current_element, next_element): - current_element.text += " " + next_element.text # append here - current_element.metadata.text_as_html += ( - " " + next_element.metadata.text_as_html - ) # append here + current_element.text += " " + next_element.text + current_element.metadata.text_as_html += " " + next_element.metadata.text_as_html else: result_elements.append(current_element) current_element = next_element @@ -154,17 +171,22 @@ def squeeze_inline_elements(elements: list[Element]) -> list[Element]: def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool: """ - Elements can be merged when (remember they can be already after some merging): - - Neither of them has children (only first HTML tag in text_as_html has to considered; - if they were merged already they didn't have children) - - We do not want to merge two text elements, only inline element with text so there - is no two text elements - - and all other elements are inline elements + Elements can be merged when: + - They are on the same level in the HTML tree + - Neither of them has children + - All elements are inline elements or maximum one text element is present: + - We do not want to merge two seperated paragraphs + - But we want to merge inline text with paragraph and vice versa """ + if current_element.metadata.category_depth != next_element.metadata.category_depth: + return False + current_html_tags = BeautifulSoup( current_element.metadata.text_as_html, "html.parser" - ).find_all() - next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all() + ).find_all(recursive=False) + next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all( + recursive=False + ) ontology_elements = [ parse_html_to_ontology_element(html_tag) @@ -190,6 +212,8 @@ def can_unstructured_elements_be_merged(current_element: Element, next_element: def is_text_element(ontology_element: OntologyElement) -> bool: + """Categories or classes that we want to combine with inline text""" + text_classes = [ NarrativeText, Quote, @@ -212,6 +236,8 @@ def is_text_element(ontology_element: OntologyElement) -> bool: def is_inline_element(ontology_element: OntologyElement) -> bool: + """Categories or classes that we want to combine with text elements""" + inline_classes = [Hyperlink] inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation] From b5d1cc7b63d9bc96dfb090e65b05b4032d00b076 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 24 Oct 2024 20:10:49 +0200 Subject: [PATCH 04/16] Merge multiple text paragraphs on the same level in tree --- .../test_ontology_to_unstructured_parsing.py | 23 ++++++++----------- .../partition/html/transformations.py | 12 +--------- 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index a176e1d8b7..785bb71ea5 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -219,7 +219,7 @@ def test_inline_elements_are_squeezed(): assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3" -def test_text_elements_are_not_squeezed(): +def test_text_elements_are_squeezed(): ontology = Document( children=[ Page( @@ -231,11 +231,10 @@ def test_text_elements_are_not_squeezed(): ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) - assert len(unstructured_elements) == 3 + assert len(unstructured_elements) == 2 - page, text1, text2 = unstructured_elements - assert text1.text == "Paragraph1" - assert text2.text == "Paragraph2" + page, text1 = unstructured_elements + assert text1.text == "Paragraph1 Paragraph2" def test_inline_elements_are_squeezed_when_image(): @@ -255,12 +254,11 @@ def test_inline_elements_are_squeezed_when_image(): ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) - assert len(unstructured_elements) == 5 + assert len(unstructured_elements) == 4 - page, text1, image, text2, text3 = unstructured_elements + page, text1, image, text2 = unstructured_elements assert text1.text == "Paragraph1 Hyperlink1" - assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2" - assert text3.text == "Paragraph3" + assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2 Paragraph3" assert ' Date: Fri, 25 Oct 2024 11:36:03 +0200 Subject: [PATCH 05/16] Verification of inline text merged --- ...t_html_to_unstructured_and_back_parsing.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py index 06281c712e..84c9c91bfb 100644 --- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py +++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py @@ -8,9 +8,11 @@ Text, Title, ) +from unstructured.documents.ontology import Address, Paragraph from unstructured.partition.html.html_utils import indent_html from unstructured.partition.html.transformations import ( ontology_to_unstructured_elements, + parse_html_to_ontology, parse_html_to_ontology_element, unstructured_elements_to_ontology, ) @@ -528,3 +530,31 @@ def test_squeezed_elements_are_parsed_back(): ) ] _assert_elements_equal(unstructured_elements, expected_elements) + + +def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs(): + # language=HTML + base_html = """ +
+ About the same +
+ 1356 Hornor Avenue Oklahoma +
+ Some text +
+ """ + # Such HTML is transformed into Page: [Pargraph, Address, Paragraph] + # We would like it to be parsed to UnstructuredElements as [Page, NarrativeText] + + ontology = parse_html_to_ontology(base_html) + + p1, address, p2 = ontology.children + assert isinstance(p1, Paragraph) + assert isinstance(address, Address) + assert isinstance(p2, Paragraph) + + unstructured_elements = ontology_to_unstructured_elements(ontology) + + assert len(unstructured_elements) == 2 + assert isinstance(unstructured_elements[0], Text) + assert isinstance(unstructured_elements[1], NarrativeText) From cee1ad373787d84712316000e6c7df5d2cb4a684 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Fri, 25 Oct 2024 11:46:18 +0200 Subject: [PATCH 06/16] Add full example doc with current elements (not squeezed) --- .../html_files/example_full_doc.html | 1802 +++++++++++++++++ .../test_ontology_to_unstructured_parsing.py | 1 + .../example_full_doc.json | 1245 ++++++++++++ .../partition/html/transformations.py | 4 +- 4 files changed, 3050 insertions(+), 2 deletions(-) create mode 100644 test_unstructured/documents/html_files/example_full_doc.html create mode 100644 test_unstructured/documents/unstructured_json_output/example_full_doc.json diff --git a/test_unstructured/documents/html_files/example_full_doc.html b/test_unstructured/documents/html_files/example_full_doc.html new file mode 100644 index 0000000000..de5fb0b838 --- /dev/null +++ b/test_unstructured/documents/html_files/example_full_doc.html @@ -0,0 +1,1802 @@ + +
+
+ + Table of Contents + + + + https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+

+ ENTERPRISE PRODUCTS PARTNERS L.P. +

+

+ NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS +

+

+ Note 6. Intangible Assets and Goodwill +

+

+ Identifiable Intangible Assets +

+

+ The following table summarizes our intangible assets by business segment at the dates indicated: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + June 30, 2023 + + December 31, 2022 +
+ + Gross Value + + Accumulated Amortization + + Carrying Value + + Gross Value + + Accumulated Amortization + + Carrying Value +
+ NGL Pipelines & Services: + + + + + + +
+ Customer relationship intangibles + + + $ + +

+ 449 +

+
+ + $ + +

+ (257) +

+
+ + $ + +

+ 192 +

+
+ + $ + +

+ 449 +

+
+ + $ + +

+ (249) +

+
+ + $ + +

+ 200 +

+
+ Contract-based intangibles + + 751 + + (95) + + 656 + + 749 + + (84) + + 665 +
+ Segment total + + 1,200 + + (352) + + 848 + + 1,198 + + (333) + + 865 +
+ Crude Oil Pipelines & Services: + + + + + + +
+ Customer relationship intangibles + + 2,195 + + (477) + + 1,718 + + 2,195 + + (431) + + 1,764 +
+ Contract-based intangibles + + 283 + + (273) + + 10 + + 283 + + (271) + + 12 +
+ Segment total + + 2,478 + + (750) + + 1,728 + + 2,478 + + (702) + + 1,776 +
+ Natural Gas Pipelines & Services: + + + + + + +
+ Customer relationship intangibles + + 1,350 + + (607) + + 743 + + 1,350 + + (588) + + 762 +
+ Contract-based intangibles + + 639 + + (201) + + 438 + + 639 + + (195) + + 444 +
+ Segment total + + 1,989 + + (808) + + 1,181 + + 1,989 + + (783) + + 1,206 +
+ Petrochemical & Refined Products Services: + + + + + + +
+ Customer relationship intangibles + + 181 + + (83) + + 98 + + 181 + + (80) + + 101 +
+ Contract-based intangibles + + 45 + + (29) + + 16 + + 45 + + (28) + + 17 +
+ Segment total + + 226 + + (112) + + 114 + + 226 + + (108) + + 118 +
+ Total intangible assets + + + $ + +

+ 5,893 +

+
+ + $ + +

+ (2,022) +

+
+ + $ + +

+ 3,871 +

+
+ + $ + +

+ 5,891 +

+
+ + $ + +

+ (1,926) +

+
+ + $ + +

+ 3,965 +

+
+

+ The following table presents the amortization expense of our intangible assets by business segment for the periods indicated: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + For the Three Months Ended June 30, + + For the Six Months Ended June 30, +
+ + 2023 + + 2022 + + 2023 + + 2022 +
+ NGL Pipelines & Services + + + $ + +

+ 10 +

+
+ + $ + +

+ 9 +

+
+ + $ + +

+ 19 +

+
+ + $ + +

+ 17 +

+
+ Crude Oil Pipelines & Services + + 25 + + 21 + + 48 + + 41 +
+ Natural Gas Pipelines & Services + + 13 + + 14 + + 25 + + 25 +
+ Petrochemical & Refined Products Services + + 2 + + 1 + + 4 + + 3 +
+ Total + + + $ + +

+ 50 +

+
+ + $ + +

+ 45 +

+
+ + $ + +

+ 96 +

+
+ + $ + +

+ 86 +

+
+

+ The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated: +

+ + + + + + + + + + + + + + + + + + + + + +
+ + Remainder of 2023 + + 2024 + + 2025 + + 2026 + + 2027 +
+ + $ + + + 107 + + 222 + + 230 + + 237 + + 235 +
+

+ Goodwill +

+

+ Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K. +

+ + 13 + + +
+
+
+ + 11/7/23, 2:38 PM + + + sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+

+ ENTERPRISE PRODUCTS PARTNERS L.P. +

+

+ NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS +

+

+ Note 7. Debt Obligations +

+

+ The following table presents our consolidated debt obligations (arranged by company and maturity date) at the dates indicated: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + June 30, 2023 + + December 31, 2022 +
+ EPO senior debt obligations: + + +
+ Commercial Paper Notes, variable-rates + + + $ + +

+ 355 +

+
+ + $ + +

+ 495 +

+
+ Senior Notes HH, 3.35% fixed-rate, due March 2023 + + - + + 1,250 +
+ Senior Notes JJ, 3.90% fixed-rate, due February 2024 + + 850 + + 850 +
+

+ March 2023 $1.5 Billion 364-Day Revolving Credit Agreement, variable-rate, due March 2024 +

+ + (1) + +
+ - + + - +
+ Senior Notes MM, 3.75% fixed-rate, due February 2025 + + 1,150 + + 1,150 +
+ Senior Notes FFF, 5.05% fixed-rate, due January 2026 + + 750 + + - +
+ Senior Notes PP, 3.70% fixed-rate, due February 2026 + + 875 + + 875 +
+ Senior Notes SS, 3.95% fixed-rate, due February 2027 + + 575 + + 575 +
+

+ March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement, variable-rate, due March 2028 +

+ + (2) + +
+ - + + - +
+ Senior Notes WW, 4.15% fixed-rate, due October 2028 + + 1,000 + + 1,000 +
+ Senior Notes YY, 3.125% fixed-rate, due July 2029 + + 1,250 + + 1,250 +
+ Senior Notes AAA, 2.80% fixed-rate, due January 2030 + + 1,250 + + 1,250 +
+ Senior Notes GGG, 5.35% fixed-rate, due January 2033 + + 1,000 + + - +
+ Senior Notes D, 6.875% fixed-rate, due March 2033 + + 500 + + 500 +
+ Senior Notes H, 6.65% fixed-rate, due October 2034 + + 350 + + 350 +
+ Senior Notes J, 5.75% fixed-rate, due March 2035 + + 250 + + 250 +
+ Senior Notes W, 7.95% fixed-rate, due April 2038 + + 400 + + 400 +
+ Senior Notes R, 6.125% fixed-rate, due October 2039 + + 600 + + 600 +
+ Senior Notes Z, 6.45% fixed-rate, due September 2040 + + 600 + + 600 +
+ Senior Notes BB, 5.95% fixed-rate, due February 2041 + + 750 + + 750 +
+ Senior Notes DD, 5.70% fixed-rate, due February 2042 + + 600 + + 600 +
+ Senior Notes EE, 4.85% fixed-rate, due August 2042 + + 750 + + 750 +
+ Senior Notes GG, 4.45% fixed-rate, due February 2043 + + 1,100 + + 1,100 +
+ Senior Notes II, 4.85% fixed-rate, due March 2044 + + 1,400 + + 1,400 +
+ Senior Notes KK, 5.10% fixed-rate, due February 2045 + + 1,150 + + 1,150 +
+ Senior Notes QQ, 4.09% fixed-rate, due May 2046 + + 975 + + 975 +
+ Senior Notes UU, 4.25% fixed-rate, due February 2048 + + 1,250 + + 1,250 +
+ Senior Notes XX, 4.80% fixed-rate, due February 2049 + + 1,250 + + 1,250 +
+ Senior Notes ZZ, 4.20% fixed-rate, due January 2051 + + 1,250 + + 1,250 +
+ Senior Notes BB, 3.70% fixed-rate, due January 2051 + + 1,000 + + 1,000 +
+ Senior Notes DDD, 3.30% fixed-rate, due February 2052 + + 1,000 + + 1,000 +
+ Senior Notes EEE, 3.00% fixed-rate, due February 2053 + + 1,000 + + 1,000 +
+ Senior Notes NN, 4.95% fixed-rate, due October 2054 + + 400 + + 400 +
+ Senior Notes CCC, 3.95% fixed-rate, due January 2060 + + 1,000 + + 1,000 +
+ Total principal amount of senior debt obligations + + 26,630 + + 26,270 +
+

+ EPO Junior Subordinated Notes C, variable-rate, due June 2067 +

+ + (3)(7) + +
+ 232 + + 232 +
+

+ EPO Junior Subordinated Notes D, variable-rate, due August 2077 +

+ + (4)(7) + +
+ 350 + + 350 +
+

+ EPO Junior Subordinated Notes E, fixed/variable-rate, due August 2077 +

+ + (5)(7) + +
+ 1,000 + + 1,000 +
+

+ EPO Junior Subordinated Notes F, fixed/variable-rate, due February 2078 +

+ + (6)(7) + +
+ 700 + + 700 +
+

+ TEPPCO Junior Subordinated Notes, variable-rate, due June 2067 +

+ + (3)(7) + +
+ 14 + + 14 +
+ Total principal amount of senior and junior debt obligations + + 28,926 + + 28,566 +
+ Other, non-principal amounts + + +
+ Less current maturities of debt + + (279) + + (271) +
+ Total long-term debt + + (1,204) + + (1,744) +
+ + + $ + +

+ 27,443 +

+
+ + $ + +

+ 26,551 +

+
+
+ (1) Under the terms of the agreement, EPO may borrow up to $1.5 billion (which may be increased by up to $200 million to $1.7 billion at EPO’s election provided certain conditions are met). +
+
+ (2) Under the terms of the agreement, EPO may borrow up to $2.7 billion (which may be increased by up to $500 million to $3.2 billion at EPO’s election provided certain conditions are met). +
+
+ (3) Variable rate is reset quarterly and based on 3-month London Interbank Offered Rate (“LIBOR”) plus 2.778%. +
+
+ (4) Variable rate is reset quarterly and based on 3-month LIBOR plus 2.986%. +
+
+ (5) Fixed rate of 5.250% through August 15, 2027; thereafter, a variable rate reset quarterly and based on 3-month LIBOR plus 3.033%. +
+
+ (6) Fixed rate of 3.575% through February 14, 2028; thereafter, a variable rate reset quarterly and based on 3-month LIBOR plus 2.57%. +
+
+ (7) +
+ + https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + + + 19/93 + +
+
+
+ + + sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+
+ + (7) + +

+ See discussion below in “Variable Interest Rates” regarding the LIBOR replacement and LIBOR replacement rate. +

+
+

+ References to “TEPPCO” mean TEPPCO Partners, L.P. prior to its merger with one of our wholly owned subsidiaries in October 2009. +

+ + 14 + + +
+
+
+ + Table of Contents + + + 11/7/23, 2:38 PM + + + https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+

+ ENTERPRISE PRODUCTS PARTNERS L.P. +

+

+ NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS +

+

+ Variable Interest Rates +

+

+ The following table presents the range of interest rates and weighted-average interest rates paid on our consolidated variable-rate debt during the six months ended June 30, 2023: +

+ + + + + + + + + + + + + + + + + + + + + + + + +
+ Range of Interest Rates Paid + + Weighted-Average Interest Rate Paid +
+ Commercial Paper Notes + + 4.59% to 5.34% + + 5.17% +
+ EPO Junior Subordinated Notes C and TEPPCO Junior Subordinated Notes + + 7.54% to 8.27% + + 7.76% +
+ EPO Junior Subordinated Notes D + + 7.63% to 8.30% + + 7.91% +
+

+ Amounts borrowed under EPO’s March 2023 $1.5 Billion 364-Day Revolving Credit Agreement and March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement bear interest, at EPO’s election, equal to: (i) the Secured Overnight Financing Rate (“SOFR”), plus an additional variable spread; or (ii) an alternate base rate, which is the greatest of (a) the Prime Rate in effect on such day, (b) the Federal Funds Effective Rate in effect on such day plus 0.5%, or (c) Adjusted Term SOFR, for an interest period of one month in effect on such day plus 1%, and a variable spread. The applicable spreads are determined based on EPO’s debt ratings. +

+

+ In July 2017, the Financial Conduct Authority in the U.K. announced a desire to phase out LIBOR as a benchmark by the end of June 2023. In December 2022, the Board of Governors of the Federal Reserve System approved a final rule to implement the Adjustable Interest Rate (LIBOR) Act, which established benchmark replacements for certain contracts that reference various tenors of LIBOR and do not provide an alternative rate or would result in a rate that is expressed in terms of the last known value of LIBOR (typically referred to as a “frozen LIBOR” provision). The final rule became effective during the first quarter of 2023. As a result of the LIBOR Act, our Junior Subordinated Notes C and D and the TEPPCO Junior Subordinated Notes, which were subject to a variable rate (as defined by the applicable agreement) based on three-month LIBOR (in each case, a “LIBOR Rate”) through June 30, 2023, replaced the applicable LIBOR Rate with a variable rate based on the three-month CME Term SOFR (“SOFR Rate”) as administered by the CME Group Benchmark Administration, Ltd. plus a 0.26161% tenor spread adjustment beginning on July 1, 2023. Additionally, our Junior Subordinated Notes D and F, which would have been subject to a variable rate (as defined by the applicable agreement) based on three-month LIBOR beginning in August 2027 and February 2028, respectively, will replace the applicable LIBOR Rate with the three-month SOFR rate plus a 0.26161% tenor spread adjustment. The foregoing tenor spread adjustment will be in addition to the applicable spread under the terms of each series of Junior Subordinated Notes. We do not expect the transition from LIBOR to have a material financial impact on us. +

+

+ Scheduled Maturities of Debt +

+

+ The following table presents the scheduled maturities of principal amounts of EPO’s consolidated debt obligations at June 30, 2023 for the next five years, and in total thereafter: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Scheduled Maturities of Debt +
+ Total + + Remainder of 2023 + + 2024 + + 2025 + + 2026 + + 2027 + + Thereafter +
+ Commercial Paper Notes + + $ 355 + + $ 355 + + $ — + + $ — + + $ — + + $ — + + $ — +
+ Senior Notes + + $ 26,275 + + $ — + + $ 850 + + $ 1,150 + + $ 1,625 + + $ 575 + + $ 22,075 +
+ Junior Subordinated Notes + + $ 2,296 + + $ — + + $ — + + $ — + + $ — + + $ — + + $ 2,296 +
+ Total + + $ 28,926 + + $ 355 + + $ 850 + + $ 1,150 + + $ 1,625 + + $ 575 + + $ 24,371 +
+ + 15 + + + https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+
+
+ + Table of Contents + + + sec.gov/Archives/edgar/data/1061219/000106121923000017/ + +
+

+ ENTERPRISE PRODUCTS PARTNERS L.P. +

+

+ NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS +

+
+

+ March 2023 $1.5 Billion 364-Day Revolving Credit Agreement +

+

+ In March 2023, EPO entered into a new 364-Day Revolving Credit Agreement (the “March 2023 $1.5 Billion 364-Day Revolving Credit Agreement”) that replaced its September 2022 364-Day Revolving Credit Agreement. There were no principal amounts outstanding under the September 2022 364-Day Revolving Credit Agreement when it was replaced by the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement. As of June 30, 2023, there were no principal amounts outstanding under the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement. +

+

+ Under the terms of the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement, EPO may borrow up to $1.5 billion (which may be increased by up to $200 million to $1.7 billion at EPO’s election, provided certain conditions are met) at a variable interest rate for a term of up to 364 days, subject to the terms and conditions set forth therein. The March 2023 $1.5 Billion 364-Day Revolving Credit Agreement matures in March 2024. To the extent that principal amounts are outstanding at the maturity date, EPO may elect to have the entire principal balance then outstanding continued as non-revolving term loans for a period of one additional year, payable in March 2025. Borrowings under the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement may be used for working capital, capital expenditures, acquisitions and general company purposes. +

+

+ The March 2023 $1.5 Billion 364-Day Revolving Credit Agreement contains customary representations, warranties, covenants (affirmative and negative) and events of default, the occurrence of which would permit the lenders to accelerate the maturity date of any amounts borrowed under this credit agreement. The March 2023 $1.5 Billion 364-Day Revolving Credit Agreement also restricts EPO’s ability to pay cash distributions to the Partnership, if an event of default (as defined in the credit agreement) has occurred and is continuing at the time such distribution is scheduled to be paid or would result therefrom. +

+

+ EPO’s obligations under the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement are not secured by any collateral; however, they are guaranteed by the Partnership. +

+
+
+

+ March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement +

+

+ In March 2023, EPO entered into a new revolving credit agreement that matures in March 2028 (the “March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement”). The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement replaced EPO’s prior multi-year revolving credit agreement that was scheduled to mature in September 2026. There were no principal amounts outstanding under the prior multi-year revolving credit agreement when it was replaced by the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement. As of June 30, 2023, there were no principal amounts outstanding under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement. +

+

+ Under the terms of the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement, EPO may borrow up to $2.7 billion (which may be increased by up to $500 million to $3.2 billion at EPO’s election, provided certain conditions are met) at a variable interest rate for a term of five years, subject to the terms and conditions set forth therein. The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement matures in March 2028, although the maturity date may be extended at EPO’s request (up to two requests) for a one-year extension of the maturity date by delivering a request prior to the maturity date and with the consent of required lenders as set forth under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement. Borrowings under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement may be used for working capital, capital expenditures, acquisitions and general company purposes. +

+

+ The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement contains customary representations, warranties, covenants (affirmative and negative) and events of default, the occurrence of which would permit the lenders to accelerate the maturity date of any amounts borrowed under this credit agreement. The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement also restricts EPO’s ability to pay cash distributions to the Partnership, if an event of default (as defined in the credit agreement) has occurred and is continuing at the time such distribution is scheduled to be paid or would result therefrom. +

+

+ EPO’s obligations under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement are not secured by any collateral; however, they are guaranteed by the Partnership. +

+
+
+ + 16 + + + https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+
+
+
+ + Table of Contents + + + sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + +
+

+ ENTERPRISE PRODUCTS PARTNERS L.P. +

+

+ NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS +

+

+ Issuance of $1.75 Billion of Senior Notes in January 2023 +

+

+ In January 2023, EPO issued $1.75 billion aggregate principal amount of senior notes comprised of (i) $750 million principal amount of senior notes due January 2026 (“Senior Notes FFF”) and (ii) $1.0 billion principal amount of senior notes due January 2033 (“Senior Notes GGG”). Net proceeds from this offering were used by EPO for general company purposes, including for growth capital investments, and the repayment of debt (including the repayment of all of our $1.25 billion principal amount of 3.35% Senior Notes HH at their maturity in March 2023 and amounts outstanding under our commercial paper program). +

+

+ Senior Notes FFF were issued at 99.893% of their principal amount and have a fixed-rate interest rate of 5.05% per year. Senior Notes GGG were issued at 99.803% of their principal amount and have a fixed-rate interest rate of 5.35% per year. The Partnership guaranteed these senior notes through an unconditional guarantee on an unsecured and unsubordinated basis. +

+

+ Letters of Credit +

+

+ At June 30, 2023, EPO had $110 million of letters of credit outstanding primarily related to our commodity hedging activities. +

+

+ Lender Financial Covenants +

+

+ We were in compliance with the financial covenants of our consolidated debt agreements at June 30, 2023. +

+

+ Parent-Subsidiary Guarantor Relationships +

+

+ The Partnership acts as guarantor of the consolidated debt obligations of EPO, with the exception of the remaining debt obligations of TEPPCO. If EPO were to default on any of its guaranteed debt, the Partnership would be responsible for full and unconditional repayment of such obligations. +

+

+ Note 8. Capital Accounts +

+

+ Common Limited Partner Interests +

+

+ The following table summarizes changes in the number of our common units outstanding since December 31, 2022: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Common units outstanding at December 31, 2022 + +
+ Common unit repurchases under 2019 Buyback Program + + (682,589) +
+ Common units issued in connection with the vesting of phantom unit awards, net + + 4,364,301 +
+ Other + + 20,892 +
+ Common units outstanding at March 31, 2023 + + 2,174,508,951 +
+ Common unit repurchases under 2019 Buyback Program + + (2,910,121) +
+ Common units issued in connection with the vesting of phantom unit awards, net + + 153,502 +
+ Common units outstanding at June 30, 2023 + + 2,171,752,332 +
+

+ Registration Statements +

+

+ We have a universal shelf registration statement on file with the SEC which allows the Partnership and EPO (each on a standalone basis) to issue an unlimited amount of equity and debt securities, respectively. +

+

+ In addition, the Partnership has a registration statement on file with the SEC covering the issuance of up to $2.5 billion of its common units in amounts, at prices and on terms based on market conditions and other factors at the time of such offerings (referred to as the Partnership’s at-the-market (“ATM”) program). The Partnership did not issue any common units under its ATM program during the six months ended June 30, 2023. The Partnership’s capacity to issue additional common units under the ATM program remains at $2.5 billion as of June 30, 2023. +

+

+ We may issue additional equity and debt securities to assist us in meeting our future liquidity requirements, including those related to capital investments. +

+
+ + 17 + + + https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm + + + 23/93 + +
+
+ diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index 785bb71ea5..451f1e9d73 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -180,6 +180,7 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): ("html_file_path", "json_file_path"), [ ("html_files/example.html", "unstructured_json_output/example.json"), + ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), ( "html_files/example_with_inline_fields.html", "unstructured_json_output/example_with_inline_fields.json", diff --git a/test_unstructured/documents/unstructured_json_output/example_full_doc.json b/test_unstructured/documents/unstructured_json_output/example_full_doc.json new file mode 100644 index 0000000000..90350a4ff2 --- /dev/null +++ b/test_unstructured/documents/unstructured_json_output/example_full_doc.json @@ -0,0 +1,1245 @@ +[ + { + "element_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "metadata": { + "category_depth": 0, + "page_number": 1, + "parent_id": "a77ae2bba17845d6bcce44f6aebadfb5", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "d0a9edd181f542f0ba695489f14c4b75", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "2a8866a868414163afee2ef24574fc9b", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "d0a9edd181f542f0ba695489f14c4b75", + "text_as_html": "Table of Contents " + }, + "text": "Table of Contents", + "type": "UncategorizedText" + }, + { + "element_id": "338375ec733a47f2a9544a74cfb4b11c", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "d0a9edd181f542f0ba695489f14c4b75", + "text_as_html": "" + }, + "text": "11/7/23, 2:38 PM", + "type": "UncategorizedText" + }, + { + "element_id": "770e80f0a903490892ee9ff1f1f76718", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "d0a9edd181f542f0ba695489f14c4b75", + "text_as_html": "https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm " + }, + "text": "https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm", + "type": "UncategorizedText" + }, + { + "element_id": "2bca4006451a405c87ebaf6eb9ff7bd9", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

ENTERPRISE PRODUCTS PARTNERS L.P.

" + }, + "text": "ENTERPRISE PRODUCTS PARTNERS L.P.", + "type": "Title" + }, + { + "element_id": "8da7d91b8f094acfb4caef69d96d17b9", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS

" + }, + "text": "NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS", + "type": "Title" + }, + { + "element_id": "1a8af2164abc4fed820445c7d7a1652e", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

Note 6. Intangible Assets and Goodwill

" + }, + "text": "Note 6. Intangible Assets and Goodwill", + "type": "Title" + }, + { + "element_id": "c9dcb08578704efeb997f7d3dd659a61", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

Identifiable Intangible Assets

" + }, + "text": "Identifiable Intangible Assets", + "type": "Title" + }, + { + "element_id": "2189d06e7f1f4d73b93c3f1845486b52", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

The following table summarizes our intangible assets by business segment at the dates indicated:

" + }, + "text": "The following table summarizes our intangible assets by business segment at the dates indicated:", + "type": "NarrativeText" + }, + { + "element_id": "d0f9bd2adefa42e18357960d582588bd", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "
June 30, 2023 December 31, 2022
Gross Value Accumulated Amortization Carrying Value Gross Value Accumulated Amortization Carrying Value
NGL Pipelines & Services:
Customer relationship intangibles $

449

$

(257)

$

192

$

449

$

(249)

$

200

Contract-based intangibles 751 (95) 656 749 (84) 665
Segment total 1,200 (352) 848 1,198 (333) 865
Crude Oil Pipelines & Services:
Customer relationship intangibles 2,195 (477) 1,718 2,195 (431) 1,764
Contract-based intangibles 283 (273) 10 283 (271) 12
Segment total 2,478 (750) 1,728 2,478 (702) 1,776
Natural Gas Pipelines & Services:
Customer relationship intangibles 1,350 (607) 743 1,350 (588) 762
Contract-based intangibles 639 (201) 438 639 (195) 444
Segment total 1,989 (808) 1,181 1,989 (783) 1,206
Petrochemical & Refined Products Services:
Customer relationship intangibles 181 (83) 98 181 (80) 101
Contract-based intangibles 45 (29) 16 45 (28) 17
Segment total 226 (112) 114 226 (108) 118
Total intangible assets $

5,893

$

(2,022)

$

3,871

$

5,891

$

(1,926)

$

3,965

" + }, + "text": "June 30, 2023 December 31, 2022 Gross Value Accumulated Amortization Carrying Value Gross Value Accumulated Amortization Carrying Value NGL Pipelines & Services: Customer relationship intangibles $ 449 $ (257) $ 192 $ 449 $ (249) $ 200 Contract-based intangibles 751 (95) 656 749 (84) 665 Segment total 1,200 (352) 848 1,198 (333) 865 Crude Oil Pipelines & Services: Customer relationship intangibles 2,195 (477) 1,718 2,195 (431) 1,764 Contract-based intangibles 283 (273) 10 283 (271) 12 Segment total 2,478 (750) 1,728 2,478 (702) 1,776 Natural Gas Pipelines & Services: Customer relationship intangibles 1,350 (607) 743 1,350 (588) 762 Contract-based intangibles 639 (201) 438 639 (195) 444 Segment total 1,989 (808) 1,181 1,989 (783) 1,206 Petrochemical & Refined Products Services: Customer relationship intangibles 181 (83) 98 181 (80) 101 Contract-based intangibles 45 (29) 16 45 (28) 17 Segment total 226 (112) 114 226 (108) 118 Total intangible assets $ 5,893 $ (2,022) $ 3,871 $ 5,891 $ (1,926) $ 3,965", + "type": "Table" + }, + { + "element_id": "48d7d6313bc141c6945f7f5eee588db8", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

The following table presents the amortization expense of our intangible assets by business segment for the periods indicated:

" + }, + "text": "The following table presents the amortization expense of our intangible assets by business segment for the periods indicated:", + "type": "NarrativeText" + }, + { + "element_id": "d3dbbac8b8834b109421da2cbeda1399", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "
For the Three Months Ended June 30, For the Six Months Ended June 30,
2023 2022 2023 2022
NGL Pipelines & Services $

10

$

9

$

19

$

17

Crude Oil Pipelines & Services 25 21 48 41
Natural Gas Pipelines & Services 13 14 25 25
Petrochemical & Refined Products Services 2 1 4 3
Total $

50

$

45

$

96

$

86

" + }, + "text": "For the Three Months Ended June 30, For the Six Months Ended June 30, 2023 2022 2023 2022 NGL Pipelines & Services $ 10 $ 9 $ 19 $ 17 Crude Oil Pipelines & Services 25 21 48 41 Natural Gas Pipelines & Services 13 14 25 25 Petrochemical & Refined Products Services 2 1 4 3 Total $ 50 $ 45 $ 96 $ 86", + "type": "Table" + }, + { + "element_id": "42c2678b8b5d4977a27e8ba6a8b2224f", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated:

" + }, + "text": "The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated:", + "type": "NarrativeText" + }, + { + "element_id": "fb8c1e14c5ca44caa7359a4ace457701", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "
Remainder of 2023 2024 2025 2026 2027
$ 107 222 230 237 235
" + }, + "text": "Remainder of 2023 2024 2025 2026 2027 $ 107 222 230 237 235", + "type": "Table" + }, + { + "element_id": "a5c180f735a846929e7d496ac5d49603", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

Goodwill

" + }, + "text": "Goodwill", + "type": "Title" + }, + { + "element_id": "f1d9b3bb0fc04cae8e419a394f8c0e45", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "

Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K.

" + }, + "text": "Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K.", + "type": "NarrativeText" + }, + { + "element_id": "9cc42bfd543943fe97a179576d1f0f09", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "13 " + }, + "text": "13", + "type": "PageNumber" + }, + { + "element_id": "e29d700b4fd046dd82dcd5e9a2e1f5ab", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", + "text_as_html": "