diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b32409325..b9e042effa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## 0.16.3-dev1 + +### Enhancements + +### Features + +### Fixes + +* **V2 elements without first parent ID can be parsed** + + ## 0.16.2 ### Enhancements diff --git a/test_unstructured/partition/html/test_unstructured_elements_to_ontology_parsing.py b/test_unstructured/partition/html/test_unstructured_elements_to_ontology_parsing.py new file mode 100644 index 0000000000..d6bd4fa562 --- /dev/null +++ b/test_unstructured/partition/html/test_unstructured_elements_to_ontology_parsing.py @@ -0,0 +1,33 @@ +from unstructured.documents.elements import ElementMetadata, NarrativeText, Text +from unstructured.documents.ontology import Document, Page, Paragraph +from unstructured.partition.html.transformations import unstructured_elements_to_ontology + + +def test_when_first_elements_does_not_have_id(): + unstructured_elements = [ + Text( + element_id="1", + text="", + metadata=ElementMetadata(text_as_html='
'), + ), + NarrativeText( + element_id="2", + text="Example text", + metadata=ElementMetadata( + text_as_html='

Example text

', parent_id="1" + ), + ), + ] + ontology = unstructured_elements_to_ontology(unstructured_elements) + + assert isinstance(ontology, Document) + + assert len(ontology.children) == 1 + page = ontology.children[0] + + assert isinstance(page, Page) + assert len(page.children) == 1 + paragraph = page.children[0] + + assert isinstance(paragraph, Paragraph) + assert paragraph.text == "Example text" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d750d3b2a7..3eb8a850e8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.2" # pragma: no cover +__version__ = "0.16.3-dev1" # pragma: no cover diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index 853a0a5224..152edc8922 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -67,7 +67,11 @@ def __init__(self, **kwargs): if self.html_tag_name == "": self.html_tag_name = self.allowed_tags[0] if "id" not in self.additional_attributes: - self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "") + self.additional_attributes["id"] = self.generate_unique_id() + + @staticmethod + def generate_unique_id() -> str: + return str(uuid.uuid4()).replace("-", "") def to_html(self, add_children=True) -> str: additional_attrs = copy(self.additional_attributes) diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index aa1c53555c..421585a2d5 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) id_to_element_mapping = OrderedDict() document_element_id = unstructured_elements[0].metadata.parent_id + + if document_element_id is None: + document_element_id = OntologyElement.generate_unique_id() + unstructured_elements[0].metadata.parent_id = document_element_id + id_to_element_mapping[document_element_id] = Document( additional_attributes={"id": document_element_id} )