Skip to content

Commit

Permalink
Fix when parent id is none for first element in v2 notion: (#3752)
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy authored Oct 25, 2024
1 parent 9835fe4 commit 2417f8e
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 2 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
## 0.16.3-dev1

### Enhancements

### Features

### Fixes

* **V2 elements without first parent ID can be parsed**


## 0.16.2

### Enhancements
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text
from unstructured.documents.ontology import Document, Page, Paragraph
from unstructured.partition.html.transformations import unstructured_elements_to_ontology


def test_when_first_elements_does_not_have_id():
unstructured_elements = [
Text(
element_id="1",
text="",
metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'),
),
NarrativeText(
element_id="2",
text="Example text",
metadata=ElementMetadata(
text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1"
),
),
]
ontology = unstructured_elements_to_ontology(unstructured_elements)

assert isinstance(ontology, Document)

assert len(ontology.children) == 1
page = ontology.children[0]

assert isinstance(page, Page)
assert len(page.children) == 1
paragraph = page.children[0]

assert isinstance(paragraph, Paragraph)
assert paragraph.text == "Example text"
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.2" # pragma: no cover
__version__ = "0.16.3-dev1" # pragma: no cover
6 changes: 5 additions & 1 deletion unstructured/documents/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,11 @@ def __init__(self, **kwargs):
if self.html_tag_name == "":
self.html_tag_name = self.allowed_tags[0]
if "id" not in self.additional_attributes:
self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "")
self.additional_attributes["id"] = self.generate_unique_id()

@staticmethod
def generate_unique_id() -> str:
return str(uuid.uuid4()).replace("-", "")

def to_html(self, add_children=True) -> str:
additional_attrs = copy(self.additional_attributes)
Expand Down
5 changes: 5 additions & 0 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
id_to_element_mapping = OrderedDict()

document_element_id = unstructured_elements[0].metadata.parent_id

if document_element_id is None:
document_element_id = OntologyElement.generate_unique_id()
unstructured_elements[0].metadata.parent_id = document_element_id

id_to_element_mapping[document_element_id] = Document(
additional_attributes={"id": document_element_id}
)
Expand Down

0 comments on commit 2417f8e

Please sign in to comment.