diff --git a/.codecov.yml b/.codecov.yml
index 8e5944c..ca544c8 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -3,4 +3,4 @@ coverage:
patch:
default:
# Note: also update Taskfile.yml when changing the target coverage.
- target: 60%
+ target: 90%
diff --git a/Taskfile.yml b/Taskfile.yml
index 923ee80..00d259f 100644
--- a/Taskfile.yml
+++ b/Taskfile.yml
@@ -34,14 +34,14 @@ tasks:
cmds:
# Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
# Note: also update .codecov.yml when changing the target coverage.
- - poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
+ - poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/
unit-watch:
desc: Run unit tests and check code coverage immediately when files change.
cmds:
# Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
# Note: also update .codecov.yml when changing the target coverage.
- - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
+ - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/
###########################
diff --git a/sec_parser/processing_engine/html_tag.py b/sec_parser/processing_engine/html_tag.py
index e044d3d..a17444d 100644
--- a/sec_parser/processing_engine/html_tag.py
+++ b/sec_parser/processing_engine/html_tag.py
@@ -6,7 +6,6 @@
from sec_parser.exceptions import SecParserValueError
from sec_parser.utils.bs4_.contains_tag import contains_tag
-from sec_parser.utils.bs4_.get_first_deepest_tag import get_first_deepest_tag
from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics
@@ -93,23 +92,6 @@ def is_unary_tree(self) -> bool:
self._is_unary_tree = is_unary_tree(self._bs4)
return self._is_unary_tree
- def get_first_deepest_tag(self) -> HtmlTag | None:
- """
- `get_first_deepest_tag` returns the first deepest tag within the current tag.
-
- For example, if we have the following HTML structure:
-
- and we pass the 'div' tag to this function, it will return the 'p' tag,
- which is the first deepest tag within the 'html' tag.
- """
- result: HtmlTag | None = None
- if self._first_deepest_tag is NotSet:
- tag = get_first_deepest_tag(self._bs4)
- if tag is not None:
- result = HtmlTag(tag)
- self._first_deepest_tag = result
- return result
-
def get_text_styles_metrics(self) -> dict[tuple[str, str], float]:
"""
Compute the percentage distribution of various CSS styles within the text
diff --git a/sec_parser/processing_steps/text_parsing_step.py b/sec_parser/processing_steps/text_parsing_step.py
index 5e76ed8..066b203 100644
--- a/sec_parser/processing_steps/text_parsing_step.py
+++ b/sec_parser/processing_steps/text_parsing_step.py
@@ -34,13 +34,6 @@ def __init__(
)
self._unique_markers_by_order: list[str] = []
- def _found_marker(self, symbol: str) -> None:
- if symbol not in self._unique_markers_by_order:
- # Ordered set:
- self._unique_markers_by_order = list(
- dict.fromkeys([*self._unique_markers_by_order, symbol]).keys(),
- )
-
def _process_element(
self,
element: AbstractSemanticElement,
diff --git a/sec_parser/processing_steps/title_parsing_step.py b/sec_parser/processing_steps/title_parsing_step.py
index 2feb518..34cd603 100644
--- a/sec_parser/processing_steps/title_parsing_step.py
+++ b/sec_parser/processing_steps/title_parsing_step.py
@@ -20,10 +20,16 @@
class TitleParsingStep(AbstractElementwiseProcessingStep):
"""
- TitleParsingStep class for transforming elements into TitleElement instances.
+ TitleParsingStep elements into TitleElement instances by scanning a list
+ of semantic elements and replacing suitable candidates.
- This step scans through a list of semantic elements and changes it,
- primarily by replacing suitable candidates with TitleElement instances.
+ The "_unique_styles_by_order" tuple:
+ ====================================
+ - Represents an ordered set of unique styles found in the document.
+ - Preserves the order of insertion, which determines the hierarchical
+ level of each style.
+ - Assumes that earlier "highlight" styles correspond to higher level paragraph
+ or section headings.
"""
def __init__(
@@ -36,17 +42,11 @@ def __init__(
types_to_exclude=types_to_exclude,
)
- # _unique_styles_by_order track unique styles in the document.
- # Stored in a tuple as an ordered set, preserving insertion order.
- # This order is used to determine a style's level.
- # It is based on the observation that "highlight" styles that appear first
- # typically mark higher level paragraph/section headings.
- # _unique_styles_by_order is effectively used as an ordered set:
self._unique_styles_by_order: tuple[TextStyle, ...] = ()
def _add_unique_style(self, style: TextStyle) -> None:
+ """Add a new unique style if not already present."""
if style not in self._unique_styles_by_order:
- # _styles is effectively updated as an ordered set:
self._unique_styles_by_order = tuple(
dict.fromkeys([*self._unique_styles_by_order, style]).keys(),
)
@@ -56,8 +56,12 @@ def _process_element(
element: AbstractSemanticElement,
_: ElementwiseProcessingContext,
) -> AbstractSemanticElement:
+ """Process each element and convert to TitleElement if necessary."""
if not isinstance(element, HighlightedTextElement):
return element
+
+ # Ensure the style is tracked
self._add_unique_style(element.style)
+
level = self._unique_styles_by_order.index(element.style)
return TitleElement.convert_from(element, level=level)
diff --git a/sec_parser/processing_steps/title_plugin.py b/sec_parser/processing_steps/title_plugin.py
deleted file mode 100644
index 2feb518..0000000
--- a/sec_parser/processing_steps/title_plugin.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from sec_parser.processing_steps.abstract_elementwise_processing_step import (
- AbstractElementwiseProcessingStep,
- ElementwiseProcessingContext,
-)
-from sec_parser.semantic_elements.highlighted_text_element import (
- HighlightedTextElement,
- TextStyle,
-)
-from sec_parser.semantic_elements.semantic_elements import TitleElement
-
-if TYPE_CHECKING: # pragma: no cover
- from sec_parser.semantic_elements.abstract_semantic_element import (
- AbstractSemanticElement,
- )
-
-
-class TitleParsingStep(AbstractElementwiseProcessingStep):
- """
- TitleParsingStep class for transforming elements into TitleElement instances.
-
- This step scans through a list of semantic elements and changes it,
- primarily by replacing suitable candidates with TitleElement instances.
- """
-
- def __init__(
- self,
- types_to_process: set[type[AbstractSemanticElement]] | None = None,
- types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
- ) -> None:
- super().__init__(
- types_to_process=types_to_process,
- types_to_exclude=types_to_exclude,
- )
-
- # _unique_styles_by_order track unique styles in the document.
- # Stored in a tuple as an ordered set, preserving insertion order.
- # This order is used to determine a style's level.
- # It is based on the observation that "highlight" styles that appear first
- # typically mark higher level paragraph/section headings.
- # _unique_styles_by_order is effectively used as an ordered set:
- self._unique_styles_by_order: tuple[TextStyle, ...] = ()
-
- def _add_unique_style(self, style: TextStyle) -> None:
- if style not in self._unique_styles_by_order:
- # _styles is effectively updated as an ordered set:
- self._unique_styles_by_order = tuple(
- dict.fromkeys([*self._unique_styles_by_order, style]).keys(),
- )
-
- def _process_element(
- self,
- element: AbstractSemanticElement,
- _: ElementwiseProcessingContext,
- ) -> AbstractSemanticElement:
- if not isinstance(element, HighlightedTextElement):
- return element
- self._add_unique_style(element.style)
- level = self._unique_styles_by_order.index(element.style)
- return TitleElement.convert_from(element, level=level)
diff --git a/sec_parser/semantic_elements/abstract_semantic_element.py b/sec_parser/semantic_elements/abstract_semantic_element.py
index 3f70473..8045e71 100644
--- a/sec_parser/semantic_elements/abstract_semantic_element.py
+++ b/sec_parser/semantic_elements/abstract_semantic_element.py
@@ -37,33 +37,6 @@ def convert_from(
"""Convert the semantic element into another semantic element type."""
return cls(source.html_tag)
- @classmethod
- def get_direct_abstract_semantic_subclass(
- cls,
- ) -> type[AbstractSemanticElement]:
- """
- Given a class, find the class that is one step below
- AbstractSemanticElement in its inheritance hierarchy.
- """
- if not issubclass(cls, AbstractSemanticElement):
- msg = "Argument must be a subclass of AbstractSemanticElement."
- raise TypeError(msg)
-
- root_child = None
- for ancestor in cls.mro():
- if ancestor is AbstractSemanticElement:
- break
- root_child = ancestor
-
- if root_child is None:
- msg = "Could not find a root child class for the given class."
- raise ValueError(msg)
-
- return root_child
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}<{self.html_tag.name}>"
-
class AbstractLevelElement(AbstractSemanticElement, ABC):
"""
@@ -72,7 +45,7 @@ class AbstractLevelElement(AbstractSemanticElement, ABC):
a main section title might be at level 1, a subsection at level 2, etc.
"""
- MIN_LEVEL = 1
+ MIN_LEVEL = 0
def __init__(
self,
diff --git a/sec_parser/semantic_tree/abstract_nesting_rule.py b/sec_parser/semantic_tree/abstract_nesting_rule.py
index 4c1c3bc..23ee8b0 100644
--- a/sec_parser/semantic_tree/abstract_nesting_rule.py
+++ b/sec_parser/semantic_tree/abstract_nesting_rule.py
@@ -12,4 +12,4 @@ def should_be_nested_under(
parent: AbstractSemanticElement,
child: AbstractSemanticElement,
) -> bool:
- pass
+ raise NotImplementedError # pragma: no cover
diff --git a/sec_parser/semantic_tree/semantic_tree.py b/sec_parser/semantic_tree/semantic_tree.py
index 7f7d390..3a8f6cf 100644
--- a/sec_parser/semantic_tree/semantic_tree.py
+++ b/sec_parser/semantic_tree/semantic_tree.py
@@ -54,23 +54,22 @@ def render(
new_prefix = "│ " if not is_last else " "
level = ""
- lvl = getattr(node, "level", "")
+ lvl = getattr(node.semantic_element, "level", "")
if lvl:
level = f"[L{lvl}]"
- class_name = f"{element.__class__.__name__}{level}:"
- title = element.html_tag.get_text()
- if len(title) > max_line_length:
- title = f"{title[:max_line_length]}..."
+ if pretty:
+ level = f"\033[1;92m{level}\033[0m"
+ class_name = f"{element.__class__.__name__}{level}"
+ contents = element.html_tag.get_text().strip()
+ if len(contents) > max_line_length:
+ contents = f"{contents[:max_line_length]}..."
if pretty:
class_name = f"\033[1;34m{class_name}\033[0m"
- title = f"\033[1;32m{title}\033[0m"
# Fix the alignment for root elements
- line = (
- f"{_prefix}{indent}{class_name} {title}"
- if not _is_root
- else f"{class_name} {title}"
- )
+ line = f"{_prefix}{indent}{class_name}" if not _is_root else f"{class_name}"
+ if contents:
+ line = f"{line}: {contents}"
tree_strings.append(line)
# Recursive call: Always set _is_root to False for non-root elements
diff --git a/sec_parser/utils/__init__.py b/sec_parser/utils/__init__.py
index 2688e8e..aefafd9 100644
--- a/sec_parser/utils/__init__.py
+++ b/sec_parser/utils/__init__.py
@@ -6,9 +6,11 @@
from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
from sec_parser.utils.env_var_helpers import ValueNotSetError, get_value_or_env_var
+from sec_parser.utils.py_utils import get_direct_subclass_of_base_class
__all__ = [
"ValueNotSetError",
"get_value_or_env_var",
+ "get_direct_subclass_of_base_class",
"is_unary_tree",
]
diff --git a/sec_parser/utils/bs4_/text_styles_metrics.py b/sec_parser/utils/bs4_/text_styles_metrics.py
index d7d3a54..1a43e6a 100644
--- a/sec_parser/utils/bs4_/text_styles_metrics.py
+++ b/sec_parser/utils/bs4_/text_styles_metrics.py
@@ -61,7 +61,9 @@ def _compute_effective_style(tag: Tag) -> dict[str, str]:
while found_tag:
if "style" in found_tag.attrs:
found_styles = found_tag["style"]
- if isinstance(found_styles, list):
+ if isinstance(found_styles, list): # pragma: no cover
+ # this should never happen, can't even construct a
+ # scenario where this would occur
msg = "Expected a string, got a list"
raise ValueError(msg)
styles = found_styles.split(";")
diff --git a/sec_parser/utils/py_utils.py b/sec_parser/utils/py_utils.py
new file mode 100644
index 0000000..fde96a4
--- /dev/null
+++ b/sec_parser/utils/py_utils.py
@@ -0,0 +1,23 @@
+def get_direct_subclass_of_base_class(cls: type, base_class: type) -> type:
+ """
+ Given a class, find the class that is one step below
+ the specified base_class in its inheritance hierarchy.
+ """
+ if not issubclass(cls, base_class):
+ msg = f"Argument must be a subclass of {base_class.__name__}."
+ raise TypeError(msg)
+
+ root_child = None
+ for ancestor in cls.mro():
+ if ancestor is base_class:
+ break
+ root_child = ancestor
+
+ if root_child is None:
+ msg = (
+ f"Could not find a root child class for "
+ f"the given class below {base_class.__name__}."
+ )
+ raise ValueError(msg)
+
+ return root_child
diff --git a/tests/unit/processing_engine/test_sec_parser.py b/tests/unit/processing_engine/test_sec_parser.py
index 2d18e6b..36238e0 100644
--- a/tests/unit/processing_engine/test_sec_parser.py
+++ b/tests/unit/processing_engine/test_sec_parser.py
@@ -18,7 +18,7 @@
),
],
)
-def test_sec_parser(html_str, expected_elements):
+def test_smoke_test(html_str, expected_elements):
# Arrange
sec_parser = SecParser()
diff --git a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py
index 024d7e7..6e60dc1 100644
--- a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py
+++ b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py
@@ -1,5 +1,7 @@
from __future__ import annotations
+from unittest.mock import Mock
+
import bs4
import pytest
@@ -49,8 +51,8 @@ def test_process_skip_due_to_types_to_process():
# Arrange
types_to_process: set[type[AbstractSemanticElement]] = {MockSemanticElement}
step = ProcessingStep(types_to_process=types_to_process)
- element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
- element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
+ element1 = MockSemanticElement(Mock())
+ element2 = AnotherMockSemanticElement(Mock())
input_elements = [element1, element2]
# Act
@@ -66,8 +68,8 @@ def test_process_skip_due_to_types_to_exclude():
# Arrange
types_to_exclude: set[type[AbstractSemanticElement]] = {MockSemanticElement}
step = ProcessingStep(types_to_exclude=types_to_exclude)
- element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
- element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
+ element1 = MockSemanticElement(Mock())
+ element2 = AnotherMockSemanticElement(Mock())
input_elements = [element1, element2]
# Act
@@ -94,8 +96,8 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude():
types_to_process=types_to_process,
types_to_exclude=types_to_exclude,
)
- element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
- element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
+ element1 = MockSemanticElement(Mock())
+ element2 = AnotherMockSemanticElement(Mock())
input_elements = [element1, element2]
# Act
@@ -104,3 +106,4 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude():
# Assert
assert step.seen_elements == [element1]
assert processed_elements == input_elements
+ assert processed_elements == input_elements
diff --git a/tests/unit/processing_steps/test_abstract_processing_step.py b/tests/unit/processing_steps/test_abstract_processing_step.py
index 958b6ff..cadcddf 100644
--- a/tests/unit/processing_steps/test_abstract_processing_step.py
+++ b/tests/unit/processing_steps/test_abstract_processing_step.py
@@ -1,7 +1,8 @@
+from unittest.mock import Mock
+
import bs4
import pytest
-from sec_parser.processing_engine.html_tag import HtmlTag
from sec_parser.processing_steps.abstract_processing_step import (
AbstractProcessingStep,
AlreadyProcessedError,
@@ -24,17 +25,16 @@ def _process(
def test_process_already_processed_raises_error():
- # Arrange: Create a list of dummy elements and a dummy processing step
- elements: list[AbstractSemanticElement] = [
- DummyElement(html_tag=HtmlTag(bs4.Tag(name="p"))) for _ in range(5)
- ]
+ # Arrange
+ elements: list[AbstractSemanticElement] = [DummyElement(Mock()) for _ in range(5)]
step = DummyProcessingStep()
- # Act: Call process once (this should not raise an error)
+ # Act
step.process(elements)
- # Assert: Calling process a second time should raise an error
+ # Assert
with pytest.raises(
- AlreadyProcessedError, match="This Step instance has already processed"
+ AlreadyProcessedError,
+ match="This Step instance has already processed a document",
):
step.process(elements)
diff --git a/tests/unit/processing_steps/test_highlighted_title_parsing_step.py b/tests/unit/processing_steps/test_highlighted_title_parsing_step.py
index 8e6c8b8..77299f7 100644
--- a/tests/unit/processing_steps/test_highlighted_title_parsing_step.py
+++ b/tests/unit/processing_steps/test_highlighted_title_parsing_step.py
@@ -33,6 +33,25 @@
{"type": UndeterminedElement, "tag": "div"},
],
),
+ (
+ """
+
+
+ foo
+
+
+ bar
+
+
+
+ baz
+
+ """,
+ [
+ {"type": TitleElement, "tag": "div"},
+ {"type": UndeterminedElement, "tag": "span"},
+ ],
+ ),
],
)
def test_title_step(html_str, expected_elements):
diff --git a/tests/unit/processing_steps/test_table_parsing_step.py b/tests/unit/processing_steps/test_table_parsing_step.py
new file mode 100644
index 0000000..b319d13
--- /dev/null
+++ b/tests/unit/processing_steps/test_table_parsing_step.py
@@ -0,0 +1,39 @@
+import pytest
+
+from sec_parser.processing_steps.table_parsing_step import TableParsingStep
+from sec_parser.semantic_elements.semantic_elements import ImageElement, TableElement
+from tests.unit._utils import assert_elements
+from tests.unit.processing_steps._utils import parse_initial_semantic_elements
+
+
+@pytest.mark.parametrize(
+ ("html_str", "expected_elements"),
+ [
+ (
+ """
+
+ """,
+ [
+ {"type": TableElement, "tag": "div"},
+ ],
+ ),
+ ],
+)
+def test_table_parsing_step(html_str, expected_elements):
+ """
+ test_table_parsing_step test checks that the TableParsingStep can successfully
+ transform a list of semantic elements returned by `parse_initial_semantic_elements`.
+ """
+ # Arrange
+ elements = parse_initial_semantic_elements(html_str)
+ step = TableParsingStep()
+
+ # Act
+ processed_elements = step.process(elements)
+
+ # Assert
+ assert_elements(processed_elements, expected_elements)
diff --git a/tests/unit/processing_steps/test_title_parsing_step.py b/tests/unit/processing_steps/test_title_parsing_step.py
new file mode 100644
index 0000000..9e61998
--- /dev/null
+++ b/tests/unit/processing_steps/test_title_parsing_step.py
@@ -0,0 +1,61 @@
+from unittest.mock import Mock
+
+import bs4
+import pytest
+
+from sec_parser.processing_engine.html_tag import HtmlTag
+from sec_parser.processing_steps.table_parsing_step import TableParsingStep
+from sec_parser.processing_steps.title_parsing_step import TitleParsingStep
+from sec_parser.semantic_elements.highlighted_text_element import (
+ HighlightedTextElement,
+ TextStyle,
+)
+from sec_parser.semantic_elements.semantic_elements import TableElement, TitleElement
+from tests.unit._utils import assert_elements
+from tests.unit.processing_steps._utils import parse_initial_semantic_elements
+
+
+def html_tag(tag_name: str, text: str = "Hello World") -> HtmlTag:
+ tag = bs4.Tag(name=tag_name)
+ tag.string = text
+ return HtmlTag(tag)
+
+
+bold = TextStyle(
+ bold_with_font_weight=True,
+ italic=False,
+)
+italic = TextStyle(
+ bold_with_font_weight=False,
+ italic=True,
+)
+
+
+@pytest.mark.parametrize(
+ ("elements", "expected_elements"),
+ [
+ (
+ [
+ HighlightedTextElement(html_tag("p"), style=italic),
+ HighlightedTextElement(html_tag("p"), style=bold),
+ HighlightedTextElement(html_tag("p"), style=bold),
+ HighlightedTextElement(html_tag("p"), style=italic),
+ ],
+ [
+ {"type": TitleElement, "tag": "p", "fields": {"level": 0}},
+ {"type": TitleElement, "tag": "p", "fields": {"level": 1}},
+ {"type": TitleElement, "tag": "p", "fields": {"level": 1}},
+ {"type": TitleElement, "tag": "p", "fields": {"level": 0}},
+ ],
+ ),
+ ],
+)
+def test_title_parsing_step(elements, expected_elements):
+ # Arrange
+ step = TitleParsingStep()
+
+ # Act
+ processed_elements = step.process(elements)
+
+ # Assert
+ assert_elements(processed_elements, expected_elements)
diff --git a/tests/unit/semantic_elements/test_abstract_level_element.py b/tests/unit/semantic_elements/test_abstract_level_element.py
new file mode 100644
index 0000000..4ff5338
--- /dev/null
+++ b/tests/unit/semantic_elements/test_abstract_level_element.py
@@ -0,0 +1,21 @@
+from unittest.mock import Mock
+
+import pytest
+
+from sec_parser.semantic_elements.abstract_semantic_element import (
+ AbstractLevelElement,
+ InvalidLevelError,
+)
+
+
+class DummyElement(AbstractLevelElement):
+ pass
+
+
+def test_invalid_level_raises():
+ # Arrange
+ invalid_level = -1
+
+ # Act & Assert
+ with pytest.raises(InvalidLevelError):
+ DummyElement(Mock(), level=invalid_level)
diff --git a/tests/unit/semantic_elements/test_composite_semantic_element.py b/tests/unit/semantic_elements/test_composite_semantic_element.py
new file mode 100644
index 0000000..dbde447
--- /dev/null
+++ b/tests/unit/semantic_elements/test_composite_semantic_element.py
@@ -0,0 +1,83 @@
+from unittest.mock import Mock
+
+import pytest
+
+from sec_parser.semantic_elements.abstract_semantic_element import (
+ AbstractSemanticElement,
+)
+from sec_parser.semantic_elements.composite_semantic_element import (
+ CompositeSemanticElement,
+)
+
+# Create a mock for HtmlTag
+MockHtmlTag = Mock()
+
+
+@pytest.fixture
+def default_inner_elements():
+ return [Mock(spec=AbstractSemanticElement) for _ in range(3)]
+
+
+def test_composite_semantic_element_initialization_valid_inner_elements(
+ default_inner_elements,
+):
+ """
+ Test if a CompositeSemanticElement object can be successfully initialized
+ with valid inner elements.
+ """
+ # Arrange
+ mock_html_tag = MockHtmlTag()
+
+ # Act
+ element = CompositeSemanticElement(mock_html_tag, default_inner_elements)
+
+ # Assert
+ assert element.inner_elements == default_inner_elements
+ assert element.html_tag == mock_html_tag
+
+
+def test_composite_semantic_element_initialization_none_inner_elements():
+ """
+ Test if initializing CompositeSemanticElement with inner_elements
+ as None raises a ValueError.
+ """
+ # Arrange
+ mock_html_tag = MockHtmlTag()
+
+ # Act & Assert
+ with pytest.raises(ValueError, match="inner_elements cannot be None."):
+ CompositeSemanticElement(mock_html_tag, None)
+
+
+def test_composite_semantic_element_initialization_empty_inner_elements():
+ """
+ Test if initializing CompositeSemanticElement with an empty list
+ for inner_elements raises a ValueError.
+ """
+ # Arrange
+ mock_html_tag = MockHtmlTag()
+
+ # Act & Assert
+ with pytest.raises(ValueError, match="inner_elements cannot be empty."):
+ CompositeSemanticElement(mock_html_tag, [])
+
+
+def test_convert_from_source_valid_inner_elements(default_inner_elements):
+ """
+ Test if convert_from method can successfully convert from a source object
+ with valid inner elements.
+ """
+ # Arrange
+ mock_html_tag = MockHtmlTag()
+ source = Mock(spec=AbstractSemanticElement)
+ source.html_tag = mock_html_tag
+
+ # Act
+ converted = CompositeSemanticElement.convert_from(
+ source, inner_elements=default_inner_elements
+ )
+
+ # Assert
+ assert isinstance(converted, CompositeSemanticElement)
+ assert converted.inner_elements == default_inner_elements
+ assert converted.html_tag == mock_html_tag
diff --git a/tests/unit/semantic_elements/test_highlighted_text_element.py b/tests/unit/semantic_elements/test_highlighted_text_element.py
new file mode 100644
index 0000000..2b9ed03
--- /dev/null
+++ b/tests/unit/semantic_elements/test_highlighted_text_element.py
@@ -0,0 +1,17 @@
+from unittest.mock import Mock
+
+import pytest
+
+from sec_parser.semantic_elements.highlighted_text_element import HighlightedTextElement
+
+
+def test_highlighted_text_element_initialization():
+ # Arrange
+ mock_html_tag = Mock()
+
+ # Act & Assert
+ with pytest.raises(
+ ValueError,
+ match="styles must be specified for HighlightedElement",
+ ):
+ HighlightedTextElement(mock_html_tag, style=None)
diff --git a/tests/unit/semantic_tree/test_semantic_tree_render.py b/tests/unit/semantic_tree/test_semantic_tree_render.py
new file mode 100644
index 0000000..9d6bd93
--- /dev/null
+++ b/tests/unit/semantic_tree/test_semantic_tree_render.py
@@ -0,0 +1,95 @@
+# test_semantic_tree.py
+
+from typing import Callable
+from unittest.mock import Mock, patch
+
+import bs4
+import pytest
+
+from sec_parser.processing_engine.html_tag import HtmlTag
+from sec_parser.semantic_elements.abstract_semantic_element import (
+ AbstractLevelElement,
+ AbstractSemanticElement,
+)
+from sec_parser.semantic_elements.semantic_elements import IrrelevantElement
+from sec_parser.semantic_tree.semantic_tree import SemanticTree
+from sec_parser.semantic_tree.tree_node import TreeNode
+
+
+class Element(AbstractSemanticElement):
+ pass
+
+
+class ElementWithLevel(AbstractLevelElement):
+ pass
+
+
+class IgnoredElement(AbstractSemanticElement):
+ pass
+
+
+def test_basic_render():
+ # Arrange
+ tree = get_tree()
+
+ # Act
+ result = tree.render(pretty=False)
+
+ # Assert
+ assert (
+ result
+ == "Element\n├── ElementWithLevel[L2]: Child 1 of node1\n├── Element: Child 2 of node1, with its own child\n│ └── Element: Grandchild of node1 (Child of node5)\n└── Element: Another child for node1\nElement: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA..."
+ )
+
+
+def test_render_with_pretty_option():
+ # Arrange
+ tree = get_tree()
+
+ # Act
+ result = tree.render(pretty=True)
+
+ # Assert
+ assert (
+ result
+ == "\x1b[1;34mElement\x1b[0m\n├── \x1b[1;34mElementWithLevel\x1b[1;92m[L2]\x1b[0m\x1b[0m: Child 1 of node1\n├── \x1b[1;34mElement\x1b[0m: Child 2 of node1, with its own child\n│ └── \x1b[1;34mElement\x1b[0m: Grandchild of node1 (Child of node5)\n└── \x1b[1;34mElement\x1b[0m: Another child for node1\n\x1b[1;34mElement\x1b[0m: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA..."
+ )
+
+
+def new_node(name, text, cls: Callable = Element):
+ tag = bs4.Tag(name=name)
+ tag.string = text
+ return TreeNode(cls(HtmlTag(tag)))
+
+
+def get_tree():
+ node1 = new_node("p", "") # A regular node
+ node2 = new_node("p", "A" * 51) # A node with title longer than max_line_length
+ node3 = new_node(
+ "p",
+ "This is an ignored type",
+ cls=IrrelevantElement,
+ )
+ node4 = new_node(
+ "p",
+ "Child 1 of node1",
+ cls=lambda k: ElementWithLevel(k, level=2),
+ )
+ node5 = new_node("p", "Child 2 of node1, with its own child")
+ node6 = new_node("p", "Grandchild of node1 (Child of node5)")
+ node7 = new_node("p", "Another child for node1")
+
+ # Building the relationships
+ node1.add_child(node4)
+ node1.add_child(node5)
+ node5.add_child(node6)
+ node1.add_child(node7)
+
+ tree = SemanticTree(
+ [
+ node1,
+ node2,
+ node3,
+ ],
+ )
+ return tree
diff --git a/tests/unit/semantic_tree/test_tree_builder.py b/tests/unit/semantic_tree/test_tree_builder.py
index 7dd8b6d..874fffe 100644
--- a/tests/unit/semantic_tree/test_tree_builder.py
+++ b/tests/unit/semantic_tree/test_tree_builder.py
@@ -1,11 +1,22 @@
import bs4
-from sec_parser import AbstractSemanticElement, HtmlTag, TreeBuilder
+from sec_parser import AbstractSemanticElement, TreeBuilder
+from sec_parser.processing_engine.html_tag import HtmlTag
from sec_parser.semantic_elements.abstract_semantic_element import AbstractLevelElement
from sec_parser.semantic_tree.nesting_rules import (
AbstractNestingRule,
+ AlwaysNestAsChildRule,
+ AlwaysNestAsParentRule,
NestSameTypeDependingOnLevelRule,
)
+from sec_parser.semantic_tree.semantic_tree import SemanticTree
+from sec_parser.semantic_tree.tree_node import TreeNode
+
+
+def html_tag(tag_name: str, text: str) -> HtmlTag:
+ tag = bs4.Tag(name=tag_name)
+ tag.string = text
+ return HtmlTag(tag)
class BaseElement(AbstractSemanticElement):
@@ -20,28 +31,30 @@ class ChildElement(AbstractSemanticElement):
pass
-class LeveledElement(AbstractLevelElement):
+class IgnoredParent(AbstractSemanticElement):
pass
-class ParentChildNestingRule(AbstractNestingRule):
- def _should_be_nested_under(
- self,
- parent: AbstractSemanticElement,
- child: AbstractSemanticElement,
- ) -> bool:
- return isinstance(parent, ParentElement) and isinstance(child, ChildElement)
+class IgnoredChild(AbstractSemanticElement):
+ pass
+
+
+class LeveledElement(AbstractLevelElement):
+ pass
def test_nesting_of_leveled_elements():
# Arrange
mock_elements = [
- LeveledElement(HtmlTag(bs4.Tag(name="p")), level=1),
- LeveledElement(HtmlTag(bs4.Tag(name="p")), level=2),
- LeveledElement(HtmlTag(bs4.Tag(name="p")), level=2),
+ LeveledElement(html_tag("tag1", "text1"), level=1),
+ LeveledElement(html_tag("tag2", "text2"), level=2),
+ LeveledElement(html_tag("tag3", "text3"), level=2),
]
- rules = [NestSameTypeDependingOnLevelRule()]
- tree_builder = TreeBuilder(get_rules=lambda: rules)
+
+ def get_rules() -> list[AbstractNestingRule]:
+ return [NestSameTypeDependingOnLevelRule()]
+
+ tree_builder = TreeBuilder(get_rules)
# Act
tree = tree_builder.build(mock_elements)
@@ -56,23 +69,156 @@ def test_nesting_of_leveled_elements():
assert child.semantic_element.level == 2
-def test_nesting_of_parent_and_child():
+def test_always_nest_as_parent():
# Arrange
mock_elements = [
- ParentElement(HtmlTag(bs4.Tag(name="p"))),
- ChildElement(HtmlTag(bs4.Tag(name="p"))),
+ ChildElement(html_tag("tag7", "text7")),
+ ParentElement(html_tag("tag6", "text6")),
+ ChildElement(html_tag("tag8", "text8")),
+ ParentElement(html_tag("tag17", "text17")),
+ ChildElement(html_tag("tag18", "text18")),
]
- def rules():
- return [ParentChildNestingRule()]
+ def get_rules() -> list[AbstractNestingRule]:
+ return [AlwaysNestAsParentRule(ParentElement)]
- tree_builder = TreeBuilder(get_rules=rules)
+ tree_builder = TreeBuilder(get_rules)
# Act
tree = tree_builder.build(mock_elements)
# Assert
- assert len(tree.root_nodes) == 1
+ assert len(tree.root_nodes) == 3
+ assert isinstance(tree.root_nodes[0].semantic_element, ChildElement)
+ assert isinstance(tree.root_nodes[1].semantic_element, ParentElement)
+ assert isinstance(tree.root_nodes[2].semantic_element, ParentElement)
+ assert len(tree.root_nodes[1].children) == 1
+ assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement)
+ assert len(tree.root_nodes[2].children) == 1
+ assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement)
+
+
+def test_always_nest_as_child():
+ # Arrange
+ mock_elements = [
+ ChildElement(html_tag("tag7", "text7")),
+ ParentElement(html_tag("tag6", "text6")),
+ ChildElement(html_tag("tag8", "text8")),
+ ParentElement(html_tag("tag17", "text17")),
+ ChildElement(html_tag("tag18", "text18")),
+ ]
+
+ def get_rules() -> list[AbstractNestingRule]:
+ return [AlwaysNestAsChildRule(ChildElement)]
+
+ tree_builder = TreeBuilder(get_rules)
+
+ # Act
+ tree = tree_builder.build(mock_elements)
+
+ # Assert
+ assert len(tree.root_nodes) == 3
+ assert isinstance(tree.root_nodes[0].semantic_element, ChildElement)
+ assert isinstance(tree.root_nodes[1].semantic_element, ParentElement)
+ assert isinstance(tree.root_nodes[2].semantic_element, ParentElement)
+ assert len(tree.root_nodes[1].children) == 1
+ assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement)
+ assert len(tree.root_nodes[2].children) == 1
+ assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement)
+
+
+def test_smoke_test():
+ # Arrange
+ tag = bs4.Tag(name="p")
+ tag.string = "Hello, world!"
+ element = BaseElement(HtmlTag(tag))
+ expected_tree = SemanticTree([TreeNode(element)])
+ tree_builder = TreeBuilder()
+
+ # Act
+ actual_tree = tree_builder.build([element])
+
+ # Assert
+ assert actual_tree.render() == expected_tree.render()
+
+
+def test_exclude_ignored_parent():
+ # Arrange
+ mock_elements = [
+ IgnoredParent(html_tag("tag2", "text2")),
+ ChildElement(html_tag("tag1", "text1")),
+ ParentElement(html_tag("tag3", "text3")),
+ ChildElement(html_tag("tag1", "text1")),
+ ]
+
+ def get_rules():
+ return [AlwaysNestAsChildRule(ChildElement, exclude_parents={IgnoredParent})]
+
+ tree_builder = TreeBuilder(get_rules)
+
+ # Act
+ tree = tree_builder.build(mock_elements)
+
+ # Assert
+ assert len(tree.root_nodes) == 3
+ assert isinstance(tree.root_nodes[0].semantic_element, IgnoredParent)
+ assert isinstance(tree.root_nodes[1].semantic_element, ChildElement)
+ assert isinstance(tree.root_nodes[2].semantic_element, ParentElement)
+
+
+def test_exclude_ignored_child():
+ # Arrange
+ mock_elements = [
+ ParentElement(html_tag("tag2", "text2")),
+ IgnoredChild(html_tag("tag1", "text1")),
+ ParentElement(html_tag("tag3", "text3")),
+ ChildElement(html_tag("tag1", "text1")),
+ ]
+
+ def get_rules():
+ return [AlwaysNestAsChildRule(ChildElement, exclude_children={IgnoredChild})]
+
+ tree_builder = TreeBuilder(get_rules)
+
+ # Act
+ tree = tree_builder.build(mock_elements)
+
+ # Assert
+ assert len(tree.root_nodes) == 3
+ assert isinstance(tree.root_nodes[0].semantic_element, ParentElement)
+ assert isinstance(tree.root_nodes[1].semantic_element, IgnoredChild)
+ assert isinstance(tree.root_nodes[2].semantic_element, ParentElement)
+
+
+def test_exclude_both_ignored_parent_and_child():
+ # Arrange
+ mock_elements = [
+ ParentElement(html_tag("tag2", "text2")),
+ ChildElement(html_tag("tag1", "text1")),
+ ParentElement(html_tag("tag3", "text3")),
+ IgnoredChild(html_tag("tag1", "text1")),
+ IgnoredParent(html_tag("tag2", "text2")),
+ ChildElement(html_tag("tag1", "text1")),
+ ]
+
+ def get_rules():
+ return [
+ AlwaysNestAsChildRule(
+ ChildElement,
+ exclude_parents={IgnoredParent},
+ exclude_children={IgnoredChild},
+ ),
+ ]
+
+ tree_builder = TreeBuilder(get_rules)
+
+ # Act
+ tree = tree_builder.build(mock_elements)
+
+ # Assert
+ assert len(tree.root_nodes) == 5
assert isinstance(tree.root_nodes[0].semantic_element, ParentElement)
- assert len(tree.root_nodes[0].children) == 1
- assert isinstance(tree.root_nodes[0].children[0].semantic_element, ChildElement)
+ assert isinstance(tree.root_nodes[1].semantic_element, ParentElement)
+ assert isinstance(tree.root_nodes[2].semantic_element, IgnoredChild)
+ assert isinstance(tree.root_nodes[3].semantic_element, IgnoredParent)
+ assert isinstance(tree.root_nodes[4].semantic_element, ChildElement)
diff --git a/tests/unit/semantic_tree/test_tree_node.py b/tests/unit/semantic_tree/test_tree_node.py
new file mode 100644
index 0000000..a6fb89d
--- /dev/null
+++ b/tests/unit/semantic_tree/test_tree_node.py
@@ -0,0 +1,115 @@
+from unittest.mock import Mock
+
+import pytest
+
+from sec_parser.semantic_elements.abstract_semantic_element import (
+ AbstractSemanticElement,
+)
+from sec_parser.semantic_tree.tree_node import TreeNode
+
+
+@pytest.fixture
+def mock_element():
+ return Mock(spec=AbstractSemanticElement)
+
+
+def test_add_child(mock_element):
+ # Arrange
+ parent = TreeNode(mock_element)
+ child = TreeNode(mock_element)
+
+ # Act
+ parent.add_child(child)
+
+ # Assert
+ assert parent.has_child(child)
+ assert child.parent == parent
+
+
+def test_add_children(mock_element):
+ # Arrange
+ parent = TreeNode(mock_element)
+ children = [TreeNode(mock_element) for _ in range(3)]
+
+ # Act
+ parent.add_children(children)
+
+ # Assert
+ for child in children:
+ assert parent.has_child(child)
+ assert child.parent == parent
+
+
+def test_remove_child(mock_element):
+ # Arrange
+ parent = TreeNode(mock_element)
+ child = TreeNode(mock_element, parent=parent)
+
+ # Act
+ parent.remove_child(child)
+
+ # Assert
+ assert not parent.has_child(child)
+ assert child.parent is None
+
+
+def test_set_parent(mock_element):
+ # Arrange
+ node = TreeNode(mock_element)
+ new_parent = TreeNode(mock_element)
+
+ # Act
+ node.parent = new_parent
+
+ # Assert
+ assert node.parent == new_parent
+ assert new_parent.has_child(node)
+
+
+def test_remove_from_existing_parent_when_new_parent_assigned(mock_element):
+ # Arrange
+ initial_parent = TreeNode(mock_element)
+ node = TreeNode(mock_element, parent=initial_parent)
+ new_parent = TreeNode(mock_element)
+
+ # Act
+ node.parent = new_parent
+
+ # Assert
+ assert node.parent == new_parent
+ assert new_parent.has_child(node)
+ assert not initial_parent.has_child(node)
+
+
+def test_remove_parent(mock_element):
+ # Arrange
+ parent = TreeNode(mock_element)
+ child = TreeNode(mock_element, parent=parent)
+
+ # Act
+ child.parent = None
+
+ # Assert
+ assert child.parent is None
+ assert not parent.has_child(child)
+
+
+def test_repr(mock_element):
+ # Arrange
+ parent = TreeNode(mock_element)
+ children = [TreeNode(mock_element) for _ in range(3)]
+ node_with_children = TreeNode(mock_element, children=children)
+
+ # Act
+ repr_parent = repr(parent)
+ repr_with_children = repr(node_with_children)
+
+ # Assert
+ assert repr_parent == "TreeNode(parent=None, children=0)"
+ assert repr_with_children == f"TreeNode(parent=None, children={len(children)})"
+ repr_parent = repr(parent)
+ repr_with_children = repr(node_with_children)
+
+ # Assert
+ assert repr_parent == "TreeNode(parent=None, children=0)"
+ assert repr_with_children == f"TreeNode(parent=None, children={len(children)})"
diff --git a/tests/unit/utils/bs4_/test_text_styles_metrics.py b/tests/unit/utils/bs4_/test_text_styles_metrics.py
index 2ea9ff9..efd2c1b 100644
--- a/tests/unit/utils/bs4_/test_text_styles_metrics.py
+++ b/tests/unit/utils/bs4_/test_text_styles_metrics.py
@@ -1,8 +1,7 @@
+import pytest
from bs4 import BeautifulSoup
-from sec_parser.utils.bs4_.text_styles_metrics import (
- compute_text_styles_metrics,
-)
+from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics
# Test: Normal case with multiple styles
diff --git a/tests/unit/utils/env_var_helpers/test_get_value_or_env_var.py b/tests/unit/utils/env_var_helpers/test_get_value_or_env_var.py
new file mode 100644
index 0000000..8f258f5
--- /dev/null
+++ b/tests/unit/utils/env_var_helpers/test_get_value_or_env_var.py
@@ -0,0 +1,68 @@
+from unittest.mock import patch
+
+import pytest
+
+from sec_parser.utils.env_var_helpers import ValueNotSetError, get_value_or_env_var
+
+
+def test_get_value_with_value_provided():
+ # Arrange
+ value = "test_value"
+ env_var = "UNUSED_ENV_VAR"
+
+ # Act
+ result = get_value_or_env_var(value, env_var)
+
+ # Assert
+ assert result == value
+
+
+@patch.dict("os.environ", {"TEST_ENV_VAR": "env_test_value"})
+def test_get_value_with_env_var_set():
+ # Arrange
+ env_var = "TEST_ENV_VAR"
+
+ # Act
+ result = get_value_or_env_var(None, env_var)
+
+ # Assert
+ assert result == "env_test_value"
+
+
+@patch.dict("os.environ", {}, clear=True)
+def test_get_value_with_default_provided():
+ # Arrange
+ env_var = "TEST_ENV_VAR"
+ default = "default_test_value"
+
+ # Act
+ result = get_value_or_env_var(None, env_var, default)
+
+ # Assert
+ assert result == default
+
+
+@patch.dict("os.environ", {}, clear=True)
+def test_get_value_raises_exception():
+ # Arrange
+ env_var = "TEST_ENV_VAR"
+
+ # Act & Assert
+ with pytest.raises(
+ ValueNotSetError,
+ match=f"No value provided and the environment variable '{env_var}' is also not set.",
+ ):
+ get_value_or_env_var(None, env_var)
+
+
+def test_get_value_with_empty_string():
+ # Arrange
+ value = " "
+ env_var = "UNUSED_ENV_VAR"
+
+ # Act & Assert
+ with pytest.raises(
+ ValueNotSetError,
+ match=f"No value provided and the environment variable '{env_var}' is also not set.",
+ ):
+ get_value_or_env_var(value, env_var)
diff --git a/tests/unit/utils/py_utils/test_get_direct_subclass_of_base.py b/tests/unit/utils/py_utils/test_get_direct_subclass_of_base.py
new file mode 100644
index 0000000..9c99234
--- /dev/null
+++ b/tests/unit/utils/py_utils/test_get_direct_subclass_of_base.py
@@ -0,0 +1,51 @@
+import pytest
+
+from sec_parser.utils.py_utils import get_direct_subclass_of_base_class
+
+
+class Parent:
+ pass
+
+
+class Child(Parent):
+ pass
+
+
+class Grandchild(Child):
+ pass
+
+
+@pytest.mark.parametrize(
+ "base, cls, expected",
+ [
+ (Parent, Child, Child),
+ (Parent, Grandchild, Child),
+ ],
+)
+def test_get_direct(base, cls, expected):
+ # Arrange
+ pass
+
+ # Act
+ actual = get_direct_subclass_of_base_class(cls, base)
+
+ # Assert
+ assert actual == expected
+
+
+def test_invalid_input_type():
+ # Arrange
+ class Unrelated:
+ pass
+
+ # Act & Assert
+ with pytest.raises(TypeError):
+ get_direct_subclass_of_base_class(Unrelated, Parent)
+
+
+def test_no_direct_subclass():
+ with pytest.raises(
+ ValueError,
+ match=r"Could not find a root child class for the given class below Parent.",
+ ):
+ get_direct_subclass_of_base_class(Parent, Parent)