diff --git a/.codecov.yml b/.codecov.yml index 8e5944c..ca544c8 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -3,4 +3,4 @@ coverage: patch: default: # Note: also update Taskfile.yml when changing the target coverage. - target: 60% + target: 90% diff --git a/Taskfile.yml b/Taskfile.yml index 923ee80..00d259f 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -34,14 +34,14 @@ tasks: cmds: # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters # Note: also update .codecov.yml when changing the target coverage. - - poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/ + - poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/ unit-watch: desc: Run unit tests and check code coverage immediately when files change. cmds: # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters # Note: also update .codecov.yml when changing the target coverage. - - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/ + - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/ ########################### diff --git a/sec_parser/processing_engine/html_tag.py b/sec_parser/processing_engine/html_tag.py index e044d3d..a17444d 100644 --- a/sec_parser/processing_engine/html_tag.py +++ b/sec_parser/processing_engine/html_tag.py @@ -6,7 +6,6 @@ from sec_parser.exceptions import SecParserValueError from sec_parser.utils.bs4_.contains_tag import contains_tag -from sec_parser.utils.bs4_.get_first_deepest_tag import get_first_deepest_tag from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics @@ -93,23 +92,6 @@ def is_unary_tree(self) -> bool: self._is_unary_tree = is_unary_tree(self._bs4) return self._is_unary_tree - def get_first_deepest_tag(self) -> HtmlTag | None: - """ - `get_first_deepest_tag` returns the first deepest tag within the current tag. - - For example, if we have the following HTML structure: -

Test

Another Test
- and we pass the 'div' tag to this function, it will return the 'p' tag, - which is the first deepest tag within the 'html' tag. - """ - result: HtmlTag | None = None - if self._first_deepest_tag is NotSet: - tag = get_first_deepest_tag(self._bs4) - if tag is not None: - result = HtmlTag(tag) - self._first_deepest_tag = result - return result - def get_text_styles_metrics(self) -> dict[tuple[str, str], float]: """ Compute the percentage distribution of various CSS styles within the text diff --git a/sec_parser/processing_steps/text_parsing_step.py b/sec_parser/processing_steps/text_parsing_step.py index 5e76ed8..066b203 100644 --- a/sec_parser/processing_steps/text_parsing_step.py +++ b/sec_parser/processing_steps/text_parsing_step.py @@ -34,13 +34,6 @@ def __init__( ) self._unique_markers_by_order: list[str] = [] - def _found_marker(self, symbol: str) -> None: - if symbol not in self._unique_markers_by_order: - # Ordered set: - self._unique_markers_by_order = list( - dict.fromkeys([*self._unique_markers_by_order, symbol]).keys(), - ) - def _process_element( self, element: AbstractSemanticElement, diff --git a/sec_parser/processing_steps/title_parsing_step.py b/sec_parser/processing_steps/title_parsing_step.py index 2feb518..34cd603 100644 --- a/sec_parser/processing_steps/title_parsing_step.py +++ b/sec_parser/processing_steps/title_parsing_step.py @@ -20,10 +20,16 @@ class TitleParsingStep(AbstractElementwiseProcessingStep): """ - TitleParsingStep class for transforming elements into TitleElement instances. + TitleParsingStep elements into TitleElement instances by scanning a list + of semantic elements and replacing suitable candidates. - This step scans through a list of semantic elements and changes it, - primarily by replacing suitable candidates with TitleElement instances. + The "_unique_styles_by_order" tuple: + ==================================== + - Represents an ordered set of unique styles found in the document. + - Preserves the order of insertion, which determines the hierarchical + level of each style. + - Assumes that earlier "highlight" styles correspond to higher level paragraph + or section headings. """ def __init__( @@ -36,17 +42,11 @@ def __init__( types_to_exclude=types_to_exclude, ) - # _unique_styles_by_order track unique styles in the document. - # Stored in a tuple as an ordered set, preserving insertion order. - # This order is used to determine a style's level. - # It is based on the observation that "highlight" styles that appear first - # typically mark higher level paragraph/section headings. - # _unique_styles_by_order is effectively used as an ordered set: self._unique_styles_by_order: tuple[TextStyle, ...] = () def _add_unique_style(self, style: TextStyle) -> None: + """Add a new unique style if not already present.""" if style not in self._unique_styles_by_order: - # _styles is effectively updated as an ordered set: self._unique_styles_by_order = tuple( dict.fromkeys([*self._unique_styles_by_order, style]).keys(), ) @@ -56,8 +56,12 @@ def _process_element( element: AbstractSemanticElement, _: ElementwiseProcessingContext, ) -> AbstractSemanticElement: + """Process each element and convert to TitleElement if necessary.""" if not isinstance(element, HighlightedTextElement): return element + + # Ensure the style is tracked self._add_unique_style(element.style) + level = self._unique_styles_by_order.index(element.style) return TitleElement.convert_from(element, level=level) diff --git a/sec_parser/processing_steps/title_plugin.py b/sec_parser/processing_steps/title_plugin.py deleted file mode 100644 index 2feb518..0000000 --- a/sec_parser/processing_steps/title_plugin.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from sec_parser.processing_steps.abstract_elementwise_processing_step import ( - AbstractElementwiseProcessingStep, - ElementwiseProcessingContext, -) -from sec_parser.semantic_elements.highlighted_text_element import ( - HighlightedTextElement, - TextStyle, -) -from sec_parser.semantic_elements.semantic_elements import TitleElement - -if TYPE_CHECKING: # pragma: no cover - from sec_parser.semantic_elements.abstract_semantic_element import ( - AbstractSemanticElement, - ) - - -class TitleParsingStep(AbstractElementwiseProcessingStep): - """ - TitleParsingStep class for transforming elements into TitleElement instances. - - This step scans through a list of semantic elements and changes it, - primarily by replacing suitable candidates with TitleElement instances. - """ - - def __init__( - self, - types_to_process: set[type[AbstractSemanticElement]] | None = None, - types_to_exclude: set[type[AbstractSemanticElement]] | None = None, - ) -> None: - super().__init__( - types_to_process=types_to_process, - types_to_exclude=types_to_exclude, - ) - - # _unique_styles_by_order track unique styles in the document. - # Stored in a tuple as an ordered set, preserving insertion order. - # This order is used to determine a style's level. - # It is based on the observation that "highlight" styles that appear first - # typically mark higher level paragraph/section headings. - # _unique_styles_by_order is effectively used as an ordered set: - self._unique_styles_by_order: tuple[TextStyle, ...] = () - - def _add_unique_style(self, style: TextStyle) -> None: - if style not in self._unique_styles_by_order: - # _styles is effectively updated as an ordered set: - self._unique_styles_by_order = tuple( - dict.fromkeys([*self._unique_styles_by_order, style]).keys(), - ) - - def _process_element( - self, - element: AbstractSemanticElement, - _: ElementwiseProcessingContext, - ) -> AbstractSemanticElement: - if not isinstance(element, HighlightedTextElement): - return element - self._add_unique_style(element.style) - level = self._unique_styles_by_order.index(element.style) - return TitleElement.convert_from(element, level=level) diff --git a/sec_parser/semantic_elements/abstract_semantic_element.py b/sec_parser/semantic_elements/abstract_semantic_element.py index 3f70473..8045e71 100644 --- a/sec_parser/semantic_elements/abstract_semantic_element.py +++ b/sec_parser/semantic_elements/abstract_semantic_element.py @@ -37,33 +37,6 @@ def convert_from( """Convert the semantic element into another semantic element type.""" return cls(source.html_tag) - @classmethod - def get_direct_abstract_semantic_subclass( - cls, - ) -> type[AbstractSemanticElement]: - """ - Given a class, find the class that is one step below - AbstractSemanticElement in its inheritance hierarchy. - """ - if not issubclass(cls, AbstractSemanticElement): - msg = "Argument must be a subclass of AbstractSemanticElement." - raise TypeError(msg) - - root_child = None - for ancestor in cls.mro(): - if ancestor is AbstractSemanticElement: - break - root_child = ancestor - - if root_child is None: - msg = "Could not find a root child class for the given class." - raise ValueError(msg) - - return root_child - - def __repr__(self) -> str: - return f"{self.__class__.__name__}<{self.html_tag.name}>" - class AbstractLevelElement(AbstractSemanticElement, ABC): """ @@ -72,7 +45,7 @@ class AbstractLevelElement(AbstractSemanticElement, ABC): a main section title might be at level 1, a subsection at level 2, etc. """ - MIN_LEVEL = 1 + MIN_LEVEL = 0 def __init__( self, diff --git a/sec_parser/semantic_tree/abstract_nesting_rule.py b/sec_parser/semantic_tree/abstract_nesting_rule.py index 4c1c3bc..23ee8b0 100644 --- a/sec_parser/semantic_tree/abstract_nesting_rule.py +++ b/sec_parser/semantic_tree/abstract_nesting_rule.py @@ -12,4 +12,4 @@ def should_be_nested_under( parent: AbstractSemanticElement, child: AbstractSemanticElement, ) -> bool: - pass + raise NotImplementedError # pragma: no cover diff --git a/sec_parser/semantic_tree/semantic_tree.py b/sec_parser/semantic_tree/semantic_tree.py index 7f7d390..3a8f6cf 100644 --- a/sec_parser/semantic_tree/semantic_tree.py +++ b/sec_parser/semantic_tree/semantic_tree.py @@ -54,23 +54,22 @@ def render( new_prefix = "│ " if not is_last else " " level = "" - lvl = getattr(node, "level", "") + lvl = getattr(node.semantic_element, "level", "") if lvl: level = f"[L{lvl}]" - class_name = f"{element.__class__.__name__}{level}:" - title = element.html_tag.get_text() - if len(title) > max_line_length: - title = f"{title[:max_line_length]}..." + if pretty: + level = f"\033[1;92m{level}\033[0m" + class_name = f"{element.__class__.__name__}{level}" + contents = element.html_tag.get_text().strip() + if len(contents) > max_line_length: + contents = f"{contents[:max_line_length]}..." if pretty: class_name = f"\033[1;34m{class_name}\033[0m" - title = f"\033[1;32m{title}\033[0m" # Fix the alignment for root elements - line = ( - f"{_prefix}{indent}{class_name} {title}" - if not _is_root - else f"{class_name} {title}" - ) + line = f"{_prefix}{indent}{class_name}" if not _is_root else f"{class_name}" + if contents: + line = f"{line}: {contents}" tree_strings.append(line) # Recursive call: Always set _is_root to False for non-root elements diff --git a/sec_parser/utils/__init__.py b/sec_parser/utils/__init__.py index 2688e8e..aefafd9 100644 --- a/sec_parser/utils/__init__.py +++ b/sec_parser/utils/__init__.py @@ -6,9 +6,11 @@ from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree from sec_parser.utils.env_var_helpers import ValueNotSetError, get_value_or_env_var +from sec_parser.utils.py_utils import get_direct_subclass_of_base_class __all__ = [ "ValueNotSetError", "get_value_or_env_var", + "get_direct_subclass_of_base_class", "is_unary_tree", ] diff --git a/sec_parser/utils/bs4_/text_styles_metrics.py b/sec_parser/utils/bs4_/text_styles_metrics.py index d7d3a54..1a43e6a 100644 --- a/sec_parser/utils/bs4_/text_styles_metrics.py +++ b/sec_parser/utils/bs4_/text_styles_metrics.py @@ -61,7 +61,9 @@ def _compute_effective_style(tag: Tag) -> dict[str, str]: while found_tag: if "style" in found_tag.attrs: found_styles = found_tag["style"] - if isinstance(found_styles, list): + if isinstance(found_styles, list): # pragma: no cover + # this should never happen, can't even construct a + # scenario where this would occur msg = "Expected a string, got a list" raise ValueError(msg) styles = found_styles.split(";") diff --git a/sec_parser/utils/py_utils.py b/sec_parser/utils/py_utils.py new file mode 100644 index 0000000..fde96a4 --- /dev/null +++ b/sec_parser/utils/py_utils.py @@ -0,0 +1,23 @@ +def get_direct_subclass_of_base_class(cls: type, base_class: type) -> type: + """ + Given a class, find the class that is one step below + the specified base_class in its inheritance hierarchy. + """ + if not issubclass(cls, base_class): + msg = f"Argument must be a subclass of {base_class.__name__}." + raise TypeError(msg) + + root_child = None + for ancestor in cls.mro(): + if ancestor is base_class: + break + root_child = ancestor + + if root_child is None: + msg = ( + f"Could not find a root child class for " + f"the given class below {base_class.__name__}." + ) + raise ValueError(msg) + + return root_child diff --git a/tests/unit/processing_engine/test_sec_parser.py b/tests/unit/processing_engine/test_sec_parser.py index 2d18e6b..36238e0 100644 --- a/tests/unit/processing_engine/test_sec_parser.py +++ b/tests/unit/processing_engine/test_sec_parser.py @@ -18,7 +18,7 @@ ), ], ) -def test_sec_parser(html_str, expected_elements): +def test_smoke_test(html_str, expected_elements): # Arrange sec_parser = SecParser() diff --git a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py index 024d7e7..6e60dc1 100644 --- a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py +++ b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py @@ -1,5 +1,7 @@ from __future__ import annotations +from unittest.mock import Mock + import bs4 import pytest @@ -49,8 +51,8 @@ def test_process_skip_due_to_types_to_process(): # Arrange types_to_process: set[type[AbstractSemanticElement]] = {MockSemanticElement} step = ProcessingStep(types_to_process=types_to_process) - element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p"))) - element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p"))) + element1 = MockSemanticElement(Mock()) + element2 = AnotherMockSemanticElement(Mock()) input_elements = [element1, element2] # Act @@ -66,8 +68,8 @@ def test_process_skip_due_to_types_to_exclude(): # Arrange types_to_exclude: set[type[AbstractSemanticElement]] = {MockSemanticElement} step = ProcessingStep(types_to_exclude=types_to_exclude) - element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p"))) - element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p"))) + element1 = MockSemanticElement(Mock()) + element2 = AnotherMockSemanticElement(Mock()) input_elements = [element1, element2] # Act @@ -94,8 +96,8 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude(): types_to_process=types_to_process, types_to_exclude=types_to_exclude, ) - element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p"))) - element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p"))) + element1 = MockSemanticElement(Mock()) + element2 = AnotherMockSemanticElement(Mock()) input_elements = [element1, element2] # Act @@ -104,3 +106,4 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude(): # Assert assert step.seen_elements == [element1] assert processed_elements == input_elements + assert processed_elements == input_elements diff --git a/tests/unit/processing_steps/test_abstract_processing_step.py b/tests/unit/processing_steps/test_abstract_processing_step.py index 958b6ff..cadcddf 100644 --- a/tests/unit/processing_steps/test_abstract_processing_step.py +++ b/tests/unit/processing_steps/test_abstract_processing_step.py @@ -1,7 +1,8 @@ +from unittest.mock import Mock + import bs4 import pytest -from sec_parser.processing_engine.html_tag import HtmlTag from sec_parser.processing_steps.abstract_processing_step import ( AbstractProcessingStep, AlreadyProcessedError, @@ -24,17 +25,16 @@ def _process( def test_process_already_processed_raises_error(): - # Arrange: Create a list of dummy elements and a dummy processing step - elements: list[AbstractSemanticElement] = [ - DummyElement(html_tag=HtmlTag(bs4.Tag(name="p"))) for _ in range(5) - ] + # Arrange + elements: list[AbstractSemanticElement] = [DummyElement(Mock()) for _ in range(5)] step = DummyProcessingStep() - # Act: Call process once (this should not raise an error) + # Act step.process(elements) - # Assert: Calling process a second time should raise an error + # Assert with pytest.raises( - AlreadyProcessedError, match="This Step instance has already processed" + AlreadyProcessedError, + match="This Step instance has already processed a document", ): step.process(elements) diff --git a/tests/unit/processing_steps/test_highlighted_title_parsing_step.py b/tests/unit/processing_steps/test_highlighted_title_parsing_step.py index 8e6c8b8..77299f7 100644 --- a/tests/unit/processing_steps/test_highlighted_title_parsing_step.py +++ b/tests/unit/processing_steps/test_highlighted_title_parsing_step.py @@ -33,6 +33,25 @@ {"type": UndeterminedElement, "tag": "div"}, ], ), + ( + """ +
+ + foo + + + bar + +
+ + baz + + """, + [ + {"type": TitleElement, "tag": "div"}, + {"type": UndeterminedElement, "tag": "span"}, + ], + ), ], ) def test_title_step(html_str, expected_elements): diff --git a/tests/unit/processing_steps/test_table_parsing_step.py b/tests/unit/processing_steps/test_table_parsing_step.py new file mode 100644 index 0000000..b319d13 --- /dev/null +++ b/tests/unit/processing_steps/test_table_parsing_step.py @@ -0,0 +1,39 @@ +import pytest + +from sec_parser.processing_steps.table_parsing_step import TableParsingStep +from sec_parser.semantic_elements.semantic_elements import ImageElement, TableElement +from tests.unit._utils import assert_elements +from tests.unit.processing_steps._utils import parse_initial_semantic_elements + + +@pytest.mark.parametrize( + ("html_str", "expected_elements"), + [ + ( + """ +
+
+
+
+
+ """, + [ + {"type": TableElement, "tag": "div"}, + ], + ), + ], +) +def test_table_parsing_step(html_str, expected_elements): + """ + test_table_parsing_step test checks that the TableParsingStep can successfully + transform a list of semantic elements returned by `parse_initial_semantic_elements`. + """ + # Arrange + elements = parse_initial_semantic_elements(html_str) + step = TableParsingStep() + + # Act + processed_elements = step.process(elements) + + # Assert + assert_elements(processed_elements, expected_elements) diff --git a/tests/unit/processing_steps/test_title_parsing_step.py b/tests/unit/processing_steps/test_title_parsing_step.py new file mode 100644 index 0000000..9e61998 --- /dev/null +++ b/tests/unit/processing_steps/test_title_parsing_step.py @@ -0,0 +1,61 @@ +from unittest.mock import Mock + +import bs4 +import pytest + +from sec_parser.processing_engine.html_tag import HtmlTag +from sec_parser.processing_steps.table_parsing_step import TableParsingStep +from sec_parser.processing_steps.title_parsing_step import TitleParsingStep +from sec_parser.semantic_elements.highlighted_text_element import ( + HighlightedTextElement, + TextStyle, +) +from sec_parser.semantic_elements.semantic_elements import TableElement, TitleElement +from tests.unit._utils import assert_elements +from tests.unit.processing_steps._utils import parse_initial_semantic_elements + + +def html_tag(tag_name: str, text: str = "Hello World") -> HtmlTag: + tag = bs4.Tag(name=tag_name) + tag.string = text + return HtmlTag(tag) + + +bold = TextStyle( + bold_with_font_weight=True, + italic=False, +) +italic = TextStyle( + bold_with_font_weight=False, + italic=True, +) + + +@pytest.mark.parametrize( + ("elements", "expected_elements"), + [ + ( + [ + HighlightedTextElement(html_tag("p"), style=italic), + HighlightedTextElement(html_tag("p"), style=bold), + HighlightedTextElement(html_tag("p"), style=bold), + HighlightedTextElement(html_tag("p"), style=italic), + ], + [ + {"type": TitleElement, "tag": "p", "fields": {"level": 0}}, + {"type": TitleElement, "tag": "p", "fields": {"level": 1}}, + {"type": TitleElement, "tag": "p", "fields": {"level": 1}}, + {"type": TitleElement, "tag": "p", "fields": {"level": 0}}, + ], + ), + ], +) +def test_title_parsing_step(elements, expected_elements): + # Arrange + step = TitleParsingStep() + + # Act + processed_elements = step.process(elements) + + # Assert + assert_elements(processed_elements, expected_elements) diff --git a/tests/unit/semantic_elements/test_abstract_level_element.py b/tests/unit/semantic_elements/test_abstract_level_element.py new file mode 100644 index 0000000..4ff5338 --- /dev/null +++ b/tests/unit/semantic_elements/test_abstract_level_element.py @@ -0,0 +1,21 @@ +from unittest.mock import Mock + +import pytest + +from sec_parser.semantic_elements.abstract_semantic_element import ( + AbstractLevelElement, + InvalidLevelError, +) + + +class DummyElement(AbstractLevelElement): + pass + + +def test_invalid_level_raises(): + # Arrange + invalid_level = -1 + + # Act & Assert + with pytest.raises(InvalidLevelError): + DummyElement(Mock(), level=invalid_level) diff --git a/tests/unit/semantic_elements/test_composite_semantic_element.py b/tests/unit/semantic_elements/test_composite_semantic_element.py new file mode 100644 index 0000000..dbde447 --- /dev/null +++ b/tests/unit/semantic_elements/test_composite_semantic_element.py @@ -0,0 +1,83 @@ +from unittest.mock import Mock + +import pytest + +from sec_parser.semantic_elements.abstract_semantic_element import ( + AbstractSemanticElement, +) +from sec_parser.semantic_elements.composite_semantic_element import ( + CompositeSemanticElement, +) + +# Create a mock for HtmlTag +MockHtmlTag = Mock() + + +@pytest.fixture +def default_inner_elements(): + return [Mock(spec=AbstractSemanticElement) for _ in range(3)] + + +def test_composite_semantic_element_initialization_valid_inner_elements( + default_inner_elements, +): + """ + Test if a CompositeSemanticElement object can be successfully initialized + with valid inner elements. + """ + # Arrange + mock_html_tag = MockHtmlTag() + + # Act + element = CompositeSemanticElement(mock_html_tag, default_inner_elements) + + # Assert + assert element.inner_elements == default_inner_elements + assert element.html_tag == mock_html_tag + + +def test_composite_semantic_element_initialization_none_inner_elements(): + """ + Test if initializing CompositeSemanticElement with inner_elements + as None raises a ValueError. + """ + # Arrange + mock_html_tag = MockHtmlTag() + + # Act & Assert + with pytest.raises(ValueError, match="inner_elements cannot be None."): + CompositeSemanticElement(mock_html_tag, None) + + +def test_composite_semantic_element_initialization_empty_inner_elements(): + """ + Test if initializing CompositeSemanticElement with an empty list + for inner_elements raises a ValueError. + """ + # Arrange + mock_html_tag = MockHtmlTag() + + # Act & Assert + with pytest.raises(ValueError, match="inner_elements cannot be empty."): + CompositeSemanticElement(mock_html_tag, []) + + +def test_convert_from_source_valid_inner_elements(default_inner_elements): + """ + Test if convert_from method can successfully convert from a source object + with valid inner elements. + """ + # Arrange + mock_html_tag = MockHtmlTag() + source = Mock(spec=AbstractSemanticElement) + source.html_tag = mock_html_tag + + # Act + converted = CompositeSemanticElement.convert_from( + source, inner_elements=default_inner_elements + ) + + # Assert + assert isinstance(converted, CompositeSemanticElement) + assert converted.inner_elements == default_inner_elements + assert converted.html_tag == mock_html_tag diff --git a/tests/unit/semantic_elements/test_highlighted_text_element.py b/tests/unit/semantic_elements/test_highlighted_text_element.py new file mode 100644 index 0000000..2b9ed03 --- /dev/null +++ b/tests/unit/semantic_elements/test_highlighted_text_element.py @@ -0,0 +1,17 @@ +from unittest.mock import Mock + +import pytest + +from sec_parser.semantic_elements.highlighted_text_element import HighlightedTextElement + + +def test_highlighted_text_element_initialization(): + # Arrange + mock_html_tag = Mock() + + # Act & Assert + with pytest.raises( + ValueError, + match="styles must be specified for HighlightedElement", + ): + HighlightedTextElement(mock_html_tag, style=None) diff --git a/tests/unit/semantic_tree/test_semantic_tree_render.py b/tests/unit/semantic_tree/test_semantic_tree_render.py new file mode 100644 index 0000000..9d6bd93 --- /dev/null +++ b/tests/unit/semantic_tree/test_semantic_tree_render.py @@ -0,0 +1,95 @@ +# test_semantic_tree.py + +from typing import Callable +from unittest.mock import Mock, patch + +import bs4 +import pytest + +from sec_parser.processing_engine.html_tag import HtmlTag +from sec_parser.semantic_elements.abstract_semantic_element import ( + AbstractLevelElement, + AbstractSemanticElement, +) +from sec_parser.semantic_elements.semantic_elements import IrrelevantElement +from sec_parser.semantic_tree.semantic_tree import SemanticTree +from sec_parser.semantic_tree.tree_node import TreeNode + + +class Element(AbstractSemanticElement): + pass + + +class ElementWithLevel(AbstractLevelElement): + pass + + +class IgnoredElement(AbstractSemanticElement): + pass + + +def test_basic_render(): + # Arrange + tree = get_tree() + + # Act + result = tree.render(pretty=False) + + # Assert + assert ( + result + == "Element\n├── ElementWithLevel[L2]: Child 1 of node1\n├── Element: Child 2 of node1, with its own child\n│ └── Element: Grandchild of node1 (Child of node5)\n└── Element: Another child for node1\nElement: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA..." + ) + + +def test_render_with_pretty_option(): + # Arrange + tree = get_tree() + + # Act + result = tree.render(pretty=True) + + # Assert + assert ( + result + == "\x1b[1;34mElement\x1b[0m\n├── \x1b[1;34mElementWithLevel\x1b[1;92m[L2]\x1b[0m\x1b[0m: Child 1 of node1\n├── \x1b[1;34mElement\x1b[0m: Child 2 of node1, with its own child\n│ └── \x1b[1;34mElement\x1b[0m: Grandchild of node1 (Child of node5)\n└── \x1b[1;34mElement\x1b[0m: Another child for node1\n\x1b[1;34mElement\x1b[0m: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA..." + ) + + +def new_node(name, text, cls: Callable = Element): + tag = bs4.Tag(name=name) + tag.string = text + return TreeNode(cls(HtmlTag(tag))) + + +def get_tree(): + node1 = new_node("p", "") # A regular node + node2 = new_node("p", "A" * 51) # A node with title longer than max_line_length + node3 = new_node( + "p", + "This is an ignored type", + cls=IrrelevantElement, + ) + node4 = new_node( + "p", + "Child 1 of node1", + cls=lambda k: ElementWithLevel(k, level=2), + ) + node5 = new_node("p", "Child 2 of node1, with its own child") + node6 = new_node("p", "Grandchild of node1 (Child of node5)") + node7 = new_node("p", "Another child for node1") + + # Building the relationships + node1.add_child(node4) + node1.add_child(node5) + node5.add_child(node6) + node1.add_child(node7) + + tree = SemanticTree( + [ + node1, + node2, + node3, + ], + ) + return tree diff --git a/tests/unit/semantic_tree/test_tree_builder.py b/tests/unit/semantic_tree/test_tree_builder.py index 7dd8b6d..874fffe 100644 --- a/tests/unit/semantic_tree/test_tree_builder.py +++ b/tests/unit/semantic_tree/test_tree_builder.py @@ -1,11 +1,22 @@ import bs4 -from sec_parser import AbstractSemanticElement, HtmlTag, TreeBuilder +from sec_parser import AbstractSemanticElement, TreeBuilder +from sec_parser.processing_engine.html_tag import HtmlTag from sec_parser.semantic_elements.abstract_semantic_element import AbstractLevelElement from sec_parser.semantic_tree.nesting_rules import ( AbstractNestingRule, + AlwaysNestAsChildRule, + AlwaysNestAsParentRule, NestSameTypeDependingOnLevelRule, ) +from sec_parser.semantic_tree.semantic_tree import SemanticTree +from sec_parser.semantic_tree.tree_node import TreeNode + + +def html_tag(tag_name: str, text: str) -> HtmlTag: + tag = bs4.Tag(name=tag_name) + tag.string = text + return HtmlTag(tag) class BaseElement(AbstractSemanticElement): @@ -20,28 +31,30 @@ class ChildElement(AbstractSemanticElement): pass -class LeveledElement(AbstractLevelElement): +class IgnoredParent(AbstractSemanticElement): pass -class ParentChildNestingRule(AbstractNestingRule): - def _should_be_nested_under( - self, - parent: AbstractSemanticElement, - child: AbstractSemanticElement, - ) -> bool: - return isinstance(parent, ParentElement) and isinstance(child, ChildElement) +class IgnoredChild(AbstractSemanticElement): + pass + + +class LeveledElement(AbstractLevelElement): + pass def test_nesting_of_leveled_elements(): # Arrange mock_elements = [ - LeveledElement(HtmlTag(bs4.Tag(name="p")), level=1), - LeveledElement(HtmlTag(bs4.Tag(name="p")), level=2), - LeveledElement(HtmlTag(bs4.Tag(name="p")), level=2), + LeveledElement(html_tag("tag1", "text1"), level=1), + LeveledElement(html_tag("tag2", "text2"), level=2), + LeveledElement(html_tag("tag3", "text3"), level=2), ] - rules = [NestSameTypeDependingOnLevelRule()] - tree_builder = TreeBuilder(get_rules=lambda: rules) + + def get_rules() -> list[AbstractNestingRule]: + return [NestSameTypeDependingOnLevelRule()] + + tree_builder = TreeBuilder(get_rules) # Act tree = tree_builder.build(mock_elements) @@ -56,23 +69,156 @@ def test_nesting_of_leveled_elements(): assert child.semantic_element.level == 2 -def test_nesting_of_parent_and_child(): +def test_always_nest_as_parent(): # Arrange mock_elements = [ - ParentElement(HtmlTag(bs4.Tag(name="p"))), - ChildElement(HtmlTag(bs4.Tag(name="p"))), + ChildElement(html_tag("tag7", "text7")), + ParentElement(html_tag("tag6", "text6")), + ChildElement(html_tag("tag8", "text8")), + ParentElement(html_tag("tag17", "text17")), + ChildElement(html_tag("tag18", "text18")), ] - def rules(): - return [ParentChildNestingRule()] + def get_rules() -> list[AbstractNestingRule]: + return [AlwaysNestAsParentRule(ParentElement)] - tree_builder = TreeBuilder(get_rules=rules) + tree_builder = TreeBuilder(get_rules) # Act tree = tree_builder.build(mock_elements) # Assert - assert len(tree.root_nodes) == 1 + assert len(tree.root_nodes) == 3 + assert isinstance(tree.root_nodes[0].semantic_element, ChildElement) + assert isinstance(tree.root_nodes[1].semantic_element, ParentElement) + assert isinstance(tree.root_nodes[2].semantic_element, ParentElement) + assert len(tree.root_nodes[1].children) == 1 + assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement) + assert len(tree.root_nodes[2].children) == 1 + assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement) + + +def test_always_nest_as_child(): + # Arrange + mock_elements = [ + ChildElement(html_tag("tag7", "text7")), + ParentElement(html_tag("tag6", "text6")), + ChildElement(html_tag("tag8", "text8")), + ParentElement(html_tag("tag17", "text17")), + ChildElement(html_tag("tag18", "text18")), + ] + + def get_rules() -> list[AbstractNestingRule]: + return [AlwaysNestAsChildRule(ChildElement)] + + tree_builder = TreeBuilder(get_rules) + + # Act + tree = tree_builder.build(mock_elements) + + # Assert + assert len(tree.root_nodes) == 3 + assert isinstance(tree.root_nodes[0].semantic_element, ChildElement) + assert isinstance(tree.root_nodes[1].semantic_element, ParentElement) + assert isinstance(tree.root_nodes[2].semantic_element, ParentElement) + assert len(tree.root_nodes[1].children) == 1 + assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement) + assert len(tree.root_nodes[2].children) == 1 + assert isinstance(tree.root_nodes[1].children[0].semantic_element, ChildElement) + + +def test_smoke_test(): + # Arrange + tag = bs4.Tag(name="p") + tag.string = "Hello, world!" + element = BaseElement(HtmlTag(tag)) + expected_tree = SemanticTree([TreeNode(element)]) + tree_builder = TreeBuilder() + + # Act + actual_tree = tree_builder.build([element]) + + # Assert + assert actual_tree.render() == expected_tree.render() + + +def test_exclude_ignored_parent(): + # Arrange + mock_elements = [ + IgnoredParent(html_tag("tag2", "text2")), + ChildElement(html_tag("tag1", "text1")), + ParentElement(html_tag("tag3", "text3")), + ChildElement(html_tag("tag1", "text1")), + ] + + def get_rules(): + return [AlwaysNestAsChildRule(ChildElement, exclude_parents={IgnoredParent})] + + tree_builder = TreeBuilder(get_rules) + + # Act + tree = tree_builder.build(mock_elements) + + # Assert + assert len(tree.root_nodes) == 3 + assert isinstance(tree.root_nodes[0].semantic_element, IgnoredParent) + assert isinstance(tree.root_nodes[1].semantic_element, ChildElement) + assert isinstance(tree.root_nodes[2].semantic_element, ParentElement) + + +def test_exclude_ignored_child(): + # Arrange + mock_elements = [ + ParentElement(html_tag("tag2", "text2")), + IgnoredChild(html_tag("tag1", "text1")), + ParentElement(html_tag("tag3", "text3")), + ChildElement(html_tag("tag1", "text1")), + ] + + def get_rules(): + return [AlwaysNestAsChildRule(ChildElement, exclude_children={IgnoredChild})] + + tree_builder = TreeBuilder(get_rules) + + # Act + tree = tree_builder.build(mock_elements) + + # Assert + assert len(tree.root_nodes) == 3 + assert isinstance(tree.root_nodes[0].semantic_element, ParentElement) + assert isinstance(tree.root_nodes[1].semantic_element, IgnoredChild) + assert isinstance(tree.root_nodes[2].semantic_element, ParentElement) + + +def test_exclude_both_ignored_parent_and_child(): + # Arrange + mock_elements = [ + ParentElement(html_tag("tag2", "text2")), + ChildElement(html_tag("tag1", "text1")), + ParentElement(html_tag("tag3", "text3")), + IgnoredChild(html_tag("tag1", "text1")), + IgnoredParent(html_tag("tag2", "text2")), + ChildElement(html_tag("tag1", "text1")), + ] + + def get_rules(): + return [ + AlwaysNestAsChildRule( + ChildElement, + exclude_parents={IgnoredParent}, + exclude_children={IgnoredChild}, + ), + ] + + tree_builder = TreeBuilder(get_rules) + + # Act + tree = tree_builder.build(mock_elements) + + # Assert + assert len(tree.root_nodes) == 5 assert isinstance(tree.root_nodes[0].semantic_element, ParentElement) - assert len(tree.root_nodes[0].children) == 1 - assert isinstance(tree.root_nodes[0].children[0].semantic_element, ChildElement) + assert isinstance(tree.root_nodes[1].semantic_element, ParentElement) + assert isinstance(tree.root_nodes[2].semantic_element, IgnoredChild) + assert isinstance(tree.root_nodes[3].semantic_element, IgnoredParent) + assert isinstance(tree.root_nodes[4].semantic_element, ChildElement) diff --git a/tests/unit/semantic_tree/test_tree_node.py b/tests/unit/semantic_tree/test_tree_node.py new file mode 100644 index 0000000..a6fb89d --- /dev/null +++ b/tests/unit/semantic_tree/test_tree_node.py @@ -0,0 +1,115 @@ +from unittest.mock import Mock + +import pytest + +from sec_parser.semantic_elements.abstract_semantic_element import ( + AbstractSemanticElement, +) +from sec_parser.semantic_tree.tree_node import TreeNode + + +@pytest.fixture +def mock_element(): + return Mock(spec=AbstractSemanticElement) + + +def test_add_child(mock_element): + # Arrange + parent = TreeNode(mock_element) + child = TreeNode(mock_element) + + # Act + parent.add_child(child) + + # Assert + assert parent.has_child(child) + assert child.parent == parent + + +def test_add_children(mock_element): + # Arrange + parent = TreeNode(mock_element) + children = [TreeNode(mock_element) for _ in range(3)] + + # Act + parent.add_children(children) + + # Assert + for child in children: + assert parent.has_child(child) + assert child.parent == parent + + +def test_remove_child(mock_element): + # Arrange + parent = TreeNode(mock_element) + child = TreeNode(mock_element, parent=parent) + + # Act + parent.remove_child(child) + + # Assert + assert not parent.has_child(child) + assert child.parent is None + + +def test_set_parent(mock_element): + # Arrange + node = TreeNode(mock_element) + new_parent = TreeNode(mock_element) + + # Act + node.parent = new_parent + + # Assert + assert node.parent == new_parent + assert new_parent.has_child(node) + + +def test_remove_from_existing_parent_when_new_parent_assigned(mock_element): + # Arrange + initial_parent = TreeNode(mock_element) + node = TreeNode(mock_element, parent=initial_parent) + new_parent = TreeNode(mock_element) + + # Act + node.parent = new_parent + + # Assert + assert node.parent == new_parent + assert new_parent.has_child(node) + assert not initial_parent.has_child(node) + + +def test_remove_parent(mock_element): + # Arrange + parent = TreeNode(mock_element) + child = TreeNode(mock_element, parent=parent) + + # Act + child.parent = None + + # Assert + assert child.parent is None + assert not parent.has_child(child) + + +def test_repr(mock_element): + # Arrange + parent = TreeNode(mock_element) + children = [TreeNode(mock_element) for _ in range(3)] + node_with_children = TreeNode(mock_element, children=children) + + # Act + repr_parent = repr(parent) + repr_with_children = repr(node_with_children) + + # Assert + assert repr_parent == "TreeNode(parent=None, children=0)" + assert repr_with_children == f"TreeNode(parent=None, children={len(children)})" + repr_parent = repr(parent) + repr_with_children = repr(node_with_children) + + # Assert + assert repr_parent == "TreeNode(parent=None, children=0)" + assert repr_with_children == f"TreeNode(parent=None, children={len(children)})" diff --git a/tests/unit/utils/bs4_/test_text_styles_metrics.py b/tests/unit/utils/bs4_/test_text_styles_metrics.py index 2ea9ff9..efd2c1b 100644 --- a/tests/unit/utils/bs4_/test_text_styles_metrics.py +++ b/tests/unit/utils/bs4_/test_text_styles_metrics.py @@ -1,8 +1,7 @@ +import pytest from bs4 import BeautifulSoup -from sec_parser.utils.bs4_.text_styles_metrics import ( - compute_text_styles_metrics, -) +from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics # Test: Normal case with multiple styles diff --git a/tests/unit/utils/env_var_helpers/test_get_value_or_env_var.py b/tests/unit/utils/env_var_helpers/test_get_value_or_env_var.py new file mode 100644 index 0000000..8f258f5 --- /dev/null +++ b/tests/unit/utils/env_var_helpers/test_get_value_or_env_var.py @@ -0,0 +1,68 @@ +from unittest.mock import patch + +import pytest + +from sec_parser.utils.env_var_helpers import ValueNotSetError, get_value_or_env_var + + +def test_get_value_with_value_provided(): + # Arrange + value = "test_value" + env_var = "UNUSED_ENV_VAR" + + # Act + result = get_value_or_env_var(value, env_var) + + # Assert + assert result == value + + +@patch.dict("os.environ", {"TEST_ENV_VAR": "env_test_value"}) +def test_get_value_with_env_var_set(): + # Arrange + env_var = "TEST_ENV_VAR" + + # Act + result = get_value_or_env_var(None, env_var) + + # Assert + assert result == "env_test_value" + + +@patch.dict("os.environ", {}, clear=True) +def test_get_value_with_default_provided(): + # Arrange + env_var = "TEST_ENV_VAR" + default = "default_test_value" + + # Act + result = get_value_or_env_var(None, env_var, default) + + # Assert + assert result == default + + +@patch.dict("os.environ", {}, clear=True) +def test_get_value_raises_exception(): + # Arrange + env_var = "TEST_ENV_VAR" + + # Act & Assert + with pytest.raises( + ValueNotSetError, + match=f"No value provided and the environment variable '{env_var}' is also not set.", + ): + get_value_or_env_var(None, env_var) + + +def test_get_value_with_empty_string(): + # Arrange + value = " " + env_var = "UNUSED_ENV_VAR" + + # Act & Assert + with pytest.raises( + ValueNotSetError, + match=f"No value provided and the environment variable '{env_var}' is also not set.", + ): + get_value_or_env_var(value, env_var) diff --git a/tests/unit/utils/py_utils/test_get_direct_subclass_of_base.py b/tests/unit/utils/py_utils/test_get_direct_subclass_of_base.py new file mode 100644 index 0000000..9c99234 --- /dev/null +++ b/tests/unit/utils/py_utils/test_get_direct_subclass_of_base.py @@ -0,0 +1,51 @@ +import pytest + +from sec_parser.utils.py_utils import get_direct_subclass_of_base_class + + +class Parent: + pass + + +class Child(Parent): + pass + + +class Grandchild(Child): + pass + + +@pytest.mark.parametrize( + "base, cls, expected", + [ + (Parent, Child, Child), + (Parent, Grandchild, Child), + ], +) +def test_get_direct(base, cls, expected): + # Arrange + pass + + # Act + actual = get_direct_subclass_of_base_class(cls, base) + + # Assert + assert actual == expected + + +def test_invalid_input_type(): + # Arrange + class Unrelated: + pass + + # Act & Assert + with pytest.raises(TypeError): + get_direct_subclass_of_base_class(Unrelated, Parent) + + +def test_no_direct_subclass(): + with pytest.raises( + ValueError, + match=r"Could not find a root child class for the given class below Parent.", + ): + get_direct_subclass_of_base_class(Parent, Parent)