test: cover the codebase with unit tests

alphanome-ai · Oct 12, 2023 · ef00ae7 · ef00ae7
1 parent f78a74e
commit ef00ae7
Show file tree

Hide file tree

Showing 27 changed files with 815 additions and 183 deletions.
diff --git a/.codecov.yml b/.codecov.yml
@@ -3,4 +3,4 @@ coverage:
     patch:
       default:
         # Note: also update Taskfile.yml when changing the target coverage.
-        target: 60%
+        target: 90%
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -34,14 +34,14 @@ tasks:
     cmds:
       # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
       # Note: also update .codecov.yml when changing the target coverage.
-      - poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
+      - poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/
 
   unit-watch:
     desc: Run unit tests and check code coverage immediately when files change.
     cmds:
       # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
       # Note: also update .codecov.yml when changing the target coverage.
-      - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
+      - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/
 
 
 ###########################

diff --git a/sec_parser/processing_engine/html_tag.py b/sec_parser/processing_engine/html_tag.py
@@ -6,7 +6,6 @@
 
 from sec_parser.exceptions import SecParserValueError
 from sec_parser.utils.bs4_.contains_tag import contains_tag
-from sec_parser.utils.bs4_.get_first_deepest_tag import get_first_deepest_tag
 from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
 from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics
 
@@ -93,23 +92,6 @@ def is_unary_tree(self) -> bool:
             self._is_unary_tree = is_unary_tree(self._bs4)
         return self._is_unary_tree
 
-    def get_first_deepest_tag(self) -> HtmlTag | None:
-        """
-        `get_first_deepest_tag` returns the first deepest tag within the current tag.
-
-        For example, if we have the following HTML structure:
-        <div><p>Test</p><span>Another Test</span></div>
-        and we pass the 'div' tag to this function, it will return the 'p' tag,
-        which is the first deepest tag within the 'html' tag.
-        """
-        result: HtmlTag | None = None
-        if self._first_deepest_tag is NotSet:
-            tag = get_first_deepest_tag(self._bs4)
-            if tag is not None:
-                result = HtmlTag(tag)
-                self._first_deepest_tag = result
-        return result
-
     def get_text_styles_metrics(self) -> dict[tuple[str, str], float]:
         """
         Compute the percentage distribution of various CSS styles within the text

diff --git a/sec_parser/processing_steps/text_parsing_step.py b/sec_parser/processing_steps/text_parsing_step.py
@@ -34,13 +34,6 @@ def __init__(
         )
         self._unique_markers_by_order: list[str] = []
 
-    def _found_marker(self, symbol: str) -> None:
-        if symbol not in self._unique_markers_by_order:
-            # Ordered set:
-            self._unique_markers_by_order = list(
-                dict.fromkeys([*self._unique_markers_by_order, symbol]).keys(),
-            )
-
     def _process_element(
         self,
         element: AbstractSemanticElement,

diff --git a/sec_parser/processing_steps/title_parsing_step.py b/sec_parser/processing_steps/title_parsing_step.py
@@ -20,10 +20,16 @@
 
 class TitleParsingStep(AbstractElementwiseProcessingStep):
     """
-    TitleParsingStep class for transforming elements into TitleElement instances.
+    TitleParsingStep elements into TitleElement instances by scanning a list
+    of semantic elements and replacing suitable candidates.
 
-    This step scans through a list of semantic elements and changes it,
-    primarily by replacing suitable candidates with TitleElement instances.
+    The "_unique_styles_by_order" tuple:
+    ====================================
+    - Represents an ordered set of unique styles found in the document.
+    - Preserves the order of insertion, which determines the hierarchical
+      level of each style.
+    - Assumes that earlier "highlight" styles correspond to higher level paragraph
+      or section headings.
     """
 
     def __init__(
@@ -36,17 +42,11 @@ def __init__(
             types_to_exclude=types_to_exclude,
         )
 
-        # _unique_styles_by_order track unique styles in the document.
-        # Stored in a tuple as an ordered set, preserving insertion order.
-        # This order is used to determine a style's level.
-        # It is based on the observation that "highlight" styles that appear first
-        # typically mark higher level paragraph/section headings.
-        # _unique_styles_by_order is effectively used as an ordered set:
         self._unique_styles_by_order: tuple[TextStyle, ...] = ()
 
     def _add_unique_style(self, style: TextStyle) -> None:
+        """Add a new unique style if not already present."""
         if style not in self._unique_styles_by_order:
-            # _styles is effectively updated as an ordered set:
             self._unique_styles_by_order = tuple(
                 dict.fromkeys([*self._unique_styles_by_order, style]).keys(),
             )
@@ -56,8 +56,12 @@ def _process_element(
         element: AbstractSemanticElement,
         _: ElementwiseProcessingContext,
     ) -> AbstractSemanticElement:
+        """Process each element and convert to TitleElement if necessary."""
         if not isinstance(element, HighlightedTextElement):
             return element
+
+        # Ensure the style is tracked
         self._add_unique_style(element.style)
+
         level = self._unique_styles_by_order.index(element.style)
         return TitleElement.convert_from(element, level=level)
diff --git a/sec_parser/processing_steps/title_plugin.py b/sec_parser/processing_steps/title_plugin.py
diff --git a/sec_parser/semantic_elements/abstract_semantic_element.py b/sec_parser/semantic_elements/abstract_semantic_element.py
@@ -37,33 +37,6 @@ def convert_from(
         """Convert the semantic element into another semantic element type."""
         return cls(source.html_tag)
 
-    @classmethod
-    def get_direct_abstract_semantic_subclass(
-        cls,
-    ) -> type[AbstractSemanticElement]:
-        """
-        Given a class, find the class that is one step below
-        AbstractSemanticElement in its inheritance hierarchy.
-        """
-        if not issubclass(cls, AbstractSemanticElement):
-            msg = "Argument must be a subclass of AbstractSemanticElement."
-            raise TypeError(msg)
-
-        root_child = None
-        for ancestor in cls.mro():
-            if ancestor is AbstractSemanticElement:
-                break
-            root_child = ancestor
-
-        if root_child is None:
-            msg = "Could not find a root child class for the given class."
-            raise ValueError(msg)
-
-        return root_child
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}<{self.html_tag.name}>"
-
 
 class AbstractLevelElement(AbstractSemanticElement, ABC):
     """
@@ -72,7 +45,7 @@ class AbstractLevelElement(AbstractSemanticElement, ABC):
     a main section title might be at level 1, a subsection at level 2, etc.
     """
 
-    MIN_LEVEL = 1
+    MIN_LEVEL = 0
 
     def __init__(
         self,

diff --git a/sec_parser/semantic_tree/abstract_nesting_rule.py b/sec_parser/semantic_tree/abstract_nesting_rule.py
@@ -12,4 +12,4 @@ def should_be_nested_under(
         parent: AbstractSemanticElement,
         child: AbstractSemanticElement,
     ) -> bool:
-        pass
+        raise NotImplementedError  # pragma: no cover
diff --git a/sec_parser/semantic_tree/semantic_tree.py b/sec_parser/semantic_tree/semantic_tree.py
@@ -54,23 +54,22 @@ def render(
             new_prefix = "│   " if not is_last else "    "
 
             level = ""
-            lvl = getattr(node, "level", "")
+            lvl = getattr(node.semantic_element, "level", "")
             if lvl:
                 level = f"[L{lvl}]"
-            class_name = f"{element.__class__.__name__}{level}:"
-            title = element.html_tag.get_text()
-            if len(title) > max_line_length:
-                title = f"{title[:max_line_length]}..."
+                if pretty:
+                    level = f"\033[1;92m{level}\033[0m"
+            class_name = f"{element.__class__.__name__}{level}"
+            contents = element.html_tag.get_text().strip()
+            if len(contents) > max_line_length:
+                contents = f"{contents[:max_line_length]}..."
             if pretty:
                 class_name = f"\033[1;34m{class_name}\033[0m"
-                title = f"\033[1;32m{title}\033[0m"
 
             # Fix the alignment for root elements
-            line = (
-                f"{_prefix}{indent}{class_name} {title}"
-                if not _is_root
-                else f"{class_name} {title}"
-            )
+            line = f"{_prefix}{indent}{class_name}" if not _is_root else f"{class_name}"
+            if contents:
+                line = f"{line}: {contents}"
             tree_strings.append(line)
 
             # Recursive call: Always set _is_root to False for non-root elements

diff --git a/sec_parser/utils/__init__.py b/sec_parser/utils/__init__.py
@@ -6,9 +6,11 @@
 
 from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
 from sec_parser.utils.env_var_helpers import ValueNotSetError, get_value_or_env_var
+from sec_parser.utils.py_utils import get_direct_subclass_of_base_class
 
 __all__ = [
     "ValueNotSetError",
     "get_value_or_env_var",
+    "get_direct_subclass_of_base_class",
     "is_unary_tree",
 ]
diff --git a/sec_parser/utils/bs4_/text_styles_metrics.py b/sec_parser/utils/bs4_/text_styles_metrics.py
@@ -61,7 +61,9 @@ def _compute_effective_style(tag: Tag) -> dict[str, str]:
     while found_tag:
         if "style" in found_tag.attrs:
             found_styles = found_tag["style"]
-            if isinstance(found_styles, list):
+            if isinstance(found_styles, list):  # pragma: no cover
+                # this should never happen, can't even construct a
+                # scenario where this would occur
                 msg = "Expected a string, got a list"
                 raise ValueError(msg)
             styles = found_styles.split(";")

diff --git a/sec_parser/utils/py_utils.py b/sec_parser/utils/py_utils.py
@@ -0,0 +1,23 @@
+def get_direct_subclass_of_base_class(cls: type, base_class: type) -> type:
+    """
+    Given a class, find the class that is one step below
+    the specified base_class in its inheritance hierarchy.
+    """
+    if not issubclass(cls, base_class):
+        msg = f"Argument must be a subclass of {base_class.__name__}."
+        raise TypeError(msg)
+
+    root_child = None
+    for ancestor in cls.mro():
+        if ancestor is base_class:
+            break
+        root_child = ancestor
+
+    if root_child is None:
+        msg = (
+            f"Could not find a root child class for "
+            f"the given class below {base_class.__name__}."
+        )
+        raise ValueError(msg)
+
+    return root_child
diff --git a/tests/unit/processing_engine/test_sec_parser.py b/tests/unit/processing_engine/test_sec_parser.py
@@ -18,7 +18,7 @@
         ),
     ],
 )
-def test_sec_parser(html_str, expected_elements):
+def test_smoke_test(html_str, expected_elements):
     # Arrange
     sec_parser = SecParser()
 

diff --git a/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py b/tests/unit/processing_steps/test_abstract_elementwise_processing_step.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from unittest.mock import Mock
+
 import bs4
 import pytest
 
@@ -49,8 +51,8 @@ def test_process_skip_due_to_types_to_process():
     # Arrange
     types_to_process: set[type[AbstractSemanticElement]] = {MockSemanticElement}
     step = ProcessingStep(types_to_process=types_to_process)
-    element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
-    element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
+    element1 = MockSemanticElement(Mock())
+    element2 = AnotherMockSemanticElement(Mock())
     input_elements = [element1, element2]
 
     # Act
@@ -66,8 +68,8 @@ def test_process_skip_due_to_types_to_exclude():
     # Arrange
     types_to_exclude: set[type[AbstractSemanticElement]] = {MockSemanticElement}
     step = ProcessingStep(types_to_exclude=types_to_exclude)
-    element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
-    element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
+    element1 = MockSemanticElement(Mock())
+    element2 = AnotherMockSemanticElement(Mock())
     input_elements = [element1, element2]
 
     # Act
@@ -94,8 +96,8 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude():
         types_to_process=types_to_process,
         types_to_exclude=types_to_exclude,
     )
-    element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
-    element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
+    element1 = MockSemanticElement(Mock())
+    element2 = AnotherMockSemanticElement(Mock())
     input_elements = [element1, element2]
 
     # Act
@@ -104,3 +106,4 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude():
     # Assert
     assert step.seen_elements == [element1]
     assert processed_elements == input_elements
+    assert processed_elements == input_elements