Skip to content

Commit

Permalink
test: cover the codebase with unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Elijas committed Oct 12, 2023
1 parent f78a74e commit ef00ae7
Show file tree
Hide file tree
Showing 27 changed files with 815 additions and 183 deletions.
2 changes: 1 addition & 1 deletion .codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ coverage:
patch:
default:
# Note: also update Taskfile.yml when changing the target coverage.
target: 60%
target: 90%
4 changes: 2 additions & 2 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ tasks:
cmds:
# Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
# Note: also update .codecov.yml when changing the target coverage.
- poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
- poetry run pytest -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/

unit-watch:
desc: Run unit tests and check code coverage immediately when files change.
cmds:
# Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
# Note: also update .codecov.yml when changing the target coverage.
- poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
- poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/


###########################
Expand Down
18 changes: 0 additions & 18 deletions sec_parser/processing_engine/html_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from sec_parser.exceptions import SecParserValueError
from sec_parser.utils.bs4_.contains_tag import contains_tag
from sec_parser.utils.bs4_.get_first_deepest_tag import get_first_deepest_tag
from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics

Expand Down Expand Up @@ -93,23 +92,6 @@ def is_unary_tree(self) -> bool:
self._is_unary_tree = is_unary_tree(self._bs4)
return self._is_unary_tree

def get_first_deepest_tag(self) -> HtmlTag | None:
"""
`get_first_deepest_tag` returns the first deepest tag within the current tag.
For example, if we have the following HTML structure:
<div><p>Test</p><span>Another Test</span></div>
and we pass the 'div' tag to this function, it will return the 'p' tag,
which is the first deepest tag within the 'html' tag.
"""
result: HtmlTag | None = None
if self._first_deepest_tag is NotSet:
tag = get_first_deepest_tag(self._bs4)
if tag is not None:
result = HtmlTag(tag)
self._first_deepest_tag = result
return result

def get_text_styles_metrics(self) -> dict[tuple[str, str], float]:
"""
Compute the percentage distribution of various CSS styles within the text
Expand Down
7 changes: 0 additions & 7 deletions sec_parser/processing_steps/text_parsing_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,6 @@ def __init__(
)
self._unique_markers_by_order: list[str] = []

def _found_marker(self, symbol: str) -> None:
if symbol not in self._unique_markers_by_order:
# Ordered set:
self._unique_markers_by_order = list(
dict.fromkeys([*self._unique_markers_by_order, symbol]).keys(),
)

def _process_element(
self,
element: AbstractSemanticElement,
Expand Down
24 changes: 14 additions & 10 deletions sec_parser/processing_steps/title_parsing_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@

class TitleParsingStep(AbstractElementwiseProcessingStep):
"""
TitleParsingStep class for transforming elements into TitleElement instances.
TitleParsingStep elements into TitleElement instances by scanning a list
of semantic elements and replacing suitable candidates.
This step scans through a list of semantic elements and changes it,
primarily by replacing suitable candidates with TitleElement instances.
The "_unique_styles_by_order" tuple:
====================================
- Represents an ordered set of unique styles found in the document.
- Preserves the order of insertion, which determines the hierarchical
level of each style.
- Assumes that earlier "highlight" styles correspond to higher level paragraph
or section headings.
"""

def __init__(
Expand All @@ -36,17 +42,11 @@ def __init__(
types_to_exclude=types_to_exclude,
)

# _unique_styles_by_order track unique styles in the document.
# Stored in a tuple as an ordered set, preserving insertion order.
# This order is used to determine a style's level.
# It is based on the observation that "highlight" styles that appear first
# typically mark higher level paragraph/section headings.
# _unique_styles_by_order is effectively used as an ordered set:
self._unique_styles_by_order: tuple[TextStyle, ...] = ()

def _add_unique_style(self, style: TextStyle) -> None:
"""Add a new unique style if not already present."""
if style not in self._unique_styles_by_order:
# _styles is effectively updated as an ordered set:
self._unique_styles_by_order = tuple(
dict.fromkeys([*self._unique_styles_by_order, style]).keys(),
)
Expand All @@ -56,8 +56,12 @@ def _process_element(
element: AbstractSemanticElement,
_: ElementwiseProcessingContext,
) -> AbstractSemanticElement:
"""Process each element and convert to TitleElement if necessary."""
if not isinstance(element, HighlightedTextElement):
return element

# Ensure the style is tracked
self._add_unique_style(element.style)

level = self._unique_styles_by_order.index(element.style)
return TitleElement.convert_from(element, level=level)
63 changes: 0 additions & 63 deletions sec_parser/processing_steps/title_plugin.py

This file was deleted.

29 changes: 1 addition & 28 deletions sec_parser/semantic_elements/abstract_semantic_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,6 @@ def convert_from(
"""Convert the semantic element into another semantic element type."""
return cls(source.html_tag)

@classmethod
def get_direct_abstract_semantic_subclass(
cls,
) -> type[AbstractSemanticElement]:
"""
Given a class, find the class that is one step below
AbstractSemanticElement in its inheritance hierarchy.
"""
if not issubclass(cls, AbstractSemanticElement):
msg = "Argument must be a subclass of AbstractSemanticElement."
raise TypeError(msg)

root_child = None
for ancestor in cls.mro():
if ancestor is AbstractSemanticElement:
break
root_child = ancestor

if root_child is None:
msg = "Could not find a root child class for the given class."
raise ValueError(msg)

return root_child

def __repr__(self) -> str:
return f"{self.__class__.__name__}<{self.html_tag.name}>"


class AbstractLevelElement(AbstractSemanticElement, ABC):
"""
Expand All @@ -72,7 +45,7 @@ class AbstractLevelElement(AbstractSemanticElement, ABC):
a main section title might be at level 1, a subsection at level 2, etc.
"""

MIN_LEVEL = 1
MIN_LEVEL = 0

def __init__(
self,
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/semantic_tree/abstract_nesting_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ def should_be_nested_under(
parent: AbstractSemanticElement,
child: AbstractSemanticElement,
) -> bool:
pass
raise NotImplementedError # pragma: no cover
21 changes: 10 additions & 11 deletions sec_parser/semantic_tree/semantic_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,22 @@ def render(
new_prefix = "│ " if not is_last else " "

level = ""
lvl = getattr(node, "level", "")
lvl = getattr(node.semantic_element, "level", "")
if lvl:
level = f"[L{lvl}]"
class_name = f"{element.__class__.__name__}{level}:"
title = element.html_tag.get_text()
if len(title) > max_line_length:
title = f"{title[:max_line_length]}..."
if pretty:
level = f"\033[1;92m{level}\033[0m"
class_name = f"{element.__class__.__name__}{level}"
contents = element.html_tag.get_text().strip()
if len(contents) > max_line_length:
contents = f"{contents[:max_line_length]}..."
if pretty:
class_name = f"\033[1;34m{class_name}\033[0m"
title = f"\033[1;32m{title}\033[0m"

# Fix the alignment for root elements
line = (
f"{_prefix}{indent}{class_name} {title}"
if not _is_root
else f"{class_name} {title}"
)
line = f"{_prefix}{indent}{class_name}" if not _is_root else f"{class_name}"
if contents:
line = f"{line}: {contents}"
tree_strings.append(line)

# Recursive call: Always set _is_root to False for non-root elements
Expand Down
2 changes: 2 additions & 0 deletions sec_parser/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
from sec_parser.utils.env_var_helpers import ValueNotSetError, get_value_or_env_var
from sec_parser.utils.py_utils import get_direct_subclass_of_base_class

__all__ = [
"ValueNotSetError",
"get_value_or_env_var",
"get_direct_subclass_of_base_class",
"is_unary_tree",
]
4 changes: 3 additions & 1 deletion sec_parser/utils/bs4_/text_styles_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def _compute_effective_style(tag: Tag) -> dict[str, str]:
while found_tag:
if "style" in found_tag.attrs:
found_styles = found_tag["style"]
if isinstance(found_styles, list):
if isinstance(found_styles, list): # pragma: no cover
# this should never happen, can't even construct a
# scenario where this would occur
msg = "Expected a string, got a list"
raise ValueError(msg)
styles = found_styles.split(";")
Expand Down
23 changes: 23 additions & 0 deletions sec_parser/utils/py_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
def get_direct_subclass_of_base_class(cls: type, base_class: type) -> type:
"""
Given a class, find the class that is one step below
the specified base_class in its inheritance hierarchy.
"""
if not issubclass(cls, base_class):
msg = f"Argument must be a subclass of {base_class.__name__}."
raise TypeError(msg)

root_child = None
for ancestor in cls.mro():
if ancestor is base_class:
break
root_child = ancestor

if root_child is None:
msg = (
f"Could not find a root child class for "
f"the given class below {base_class.__name__}."
)
raise ValueError(msg)

return root_child
2 changes: 1 addition & 1 deletion tests/unit/processing_engine/test_sec_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
),
],
)
def test_sec_parser(html_str, expected_elements):
def test_smoke_test(html_str, expected_elements):
# Arrange
sec_parser = SecParser()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from unittest.mock import Mock

import bs4
import pytest

Expand Down Expand Up @@ -49,8 +51,8 @@ def test_process_skip_due_to_types_to_process():
# Arrange
types_to_process: set[type[AbstractSemanticElement]] = {MockSemanticElement}
step = ProcessingStep(types_to_process=types_to_process)
element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
element1 = MockSemanticElement(Mock())
element2 = AnotherMockSemanticElement(Mock())
input_elements = [element1, element2]

# Act
Expand All @@ -66,8 +68,8 @@ def test_process_skip_due_to_types_to_exclude():
# Arrange
types_to_exclude: set[type[AbstractSemanticElement]] = {MockSemanticElement}
step = ProcessingStep(types_to_exclude=types_to_exclude)
element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
element1 = MockSemanticElement(Mock())
element2 = AnotherMockSemanticElement(Mock())
input_elements = [element1, element2]

# Act
Expand All @@ -94,8 +96,8 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude():
types_to_process=types_to_process,
types_to_exclude=types_to_exclude,
)
element1 = MockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
element2 = AnotherMockSemanticElement(html_tag=HtmlTag(bs4.Tag(name="p")))
element1 = MockSemanticElement(Mock())
element2 = AnotherMockSemanticElement(Mock())
input_elements = [element1, element2]

# Act
Expand All @@ -104,3 +106,4 @@ def test_process_skip_due_to_both_types_to_process_and_types_to_exclude():
# Assert
assert step.seen_elements == [element1]
assert processed_elements == input_elements
assert processed_elements == input_elements
Loading

0 comments on commit ef00ae7

Please sign in to comment.