Skip to content

Commit

Permalink
feat: add to_dict() methods and remove e2e tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Elijas committed Oct 12, 2023
1 parent 23adc89 commit 83663d8
Show file tree
Hide file tree
Showing 17 changed files with 553 additions and 504 deletions.
13 changes: 0 additions & 13 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ tasks:
cmds:
- task unit
- task lint-fix
- task e2e-smoke

p:
desc: Shortcut for `task pre-commit`.
Expand All @@ -23,7 +22,6 @@ tasks:
cmds:
- task unit
- task lint
- task e2e

######################
### Run Unit Tests ###
Expand Down Expand Up @@ -64,18 +62,7 @@ tasks:
### Run End-to-end Tests ###
############################

e2e:
desc: Run repeated end-to-end tests.
cmds:
# The CLI_ARGS variable allows passing custom arguments to the end-to-end test after "--".
# For instance, to run the test with specific parameters, use the command:
# task e2e -- --tests-per-core=5 --cores=2 --limit_documents=2
- poetry run python -m tests.e2e.sec_parser {{.CLI_ARGS}}

e2e-smoke:
desc: Run a single end-to-end test for a single document. This is a useful way to verify if the parser is functioning (a so-called "smoke" test).
cmds:
- poetry run python -m tests.e2e.sec_parser --tests-per-core=1 --cores=1 --limit_documents=1

#######################
### Developer Tools ###
Expand Down
410 changes: 304 additions & 106 deletions docs/rtd_requirements.txt

Large diffs are not rendered by default.

119 changes: 118 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ lxml = "^4.9.3"
httpx = "^0.24.1"
cssutils = "^2.7.1"
python-dotenv = "^1.0.0"
xxhash = "^3.4.1"


[tool.poetry.group.dev.dependencies]
Expand Down
35 changes: 35 additions & 0 deletions sec_parser/processing_engine/html_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import warnings

import bs4
import xxhash
from frozendict import frozendict

from sec_parser.exceptions import SecParserValueError
from sec_parser.utils.bs4_.contains_tag import contains_tag
from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics

TEXT_PREVIEW_LENGTH = 40


class HtmlTag:
"""
Expand Down Expand Up @@ -41,6 +45,37 @@ def __init__(
self._is_unary_tree: bool | None = None
self._first_deepest_tag: HtmlTag | None | NotSetType = NotSet
self._text_styles_metrics: dict[tuple[str, str], float] | None = None
self._frozen_dict: frozendict | None = None
self._source_code: str | None = None

def get_source_code(self) -> str:
if self._source_code is None:
self._source_code = str(self._bs4)
return self._source_code

def _generate_preview(self, text: str) -> str:
"""Generate a preview of the text with a specified length."""
text = text.replace("\n", " ").strip()
return (
text[: TEXT_PREVIEW_LENGTH // 2]
+ f"...[{len(text) - TEXT_PREVIEW_LENGTH}]..."
+ text[-TEXT_PREVIEW_LENGTH // 2 :]
if len(text) > TEXT_PREVIEW_LENGTH
else text
)

def to_dict(self) -> frozendict:
"""Compute the hash of the HTML tag."""
if self._frozen_dict is None:
self._frozen_dict = frozendict(
{
"tag_name": self._bs4.name,
"text_preview": self._generate_preview(self.get_text()),
"html_preview": self._generate_preview(self.get_source_code()),
"html_hash": xxhash.xxh32(self.get_source_code()).hexdigest(),
},
)
return self._frozen_dict

def get_text(self) -> str:
"""
Expand Down
16 changes: 14 additions & 2 deletions sec_parser/semantic_elements/abstract_semantic_element.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from abc import ABC
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

from sec_parser.exceptions import SecParserValueError

Expand Down Expand Up @@ -37,8 +37,14 @@ def convert_from(
"""Convert the semantic element into another semantic element type."""
return cls(source.html_tag)

def to_dict(self) -> dict[str, Any]:
return {
"cls_name": self.__class__.__name__,
**self.html_tag.to_dict(),
}

class AbstractLevelElement(AbstractSemanticElement, ABC):

class AbstractLevelElement(AbstractSemanticElement):
"""
The AbstractLevelElement class provides a level attribute to semantic elements.
It represents hierarchical levels in the document structure. For instance,
Expand Down Expand Up @@ -69,6 +75,12 @@ def convert_from(
) -> AbstractLevelElement:
return cls(source.html_tag, level=level)

def to_dict(self) -> dict[str, Any]:
return {
**super().to_dict(),
"level": self.level,
}


class InvalidLevelError(SecParserValueError):
pass
8 changes: 7 additions & 1 deletion sec_parser/semantic_elements/composite_semantic_element.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

from sec_parser.semantic_elements.abstract_semantic_element import (
AbstractSemanticElement,
Expand Down Expand Up @@ -65,3 +65,9 @@ def convert_from(
html_tag=source.html_tag,
inner_elements=inner_elements,
)

def to_dict(self) -> dict[str, Any]:
return {
**super().to_dict(),
"inner_elements": len(self.inner_elements),
}
8 changes: 7 additions & 1 deletion sec_parser/semantic_elements/highlighted_text_element.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import asdict, dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

from sec_parser.semantic_elements.abstract_semantic_element import (
AbstractSemanticElement,
Expand Down Expand Up @@ -42,6 +42,12 @@ def convert_from(
style=style,
)

def to_dict(self) -> dict[str, Any]:
return {
**super().to_dict(),
"text_style": asdict(self.style),
}


@dataclass(frozen=True)
class TextStyle:
Expand Down
Empty file removed tests/e2e/__init__.py
Empty file.
86 changes: 0 additions & 86 deletions tests/e2e/_metrics.py

This file was deleted.

Loading

0 comments on commit 83663d8

Please sign in to comment.