test: cover abstract_elementwise_processing_step

alphanome-ai · Oct 11, 2023 · 1fc0b15 · 1fc0b15
1 parent 45f343d
commit 1fc0b15
Show file tree

Hide file tree

Showing 27 changed files with 526 additions and 176 deletions.
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -41,12 +41,12 @@ tasks:
     cmds:
       # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
       # Note: also update .codecov.yml when changing the target coverage.
-      - poetry run ptw -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
+      - poetry run ptw -- -- -s --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=60 tests/unit/
 
 
-###############################
+###########################
 ### Code Quality Checks ###
-###############################
+###########################
 
   lint:
     desc: Lint the code without auto-fixing issues.

diff --git a/docs/source/notebooks/comprehensive_developer_guide.ipynb b/docs/source/notebooks/comprehensive_developer_guide.ipynb
@@ -70,11 +70,17 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### → We're using BeautifulSoup\n",
-                "Many SEC EDGAR filings are available in HTML document format. To ease the process of reading the documents, we will be using the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) (\"bs4\") library to parse an HTML document into a tree-like structure of HTML Tags (`bs4.Tag`).\n",
-                "\n",
-                "### → `HtmlTag` wraps `bs4.Tag`\n",
-                "Instead of interacting directly with `bs4.Tag`, the SEC EDGAR HTML Parser uses `HtmlTag`, a wrapper around `bs4.Tag`.\n"
+                "## We're using BeautifulSoup\n",
+                "Many SEC EDGAR filings are available in HTML document format. To ease the process of reading the documents, we will be using the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) (\"bs4\") library to parse an HTML document into a tree-like structure of HTML Tags (`bs4.Tag`)."
+            ]
+        },
+        {
+            "attachments": {},
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## `HtmlTag` is a wrapper of `bs4.Tag`\n",
+                "Instead of interacting directly with `bs4.Tag`, the SEC EDGAR HTML Parser uses `HtmlTag`, a wrapper around `bs4.Tag`."
             ]
         },
         {
@@ -122,7 +128,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### → What is a Semantic Element?"
+                "## What is a Semantic Element?"
             ]
         },
         {
@@ -165,7 +171,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 10,
+            "execution_count": 4,
             "metadata": {},
             "outputs": [
                 {
@@ -222,7 +228,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### → `SemanticElement` wraps `HtmlTag`"
+                "## `SemanticElement` is a wrapper of `HtmlTag`"
             ]
         },
         {
@@ -238,7 +244,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### → The Parsing Process"
+                "## The Parsing Process"
             ]
         },
         {
@@ -290,7 +296,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "Example 1: Using the default parsing pipeline:"
+                "**Example 1:** Using the default parsing pipeline:"
             ]
         },
         {
@@ -310,7 +316,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "Example 2: This is a trivial example to demonstrate how a parser without processing steps will just return \n",
+                "**Example 2:** This is a trivial example to demonstrate how a parser without processing steps will just return \n",
                 "the \"starting state\", which is each of the `HtmlTag` objects wrapped in a `UndeterminedElement` object."
             ]
         },
@@ -340,7 +346,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "Example 3: Advanced customization of the pipeline. Suppose `TableParsingStep` is a bottleneck for performance. In that case, you can easily remove it from the pipeline, or swap it out for a custom or inherited alternative. You can even write your own processing steps to have a completely custom parsing pipeline."
+                "**Example 3:** Advanced customization of the pipeline. Suppose `TableParsingStep` is a bottleneck for performance. In that case, you can easily remove it from the pipeline, or swap it out for a custom or inherited alternative. You can even write your own processing steps to have a completely custom parsing pipeline."
             ]
         },
         {
@@ -366,28 +372,142 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "<!-- ### → TODO: What if multiple semantic elements are in the same HTML tag? -->"
+                "## What if multiple Semantic Elements are in the same HTML tag?"
             ]
         },
         {
             "attachments": {},
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "<!-- ### → TODO: Plugin design -->"
+                "If multiple Semantic Elements are in the same HTML tag, we would use some processing step to split the elements, and then contain them in a `CompositeSemanticElement`."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 9,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "\n",
+                        "    CompositeSemanticElement acts as a container that can encapsulate other\n",
+                        "    semantic elements.\n",
+                        "\n",
+                        "    This is used for handling special cases where a single HTML root\n",
+                        "    tag wraps multiple semantic elements. This maintains structural integrity\n",
+                        "    and allows for seamless reconstitution of the original HTML document.\n",
+                        "\n",
+                        "    Why is this useful:\n",
+                        "    ===================\n",
+                        "    1. Some semantic elements, like XBRL tags (<ix>), may wrap multiple semantic\n",
+                        "    elements. The container ensures that these relationships are not broken\n",
+                        "    during parsing.\n",
+                        "    2. Enables the parser to fully reconstruct the original HTML document, which\n",
+                        "    opens up possibilities for features like semantic segmentation visualization\n",
+                        "    (e.g. recreate the original document but put semi-transparent colored boxes\n",
+                        "    on top, based on semantic meaning), serialization of parsed documents into\n",
+                        "    an augmented HTML, and debugging by comparing to the original document.\n",
+                        "    \n"
+                    ]
+                }
+            ],
+            "source": [
+                "from sec_parser.semantic_elements import CompositeSemanticElement\n",
+                "\n",
+                "print(CompositeSemanticElement.__doc__)"
             ]
         },
         {
             "attachments": {},
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### → Introduction to the SemanticTree"
+                "**Example 1:** This is an oversimplified"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": 9,
+            "execution_count": 10,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from sec_parser.processing_steps import AbstractProcessingStep\n",
+                "from sec_parser.semantic_elements import AbstractSemanticElement\n",
+                "\n",
+                "\n",
+                "class SplitterProcessingStep(AbstractProcessingStep):\n",
+                "    def process(\n",
+                "        self, elements: list[AbstractSemanticElement]\n",
+                "    ) -> list[AbstractSemanticElement]:\n",
+                "        new_elements = []"
+            ]
+        },
+        {
+            "attachments": {},
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## `ProcessingStep` design"
+            ]
+        },
+        {
+            "attachments": {},
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Each `ProcessingStep` instance is created from scratch for each parsed document. This means that each `ProcessingStep` instance can have its own state, and can be used to store information about the document being parsed. This is useful for processing steps that need to keep track of information across multiple `HtmlTag` objects."
+            ]
+        },
+        {
+            "attachments": {},
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "**Example 1:** Counting the number of images in a document"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "[UndeterminedElement<img>, UndeterminedElement<img>, UndeterminedElement<img>]"
+                        ]
+                    },
+                    "execution_count": 11,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from sec_parser.processing_steps import AbstractProcessingStep\n",
+                "\n",
+                "\n",
+                "class CounterProcessingStep(AbstractProcessingStep):\n",
+                "    pass\n",
+                "\n",
+                "\n",
+                "parser = SecParser(get_steps=lambda: [])\n",
+                "parser.parse(\"<img><img><img>\")"
+            ]
+        },
+        {
+            "attachments": {},
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Introduction to the SemanticTree"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 12,
             "metadata": {},
             "outputs": [
                 {

diff --git a/sec_parser/processing_engine/sec_parser.py b/sec_parser/processing_engine/sec_parser.py
@@ -26,6 +26,8 @@
     from sec_parser.processing_steps.abstract_processing_step import (
         AbstractProcessingStep,
     )
+
+    # pragma: no cover
     from sec_parser.semantic_elements.abstract_semantic_element import (
         AbstractSemanticElement,
     )
@@ -79,7 +81,7 @@ def parse(self, html: str) -> list[AbstractSemanticElement]:
         root_tags = self._html_tag_parser.parse(html)
 
         elements: list[AbstractSemanticElement] = [
-            UndeterminedElement(tag, inner_elements=[]) for tag in root_tags
+            UndeterminedElement(tag) for tag in root_tags
         ]
 
         for step in steps:

diff --git a/sec_parser/processing_steps/abstract_elementwise_processing_step.py b/sec_parser/processing_steps/abstract_elementwise_processing_step.py
@@ -4,11 +4,13 @@
 from dataclasses import dataclass
 from typing import Callable
 
-from sec_parser.exceptions import SecParserValueError
 from sec_parser.processing_steps.abstract_processing_step import AbstractProcessingStep
 from sec_parser.semantic_elements.abstract_semantic_element import (
     AbstractSemanticElement,
 )
+from sec_parser.semantic_elements.composite_semantic_element import (
+    CompositeSemanticElement,
+)
 
 ElementTransformer = Callable[[AbstractSemanticElement], AbstractSemanticElement]
 
@@ -26,7 +28,7 @@ class ElementwiseProcessingContext:
     is_root_element: bool
 
 
-class AbstractElementwiseProcessStep(AbstractProcessingStep):
+class AbstractElementwiseProcessingStep(AbstractProcessingStep):
     """
     `AbstractElementwiseTransformStep` class is used to iterate over
     all Semantic Elements with or without applying transformations.
@@ -41,9 +43,6 @@ def __init__(
         super().__init__()
         self._types_to_process = types_to_process or set()
         self._types_to_exclude = types_to_exclude or set()
-        if self._types_to_process & self._types_to_exclude:
-            msg = "Processed types and excluded types should not overlap."
-            raise SecParserValueError(msg)
 
     def _process(
         self,
@@ -55,24 +54,27 @@ def _process(
             is_root_element=True,
         )
 
-        for i, input_element in enumerate(elements):
+        for i, e in enumerate(elements):
+            # avoids lint error "`element` overwritten by assignment target"
+            element = e
+
             if self._types_to_process and not any(
-                isinstance(input_element, t) for t in self._types_to_process
+                isinstance(element, t) for t in self._types_to_process
             ):
                 continue
-            if any(isinstance(input_element, t) for t in self._types_to_exclude):
+            if any(isinstance(element, t) for t in self._types_to_exclude):
                 continue
 
-            element = self._process_element(input_element, context)
-
-            if element.inner_elements:
+            if isinstance(element, CompositeSemanticElement):
                 child_context = ElementwiseProcessingContext(
                     is_root_element=False,
                 )
                 element.inner_elements = self._process(
                     element.inner_elements,
                     _context=child_context,
                 )
+            else:
+                element = self._process_element(element, context)
 
             elements[i] = element
 

diff --git a/sec_parser/processing_steps/footnote_and_bulletpoint_parsing_step.py b/sec_parser/processing_steps/footnote_and_bulletpoint_parsing_step.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from sec_parser.processing_steps.abstract_elementwise_processing_step import (
-    AbstractElementwiseProcessStep,
+    AbstractElementwiseProcessingStep,
     ElementwiseProcessingContext,
 )
 from sec_parser.semantic_elements.semantic_elements import (
@@ -17,7 +17,7 @@
     )
 
 
-class FootnoteAndBulletpointParsingStep(AbstractElementwiseProcessStep):
+class FootnoteAndBulletpointParsingStep(AbstractElementwiseProcessingStep):
     """
     FootnoteAndBulletpointParsingStep class for transforming elements into
     BulletpointTextElement and FootnoteTextElement instances.
@@ -74,7 +74,6 @@ def _process_element(
             level = 1 + self._unique_markers_by_order.index(marker)
             return BulletpointTextElement(
                 element.html_tag,
-                element.inner_elements,
                 level=level,
             )
 

diff --git a/sec_parser/processing_steps/highlighted_text_parsing_step.py b/sec_parser/processing_steps/highlighted_text_parsing_step.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from sec_parser.processing_steps.abstract_elementwise_processing_step import (
-    AbstractElementwiseProcessStep,
+    AbstractElementwiseProcessingStep,
     ElementwiseProcessingContext,
 )
 from sec_parser.semantic_elements.highlighted_text_element import (
@@ -17,7 +17,7 @@
     )
 
 
-class HighlightedTextParsingStep(AbstractElementwiseProcessStep):
+class HighlightedTextParsingStep(AbstractElementwiseProcessingStep):
     """
     HighlightedText class for transforming elements into HighlightedText instances.
 

diff --git a/sec_parser/processing_steps/image_parsing_step.py b/sec_parser/processing_steps/image_parsing_step.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from sec_parser.processing_steps.abstract_elementwise_processing_step import (
-    AbstractElementwiseProcessStep,
+    AbstractElementwiseProcessingStep,
     ElementwiseProcessingContext,
 )
 from sec_parser.semantic_elements.semantic_elements import ImageElement
@@ -14,7 +14,7 @@
     )
 
 
-class ImageParsingStep(AbstractElementwiseProcessStep):
+class ImageParsingStep(AbstractElementwiseProcessingStep):
     """
     ImageParsingStep class for transforming elements into ImageElement instances.