Merge pull request #213 from xavctn/edges

Line detection + Surya OCR
xavctn · Sep 1, 2024 · 8eb9ca9 · 8eb9ca9
2 parents cd944cf + 023aeec
commit 8eb9ca9
Show file tree

Hide file tree

Showing 109 changed files with 2,143 additions and 1,723 deletions.
diff --git a/.github/workflows/test_workflow.yml b/.github/workflows/test_workflow.yml
@@ -22,3 +22,5 @@ jobs:
       run: make venv
     - name: Perform tests
       run: make test
+      env:
+        NUMBA_DISABLE_JIT: 1
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ build
 
 certs
 venv
-profiling*
+profiling*
+examples/testing
diff --git a/README.md b/README.md
@@ -26,6 +26,7 @@ The library can be installed via pip:
 > <code>pip install img2table</code>: Standard installation, supporting Tesseract<br>
 > <code>pip install img2table[paddle]</code>: For usage with Paddle OCR<br>
 > <code>pip install img2table[easyocr]</code>: For usage with EasyOCR<br>
+> <code>pip install img2table[surya]</code>: For usage with Surya OCR<br>
 > <code>pip install img2table[gcp]</code>: For usage with Google Vision OCR<br>
 > <code>pip install img2table[aws]</code>: For usage with AWS Textract OCR<br>
 > <code>pip install img2table[azure]</code>: For usage with Azure Cognitive Services OCR
@@ -34,7 +35,7 @@ The library can be installed via pip:
 
 * Table identification for images and PDF files, including bounding boxes at the table cell level
 * Handling of complex table structures such as merged cells
-* Handling of implicit rows - see [example](/examples/Implicit_rows.ipynb)
+* Handling of implicit content - see [example](/examples/Implicit.ipynb)
 * Table content extraction by providing support for OCR services / tools
 * Extracted tables are returned as a simple object, including a Pandas DataFrame representation
 * Export extracted tables to an Excel file, preserving their original structure
@@ -257,6 +258,30 @@ ocr = DocTR(detect_language=False,
 </details>
 
 
+<details>
+<summary>Surya OCR<a name="surya"></a></summary>
+<br>
+
+<b><i>Only available for <code>python >= 3.10</code></i></b><br>
+<a href="https://github.com/VikParuchuri/surya">Surya</a> is an open-source OCR based on Deep Learning models.<br>
+At first use, relevant models will be downloaded.
+
+```python
+from img2table.ocr import SuryaOCR
+
+ocr = SuryaOCR(langs=["en"])
+```
+
+> <h4>Parameters</h4>
+><dl>
+>    <dt>langs : list, optional, default <code>["en"]</code></dt>
+>    <dd style="font-style: italic;">Lang parameter used in Surya OCR for text extraction</dd>
+></dl>
+
+<br>
+</details>
+
+
 <details>
 <summary>Google Vision<a name="vision"></a></summary>
 <br>
@@ -353,6 +378,7 @@ doc = Image(src)
 # Table extraction
 extracted_tables = doc.extract_tables(ocr=ocr,
                                       implicit_rows=False,
+                                      implicit_columns=False,
                                       borderless_tables=False,
                                       min_confidence=50)
 ```
@@ -361,7 +387,9 @@ extracted_tables = doc.extract_tables(ocr=ocr,
 >    <dt>ocr : OCRInstance, optional, default <code>None</code></dt>
 >    <dd style="font-style: italic;">OCR instance used to parse document text. If None, cells content will not be extracted</dd>
 >    <dt>implicit_rows : bool, optional, default <code>False</code></dt>
->    <dd style="font-style: italic;">Boolean indicating if implicit rows should be identified - check related <a href="/examples/Implicit_rows.ipynb" target="_self">example</a></dd>
+>    <dd style="font-style: italic;">Boolean indicating if implicit rows should be identified - check related <a href="/examples/Implicit.ipynb" target="_self">example</a></dd>
+>    <dt>implicit_columns : bool, optional, default <code>False</code></dt>
+>    <dd style="font-style: italic;">Boolean indicating if implicit columns should be identified - check related <a href="/examples/Implicit.ipynb" target="_self">example</a></dd>
 >    <dt>borderless_tables : bool, optional, default <code>False</code></dt>
 >    <dd style="font-style: italic;">Boolean indicating if <a href="/examples/borderless.ipynb" target="_self">borderless tables</a> are extracted <b>on top of</b> bordered tables.</dd>
 >    <dt>min_confidence : int, optional, default <code>50</code></dt>
@@ -440,6 +468,7 @@ doc = Image(src)
 doc.to_xlsx(dest=dest,
             ocr=ocr,
             implicit_rows=False,
+            implicit_columns=False,
             borderless_tables=False,
             min_confidence=50)
 ```
@@ -450,7 +479,9 @@ doc.to_xlsx(dest=dest,
 >    <dt>ocr : OCRInstance, optional, default <code>None</code></dt>
 >    <dd style="font-style: italic;">OCR instance used to parse document text. If None, cells content will not be extracted</dd>
 >    <dt>implicit_rows : bool, optional, default <code>False</code></dt>
->    <dd style="font-style: italic;">Boolean indicating if implicit rows should be identified - check related <a href="/examples/Implicit_rows.ipynb" target="_self">example</a></dd>
+>    <dd style="font-style: italic;">Boolean indicating if implicit rows should be identified - check related <a href="/examples/Implicit.ipynb" target="_self">example</a></dd>
+>    <dt>implicit_rows : bool, optional, default <code>False</code></dt>
+>    <dd style="font-style: italic;">Boolean indicating if implicit columns should be identified - check related <a href="/examples/Implicit.ipynb" target="_self">example</a></dd>
 >    <dt>borderless_tables : bool, optional, default <code>False</code></dt>
 >    <dd style="font-style: italic;">Boolean indicating if <a href="/examples/borderless.ipynb" target="_self">borderless tables</a> are extracted. It requires to provide an OCR to the method in order to be performed - <b>feature in alpha version</b></dd>
 >    <dt>min_confidence : int, optional, default <code>50</code></dt>
@@ -472,8 +503,8 @@ Several Jupyter notebooks with examples are available :
 <a href="/examples/borderless.ipynb" target="_self">Borderless tables</a>: specific examples dedicated to the extraction of borderless tables
 </li>
 <li>
-<a href="/examples/Implicit_rows.ipynb" target="_self">Implicit rows</a>: illustrated effect 
-of the parameter <code>implicit_rows</code> of the <code>extract_tables</code> method
+<a href="/examples/Implicit.ipynb" target="_self">Implicit content</a>: illustrated effect 
+of the parameter <code>implicit_rows</code>/<code>implicit_columns</code> of the <code>extract_tables</code> method
 </li>
 </ul>
 
@@ -490,7 +521,6 @@ Effectiveness can not be guaranteed on other type of documents.
 </li>
 <li>
 Table detection using only OpenCV processing can have some limitations. If the library fails to detect tables, 
-you may check CNN based solutions like <a href="https://github.com/DevashishPrasad/CascadeTabNet">CascadeTabNet</a> or 
-the <a href="https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/ppstructure/docs/quickstart_en.md#224-table-recognition">PaddleOCR implementation</a>.
+you may check CNN based solutions.
 </li>
 </ul>
diff --git a/examples/Basic_usage.ipynb b/examples/Basic_usage.ipynb
diff --git a/examples/Implicit_rows.ipynb → examples/Implicit.ipynb b/examples/Implicit_rows.ipynb → examples/Implicit.ipynb
@@ -18,7 +18,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "time: 594 ms (started: 2023-12-16 17:34:34 +01:00)\n"
+      "time: 750 ms (started: 2024-07-29 18:58:41 +02:00)\n"
      ]
     }
    ],
@@ -39,7 +39,9 @@
    "source": [
     "The <code>implicit_rows</code> parameter is used to split existing rows into smaller ones if:\n",
     "1. The row contains multi-line cells\n",
-    "2. Vertical separation between elements of the cell is large enough"
+    "2. Vertical separation between elements of the cell is large enough\n",
+    "\n",
+    "The same principle is applied at the column level using the <code>implicit_columns</code> parameter."
    ]
   },
   {
@@ -72,7 +74,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "time: 32 ms (started: 2023-12-16 17:34:36 +01:00)\n"
+      "time: 31 ms (started: 2024-07-29 18:58:43 +02:00)\n"
      ]
     }
    ],
@@ -98,7 +100,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "time: 94 ms (started: 2023-12-16 17:34:38 +01:00)\n"
+      "time: 94 ms (started: 2024-07-29 18:58:45 +02:00)\n"
      ]
     }
    ],
@@ -128,17 +130,17 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "time: 594 ms (started: 2023-12-16 17:34:41 +01:00)\n"
+      "time: 1.34 s (started: 2024-07-29 18:58:46 +02:00)\n"
      ]
     }
    ],
    "source": [
     "# Extract tables without implicit rows\n",
-    "extracted_tables = img.extract_tables(ocr=ocr, implicit_rows=False)\n",
+    "extracted_tables = img.extract_tables(ocr=ocr, implicit_rows=False, implicit_columns=False)\n",
     "table = extracted_tables.pop()\n",
     "\n",
     "# Extract tables with implicit rows\n",
-    "extracted_tables_implicit = img.extract_tables(ocr=ocr, implicit_rows=True)\n",
+    "extracted_tables_implicit = img.extract_tables(ocr=ocr, implicit_rows=True, implicit_columns=False)\n",
     "table_implicit_rows = extracted_tables_implicit.pop()"
    ]
   },
@@ -154,7 +156,7 @@
        "<h3 style=\"text-align: center\">Regular table</h3>\n",
        "                   <p style=\"text-align: center\">\n",
        "                       <b>Title:</b> No title detected<br>\n",
-       "                       <b>Bounding box:</b> x1=64, y1=12, x2=562, y2=314\n",
+       "                       <b>Bounding box:</b> x1=64, y1=13, x2=562, y2=315\n",
        "                   </p>\n",
        "                   <div align=\"center\"><table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -200,7 +202,7 @@
        "<h3 style=\"text-align: center\">Table with implicit rows</h3>\n",
        "                   <p style=\"text-align: center\">\n",
        "                       <b>Title:</b> No title detected<br>\n",
-       "                       <b>Bounding box:</b> x1=64, y1=12, x2=562, y2=314\n",
+       "                       <b>Bounding box:</b> x1=64, y1=13, x2=562, y2=315\n",
        "                   </p>\n",
        "                   <div align=\"center\"><table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -254,7 +256,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "time: 0 ns (started: 2023-12-16 17:34:46 +01:00)\n"
+      "time: 16 ms (started: 2024-07-29 18:58:51 +02:00)\n"
      ]
     }
    ],
@@ -280,7 +282,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.8.18"
   }
  },
  "nbformat": 4,