[feat] add back parse with output to JSON and CSV + example notebook

CambioML · Aug 27, 2024 · b2d1d11 · b2d1d11
1 parent 909ff3b
commit b2d1d11
Show file tree

Hide file tree

Showing 6 changed files with 756 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -52,3 +52,5 @@ Are you an AI engineer who need to ACCURATELY extract both the text and its layo
 ### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/main/examples/extract_table_from_image_to_markdown.ipynb)
 Are you a financial analyst who need to extract ACCURATE number from a table in an image or a PDF. Check out this notebook (3-min read)!
 
+### [Extract all tables and from PDF into HTML, JSON, and CSV Format](https://github.com/CambioML/any-parser/blob/main/examples/extract_tables_from_pdf.ipynb)
+Are you an AI engineer who need to ACCURATELY extract tables from a PDF? Check out this notebook demo (3-min read)!
diff --git a/any_parser/base.py b/any_parser/base.py
@@ -1,7 +1,8 @@
 import time
+import requests
 from datetime import datetime, timedelta
+from any_parser.postprocessors import run_convert
 
-import requests
 
 CAMBIO_UPLOAD_URL = "https://jnrsqrod4j.execute-api.us-west-2.amazonaws.com/v1/upload"
 CAMBIO_REQUEST_URL = "https://jnrsqrod4j.execute-api.us-west-2.amazonaws.com/v1/request"
@@ -44,6 +45,20 @@ def extract(self, file_path):
         result = self._request_file_extraction(user_id, file_id)
         return result
 
+    def parse(self, file_path, parse_type="table", output_format="HTML", prompt="", mode="advanced"):
+        parse_type = parse_type.upper()
+        if parse_type not in ["TABLE"]:
+            raise ValueError("Invalid parse_type. Currently, only 'table' is supported.")
+
+        output_format = output_format.upper()
+        if output_format not in ["HTML", "JSON", "CSV"]:
+            raise ValueError("Invalid output_format. Expected 'HTML', 'JSON', or 'CSV'.")
+
+        user_id, file_id = self._request_and_upload_by_apiKey(file_path)
+        result = self._request_info_extraction(user_id, file_id)
+        return run_convert(result, output_format)
+
+
     def _error_handler(self, response):
         if response.status_code == 403:
             raise Exception("Invalid API Key")
@@ -97,3 +112,28 @@ def _request_file_extraction(self, user_id, file_id):
             return query_response.json()
 
         self._error_handler(response)
+
+    def _request_info_extraction(self, user_id, file_id):
+
+        payload = {
+            "files": [{"sourceType": "s3", "fileId": file_id}],
+            "jobType": "info_extraction",
+        }
+        response = requests.post(
+            self._requesturl, headers=self._request_header, json=payload
+        )
+
+        if response.status_code == 200:
+            info_extraction_job_id = response.json().get("jobId")
+            payload = {
+                "userId": user_id,
+                "jobId": info_extraction_job_id,
+                "queryType": "job_result",
+            }
+
+            query_response = self.query_result(payload)
+
+            return query_response.json()
+
+        self._error_handler(response)
+
diff --git a/any_parser/postprocessors.py b/any_parser/postprocessors.py
@@ -0,0 +1,53 @@
+from bs4 import BeautifulSoup
+import io
+import csv
+
+def run_convert(result, output_format):
+    if output_format == "JSON":
+        converter = _html_table_string_to_json
+    elif output_format == "CSV":
+        converter = _html_table_to_csv
+    else:
+        return result
+
+    return [converter(table) for table in result]
+
+def _html_table_string_to_json(html_string: str):
+    soup = BeautifulSoup(html_string, 'html.parser')
+    table = soup.find('table')
+
+    if not table:
+        raise ValueError('No table found in the provided HTML string.')
+
+    rows = table.find_all('tr')
+    headers = [cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]
+
+    result = []
+
+    for row in rows[1:]:
+        cells = row.find_all('td')
+        row_object = {}
+
+        for header, cell in zip(headers, cells):
+            row_object[header] = cell.get_text(strip=True)
+
+        result.append(row_object)
+
+    return result
+
+def _html_table_to_csv(html_string: str) -> str:
+    soup = BeautifulSoup(html_string, 'html.parser')
+    table = soup.find('table')
+
+    if not table:
+        raise ValueError('No table found in the provided HTML string.')
+
+    rows = table.find_all('tr')
+    output = io.StringIO()
+    csv_writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
+
+    for row in rows:
+        cells = row.find_all(['th', 'td'])
+        csv_writer.writerow([cell.get_text(strip=True) for cell in cells])
+
+    return output.getvalue()
diff --git a/examples/extract_tables_from_pdf.ipynb b/examples/extract_tables_from_pdf.ipynb
diff --git a/examples/sample_data/sample_portfolio_statement.pdf b/examples/sample_data/sample_portfolio_statement.pdf
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ readme = "README.md"
 python = ">=3.8,<3.13"
 requests = "^2.25.0"
 python-dotenv = "^1.0.0"
+beautifulsoup4 = "^4.12.3"
 
 [build-system]
 requires = ["poetry-core"]