Skip to content

Commit

Permalink
[feat] add back parse with output to JSON and CSV + example notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
jojortz committed Aug 27, 2024
1 parent 909ff3b commit b2d1d11
Show file tree
Hide file tree
Showing 6 changed files with 756 additions and 1 deletion.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ Are you an AI engineer who need to ACCURATELY extract both the text and its layo
### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/main/examples/extract_table_from_image_to_markdown.ipynb)
Are you a financial analyst who need to extract ACCURATE number from a table in an image or a PDF. Check out this notebook (3-min read)!
### [Extract all tables and from PDF into HTML, JSON, and CSV Format](https://github.com/CambioML/any-parser/blob/main/examples/extract_tables_from_pdf.ipynb)
Are you an AI engineer who need to ACCURATELY extract tables from a PDF? Check out this notebook demo (3-min read)!
42 changes: 41 additions & 1 deletion any_parser/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import time
import requests
from datetime import datetime, timedelta
from any_parser.postprocessors import run_convert

import requests

CAMBIO_UPLOAD_URL = "https://jnrsqrod4j.execute-api.us-west-2.amazonaws.com/v1/upload"
CAMBIO_REQUEST_URL = "https://jnrsqrod4j.execute-api.us-west-2.amazonaws.com/v1/request"
Expand Down Expand Up @@ -44,6 +45,20 @@ def extract(self, file_path):
result = self._request_file_extraction(user_id, file_id)
return result

def parse(self, file_path, parse_type="table", output_format="HTML", prompt="", mode="advanced"):
parse_type = parse_type.upper()
if parse_type not in ["TABLE"]:
raise ValueError("Invalid parse_type. Currently, only 'table' is supported.")

output_format = output_format.upper()
if output_format not in ["HTML", "JSON", "CSV"]:
raise ValueError("Invalid output_format. Expected 'HTML', 'JSON', or 'CSV'.")

user_id, file_id = self._request_and_upload_by_apiKey(file_path)
result = self._request_info_extraction(user_id, file_id)
return run_convert(result, output_format)


def _error_handler(self, response):
if response.status_code == 403:
raise Exception("Invalid API Key")
Expand Down Expand Up @@ -97,3 +112,28 @@ def _request_file_extraction(self, user_id, file_id):
return query_response.json()

self._error_handler(response)

def _request_info_extraction(self, user_id, file_id):

payload = {
"files": [{"sourceType": "s3", "fileId": file_id}],
"jobType": "info_extraction",
}
response = requests.post(
self._requesturl, headers=self._request_header, json=payload
)

if response.status_code == 200:
info_extraction_job_id = response.json().get("jobId")
payload = {
"userId": user_id,
"jobId": info_extraction_job_id,
"queryType": "job_result",
}

query_response = self.query_result(payload)

return query_response.json()

self._error_handler(response)

53 changes: 53 additions & 0 deletions any_parser/postprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from bs4 import BeautifulSoup
import io
import csv

def run_convert(result, output_format):
if output_format == "JSON":
converter = _html_table_string_to_json
elif output_format == "CSV":
converter = _html_table_to_csv
else:
return result

return [converter(table) for table in result]

def _html_table_string_to_json(html_string: str):
soup = BeautifulSoup(html_string, 'html.parser')
table = soup.find('table')

if not table:
raise ValueError('No table found in the provided HTML string.')

rows = table.find_all('tr')
headers = [cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]

result = []

for row in rows[1:]:
cells = row.find_all('td')
row_object = {}

for header, cell in zip(headers, cells):
row_object[header] = cell.get_text(strip=True)

result.append(row_object)

return result

def _html_table_to_csv(html_string: str) -> str:
soup = BeautifulSoup(html_string, 'html.parser')
table = soup.find('table')

if not table:
raise ValueError('No table found in the provided HTML string.')

rows = table.find_all('tr')
output = io.StringIO()
csv_writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)

for row in rows:
cells = row.find_all(['th', 'td'])
csv_writer.writerow([cell.get_text(strip=True) for cell in cells])

return output.getvalue()
659 changes: 659 additions & 0 deletions examples/extract_tables_from_pdf.ipynb

Large diffs are not rendered by default.

Binary file not shown.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ readme = "README.md"
python = ">=3.8,<3.13"
requests = "^2.25.0"
python-dotenv = "^1.0.0"
beautifulsoup4 = "^4.12.3"

[build-system]
requires = ["poetry-core"]
Expand Down

0 comments on commit b2d1d11

Please sign in to comment.