Skip to content

Commit

Permalink
reformat with black
Browse files Browse the repository at this point in the history
  • Loading branch information
jojortz committed Aug 27, 2024
1 parent b2d1d11 commit 772d242
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 16 deletions.
19 changes: 14 additions & 5 deletions any_parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,30 @@ def extract(self, file_path):
result = self._request_file_extraction(user_id, file_id)
return result

def parse(self, file_path, parse_type="table", output_format="HTML", prompt="", mode="advanced"):
def parse(
self,
file_path,
parse_type="table",
output_format="HTML",
prompt="",
mode="advanced",
):
parse_type = parse_type.upper()
if parse_type not in ["TABLE"]:
raise ValueError("Invalid parse_type. Currently, only 'table' is supported.")
raise ValueError(
"Invalid parse_type. Currently, only 'table' is supported."
)

output_format = output_format.upper()
if output_format not in ["HTML", "JSON", "CSV"]:
raise ValueError("Invalid output_format. Expected 'HTML', 'JSON', or 'CSV'.")
raise ValueError(
"Invalid output_format. Expected 'HTML', 'JSON', or 'CSV'."
)

user_id, file_id = self._request_and_upload_by_apiKey(file_path)
result = self._request_info_extraction(user_id, file_id)
return run_convert(result, output_format)


def _error_handler(self, response):
if response.status_code == 403:
raise Exception("Invalid API Key")
Expand Down Expand Up @@ -136,4 +146,3 @@ def _request_info_extraction(self, user_id, file_id):
return query_response.json()

self._error_handler(response)

25 changes: 14 additions & 11 deletions any_parser/postprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import csv


def run_convert(result, output_format):
if output_format == "JSON":
converter = _html_table_string_to_json
Expand All @@ -12,20 +13,21 @@ def run_convert(result, output_format):

return [converter(table) for table in result]


def _html_table_string_to_json(html_string: str):
soup = BeautifulSoup(html_string, 'html.parser')
table = soup.find('table')
soup = BeautifulSoup(html_string, "html.parser")
table = soup.find("table")

if not table:
raise ValueError('No table found in the provided HTML string.')
raise ValueError("No table found in the provided HTML string.")

rows = table.find_all('tr')
headers = [cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]
rows = table.find_all("tr")
headers = [cell.get_text(strip=True) for cell in rows[0].find_all(["th", "td"])]

result = []

for row in rows[1:]:
cells = row.find_all('td')
cells = row.find_all("td")
row_object = {}

for header, cell in zip(headers, cells):
Expand All @@ -35,19 +37,20 @@ def _html_table_string_to_json(html_string: str):

return result


def _html_table_to_csv(html_string: str) -> str:
soup = BeautifulSoup(html_string, 'html.parser')
table = soup.find('table')
soup = BeautifulSoup(html_string, "html.parser")
table = soup.find("table")

if not table:
raise ValueError('No table found in the provided HTML string.')
raise ValueError("No table found in the provided HTML string.")

rows = table.find_all('tr')
rows = table.find_all("tr")
output = io.StringIO()
csv_writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)

for row in rows:
cells = row.find_all(['th', 'td'])
cells = row.find_all(["th", "td"])
csv_writer.writerow([cell.get_text(strip=True) for cell in cells])

return output.getvalue()

0 comments on commit 772d242

Please sign in to comment.