Skip to content

Commit

Permalink
move postprocess to cdk with jobParam and polish notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
jojortz committed Aug 28, 2024
1 parent 27f208e commit a4df149
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 450 deletions.
20 changes: 8 additions & 12 deletions any_parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import requests

from any_parser.postprocessors import run_convert

CAMBIO_UPLOAD_URL = "https://jnrsqrod4j.execute-api.us-west-2.amazonaws.com/v1/upload"
CAMBIO_REQUEST_URL = "https://jnrsqrod4j.execute-api.us-west-2.amazonaws.com/v1/request"
Expand Down Expand Up @@ -46,29 +45,25 @@ def extract(self, file_path):
result = self._request_file_extraction(user_id, file_id)
return result

def parse(
def extract_table(
self,
file_path,
parse_type="table",
output_format="HTML",
prompt="",
mode="advanced",
):
parse_type = parse_type.upper()
if parse_type not in ["TABLE"]:
raise ValueError(
"Invalid parse_type. Currently, only 'table' is supported."
)

output_format = output_format.upper()
if output_format not in ["HTML", "JSON", "CSV"]:
raise ValueError(
"Invalid output_format. Expected 'HTML', 'JSON', or 'CSV'."
)

user_id, file_id = self._request_and_upload_by_apiKey(file_path)
result = self._request_info_extraction(user_id, file_id)
return run_convert(result, output_format)
job_params = {
"output_format": output_format,
}
result = self._request_info_extraction(user_id, file_id, job_params)
return result

def _error_handler(self, response):
if response.status_code == 403:
Expand Down Expand Up @@ -124,11 +119,12 @@ def _request_file_extraction(self, user_id, file_id):

self._error_handler(response)

def _request_info_extraction(self, user_id, file_id):
def _request_info_extraction(self, user_id, file_id, job_params=None):

payload = {
"files": [{"sourceType": "s3", "fileId": file_id}],
"jobType": "info_extraction",
"jobParams": job_params,
}
response = requests.post(
self._requesturl, headers=self._request_header, json=payload
Expand Down
57 changes: 0 additions & 57 deletions any_parser/postprocessors.py

This file was deleted.

Loading

0 comments on commit a4df149

Please sign in to comment.