From 522040500647ef47a11019614f39caffbfb09295 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Tue, 10 Dec 2024 17:47:06 +0000 Subject: [PATCH] chore: address issues in code review --- README.md | 2 ++ any_parser/any_parser.py | 14 ++++++++------ any_parser/batch_parser.py | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f5d4c14..145d0cf 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,8 @@ markdown = ap.batches.retrieve(request_id) ``` > ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete. +> +> ⚠️ **Important:** API keys generated from cambioml.com do not automatically have batch processing permissions. Please contact info@cambioml.com to request batch processing access for your API key. ## :scroll: Examples Check out these examples to see how you can utilize **AnyParser** to extract text, numbers, and symbols in fewer than 10 lines of code! diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index 3960b14..97bc6e0 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -21,10 +21,7 @@ from any_parser.utils import validate_file_inputs PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com" -# TODO: Update this to the correct batch endpoint -PUBLIC_BATCH_BASE_URL = ( - "http://AnyPar-ApiCo-cuKOBXasmUF1-1986145995.us-west-2.elb.amazonaws.com" -) +PUBLIC_BATCH_BASE_URL = "http://batch-api.cambio-ai.com" TIMEOUT = 60 @@ -123,7 +120,12 @@ class AnyParser: extracting information from different types of files. """ - def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None: + def __init__( + self, + api_key: str, + base_url: str = PUBLIC_SHARED_BASE_URL, + batch_url: str = PUBLIC_BATCH_BASE_URL, + ) -> None: """Initialize AnyParser with API credentials. Args: @@ -138,7 +140,7 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None ) self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url) self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url) - self.batches = BatchParser(api_key, PUBLIC_BATCH_BASE_URL) + self.batches = BatchParser(api_key, batch_url) @handle_file_processing def parse( diff --git a/any_parser/batch_parser.py b/any_parser/batch_parser.py index 7713184..5d79238 100644 --- a/any_parser/batch_parser.py +++ b/any_parser/batch_parser.py @@ -1,5 +1,6 @@ """Batch parser implementation.""" +import os from typing import List, Optional import requests @@ -11,17 +12,29 @@ class UploadResponse(BaseModel): + """ + Response from the batch upload endpoint. + """ + fileName: str requestId: str requestStatus: str class UsageResponse(BaseModel): + """ + Response from the batch usage endpoint. + """ + pageLimit: int pageRemaining: int class FileStatusResponse(BaseModel): + """ + Response from the batch file status endpoint. + """ + fileName: str fileType: str requestId: str @@ -51,6 +64,9 @@ def create(self, file_path: str) -> UploadResponse: Returns: FileUploadResponse object containing upload details """ + if not os.path.isfile(file_path): + raise FileNotFoundError(f"The file path '{file_path}' does not exist.") + with open(file_path, "rb") as f: files = {"file": f} response = requests.post( @@ -59,7 +75,6 @@ def create(self, file_path: str) -> UploadResponse: files=files, timeout=TIMEOUT, ) - print(response.json()) if response.status_code != 200: raise Exception(f"Upload failed: {response.text}")