From cb9f821c97e1a5105de1327734e0b3c95cff3ef9 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Thu, 5 Dec 2024 15:37:37 +0000 Subject: [PATCH 1/2] feat: add batch api --- README.md | 13 +++++ any_parser/any_parser.py | 6 ++ any_parser/batch_parser.py | 113 +++++++++++++++++++++++++++++++++++++ tests/test_batch_api.py | 37 ++++++++++++ 4 files changed, 169 insertions(+) create mode 100644 any_parser/batch_parser.py create mode 100644 tests/test_batch_api.py diff --git a/README.md b/README.md index 7d547e6..f5d4c14 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,19 @@ file_id = ap.async_parse(file_path="./data/test.pdf") markdown = ap.async_fetch(file_id=file_id) ``` +### 5. Run Batch Extraction (Beta) +For batch extraction, send the file to begin processing and fetch results later: +```python +# Send the file to begin batch extraction +response = ap.batches.create(file_path="./data/test.pdf") +request_id = response.requestId + +# Fetch the extracted content using the request ID +markdown = ap.batches.retrieve(request_id) +``` + +> ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete. + ## :scroll: Examples Check out these examples to see how you can utilize **AnyParser** to extract text, numbers, and symbols in fewer than 10 lines of code! diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index ac52c92..3960b14 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -9,6 +9,7 @@ import requests from any_parser.async_parser import AsyncParser +from any_parser.batch_parser import BatchParser from any_parser.constants import ProcessType from any_parser.sync_parser import ( ExtractKeyValueSyncParser, @@ -20,6 +21,10 @@ from any_parser.utils import validate_file_inputs PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com" +# TODO: Update this to the correct batch endpoint +PUBLIC_BATCH_BASE_URL = ( + "http://AnyPar-ApiCo-cuKOBXasmUF1-1986145995.us-west-2.elb.amazonaws.com" +) TIMEOUT = 60 @@ -133,6 +138,7 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None ) self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url) self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url) + self.batches = BatchParser(api_key, PUBLIC_BATCH_BASE_URL) @handle_file_processing def parse( diff --git a/any_parser/batch_parser.py b/any_parser/batch_parser.py new file mode 100644 index 0000000..7713184 --- /dev/null +++ b/any_parser/batch_parser.py @@ -0,0 +1,113 @@ +"""Batch parser implementation.""" + +from typing import List, Optional + +import requests +from pydantic import BaseModel, Field + +from any_parser.base_parser import BaseParser + +TIMEOUT = 60 + + +class UploadResponse(BaseModel): + fileName: str + requestId: str + requestStatus: str + + +class UsageResponse(BaseModel): + pageLimit: int + pageRemaining: int + + +class FileStatusResponse(BaseModel): + fileName: str + fileType: str + requestId: str + requestStatus: str + uploadTime: str + completionTime: Optional[str] = None + result: Optional[List[str]] = Field(default_factory=list) + error: Optional[List[str]] = Field(default_factory=list) + + +class BatchParser(BaseParser): + def __init__(self, api_key: str, base_url: str) -> None: + super().__init__(api_key, base_url) + self._file_upload_url = f"{self._base_url}/files/" + self._processing_status_url = f"{self._base_url}/files/" + "{request_id}" + self._usage_url = f"{self._base_url}/users/current/usage" + + # remove "Content-Type" from headers + self._headers.pop("Content-Type") + + def create(self, file_path: str) -> UploadResponse: + """Upload a single file for batch processing. + + Args: + file_path: Path to the file to upload + + Returns: + FileUploadResponse object containing upload details + """ + with open(file_path, "rb") as f: + files = {"file": f} + response = requests.post( + self._file_upload_url, + headers=self._headers, + files=files, + timeout=TIMEOUT, + ) + print(response.json()) + + if response.status_code != 200: + raise Exception(f"Upload failed: {response.text}") + + data = response.json() + return UploadResponse( + fileName=data["fileName"], + requestId=data["requestId"], + requestStatus=data["requestStatus"], + ) + + def retrieve(self, request_id: str) -> FileStatusResponse: + """Get the processing status of a file. + + Args: + request_id: The ID of the file processing request + + Returns: + FileProcessingStatus object containing status details + """ + response = requests.get( + self._processing_status_url.format(request_id=request_id), + headers=self._headers, + timeout=TIMEOUT, + ) + + if response.status_code != 200: + raise Exception(f"Status check failed: {response.text}") + + data = response.json() + return FileStatusResponse(**data) + + def get_usage(self) -> UsageResponse: + """Get current usage information. + + Returns: + UsageResponse object containing usage details + """ + response = requests.get( + self._usage_url, + headers=self._headers, + timeout=TIMEOUT, + ) + + if response.status_code != 200: + raise Exception(f"Usage check failed: {response.text}") + + data = response.json() + return UsageResponse( + pageLimit=data["pageLimit"], pageRemaining=data["pageRemaining"] + ) diff --git a/tests/test_batch_api.py b/tests/test_batch_api.py new file mode 100644 index 0000000..9d2f766 --- /dev/null +++ b/tests/test_batch_api.py @@ -0,0 +1,37 @@ +"""Testing Batch API Extraction""" + +import os +import sys +import unittest + +from dotenv import load_dotenv + +sys.path.append(".") +load_dotenv(override=True) +from any_parser import AnyParser # noqa: E402 + + +class TestAnyParserBatchAPI(unittest.TestCase): + """Testing Any Parser Batch API""" + + def setUp(self): + self.api_key = os.environ.get("CAMBIO_API_KEY") + if not self.api_key: + raise ValueError("CAMBIO_API_KEY is not set") + self.ap = AnyParser(self.api_key) + + def test_batch_api_create(self): + """Batch API Create""" + working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" + + response = self.ap.batches.create(working_file) + + self.assertIsNotNone(response) + self.assertEqual(response.requestStatus, "UPLOADED") + + request_id = response.requestId + status = self.ap.batches.retrieve(request_id) + self.assertEqual(status.requestStatus, "UPLOADED") + + quota = self.ap.batches.get_usage() + self.assertGreaterEqual(quota.pageRemaining, 0) From 190843b2eb5390889274b8a8a576f8068ad4a1bf Mon Sep 17 00:00:00 2001 From: Rachel Hu Date: Sun, 8 Dec 2024 21:28:46 +0800 Subject: [PATCH 2/2] Resolve github action missing pydantic for build failure. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7043d6d..f23084e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" python = ">=3.9,<3.13" requests = "^2.25.0" python-dotenv = "^1.0.0" +pydantic = "^2.10.3" [tool.poetry.group.dev.dependencies] Levenshtein = [