-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #74 from CambioML/1219
feat: Add input folder support for batch api
- Loading branch information
Showing
5 changed files
with
185 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,6 +76,27 @@ request_id = response.requestId | |
markdown = ap.batches.retrieve(request_id) | ||
``` | ||
|
||
Batch API for folder input: | ||
```python | ||
# Send the folder to begin batch extraction | ||
WORKING_FOLDER = "./sample_data" | ||
# This will generate a jsonl with filename and requestID | ||
response = ap.batches.create(WORKING_FOLDER) | ||
``` | ||
|
||
Each response in the JSONL file contains: | ||
- The filename | ||
- A unique request ID | ||
- Additional processing metadata | ||
You can later use these request IDs to retrieve the extracted content for each file: | ||
|
||
```python | ||
# Fetch the extracted content using the request ID from the jsonl file | ||
markdown = ap.batches.retrieve(request_id) | ||
``` | ||
For more details about code implementation of batch API, refer to | ||
[examples/parse_batch_upload.py](examples/parse_batch_upload.py) and [examples/parse_batch_fetch.py](examples/parse_batch_fetch.py) | ||
|
||
> ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete. | ||
> | ||
> ⚠️ **Important:** API keys generated from cambioml.com do not automatically have batch processing permissions. Please contact [email protected] to request batch processing access for your API key. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
"""Test batch API folder fetch response""" | ||
|
||
import json | ||
import logging | ||
import os | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
|
||
from dotenv import load_dotenv | ||
|
||
from any_parser import AnyParser | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
# Load environment variables | ||
load_dotenv(override=True) | ||
|
||
MAX_WORKER = 10 | ||
|
||
# Get API key and create parser | ||
api_key = os.environ.get("CAMBIO_API_KEY") | ||
if not api_key: | ||
raise ValueError("CAMBIO_API_KEY is not set") | ||
ap = AnyParser(api_key) | ||
|
||
# Read responses from JSONL file | ||
# Change to your real output json from parse_batch_upload.py | ||
response_file = "./sample_data_20241219190049.jsonl" | ||
with open(response_file, "r") as f: | ||
responses = [json.loads(line) for line in f] | ||
|
||
|
||
def process_response(response): | ||
"""Process a single response by retrieving markdown content""" | ||
request_id = response["requestId"] | ||
try: | ||
markdown = ap.batches.retrieve(request_id) | ||
if markdown: | ||
response["result"] = [markdown.result[0] if markdown.result else ""] | ||
response["requestStatus"] = "COMPLETED" | ||
response["completionTime"] = markdown.completionTime | ||
except Exception as e: | ||
logger.error(f"Error processing {request_id}: {str(e)}") | ||
response["error"] = [str(e)] | ||
return response | ||
|
||
|
||
# Process responses concurrently | ||
with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor: | ||
future_to_response = { | ||
executor.submit(process_response, response): response for response in responses | ||
} | ||
|
||
updated_responses = [] | ||
for future in as_completed(future_to_response): | ||
updated_response = future.result() | ||
updated_responses.append(updated_response) | ||
|
||
# Write all updated responses back to file | ||
with open(response_file, "w") as f: | ||
for response in updated_responses: | ||
f.write(json.dumps(response) + "\n") | ||
|
||
print(f"Updated all responses in {response_file} with markdown content") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
"""Batch API Folder Processing Upload Example""" | ||
|
||
import json | ||
import os | ||
from datetime import datetime | ||
|
||
from dotenv import load_dotenv | ||
|
||
from any_parser import AnyParser | ||
|
||
# Load environment variables | ||
load_dotenv(override=True) | ||
|
||
# Get API key and create parser | ||
api_key = os.environ.get("CAMBIO_API_KEY") | ||
if not api_key: | ||
raise ValueError("CAMBIO_API_KEY is not set") | ||
ap = AnyParser(api_key) | ||
|
||
# Upload folder for batch processing | ||
WORKING_FOLDER = "./sample_data" | ||
responses = ap.batches.create(WORKING_FOLDER) | ||
|
||
# Save responses to JSONL file with timestamp | ||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S") | ||
output_file = f"./sample_data_{timestamp}.jsonl" | ||
|
||
with open(output_file, "w") as f: | ||
for response in responses: | ||
f.write(json.dumps(response.model_dump()) + "\n") | ||
|
||
print(f"Upload responses saved to: {output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters