-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #77 from CambioML/1219
Add notebook example for Batch API
- Loading branch information
Showing
5 changed files
with
363 additions
and
98 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,4 +167,5 @@ cython_debug/ | |
|
||
# data/ | ||
*.xlsx | ||
*.csv | ||
*.csv | ||
*.jsonl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,360 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Anyparser Batch API Example" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Install the libraries (ipython is used for displaying markdown in this demo)\n", | ||
"# !pip3 install --upgrade ipython\n", | ||
"# !pip3 install --upgrade any-parser" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Step1: Batch API Folder Processing Upload" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import json\n", | ||
"import os\n", | ||
"from datetime import datetime\n", | ||
"\n", | ||
"from dotenv import load_dotenv\n", | ||
"\n", | ||
"from any_parser import AnyParser" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load environment variables\n", | ||
"load_dotenv(override=True)\n", | ||
"\n", | ||
"# Get API key and create parser\n", | ||
"api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", | ||
"if not api_key:\n", | ||
" raise ValueError(\"CAMBIO_API_KEY is not set\")\n", | ||
"ap = AnyParser(api_key)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Create Batch Request" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Upload responses saved to: ./sample_data_20250103003352.jsonl\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Upload folder for batch processing\n", | ||
"WORKING_FOLDER = \"./sample_data\"\n", | ||
"responses = ap.batches.create(WORKING_FOLDER)\n", | ||
"\n", | ||
"# Save responses to JSONL file with timestamp\n", | ||
"timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", | ||
"output_file = f\"./sample_data_{timestamp}.jsonl\"\n", | ||
"\n", | ||
"with open(output_file, \"w\") as f:\n", | ||
" for response in responses:\n", | ||
" f.write(json.dumps(response.model_dump()) + \"\\n\")\n", | ||
"\n", | ||
"print(f\"Upload responses saved to: {output_file}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Check the first element status in the jsonl using the requestId" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Checking status for file: Earnings-Presentation-Q2-2024.pdf\n", | ||
"Content not yet available\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Get first response from the JSONL file\n", | ||
"with open(output_file, \"r\") as f:\n", | ||
" first_response = json.loads(f.readline())\n", | ||
"\n", | ||
"request_id = first_response[\"requestId\"]\n", | ||
"print(f\"Checking status for file: {first_response['fileName']}\")\n", | ||
"\n", | ||
"# Retrieve status using request ID\n", | ||
"markdown = ap.batches.retrieve(request_id)\n", | ||
"if markdown and markdown.result:\n", | ||
" print(\"Content retrieved successfully\")\n", | ||
"else:\n", | ||
" print(\"Content not yet available\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"After 2 hours, you can check the content of the first file in the folder again" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Content retrieved successfully\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Retrieve status using request ID\n", | ||
"markdown = ap.batches.retrieve(request_id)\n", | ||
"if markdown and markdown.result:\n", | ||
" print(\"Content retrieved successfully\")\n", | ||
"else:\n", | ||
" print(\"Content not yet available\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Step2: Batch API folder fetch response\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import json\n", | ||
"import logging\n", | ||
"import os\n", | ||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n", | ||
"\n", | ||
"from dotenv import load_dotenv\n", | ||
"\n", | ||
"from any_parser import AnyParser\n", | ||
"\n", | ||
"# Configure logging\n", | ||
"logging.basicConfig(level=logging.INFO)\n", | ||
"logger = logging.getLogger(__name__)\n", | ||
"\n", | ||
"# Load environment variables\n", | ||
"load_dotenv(override=True)\n", | ||
"\n", | ||
"MAX_WORKER = 10" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Get API key and create parser\n", | ||
"api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", | ||
"if not api_key:\n", | ||
" raise ValueError(\"CAMBIO_API_KEY is not set\")\n", | ||
"ap = AnyParser(api_key)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Read responses from JSONL file" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Change to your real output json from parse_batch_upload.py\n", | ||
"response_file = \"./sample_data_20250102103047.jsonl\"\n", | ||
"with open(response_file, \"r\") as f:\n", | ||
" responses = [json.loads(line) for line in f]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Process responses concurrently\n", | ||
"def process_response(response):\n", | ||
" \"\"\"Process a single response by retrieving markdown content\"\"\"\n", | ||
" request_id = response[\"requestId\"]\n", | ||
" try:\n", | ||
" markdown = ap.batches.retrieve(request_id)\n", | ||
" if markdown and markdown.result:\n", | ||
" response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", | ||
" response[\"requestStatus\"] = \"COMPLETED\"\n", | ||
" response[\"completionTime\"] = markdown.completionTime\n", | ||
" except Exception as e:\n", | ||
" logger.error(f\"Error processing {request_id}: {str(e)}\")\n", | ||
" response[\"error\"] = [str(e)]\n", | ||
" return response" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Process responses concurrently\n", | ||
"with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", | ||
" future_to_response = {\n", | ||
" executor.submit(process_response, response): response\n", | ||
" for response in responses\n", | ||
" }\n", | ||
"\n", | ||
" updated_responses = []\n", | ||
" for future in as_completed(future_to_response):\n", | ||
" updated_response = future.result()\n", | ||
" updated_responses.append(updated_response)\n", | ||
"\n", | ||
"# Write all updated responses back to file\n", | ||
"with open(response_file, \"w\") as f:\n", | ||
" for response in updated_responses:\n", | ||
" f.write(json.dumps(response) + \"\\n\")\n", | ||
"\n", | ||
"print(f\"Updated all responses in {response_file} with markdown content\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Print out the first row from the updated file" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"First row from updated file:\n", | ||
"{\n", | ||
" \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n", | ||
" \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n", | ||
" \"requestStatus\": \"COMPLETED\",\n", | ||
" \"result\": [\n", | ||
" \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n", | ||
" ],\n", | ||
" \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n", | ||
"}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Read and print first row from the updated file\n", | ||
"with open(response_file, \"r\") as f:\n", | ||
" first_row = json.loads(f.readline())\n", | ||
" print(\"First row from updated file:\")\n", | ||
" print(json.dumps(first_row, indent=2))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## End of the notebook\n", | ||
"\n", | ||
"Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", | ||
"\n", | ||
"<a href=\"https://www.cambioml.com/\" title=\"Title\">\n", | ||
" <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n", | ||
"</a>" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "any-parse", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.