Skip to content

Commit

Permalink
Merge pull request #77 from CambioML/1219
Browse files Browse the repository at this point in the history
Add notebook example for Batch API
  • Loading branch information
lingjiekong authored Jan 2, 2025
2 parents efe1558 + 79f38cd commit ca4ac5f
Show file tree
Hide file tree
Showing 5 changed files with 363 additions and 98 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,5 @@ cython_debug/

# data/
*.xlsx
*.csv
*.csv
*.jsonl
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Each response in the JSONL file contains:
- The filename
- A unique request ID
- Additional processing metadata

You can later use these request IDs to retrieve the extracted content for each file:

```python
Expand Down
360 changes: 360 additions & 0 deletions examples/parse_batch_api.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Anyparser Batch API Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install the libraries (ipython is used for displaying markdown in this demo)\n",
"# !pip3 install --upgrade ipython\n",
"# !pip3 install --upgrade any-parser"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step1: Batch API Folder Processing Upload"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"from datetime import datetime\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"from any_parser import AnyParser"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables\n",
"load_dotenv(override=True)\n",
"\n",
"# Get API key and create parser\n",
"api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
"if not api_key:\n",
" raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
"ap = AnyParser(api_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create Batch Request"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Upload responses saved to: ./sample_data_20250103003352.jsonl\n"
]
}
],
"source": [
"# Upload folder for batch processing\n",
"WORKING_FOLDER = \"./sample_data\"\n",
"responses = ap.batches.create(WORKING_FOLDER)\n",
"\n",
"# Save responses to JSONL file with timestamp\n",
"timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
"output_file = f\"./sample_data_{timestamp}.jsonl\"\n",
"\n",
"with open(output_file, \"w\") as f:\n",
" for response in responses:\n",
" f.write(json.dumps(response.model_dump()) + \"\\n\")\n",
"\n",
"print(f\"Upload responses saved to: {output_file}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check the first element status in the jsonl using the requestId"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking status for file: Earnings-Presentation-Q2-2024.pdf\n",
"Content not yet available\n"
]
}
],
"source": [
"# Get first response from the JSONL file\n",
"with open(output_file, \"r\") as f:\n",
" first_response = json.loads(f.readline())\n",
"\n",
"request_id = first_response[\"requestId\"]\n",
"print(f\"Checking status for file: {first_response['fileName']}\")\n",
"\n",
"# Retrieve status using request ID\n",
"markdown = ap.batches.retrieve(request_id)\n",
"if markdown and markdown.result:\n",
" print(\"Content retrieved successfully\")\n",
"else:\n",
" print(\"Content not yet available\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After 2 hours, you can check the content of the first file in the folder again"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Content retrieved successfully\n"
]
}
],
"source": [
"# Retrieve status using request ID\n",
"markdown = ap.batches.retrieve(request_id)\n",
"if markdown and markdown.result:\n",
" print(\"Content retrieved successfully\")\n",
"else:\n",
" print(\"Content not yet available\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step2: Batch API folder fetch response\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import logging\n",
"import os\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"from any_parser import AnyParser\n",
"\n",
"# Configure logging\n",
"logging.basicConfig(level=logging.INFO)\n",
"logger = logging.getLogger(__name__)\n",
"\n",
"# Load environment variables\n",
"load_dotenv(override=True)\n",
"\n",
"MAX_WORKER = 10"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Get API key and create parser\n",
"api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
"if not api_key:\n",
" raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
"ap = AnyParser(api_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read responses from JSONL file"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# Change to your real output json from parse_batch_upload.py\n",
"response_file = \"./sample_data_20250102103047.jsonl\"\n",
"with open(response_file, \"r\") as f:\n",
" responses = [json.loads(line) for line in f]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Process responses concurrently\n",
"def process_response(response):\n",
" \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
" request_id = response[\"requestId\"]\n",
" try:\n",
" markdown = ap.batches.retrieve(request_id)\n",
" if markdown and markdown.result:\n",
" response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
" response[\"requestStatus\"] = \"COMPLETED\"\n",
" response[\"completionTime\"] = markdown.completionTime\n",
" except Exception as e:\n",
" logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
" response[\"error\"] = [str(e)]\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
]
}
],
"source": [
"# Process responses concurrently\n",
"with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
" future_to_response = {\n",
" executor.submit(process_response, response): response\n",
" for response in responses\n",
" }\n",
"\n",
" updated_responses = []\n",
" for future in as_completed(future_to_response):\n",
" updated_response = future.result()\n",
" updated_responses.append(updated_response)\n",
"\n",
"# Write all updated responses back to file\n",
"with open(response_file, \"w\") as f:\n",
" for response in updated_responses:\n",
" f.write(json.dumps(response) + \"\\n\")\n",
"\n",
"print(f\"Updated all responses in {response_file} with markdown content\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Print out the first row from the updated file"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"First row from updated file:\n",
"{\n",
" \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
" \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
" \"requestStatus\": \"COMPLETED\",\n",
" \"result\": [\n",
" \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
" ],\n",
" \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
"}\n"
]
}
],
"source": [
"# Read and print first row from the updated file\n",
"with open(response_file, \"r\") as f:\n",
" first_row = json.loads(f.readline())\n",
" print(\"First row from updated file:\")\n",
" print(json.dumps(first_row, indent=2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## End of the notebook\n",
"\n",
"Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
"\n",
"<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
" <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
"</a>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "any-parse",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit ca4ac5f

Please sign in to comment.