Merge pull request #77 from CambioML/1219

Add notebook example for Batch API
CambioML · Jan 2, 2025 · ca4ac5f · ca4ac5f
2 parents efe1558 + 79f38cd
commit ca4ac5f
Show file tree

Hide file tree

Showing 5 changed files with 363 additions and 98 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,4 +167,5 @@ cython_debug/
 
 # data/
 *.xlsx
-*.csv
+*.csv
+*.jsonl
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ Each response in the JSONL file contains:
 - The filename
 - A unique request ID
 - Additional processing metadata
+
 You can later use these request IDs to retrieve the extracted content for each file:
 
 ```python

diff --git a/examples/parse_batch_api.ipynb b/examples/parse_batch_api.ipynb
@@ -0,0 +1,360 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Anyparser Batch API Example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
+    "# !pip3 install --upgrade ipython\n",
+    "# !pip3 install --upgrade any-parser"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step1: Batch API Folder Processing Upload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from any_parser import AnyParser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load environment variables\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "# Get API key and create parser\n",
+    "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
+    "if not api_key:\n",
+    "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
+    "ap = AnyParser(api_key)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create Batch Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Upload responses saved to: ./sample_data_20250103003352.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Upload folder for batch processing\n",
+    "WORKING_FOLDER = \"./sample_data\"\n",
+    "responses = ap.batches.create(WORKING_FOLDER)\n",
+    "\n",
+    "# Save responses to JSONL file with timestamp\n",
+    "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+    "output_file = f\"./sample_data_{timestamp}.jsonl\"\n",
+    "\n",
+    "with open(output_file, \"w\") as f:\n",
+    "    for response in responses:\n",
+    "        f.write(json.dumps(response.model_dump()) + \"\\n\")\n",
+    "\n",
+    "print(f\"Upload responses saved to: {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check the first element status in the jsonl using the requestId"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Checking status for file: Earnings-Presentation-Q2-2024.pdf\n",
+      "Content not yet available\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get first response from the JSONL file\n",
+    "with open(output_file, \"r\") as f:\n",
+    "    first_response = json.loads(f.readline())\n",
+    "\n",
+    "request_id = first_response[\"requestId\"]\n",
+    "print(f\"Checking status for file: {first_response['fileName']}\")\n",
+    "\n",
+    "# Retrieve status using request ID\n",
+    "markdown = ap.batches.retrieve(request_id)\n",
+    "if markdown and markdown.result:\n",
+    "    print(\"Content retrieved successfully\")\n",
+    "else:\n",
+    "    print(\"Content not yet available\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After 2 hours, you can check the content of the first file in the folder again"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Content retrieved successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Retrieve status using request ID\n",
+    "markdown = ap.batches.retrieve(request_id)\n",
+    "if markdown and markdown.result:\n",
+    "    print(\"Content retrieved successfully\")\n",
+    "else:\n",
+    "    print(\"Content not yet available\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step2: Batch API folder fetch response\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import logging\n",
+    "import os\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from any_parser import AnyParser\n",
+    "\n",
+    "# Configure logging\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "MAX_WORKER = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get API key and create parser\n",
+    "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
+    "if not api_key:\n",
+    "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
+    "ap = AnyParser(api_key)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read responses from JSONL file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change to your real output json from parse_batch_upload.py\n",
+    "response_file = \"./sample_data_20250102103047.jsonl\"\n",
+    "with open(response_file, \"r\") as f:\n",
+    "    responses = [json.loads(line) for line in f]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process responses concurrently\n",
+    "def process_response(response):\n",
+    "    \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
+    "    request_id = response[\"requestId\"]\n",
+    "    try:\n",
+    "        markdown = ap.batches.retrieve(request_id)\n",
+    "        if markdown and markdown.result:\n",
+    "            response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
+    "            response[\"requestStatus\"] = \"COMPLETED\"\n",
+    "            response[\"completionTime\"] = markdown.completionTime\n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
+    "        response[\"error\"] = [str(e)]\n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Process responses concurrently\n",
+    "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
+    "    future_to_response = {\n",
+    "        executor.submit(process_response, response): response\n",
+    "        for response in responses\n",
+    "    }\n",
+    "\n",
+    "    updated_responses = []\n",
+    "    for future in as_completed(future_to_response):\n",
+    "        updated_response = future.result()\n",
+    "        updated_responses.append(updated_response)\n",
+    "\n",
+    "# Write all updated responses back to file\n",
+    "with open(response_file, \"w\") as f:\n",
+    "    for response in updated_responses:\n",
+    "        f.write(json.dumps(response) + \"\\n\")\n",
+    "\n",
+    "print(f\"Updated all responses in {response_file} with markdown content\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print out the first row from the updated file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First row from updated file:\n",
+      "{\n",
+      "  \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
+      "  \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
+      "  \"requestStatus\": \"COMPLETED\",\n",
+      "  \"result\": [\n",
+      "    \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
+      "  ],\n",
+      "  \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read and print first row from the updated file\n",
+    "with open(response_file, \"r\") as f:\n",
+    "    first_row = json.loads(f.readline())\n",
+    "    print(\"First row from updated file:\")\n",
+    "    print(json.dumps(first_row, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## End of the notebook\n",
+    "\n",
+    "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
+    "\n",
+    "<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
+    "    <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
+    "</a>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "any-parse",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -167,4 +167,5 @@ cython_debug/ @@
     # data/
     *.xlsx
-    *.csv
+    *.csv
+    *.jsonl