From a70dc0cfc56d709b756a32e03dbffd29dfd0af8c Mon Sep 17 00:00:00 2001 From: Charles Yuan Date: Thu, 2 Jan 2025 13:58:17 +0800 Subject: [PATCH 1/4] add notebook --- .gitignore | 3 +- README.md | 1 + examples/parse_batch_fetch.ipynb | 206 ++++++++++++++++++++++++++++++ examples/parse_batch_fetch.py | 4 +- examples/parse_batch_upload.ipynb | 205 +++++++++++++++++++++++++++++ 5 files changed, 416 insertions(+), 3 deletions(-) create mode 100644 examples/parse_batch_fetch.ipynb create mode 100644 examples/parse_batch_upload.ipynb diff --git a/.gitignore b/.gitignore index 3ced46f..9da19ae 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,5 @@ cython_debug/ # data/ *.xlsx -*.csv \ No newline at end of file +*.csv +*.jsonl \ No newline at end of file diff --git a/README.md b/README.md index 3131c25..0cd4e83 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ Each response in the JSONL file contains: - The filename - A unique request ID - Additional processing metadata + You can later use these request IDs to retrieve the extracted content for each file: ```python diff --git a/examples/parse_batch_fetch.ipynb b/examples/parse_batch_fetch.ipynb new file mode 100644 index 0000000..bfe0b35 --- /dev/null +++ b/examples/parse_batch_fetch.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Batch API folder fetch response Example\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "import os\n", + "from concurrent.futures import ThreadPoolExecutor, as_completed\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser\n", + "\n", + "# Configure logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "MAX_WORKER = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read responses from JSONL file" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Change to your real output json from parse_batch_upload.py\n", + "response_file = \"./sample_data_20250102103047.jsonl\"\n", + "with open(response_file, \"r\") as f:\n", + " responses = [json.loads(line) for line in f]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Process responses concurrently\n", + "def process_response(response):\n", + " \"\"\"Process a single response by retrieving markdown content\"\"\"\n", + " request_id = response[\"requestId\"]\n", + " try:\n", + " markdown = ap.batches.retrieve(request_id)\n", + " if markdown:\n", + " response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", + " response[\"requestStatus\"] = \"COMPLETED\"\n", + " response[\"completionTime\"] = markdown.completionTime\n", + " except Exception as e:\n", + " logger.error(f\"Error processing {request_id}: {str(e)}\")\n", + " response[\"error\"] = [str(e)]\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n" + ] + } + ], + "source": [ + "# Process responses concurrently\n", + "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", + " future_to_response = {\n", + " executor.submit(process_response, response): response\n", + " for response in responses\n", + " }\n", + "\n", + " updated_responses = []\n", + " for future in as_completed(future_to_response):\n", + " updated_response = future.result()\n", + " updated_responses.append(updated_response)\n", + "\n", + "# Write all updated responses back to file\n", + "with open(response_file, \"w\") as f:\n", + " for response in updated_responses:\n", + " f.write(json.dumps(response) + \"\\n\")\n", + "\n", + "print(f\"Updated all responses in {response_file} with markdown content\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out the first row from the updated file" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First row from updated file:\n", + "{\n", + " \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n", + " \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n", + " \"requestStatus\": \"COMPLETED\",\n", + " \"result\": [\n", + " \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n", + " ],\n", + " \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n", + "}\n" + ] + } + ], + "source": [ + "# Read and print first row from the updated file\n", + "with open(response_file, \"r\") as f:\n", + " first_row = json.loads(f.readline())\n", + " print(\"First row from updated file:\")\n", + " print(json.dumps(first_row, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any-parse", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py index 4704c64..0825009 100644 --- a/examples/parse_batch_fetch.py +++ b/examples/parse_batch_fetch.py @@ -26,7 +26,7 @@ # Read responses from JSONL file # Change to your real output json from parse_batch_upload.py -response_file = "./sample_data_20241219190049.jsonl" +response_file = "./sample_data_20250102103047.jsonl" with open(response_file, "r") as f: responses = [json.loads(line) for line in f] @@ -36,7 +36,7 @@ def process_response(response): request_id = response["requestId"] try: markdown = ap.batches.retrieve(request_id) - if markdown: + if markdown: # TODO: add status check here response["result"] = [markdown.result[0] if markdown.result else ""] response["requestStatus"] = "COMPLETED" response["completionTime"] = markdown.completionTime diff --git a/examples/parse_batch_upload.ipynb b/examples/parse_batch_upload.ipynb new file mode 100644 index 0000000..6e29234 --- /dev/null +++ b/examples/parse_batch_upload.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Batch API Folder Processing Upload Example" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from datetime import datetime\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Batch Request" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload responses saved to: ./sample_data_20250102134950.jsonl\n" + ] + } + ], + "source": [ + "# Upload folder for batch processing\n", + "WORKING_FOLDER = \"./sample_data\"\n", + "responses = ap.batches.create(WORKING_FOLDER)\n", + "\n", + "# Save responses to JSONL file with timestamp\n", + "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", + "output_file = f\"./sample_data_{timestamp}.jsonl\"\n", + "\n", + "with open(output_file, \"w\") as f:\n", + " for response in responses:\n", + " f.write(json.dumps(response.model_dump()) + \"\\n\")\n", + "\n", + "print(f\"Upload responses saved to: {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the first element status in the jsonl using the requestId" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking status for file: test3.pdf\n", + "Content not yet available\n" + ] + } + ], + "source": [ + "# Get first response from the JSONL file\n", + "with open(output_file, \"r\") as f:\n", + " first_response = json.loads(f.readline())\n", + "\n", + "request_id = first_response[\"requestId\"]\n", + "print(f\"Checking status for file: {first_response['fileName']}\")\n", + "\n", + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After 2 hours, you can check the content of the first file in the folder again" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content retrieved successfully\n" + ] + } + ], + "source": [ + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the job is completed, refer to examples/parse_batch_fetch.ipynb to fetch all responses in the jsonl file:\n", + "\n", + "https://github.com/CambioML/any-parser/blob/main/examples/parse_batch_fetch.ipynb\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0fd710282f385ca64a5dc5c8ca033d7f2d820896 Mon Sep 17 00:00:00 2001 From: Charles Yuan Date: Thu, 2 Jan 2025 14:04:56 +0800 Subject: [PATCH 2/4] fix fetch status check --- examples/parse_batch_fetch.ipynb | 2 +- examples/parse_batch_fetch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/parse_batch_fetch.ipynb b/examples/parse_batch_fetch.ipynb index bfe0b35..a649334 100644 --- a/examples/parse_batch_fetch.ipynb +++ b/examples/parse_batch_fetch.ipynb @@ -87,7 +87,7 @@ " request_id = response[\"requestId\"]\n", " try:\n", " markdown = ap.batches.retrieve(request_id)\n", - " if markdown:\n", + " if markdown and markdown.result:\n", " response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", " response[\"requestStatus\"] = \"COMPLETED\"\n", " response[\"completionTime\"] = markdown.completionTime\n", diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py index 0825009..7ec20fd 100644 --- a/examples/parse_batch_fetch.py +++ b/examples/parse_batch_fetch.py @@ -36,7 +36,7 @@ def process_response(response): request_id = response["requestId"] try: markdown = ap.batches.retrieve(request_id) - if markdown: # TODO: add status check here + if markdown and markdown.result: response["result"] = [markdown.result[0] if markdown.result else ""] response["requestStatus"] = "COMPLETED" response["completionTime"] = markdown.completionTime From dabc7719cf4d3d605e0d12050f46d272150a2faa Mon Sep 17 00:00:00 2001 From: Charles Yuan Date: Fri, 3 Jan 2025 00:21:35 +0800 Subject: [PATCH 3/4] delete dup --- examples/parse_batch_fetch.py | 65 ---------------------------------- examples/parse_batch_upload.py | 32 ----------------- 2 files changed, 97 deletions(-) delete mode 100644 examples/parse_batch_fetch.py delete mode 100644 examples/parse_batch_upload.py diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py deleted file mode 100644 index 7ec20fd..0000000 --- a/examples/parse_batch_fetch.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Test batch API folder fetch response""" - -import json -import logging -import os -from concurrent.futures import ThreadPoolExecutor, as_completed - -from dotenv import load_dotenv - -from any_parser import AnyParser - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Load environment variables -load_dotenv(override=True) - -MAX_WORKER = 10 - -# Get API key and create parser -api_key = os.environ.get("CAMBIO_API_KEY") -if not api_key: - raise ValueError("CAMBIO_API_KEY is not set") -ap = AnyParser(api_key) - -# Read responses from JSONL file -# Change to your real output json from parse_batch_upload.py -response_file = "./sample_data_20250102103047.jsonl" -with open(response_file, "r") as f: - responses = [json.loads(line) for line in f] - - -def process_response(response): - """Process a single response by retrieving markdown content""" - request_id = response["requestId"] - try: - markdown = ap.batches.retrieve(request_id) - if markdown and markdown.result: - response["result"] = [markdown.result[0] if markdown.result else ""] - response["requestStatus"] = "COMPLETED" - response["completionTime"] = markdown.completionTime - except Exception as e: - logger.error(f"Error processing {request_id}: {str(e)}") - response["error"] = [str(e)] - return response - - -# Process responses concurrently -with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor: - future_to_response = { - executor.submit(process_response, response): response for response in responses - } - - updated_responses = [] - for future in as_completed(future_to_response): - updated_response = future.result() - updated_responses.append(updated_response) - -# Write all updated responses back to file -with open(response_file, "w") as f: - for response in updated_responses: - f.write(json.dumps(response) + "\n") - -print(f"Updated all responses in {response_file} with markdown content") diff --git a/examples/parse_batch_upload.py b/examples/parse_batch_upload.py deleted file mode 100644 index d9f4cc4..0000000 --- a/examples/parse_batch_upload.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Batch API Folder Processing Upload Example""" - -import json -import os -from datetime import datetime - -from dotenv import load_dotenv - -from any_parser import AnyParser - -# Load environment variables -load_dotenv(override=True) - -# Get API key and create parser -api_key = os.environ.get("CAMBIO_API_KEY") -if not api_key: - raise ValueError("CAMBIO_API_KEY is not set") -ap = AnyParser(api_key) - -# Upload folder for batch processing -WORKING_FOLDER = "./sample_data" -responses = ap.batches.create(WORKING_FOLDER) - -# Save responses to JSONL file with timestamp -timestamp = datetime.now().strftime("%Y%m%d%H%M%S") -output_file = f"./sample_data_{timestamp}.jsonl" - -with open(output_file, "w") as f: - for response in responses: - f.write(json.dumps(response.model_dump()) + "\n") - -print(f"Upload responses saved to: {output_file}") From 79f38cd08ad0df8ae86dd3170e7d651d1e185216 Mon Sep 17 00:00:00 2001 From: Charles Yuan Date: Fri, 3 Jan 2025 00:37:36 +0800 Subject: [PATCH 4/4] combine two notebooks --- ...atch_fetch.ipynb => parse_batch_api.ipynb} | 158 +++++++++++++- examples/parse_batch_upload.ipynb | 205 ------------------ 2 files changed, 156 insertions(+), 207 deletions(-) rename examples/{parse_batch_fetch.ipynb => parse_batch_api.ipynb} (59%) delete mode 100644 examples/parse_batch_upload.ipynb diff --git a/examples/parse_batch_fetch.ipynb b/examples/parse_batch_api.ipynb similarity index 59% rename from examples/parse_batch_fetch.ipynb rename to examples/parse_batch_api.ipynb index a649334..e5a83f7 100644 --- a/examples/parse_batch_fetch.ipynb +++ b/examples/parse_batch_api.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Batch API folder fetch response Example\n" + "# Anyparser Batch API Example" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -18,6 +18,160 @@ "# !pip3 install --upgrade any-parser" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step1: Batch API Folder Processing Upload" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from datetime import datetime\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Batch Request" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload responses saved to: ./sample_data_20250103003352.jsonl\n" + ] + } + ], + "source": [ + "# Upload folder for batch processing\n", + "WORKING_FOLDER = \"./sample_data\"\n", + "responses = ap.batches.create(WORKING_FOLDER)\n", + "\n", + "# Save responses to JSONL file with timestamp\n", + "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", + "output_file = f\"./sample_data_{timestamp}.jsonl\"\n", + "\n", + "with open(output_file, \"w\") as f:\n", + " for response in responses:\n", + " f.write(json.dumps(response.model_dump()) + \"\\n\")\n", + "\n", + "print(f\"Upload responses saved to: {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the first element status in the jsonl using the requestId" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking status for file: Earnings-Presentation-Q2-2024.pdf\n", + "Content not yet available\n" + ] + } + ], + "source": [ + "# Get first response from the JSONL file\n", + "with open(output_file, \"r\") as f:\n", + " first_response = json.loads(f.readline())\n", + "\n", + "request_id = first_response[\"requestId\"]\n", + "print(f\"Checking status for file: {first_response['fileName']}\")\n", + "\n", + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After 2 hours, you can check the content of the first file in the folder again" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content retrieved successfully\n" + ] + } + ], + "source": [ + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step2: Batch API folder fetch response\n" + ] + }, { "cell_type": "code", "execution_count": 16, diff --git a/examples/parse_batch_upload.ipynb b/examples/parse_batch_upload.ipynb deleted file mode 100644 index 6e29234..0000000 --- a/examples/parse_batch_upload.ipynb +++ /dev/null @@ -1,205 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batch API Folder Processing Upload Example" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Install the libraries (ipython is used for displaying markdown in this demo)\n", - "# !pip3 install --upgrade ipython\n", - "# !pip3 install --upgrade any-parser" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "from datetime import datetime\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "from any_parser import AnyParser" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables\n", - "load_dotenv(override=True)\n", - "\n", - "# Get API key and create parser\n", - "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", - "if not api_key:\n", - " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", - "ap = AnyParser(api_key)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create Batch Request" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Upload responses saved to: ./sample_data_20250102134950.jsonl\n" - ] - } - ], - "source": [ - "# Upload folder for batch processing\n", - "WORKING_FOLDER = \"./sample_data\"\n", - "responses = ap.batches.create(WORKING_FOLDER)\n", - "\n", - "# Save responses to JSONL file with timestamp\n", - "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", - "output_file = f\"./sample_data_{timestamp}.jsonl\"\n", - "\n", - "with open(output_file, \"w\") as f:\n", - " for response in responses:\n", - " f.write(json.dumps(response.model_dump()) + \"\\n\")\n", - "\n", - "print(f\"Upload responses saved to: {output_file}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check the first element status in the jsonl using the requestId" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking status for file: test3.pdf\n", - "Content not yet available\n" - ] - } - ], - "source": [ - "# Get first response from the JSONL file\n", - "with open(output_file, \"r\") as f:\n", - " first_response = json.loads(f.readline())\n", - "\n", - "request_id = first_response[\"requestId\"]\n", - "print(f\"Checking status for file: {first_response['fileName']}\")\n", - "\n", - "# Retrieve status using request ID\n", - "markdown = ap.batches.retrieve(request_id)\n", - "if markdown and markdown.result:\n", - " print(\"Content retrieved successfully\")\n", - "else:\n", - " print(\"Content not yet available\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After 2 hours, you can check the content of the first file in the folder again" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Content retrieved successfully\n" - ] - } - ], - "source": [ - "# Retrieve status using request ID\n", - "markdown = ap.batches.retrieve(request_id)\n", - "if markdown and markdown.result:\n", - " print(\"Content retrieved successfully\")\n", - "else:\n", - " print(\"Content not yet available\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After the job is completed, refer to examples/parse_batch_fetch.ipynb to fetch all responses in the jsonl file:\n", - "\n", - "https://github.com/CambioML/any-parser/blob/main/examples/parse_batch_fetch.ipynb\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## End of the notebook\n", - "\n", - "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", - "\n", - "\n", - " \n", - "" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}