diff --git a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb
index 347735b..f2e2609 100644
--- a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb
+++ b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -55,7 +55,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -76,7 +76,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -99,43 +99,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'result': [{\"Employee's social security number\": '758-58-5787',\n",
- " 'Employer identification number (EIN)': '78-8778788',\n",
- " \"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
- " 'Control number': '9',\n",
- " \"Employee's first name and initial\": 'Jesan',\n",
- " 'Last name': 'Rahaman',\n",
- " \"State, Employer's state ID number\": 'AL',\n",
- " 'Wages, tips, etc.': '80000.00',\n",
- " 'Federal income tax withheld': '10368.00',\n",
- " 'Social security tax withheld': '4960.00',\n",
- " 'Medicare wages and tips': '80000.00',\n",
- " 'Medicare tax withheld': '1160.00'}],\n",
- " 'log': {'instruction': \"Return table in a JSON format with each box's key and value.\",\n",
- " 'source': '',\n",
- " 'usage': {'input_tokens': 1752, 'output_tokens': 226},\n",
- " 'source_log': None},\n",
- " 'page_num': 0}]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "qa_result"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -184,12 +148,12 @@
"
Jesan | \n",
" \n",
" \n",
- " Last name | \n",
+ " Employee's last name | \n",
" Rahaman | \n",
"
\n",
" \n",
" State, Employer's state ID number | \n",
- " AL | \n",
+ " AL 877878878 | \n",
"
\n",
" \n",
" Wages, tips, etc. | \n",
@@ -211,6 +175,14 @@
" Medicare tax withheld | \n",
" 1160.00 | \n",
"
\n",
+ " \n",
+ " State wages, tips, etc. | \n",
+ " 80000.00 | \n",
+ "
\n",
+ " \n",
+ " State income tax | \n",
+ " 3835.00 | \n",
+ "
\n",
" \n",
"\n",
""
@@ -222,22 +194,24 @@
"Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n",
"Control number 9\n",
"Employee's first name and initial Jesan\n",
- "Last name Rahaman\n",
- "State, Employer's state ID number AL\n",
+ "Employee's last name Rahaman\n",
+ "State, Employer's state ID number AL 877878878\n",
"Wages, tips, etc. 80000.00\n",
"Federal income tax withheld 10368.00\n",
"Social security tax withheld 4960.00\n",
"Medicare wages and tips 80000.00\n",
- "Medicare tax withheld 1160.00"
+ "Medicare tax withheld 1160.00\n",
+ "State wages, tips, etc. 80000.00\n",
+ "State income tax 3835.00"
]
},
- "execution_count": 8,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "data = qa_result[0]['result']\n",
+ "data = qa_result[0]\n",
"keys = [list(item.keys()) for item in data][0]\n",
"values = [list(item.values()) for item in data][0]\n",
"\n",
@@ -277,7 +251,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.13"
+ "version": "3.10.14"
}
},
"nbformat": 4,
diff --git a/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb
index 915eaae..592ba0f 100644
--- a/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb
+++ b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -55,7 +55,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -76,7 +76,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -99,52 +99,51 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[{\"a Employee's social security number\": '758-58-5787',\n",
- " 'b Employer identification number (EIN)': '78-8778788',\n",
- " \"c Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
- " 'd Control number': '9',\n",
- " \"e Employee's first name and initial\": 'Jesan',\n",
- " \"e Employee's last name\": 'Rahaman',\n",
- " \"f Employee's address and ZIP code\": 'AL 877878878',\n",
- " '1 Wages, tips, other compensation': '80000.00',\n",
- " '2 Federal income tax withheld': '10368.00',\n",
- " '3 Social security wages': '80000.00',\n",
- " '4 Social security tax withheld': '4960.00',\n",
- " '5 Medicare wages and tips': '80000.00',\n",
- " '6 Medicare tax withheld': '1160.00',\n",
- " '7 Social security tips': 'NA',\n",
- " '8 Allocated tips': 'NA',\n",
- " '10 Dependent care benefits': 'NA',\n",
- " '11 Nonqualified plans': 'NA',\n",
- " '13 Statutory Retroment employee plan': 'NA',\n",
- " '13 Third-party sick pay': 'NA',\n",
- " '14 Other': 'NA',\n",
- " \"15 State Employer's state ID number\": 'AL',\n",
- " '16 State wages, tips, etc.': '80000.00',\n",
- " '17 State income tax': '3835.00',\n",
- " '18 Local wages, tips, etc.': 'NA',\n",
- " '19 Local income tax': 'NA',\n",
- " '20 Locality name': 'NA'}]"
+ "[[{\"a Employee's social security number\": '758-58-5787',\n",
+ " 'b Employer identification number (EIN)': '78-8778788',\n",
+ " \"c Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
+ " 'd Control number': '9',\n",
+ " \"e Employee's first name and initial\": 'Jesan',\n",
+ " 'e Last name': 'Rahaman',\n",
+ " \"f Employee's address and ZIP code\": 'AL\\n877878878',\n",
+ " '1 Wages, tips, other compensation': '80000.00',\n",
+ " '2 Federal income tax withheld': '10368.00',\n",
+ " '3 Social security wages': '80000.00',\n",
+ " '4 Social security tax withheld': '4960.00',\n",
+ " '5 Medicare wages and tips': '80000.00',\n",
+ " '6 Medicare tax withheld': '1160.00',\n",
+ " '7 Social security tips': 'NA',\n",
+ " '8 Allocated tips': 'NA',\n",
+ " '10 Dependent care benefits': 'NA',\n",
+ " '11 Nonqualified plans': 'NA',\n",
+ " '13 Statutory Retroment employee Third-party sick pay plan': 'NA',\n",
+ " '14 Other': 'NA',\n",
+ " '15 State': 'AL',\n",
+ " '16 State wages, tips, etc.': '80000.00',\n",
+ " '17 State income tax': '3835.00',\n",
+ " '18 Local wages, tips, etc.': 'NA',\n",
+ " '19 Local income tax': 'NA',\n",
+ " '20 Locality name': 'NA'}]]"
]
},
- "execution_count": 11,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "qa_result[0]['result']"
+ "qa_result"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -193,12 +192,12 @@
" Jesan | \n",
" \n",
" \n",
- " e Employee's last name | \n",
+ " e Last name | \n",
" Rahaman | \n",
"
\n",
" \n",
" f Employee's address and ZIP code | \n",
- " AL 877878878 | \n",
+ " AL\\n877878878 | \n",
"
\n",
" \n",
" 1 Wages, tips, other compensation | \n",
@@ -241,11 +240,7 @@
" NA | \n",
"
\n",
" \n",
- " 13 Statutory Retroment employee plan | \n",
- " NA | \n",
- "
\n",
- " \n",
- " 13 Third-party sick pay | \n",
+ " 13 Statutory Retroment employee Third-party sick pay plan | \n",
" NA | \n",
"
\n",
" \n",
@@ -253,7 +248,7 @@
" NA | \n",
"
\n",
" \n",
- " 15 State Employer's state ID number | \n",
+ " 15 State | \n",
" AL | \n",
"
\n",
" \n",
@@ -281,42 +276,41 @@
""
],
"text/plain": [
- " Value\n",
- "a Employee's social security number 758-58-5787\n",
- "b Employer identification number (EIN) 78-8778788\n",
- "c Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n",
- "d Control number 9\n",
- "e Employee's first name and initial Jesan\n",
- "e Employee's last name Rahaman\n",
- "f Employee's address and ZIP code AL 877878878\n",
- "1 Wages, tips, other compensation 80000.00\n",
- "2 Federal income tax withheld 10368.00\n",
- "3 Social security wages 80000.00\n",
- "4 Social security tax withheld 4960.00\n",
- "5 Medicare wages and tips 80000.00\n",
- "6 Medicare tax withheld 1160.00\n",
- "7 Social security tips NA\n",
- "8 Allocated tips NA\n",
- "10 Dependent care benefits NA\n",
- "11 Nonqualified plans NA\n",
- "13 Statutory Retroment employee plan NA\n",
- "13 Third-party sick pay NA\n",
- "14 Other NA\n",
- "15 State Employer's state ID number AL\n",
- "16 State wages, tips, etc. 80000.00\n",
- "17 State income tax 3835.00\n",
- "18 Local wages, tips, etc. NA\n",
- "19 Local income tax NA\n",
- "20 Locality name NA"
+ " Value\n",
+ "a Employee's social security number 758-58-5787\n",
+ "b Employer identification number (EIN) 78-8778788\n",
+ "c Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n",
+ "d Control number 9\n",
+ "e Employee's first name and initial Jesan\n",
+ "e Last name Rahaman\n",
+ "f Employee's address and ZIP code AL\\n877878878\n",
+ "1 Wages, tips, other compensation 80000.00\n",
+ "2 Federal income tax withheld 10368.00\n",
+ "3 Social security wages 80000.00\n",
+ "4 Social security tax withheld 4960.00\n",
+ "5 Medicare wages and tips 80000.00\n",
+ "6 Medicare tax withheld 1160.00\n",
+ "7 Social security tips NA\n",
+ "8 Allocated tips NA\n",
+ "10 Dependent care benefits NA\n",
+ "11 Nonqualified plans NA\n",
+ "13 Statutory Retroment employee Third-party sic... NA\n",
+ "14 Other NA\n",
+ "15 State AL\n",
+ "16 State wages, tips, etc. 80000.00\n",
+ "17 State income tax 3835.00\n",
+ "18 Local wages, tips, etc. NA\n",
+ "19 Local income tax NA\n",
+ "20 Locality name NA"
]
},
- "execution_count": 10,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "data = qa_result[0]['result']\n",
+ "data = qa_result[0]\n",
"keys = [list(item.keys()) for item in data][0]\n",
"values = [list(item.values()) for item in data][0]\n",
"\n",
diff --git a/examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb b/examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb
new file mode 100644
index 0000000..bfa84fd
--- /dev/null
+++ b/examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prompt to Extract Key-values into JSON from png using advanced mode\n",
+ "\n",
+ "Below it's an example of using OpenParser to extract key-values from a png into JSON format. (Note: the model is still in beta and is NOT robust enough to generate the same output. Please bear with it!)\n",
+ "\n",
+ "### 1. Load the libraries\n",
+ "\n",
+ "If you have install `open_parser`, uncomment the below line."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !pip3 install python-dotenv\n",
+ "# !pip3 install --upgrade open_parser"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "from open_parser import OpenParser\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Set up your OpenParser API key\n",
+ "\n",
+ "To set up your `CAMBIO_API_KEY` API key, you will:\n",
+ "\n",
+ "1. create a `.env` file in your root folder;\n",
+ "2. add the following one line to your `.env file:\n",
+ " ```\n",
+ " CAMBIO_API_KEY=17b************************\n",
+ " ```\n",
+ "\n",
+ "Then run the below line to load your API key."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "load_dotenv(override=True)\n",
+ "example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Load sample data and Run OpenParser\n",
+ "\n",
+ "OpenParser supports both image and PDF. First let's load a sample data to test OpenParser's capabilities.\n",
+ "\n",
+ "Now we can run OpenParser on our sample data and then display it in the Markdown format."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Upload response: 204\n",
+ "Extraction success.\n"
+ ]
+ }
+ ],
+ "source": [
+ "example_local_file = \"./sample_data/test3.png\"\n",
+ "example_prompt = \"Return table in a JSON format with each box's key and value.\"\n",
+ "\n",
+ "op = OpenParser(example_apikey)\n",
+ "# mode can be \"basic\" or \"advanced\"\n",
+ "qa_result = op.parse(example_local_file, example_prompt, mode=\"advanced\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Item 1 | \n",
+ " Item 2 | \n",
+ " Item 3 | \n",
+ " Item 4 | \n",
+ " Item 5 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Office Commercial products and cloud services revenue growth (y/y) | \n",
+ " 7% / 13% | \n",
+ " 7% / 14% | \n",
+ " 13%/17% | \n",
+ " 12% / 14% | \n",
+ " 15% / 14% | \n",
+ "
\n",
+ " \n",
+ " Office Consumer products and cloud services revenue growth (y/y) | \n",
+ " 7% / 11% | \n",
+ " (2)% / 3% | \n",
+ " 1% / 4% | \n",
+ " 3% / 6% | \n",
+ " 3% / 4% | \n",
+ "
\n",
+ " \n",
+ " Office 365 Commercial seat growth (y/y) | \n",
+ " 14% | \n",
+ " 12% | \n",
+ " 11% | \n",
+ " 11% | \n",
+ " 10% | \n",
+ "
\n",
+ " \n",
+ " Microsoft 365 Consumer subscribers (in millions) | \n",
+ " 65.1 | \n",
+ " 67.7 | \n",
+ " 70.8 | \n",
+ " 74.9 | \n",
+ " 76.7 | \n",
+ "
\n",
+ " \n",
+ " Dynamics products and cloud services revenue growth (y/y) | \n",
+ " 15% / 22% | \n",
+ " 13% / 20% | \n",
+ " 17% / 21% | \n",
+ " 19% / 21% | \n",
+ " 22% / 21% | \n",
+ "
\n",
+ " \n",
+ " LinkedIn revenue growth (y/y) | \n",
+ " 17% / 21% | \n",
+ " 10% / 14% | \n",
+ " 8% / 11% | \n",
+ " 6% / 8% | \n",
+ " 8% | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Item 1 Item 2 \\\n",
+ "Office Commercial products and cloud services r... 7% / 13% 7% / 14% \n",
+ "Office Consumer products and cloud services rev... 7% / 11% (2)% / 3% \n",
+ "Office 365 Commercial seat growth (y/y) 14% 12% \n",
+ "Microsoft 365 Consumer subscribers (in millions) 65.1 67.7 \n",
+ "Dynamics products and cloud services revenue gr... 15% / 22% 13% / 20% \n",
+ "LinkedIn revenue growth (y/y) 17% / 21% 10% / 14% \n",
+ "\n",
+ " Item 3 Item 4 \\\n",
+ "Office Commercial products and cloud services r... 13%/17% 12% / 14% \n",
+ "Office Consumer products and cloud services rev... 1% / 4% 3% / 6% \n",
+ "Office 365 Commercial seat growth (y/y) 11% 11% \n",
+ "Microsoft 365 Consumer subscribers (in millions) 70.8 74.9 \n",
+ "Dynamics products and cloud services revenue gr... 17% / 21% 19% / 21% \n",
+ "LinkedIn revenue growth (y/y) 8% / 11% 6% / 8% \n",
+ "\n",
+ " Item 5 \n",
+ "Office Commercial products and cloud services r... 15% / 14% \n",
+ "Office Consumer products and cloud services rev... 3% / 4% \n",
+ "Office 365 Commercial seat growth (y/y) 10% \n",
+ "Microsoft 365 Consumer subscribers (in millions) 76.7 \n",
+ "Dynamics products and cloud services revenue gr... 22% / 21% \n",
+ "LinkedIn revenue growth (y/y) 8% "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Assuming qa_result is the JSON data you provided\n",
+ "data = qa_result[0]\n",
+ "\n",
+ "# Flatten the list of dictionaries into a DataFrame\n",
+ "df = pd.DataFrame(data)\n",
+ "\n",
+ "# Transpose the DataFrame so keys become the row index\n",
+ "df_transposed = df.T\n",
+ "\n",
+ "# Optionally, you can rename the columns to reflect the item number or any specific identification\n",
+ "df_transposed.columns = [f'Item {i+1}' for i in range(len(df_transposed.columns))]\n",
+ "\n",
+ "df_transposed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## End of the notebook\n",
+ "\n",
+ "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
+ "\n",
+ "\n",
+ " \n",
+ ""
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "open-parser",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/open_parser/__init__.py b/open_parser/__init__.py
index 7690287..8b6e08d 100644
--- a/open_parser/__init__.py
+++ b/open_parser/__init__.py
@@ -2,4 +2,4 @@
__all__ = ["OpenParser"]
-__version__ = "0.0.4"
+__version__ = "0.0.5"
diff --git a/open_parser/base.py b/open_parser/base.py
index 01e2b94..0f1a658 100644
--- a/open_parser/base.py
+++ b/open_parser/base.py
@@ -31,7 +31,7 @@ def extract(self, file_path):
def parse(self, file_path, prompt, mode="advanced"):
user_id, job_id, s3_key = self._request_and_upload_by_apiKey(file_path)
result = self._request_info_extraction(user_id, job_id, s3_key, mode, prompt)
- return result["results"]
+ return result
def _error_handler(self, response):
if response.status_code == 403:
diff --git a/pyproject.toml b/pyproject.toml
index ce23bd4..9ed35d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "open-parser"
-version = "0.0.4"
+version = "0.0.5"
description = "Open parser for all."
authors = ["CambioML "]
maintainers = ["Rachel Hu "]