diff --git a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb
index a0e3ef5..347735b 100644
--- a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb
+++ b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb
@@ -25,24 +25,9 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_67864/3281231558.py:2: DeprecationWarning: \n",
- "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
- "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
- "but was not found to be installed on your system.\n",
- "If this would cause problems for you,\n",
- "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
- " \n",
- " import pandas as pd\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
@@ -70,7 +55,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -91,7 +76,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -108,33 +93,38 @@
"example_prompt = \"Return table in a JSON format with each box's key and value.\"\n",
"\n",
"op = OpenParser(example_apikey)\n",
- "qa_result = op.parse(example_local_file, example_prompt)\n"
+ "# mode can be \"basic\" or \"advanced\"\n",
+ "qa_result = op.parse(example_local_file, example_prompt, mode=\"basic\")\n"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[{'result': [{\"Employee's social security number\": '758-58-5787'},\n",
- " {'Employer identification number (EIN)': '78-8778788'},\n",
- " {\"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133'},\n",
- " {'Control number': '9'},\n",
- " {\"Employee's first name and initial\": 'Jesan'},\n",
- " {'Last name': 'Rahaman'},\n",
- " {\"State, Employer's state ID number\": 'AL,877878878'},\n",
- " {'State wages, tips, etc.': '80000.00'},\n",
- " {'Federal income tax withheld': '3835.00'}],\n",
+ "[{'result': [{\"Employee's social security number\": '758-58-5787',\n",
+ " 'Employer identification number (EIN)': '78-8778788',\n",
+ " \"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
+ " 'Control number': '9',\n",
+ " \"Employee's first name and initial\": 'Jesan',\n",
+ " 'Last name': 'Rahaman',\n",
+ " \"State, Employer's state ID number\": 'AL',\n",
+ " 'Wages, tips, etc.': '80000.00',\n",
+ " 'Federal income tax withheld': '10368.00',\n",
+ " 'Social security tax withheld': '4960.00',\n",
+ " 'Medicare wages and tips': '80000.00',\n",
+ " 'Medicare tax withheld': '1160.00'}],\n",
" 'log': {'instruction': \"Return table in a JSON format with each box's key and value.\",\n",
" 'source': '',\n",
- " 'usage': {'input_tokens': 1750, 'output_tokens': 232}},\n",
+ " 'usage': {'input_tokens': 1752, 'output_tokens': 226},\n",
+ " 'source_log': None},\n",
" 'page_num': 0}]"
]
},
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -145,7 +135,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -199,15 +189,27 @@
" \n",
"
\n",
" State, Employer's state ID number | \n",
- " AL,877878878 | \n",
+ " AL | \n",
"
\n",
" \n",
- " State wages, tips, etc. | \n",
+ " Wages, tips, etc. | \n",
" 80000.00 | \n",
"
\n",
" \n",
" Federal income tax withheld | \n",
- " 3835.00 | \n",
+ " 10368.00 | \n",
+ "
\n",
+ " \n",
+ " Social security tax withheld | \n",
+ " 4960.00 | \n",
+ "
\n",
+ " \n",
+ " Medicare wages and tips | \n",
+ " 80000.00 | \n",
+ "
\n",
+ " \n",
+ " Medicare tax withheld | \n",
+ " 1160.00 | \n",
"
\n",
" \n",
"\n",
@@ -221,20 +223,23 @@
"Control number 9\n",
"Employee's first name and initial Jesan\n",
"Last name Rahaman\n",
- "State, Employer's state ID number AL,877878878\n",
- "State wages, tips, etc. 80000.00\n",
- "Federal income tax withheld 3835.00"
+ "State, Employer's state ID number AL\n",
+ "Wages, tips, etc. 80000.00\n",
+ "Federal income tax withheld 10368.00\n",
+ "Social security tax withheld 4960.00\n",
+ "Medicare wages and tips 80000.00\n",
+ "Medicare tax withheld 1160.00"
]
},
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = qa_result[0]['result']\n",
- "keys = [list(item.keys())[0] for item in data]\n",
- "values = [list(item.values())[0] for item in data]\n",
+ "keys = [list(item.keys()) for item in data][0]\n",
+ "values = [list(item.values()) for item in data][0]\n",
"\n",
"# Create a DataFrame\n",
"df = pd.DataFrame(values, index=keys, columns=['Value'])\n",
diff --git a/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb
new file mode 100644
index 0000000..915eaae
--- /dev/null
+++ b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb
@@ -0,0 +1,364 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prompt to Extract Key-values into JSON from W2 (PDF) using advanced mode\n",
+ "\n",
+ "Below it's an example of using OpenParser to extract key-values from a W2 PDF into JSON format. (Note: the model is still in beta and is NOT robust enough to generate the same output. Please bear with it!)\n",
+ "\n",
+ "### 1. Load the libraries\n",
+ "\n",
+ "If you have install `open_parser`, uncomment the below line."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !pip3 install python-dotenv\n",
+ "# !pip3 install --upgrade open_parser"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "from open_parser import OpenParser\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Set up your OpenParser API key\n",
+ "\n",
+ "To set up your `CAMBIO_API_KEY` API key, you will:\n",
+ "\n",
+ "1. create a `.env` file in your root folder;\n",
+ "2. add the following one line to your `.env file:\n",
+ " ```\n",
+ " CAMBIO_API_KEY=17b************************\n",
+ " ```\n",
+ "\n",
+ "Then run the below line to load your API key."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "load_dotenv(override=True)\n",
+ "example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Load sample data and Run OpenParser\n",
+ "\n",
+ "OpenParser supports both image and PDF. First let's load a sample data to test OpenParser's capabilities.\n",
+ "\n",
+ "Now we can run OpenParser on our sample data and then display it in the Markdown format."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Upload response: 204\n",
+ "Extraction success.\n"
+ ]
+ }
+ ],
+ "source": [
+ "example_local_file = \"./sample_data/test1.pdf\"\n",
+ "example_prompt = \"Return table in a JSON format with each box's key and value.\"\n",
+ "\n",
+ "op = OpenParser(example_apikey)\n",
+ "# mode can be \"basic\" or \"advanced\"\n",
+ "qa_result = op.parse(example_local_file, example_prompt, mode=\"advanced\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{\"a Employee's social security number\": '758-58-5787',\n",
+ " 'b Employer identification number (EIN)': '78-8778788',\n",
+ " \"c Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
+ " 'd Control number': '9',\n",
+ " \"e Employee's first name and initial\": 'Jesan',\n",
+ " \"e Employee's last name\": 'Rahaman',\n",
+ " \"f Employee's address and ZIP code\": 'AL 877878878',\n",
+ " '1 Wages, tips, other compensation': '80000.00',\n",
+ " '2 Federal income tax withheld': '10368.00',\n",
+ " '3 Social security wages': '80000.00',\n",
+ " '4 Social security tax withheld': '4960.00',\n",
+ " '5 Medicare wages and tips': '80000.00',\n",
+ " '6 Medicare tax withheld': '1160.00',\n",
+ " '7 Social security tips': 'NA',\n",
+ " '8 Allocated tips': 'NA',\n",
+ " '10 Dependent care benefits': 'NA',\n",
+ " '11 Nonqualified plans': 'NA',\n",
+ " '13 Statutory Retroment employee plan': 'NA',\n",
+ " '13 Third-party sick pay': 'NA',\n",
+ " '14 Other': 'NA',\n",
+ " \"15 State Employer's state ID number\": 'AL',\n",
+ " '16 State wages, tips, etc.': '80000.00',\n",
+ " '17 State income tax': '3835.00',\n",
+ " '18 Local wages, tips, etc.': 'NA',\n",
+ " '19 Local income tax': 'NA',\n",
+ " '20 Locality name': 'NA'}]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "qa_result[0]['result']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " a Employee's social security number | \n",
+ " 758-58-5787 | \n",
+ "
\n",
+ " \n",
+ " b Employer identification number (EIN) | \n",
+ " 78-8778788 | \n",
+ "
\n",
+ " \n",
+ " c Employer's name, address, and ZIP code | \n",
+ " DesignNext\\nKatham Dorbosto, Kashiani, Gopalga... | \n",
+ "
\n",
+ " \n",
+ " d Control number | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " e Employee's first name and initial | \n",
+ " Jesan | \n",
+ "
\n",
+ " \n",
+ " e Employee's last name | \n",
+ " Rahaman | \n",
+ "
\n",
+ " \n",
+ " f Employee's address and ZIP code | \n",
+ " AL 877878878 | \n",
+ "
\n",
+ " \n",
+ " 1 Wages, tips, other compensation | \n",
+ " 80000.00 | \n",
+ "
\n",
+ " \n",
+ " 2 Federal income tax withheld | \n",
+ " 10368.00 | \n",
+ "
\n",
+ " \n",
+ " 3 Social security wages | \n",
+ " 80000.00 | \n",
+ "
\n",
+ " \n",
+ " 4 Social security tax withheld | \n",
+ " 4960.00 | \n",
+ "
\n",
+ " \n",
+ " 5 Medicare wages and tips | \n",
+ " 80000.00 | \n",
+ "
\n",
+ " \n",
+ " 6 Medicare tax withheld | \n",
+ " 1160.00 | \n",
+ "
\n",
+ " \n",
+ " 7 Social security tips | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 8 Allocated tips | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 10 Dependent care benefits | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 11 Nonqualified plans | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 13 Statutory Retroment employee plan | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 13 Third-party sick pay | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 14 Other | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 15 State Employer's state ID number | \n",
+ " AL | \n",
+ "
\n",
+ " \n",
+ " 16 State wages, tips, etc. | \n",
+ " 80000.00 | \n",
+ "
\n",
+ " \n",
+ " 17 State income tax | \n",
+ " 3835.00 | \n",
+ "
\n",
+ " \n",
+ " 18 Local wages, tips, etc. | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 19 Local income tax | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ " 20 Locality name | \n",
+ " NA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Value\n",
+ "a Employee's social security number 758-58-5787\n",
+ "b Employer identification number (EIN) 78-8778788\n",
+ "c Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n",
+ "d Control number 9\n",
+ "e Employee's first name and initial Jesan\n",
+ "e Employee's last name Rahaman\n",
+ "f Employee's address and ZIP code AL 877878878\n",
+ "1 Wages, tips, other compensation 80000.00\n",
+ "2 Federal income tax withheld 10368.00\n",
+ "3 Social security wages 80000.00\n",
+ "4 Social security tax withheld 4960.00\n",
+ "5 Medicare wages and tips 80000.00\n",
+ "6 Medicare tax withheld 1160.00\n",
+ "7 Social security tips NA\n",
+ "8 Allocated tips NA\n",
+ "10 Dependent care benefits NA\n",
+ "11 Nonqualified plans NA\n",
+ "13 Statutory Retroment employee plan NA\n",
+ "13 Third-party sick pay NA\n",
+ "14 Other NA\n",
+ "15 State Employer's state ID number AL\n",
+ "16 State wages, tips, etc. 80000.00\n",
+ "17 State income tax 3835.00\n",
+ "18 Local wages, tips, etc. NA\n",
+ "19 Local income tax NA\n",
+ "20 Locality name NA"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = qa_result[0]['result']\n",
+ "keys = [list(item.keys()) for item in data][0]\n",
+ "values = [list(item.values()) for item in data][0]\n",
+ "\n",
+ "# Create a DataFrame\n",
+ "df = pd.DataFrame(values, index=keys, columns=['Value'])\n",
+ "\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## End of the notebook\n",
+ "\n",
+ "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
+ "\n",
+ "\n",
+ " \n",
+ ""
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "open-parser",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/test3.png b/examples/test3.png
new file mode 100644
index 0000000..6fc943d
Binary files /dev/null and b/examples/test3.png differ
diff --git a/examples/test_example.py b/examples/test_example.py
index 04712f5..8a06e41 100755
--- a/examples/test_example.py
+++ b/examples/test_example.py
@@ -14,7 +14,7 @@
example_apikey = os.getenv("CAMBIO_API_KEY")
- example_local_file = "./test2.pdf"
+ example_local_file = "./sample_data/test2.pdf"
op = OpenParser(example_apikey)
@@ -25,6 +25,6 @@
print("information extraction test:")
example_prompt = "Return table under Investor Metrics in JSON format with year as the key and the column as subkeys."
- qa_result = op.parse(example_local_file, example_prompt)
+ qa_result = op.parse(example_local_file, example_prompt, mode="basic")
print(type(qa_result))
print(qa_result)
diff --git a/open_parser/base.py b/open_parser/base.py
index bd18cb6..b22a235 100644
--- a/open_parser/base.py
+++ b/open_parser/base.py
@@ -28,9 +28,9 @@ def extract(self, file_path):
result = self._request_file_extraction(user_id, job_id, s3_key)
return result["file_content"]
- def parse(self, file_path, prompt):
+ def parse(self, file_path, prompt, mode="advanced"):
user_id, job_id, s3_key = self._request_and_upload_by_apiKey(file_path, prompt)
- result = self._request_info_extraction(user_id, job_id, s3_key)
+ result = self._request_info_extraction(user_id, job_id, s3_key, mode)
return result["results"]
def _error_handler(self, response):
@@ -77,11 +77,14 @@ def _request_file_extraction(self, user_id, job_id, s3_key):
self._error_handler(response)
- def _request_info_extraction(self, user_id, job_id, s3_key):
+ def _request_info_extraction(self, user_id, job_id, s3_key, mode):
+ if mode not in ["advanced", "basic"]:
+ raise ValueError("Invalid mode. Choose either 'advanced' or 'basic'.")
payload = {
"userId": user_id,
"jobId": job_id,
"fileKey": s3_key,
+ "extract": True if mode == "advanced" else False,
}
response = requests.post(
self._parseurl, headers=self._request_header, json=payload