diff --git a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb index a0e3ef5..347735b 100644 --- a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb +++ b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb @@ -25,24 +25,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_67864/3281231558.py:2: DeprecationWarning: \n", - "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", - "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", - "but was not found to be installed on your system.\n", - "If this would cause problems for you,\n", - "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", - " \n", - " import pandas as pd\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import pandas as pd\n", @@ -70,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -91,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -108,33 +93,38 @@ "example_prompt = \"Return table in a JSON format with each box's key and value.\"\n", "\n", "op = OpenParser(example_apikey)\n", - "qa_result = op.parse(example_local_file, example_prompt)\n" + "# mode can be \"basic\" or \"advanced\"\n", + "qa_result = op.parse(example_local_file, example_prompt, mode=\"basic\")\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'result': [{\"Employee's social security number\": '758-58-5787'},\n", - " {'Employer identification number (EIN)': '78-8778788'},\n", - " {\"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133'},\n", - " {'Control number': '9'},\n", - " {\"Employee's first name and initial\": 'Jesan'},\n", - " {'Last name': 'Rahaman'},\n", - " {\"State, Employer's state ID number\": 'AL,877878878'},\n", - " {'State wages, tips, etc.': '80000.00'},\n", - " {'Federal income tax withheld': '3835.00'}],\n", + "[{'result': [{\"Employee's social security number\": '758-58-5787',\n", + " 'Employer identification number (EIN)': '78-8778788',\n", + " \"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n", + " 'Control number': '9',\n", + " \"Employee's first name and initial\": 'Jesan',\n", + " 'Last name': 'Rahaman',\n", + " \"State, Employer's state ID number\": 'AL',\n", + " 'Wages, tips, etc.': '80000.00',\n", + " 'Federal income tax withheld': '10368.00',\n", + " 'Social security tax withheld': '4960.00',\n", + " 'Medicare wages and tips': '80000.00',\n", + " 'Medicare tax withheld': '1160.00'}],\n", " 'log': {'instruction': \"Return table in a JSON format with each box's key and value.\",\n", " 'source': '',\n", - " 'usage': {'input_tokens': 1750, 'output_tokens': 232}},\n", + " 'usage': {'input_tokens': 1752, 'output_tokens': 226},\n", + " 'source_log': None},\n", " 'page_num': 0}]" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -145,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -199,15 +189,27 @@ " \n", " \n", " State, Employer's state ID number\n", - " AL,877878878\n", + " AL\n", " \n", " \n", - " State wages, tips, etc.\n", + " Wages, tips, etc.\n", " 80000.00\n", " \n", " \n", " Federal income tax withheld\n", - " 3835.00\n", + " 10368.00\n", + " \n", + " \n", + " Social security tax withheld\n", + " 4960.00\n", + " \n", + " \n", + " Medicare wages and tips\n", + " 80000.00\n", + " \n", + " \n", + " Medicare tax withheld\n", + " 1160.00\n", " \n", " \n", "\n", @@ -221,20 +223,23 @@ "Control number 9\n", "Employee's first name and initial Jesan\n", "Last name Rahaman\n", - "State, Employer's state ID number AL,877878878\n", - "State wages, tips, etc. 80000.00\n", - "Federal income tax withheld 3835.00" + "State, Employer's state ID number AL\n", + "Wages, tips, etc. 80000.00\n", + "Federal income tax withheld 10368.00\n", + "Social security tax withheld 4960.00\n", + "Medicare wages and tips 80000.00\n", + "Medicare tax withheld 1160.00" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = qa_result[0]['result']\n", - "keys = [list(item.keys())[0] for item in data]\n", - "values = [list(item.values())[0] for item in data]\n", + "keys = [list(item.keys()) for item in data][0]\n", + "values = [list(item.values()) for item in data][0]\n", "\n", "# Create a DataFrame\n", "df = pd.DataFrame(values, index=keys, columns=['Value'])\n", diff --git a/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb new file mode 100644 index 0000000..915eaae --- /dev/null +++ b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt to Extract Key-values into JSON from W2 (PDF) using advanced mode\n", + "\n", + "Below it's an example of using OpenParser to extract key-values from a W2 PDF into JSON format. (Note: the model is still in beta and is NOT robust enough to generate the same output. Please bear with it!)\n", + "\n", + "### 1. Load the libraries\n", + "\n", + "If you have install `open_parser`, uncomment the below line." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install python-dotenv\n", + "# !pip3 install --upgrade open_parser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "from dotenv import load_dotenv\n", + "from open_parser import OpenParser\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Set up your OpenParser API key\n", + "\n", + "To set up your `CAMBIO_API_KEY` API key, you will:\n", + "\n", + "1. create a `.env` file in your root folder;\n", + "2. add the following one line to your `.env file:\n", + " ```\n", + " CAMBIO_API_KEY=17b************************\n", + " ```\n", + "\n", + "Then run the below line to load your API key." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Load sample data and Run OpenParser\n", + "\n", + "OpenParser supports both image and PDF. First let's load a sample data to test OpenParser's capabilities.\n", + "\n", + "Now we can run OpenParser on our sample data and then display it in the Markdown format." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload response: 204\n", + "Extraction success.\n" + ] + } + ], + "source": [ + "example_local_file = \"./sample_data/test1.pdf\"\n", + "example_prompt = \"Return table in a JSON format with each box's key and value.\"\n", + "\n", + "op = OpenParser(example_apikey)\n", + "# mode can be \"basic\" or \"advanced\"\n", + "qa_result = op.parse(example_local_file, example_prompt, mode=\"advanced\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{\"a Employee's social security number\": '758-58-5787',\n", + " 'b Employer identification number (EIN)': '78-8778788',\n", + " \"c Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n", + " 'd Control number': '9',\n", + " \"e Employee's first name and initial\": 'Jesan',\n", + " \"e Employee's last name\": 'Rahaman',\n", + " \"f Employee's address and ZIP code\": 'AL 877878878',\n", + " '1 Wages, tips, other compensation': '80000.00',\n", + " '2 Federal income tax withheld': '10368.00',\n", + " '3 Social security wages': '80000.00',\n", + " '4 Social security tax withheld': '4960.00',\n", + " '5 Medicare wages and tips': '80000.00',\n", + " '6 Medicare tax withheld': '1160.00',\n", + " '7 Social security tips': 'NA',\n", + " '8 Allocated tips': 'NA',\n", + " '10 Dependent care benefits': 'NA',\n", + " '11 Nonqualified plans': 'NA',\n", + " '13 Statutory Retroment employee plan': 'NA',\n", + " '13 Third-party sick pay': 'NA',\n", + " '14 Other': 'NA',\n", + " \"15 State Employer's state ID number\": 'AL',\n", + " '16 State wages, tips, etc.': '80000.00',\n", + " '17 State income tax': '3835.00',\n", + " '18 Local wages, tips, etc.': 'NA',\n", + " '19 Local income tax': 'NA',\n", + " '20 Locality name': 'NA'}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_result[0]['result']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Value
a Employee's social security number758-58-5787
b Employer identification number (EIN)78-8778788
c Employer's name, address, and ZIP codeDesignNext\\nKatham Dorbosto, Kashiani, Gopalga...
d Control number9
e Employee's first name and initialJesan
e Employee's last nameRahaman
f Employee's address and ZIP codeAL 877878878
1 Wages, tips, other compensation80000.00
2 Federal income tax withheld10368.00
3 Social security wages80000.00
4 Social security tax withheld4960.00
5 Medicare wages and tips80000.00
6 Medicare tax withheld1160.00
7 Social security tipsNA
8 Allocated tipsNA
10 Dependent care benefitsNA
11 Nonqualified plansNA
13 Statutory Retroment employee planNA
13 Third-party sick payNA
14 OtherNA
15 State Employer's state ID numberAL
16 State wages, tips, etc.80000.00
17 State income tax3835.00
18 Local wages, tips, etc.NA
19 Local income taxNA
20 Locality nameNA
\n", + "
" + ], + "text/plain": [ + " Value\n", + "a Employee's social security number 758-58-5787\n", + "b Employer identification number (EIN) 78-8778788\n", + "c Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n", + "d Control number 9\n", + "e Employee's first name and initial Jesan\n", + "e Employee's last name Rahaman\n", + "f Employee's address and ZIP code AL 877878878\n", + "1 Wages, tips, other compensation 80000.00\n", + "2 Federal income tax withheld 10368.00\n", + "3 Social security wages 80000.00\n", + "4 Social security tax withheld 4960.00\n", + "5 Medicare wages and tips 80000.00\n", + "6 Medicare tax withheld 1160.00\n", + "7 Social security tips NA\n", + "8 Allocated tips NA\n", + "10 Dependent care benefits NA\n", + "11 Nonqualified plans NA\n", + "13 Statutory Retroment employee plan NA\n", + "13 Third-party sick pay NA\n", + "14 Other NA\n", + "15 State Employer's state ID number AL\n", + "16 State wages, tips, etc. 80000.00\n", + "17 State income tax 3835.00\n", + "18 Local wages, tips, etc. NA\n", + "19 Local income tax NA\n", + "20 Locality name NA" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = qa_result[0]['result']\n", + "keys = [list(item.keys()) for item in data][0]\n", + "values = [list(item.values()) for item in data][0]\n", + "\n", + "# Create a DataFrame\n", + "df = pd.DataFrame(values, index=keys, columns=['Value'])\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/test3.png b/examples/test3.png new file mode 100644 index 0000000..6fc943d Binary files /dev/null and b/examples/test3.png differ diff --git a/examples/test_example.py b/examples/test_example.py index 04712f5..8a06e41 100755 --- a/examples/test_example.py +++ b/examples/test_example.py @@ -14,7 +14,7 @@ example_apikey = os.getenv("CAMBIO_API_KEY") - example_local_file = "./test2.pdf" + example_local_file = "./sample_data/test2.pdf" op = OpenParser(example_apikey) @@ -25,6 +25,6 @@ print("information extraction test:") example_prompt = "Return table under Investor Metrics in JSON format with year as the key and the column as subkeys." - qa_result = op.parse(example_local_file, example_prompt) + qa_result = op.parse(example_local_file, example_prompt, mode="basic") print(type(qa_result)) print(qa_result) diff --git a/open_parser/base.py b/open_parser/base.py index bd18cb6..b22a235 100644 --- a/open_parser/base.py +++ b/open_parser/base.py @@ -28,9 +28,9 @@ def extract(self, file_path): result = self._request_file_extraction(user_id, job_id, s3_key) return result["file_content"] - def parse(self, file_path, prompt): + def parse(self, file_path, prompt, mode="advanced"): user_id, job_id, s3_key = self._request_and_upload_by_apiKey(file_path, prompt) - result = self._request_info_extraction(user_id, job_id, s3_key) + result = self._request_info_extraction(user_id, job_id, s3_key, mode) return result["results"] def _error_handler(self, response): @@ -77,11 +77,14 @@ def _request_file_extraction(self, user_id, job_id, s3_key): self._error_handler(response) - def _request_info_extraction(self, user_id, job_id, s3_key): + def _request_info_extraction(self, user_id, job_id, s3_key, mode): + if mode not in ["advanced", "basic"]: + raise ValueError("Invalid mode. Choose either 'advanced' or 'basic'.") payload = { "userId": user_id, "jobId": job_id, "fileKey": s3_key, + "extract": True if mode == "advanced" else False, } response = requests.post( self._parseurl, headers=self._request_header, json=payload