From 4ec49d6d4a65615d5512b6ab5236f62e0faec38f Mon Sep 17 00:00:00 2001 From: Rachel Hu Date: Thu, 11 Apr 2024 00:13:37 -0700 Subject: [PATCH 1/2] Update notebook with latset interface change from CDK information extract job change. --- ...pt_to_extract_table_from_pdf_to_json.ipynb | 72 ++--- ...table_from_pdf_to_json_advanced_mode.ipynb | 136 +++++---- ...table_from_png_to_json_advanced_mode.ipynb | 268 ++++++++++++++++++ open_parser/base.py | 2 +- 4 files changed, 357 insertions(+), 121 deletions(-) create mode 100644 examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb diff --git a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb index 347735b..f2e2609 100644 --- a/examples/prompt_to_extract_table_from_pdf_to_json.ipynb +++ b/examples/prompt_to_extract_table_from_pdf_to_json.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -99,43 +99,7 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'result': [{\"Employee's social security number\": '758-58-5787',\n", - " 'Employer identification number (EIN)': '78-8778788',\n", - " \"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n", - " 'Control number': '9',\n", - " \"Employee's first name and initial\": 'Jesan',\n", - " 'Last name': 'Rahaman',\n", - " \"State, Employer's state ID number\": 'AL',\n", - " 'Wages, tips, etc.': '80000.00',\n", - " 'Federal income tax withheld': '10368.00',\n", - " 'Social security tax withheld': '4960.00',\n", - " 'Medicare wages and tips': '80000.00',\n", - " 'Medicare tax withheld': '1160.00'}],\n", - " 'log': {'instruction': \"Return table in a JSON format with each box's key and value.\",\n", - " 'source': '',\n", - " 'usage': {'input_tokens': 1752, 'output_tokens': 226},\n", - " 'source_log': None},\n", - " 'page_num': 0}]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "qa_result" - ] - }, - { - "cell_type": "code", - "execution_count": 8, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -184,12 +148,12 @@ " Jesan\n", " \n", " \n", - " Last name\n", + " Employee's last name\n", " Rahaman\n", " \n", " \n", " State, Employer's state ID number\n", - " AL\n", + " AL 877878878\n", " \n", " \n", " Wages, tips, etc.\n", @@ -211,6 +175,14 @@ " Medicare tax withheld\n", " 1160.00\n", " \n", + " \n", + " State wages, tips, etc.\n", + " 80000.00\n", + " \n", + " \n", + " State income tax\n", + " 3835.00\n", + " \n", " \n", "\n", "" @@ -222,22 +194,24 @@ "Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n", "Control number 9\n", "Employee's first name and initial Jesan\n", - "Last name Rahaman\n", - "State, Employer's state ID number AL\n", + "Employee's last name Rahaman\n", + "State, Employer's state ID number AL 877878878\n", "Wages, tips, etc. 80000.00\n", "Federal income tax withheld 10368.00\n", "Social security tax withheld 4960.00\n", "Medicare wages and tips 80000.00\n", - "Medicare tax withheld 1160.00" + "Medicare tax withheld 1160.00\n", + "State wages, tips, etc. 80000.00\n", + "State income tax 3835.00" ] }, - "execution_count": 8, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = qa_result[0]['result']\n", + "data = qa_result[0]\n", "keys = [list(item.keys()) for item in data][0]\n", "values = [list(item.values()) for item in data][0]\n", "\n", @@ -277,7 +251,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb index 915eaae..592ba0f 100644 --- a/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb +++ b/examples/prompt_to_extract_table_from_pdf_to_json_advanced_mode.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -99,52 +99,51 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{\"a Employee's social security number\": '758-58-5787',\n", - " 'b Employer identification number (EIN)': '78-8778788',\n", - " \"c Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n", - " 'd Control number': '9',\n", - " \"e Employee's first name and initial\": 'Jesan',\n", - " \"e Employee's last name\": 'Rahaman',\n", - " \"f Employee's address and ZIP code\": 'AL 877878878',\n", - " '1 Wages, tips, other compensation': '80000.00',\n", - " '2 Federal income tax withheld': '10368.00',\n", - " '3 Social security wages': '80000.00',\n", - " '4 Social security tax withheld': '4960.00',\n", - " '5 Medicare wages and tips': '80000.00',\n", - " '6 Medicare tax withheld': '1160.00',\n", - " '7 Social security tips': 'NA',\n", - " '8 Allocated tips': 'NA',\n", - " '10 Dependent care benefits': 'NA',\n", - " '11 Nonqualified plans': 'NA',\n", - " '13 Statutory Retroment employee plan': 'NA',\n", - " '13 Third-party sick pay': 'NA',\n", - " '14 Other': 'NA',\n", - " \"15 State Employer's state ID number\": 'AL',\n", - " '16 State wages, tips, etc.': '80000.00',\n", - " '17 State income tax': '3835.00',\n", - " '18 Local wages, tips, etc.': 'NA',\n", - " '19 Local income tax': 'NA',\n", - " '20 Locality name': 'NA'}]" + "[[{\"a Employee's social security number\": '758-58-5787',\n", + " 'b Employer identification number (EIN)': '78-8778788',\n", + " \"c Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n", + " 'd Control number': '9',\n", + " \"e Employee's first name and initial\": 'Jesan',\n", + " 'e Last name': 'Rahaman',\n", + " \"f Employee's address and ZIP code\": 'AL\\n877878878',\n", + " '1 Wages, tips, other compensation': '80000.00',\n", + " '2 Federal income tax withheld': '10368.00',\n", + " '3 Social security wages': '80000.00',\n", + " '4 Social security tax withheld': '4960.00',\n", + " '5 Medicare wages and tips': '80000.00',\n", + " '6 Medicare tax withheld': '1160.00',\n", + " '7 Social security tips': 'NA',\n", + " '8 Allocated tips': 'NA',\n", + " '10 Dependent care benefits': 'NA',\n", + " '11 Nonqualified plans': 'NA',\n", + " '13 Statutory Retroment employee Third-party sick pay plan': 'NA',\n", + " '14 Other': 'NA',\n", + " '15 State': 'AL',\n", + " '16 State wages, tips, etc.': '80000.00',\n", + " '17 State income tax': '3835.00',\n", + " '18 Local wages, tips, etc.': 'NA',\n", + " '19 Local income tax': 'NA',\n", + " '20 Locality name': 'NA'}]]" ] }, - "execution_count": 11, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "qa_result[0]['result']" + "qa_result" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -193,12 +192,12 @@ " Jesan\n", " \n", " \n", - " e Employee's last name\n", + " e Last name\n", " Rahaman\n", " \n", " \n", " f Employee's address and ZIP code\n", - " AL 877878878\n", + " AL\\n877878878\n", " \n", " \n", " 1 Wages, tips, other compensation\n", @@ -241,11 +240,7 @@ " NA\n", " \n", " \n", - " 13 Statutory Retroment employee plan\n", - " NA\n", - " \n", - " \n", - " 13 Third-party sick pay\n", + " 13 Statutory Retroment employee Third-party sick pay plan\n", " NA\n", " \n", " \n", @@ -253,7 +248,7 @@ " NA\n", " \n", " \n", - " 15 State Employer's state ID number\n", + " 15 State\n", " AL\n", " \n", " \n", @@ -281,42 +276,41 @@ "" ], "text/plain": [ - " Value\n", - "a Employee's social security number 758-58-5787\n", - "b Employer identification number (EIN) 78-8778788\n", - "c Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n", - "d Control number 9\n", - "e Employee's first name and initial Jesan\n", - "e Employee's last name Rahaman\n", - "f Employee's address and ZIP code AL 877878878\n", - "1 Wages, tips, other compensation 80000.00\n", - "2 Federal income tax withheld 10368.00\n", - "3 Social security wages 80000.00\n", - "4 Social security tax withheld 4960.00\n", - "5 Medicare wages and tips 80000.00\n", - "6 Medicare tax withheld 1160.00\n", - "7 Social security tips NA\n", - "8 Allocated tips NA\n", - "10 Dependent care benefits NA\n", - "11 Nonqualified plans NA\n", - "13 Statutory Retroment employee plan NA\n", - "13 Third-party sick pay NA\n", - "14 Other NA\n", - "15 State Employer's state ID number AL\n", - "16 State wages, tips, etc. 80000.00\n", - "17 State income tax 3835.00\n", - "18 Local wages, tips, etc. NA\n", - "19 Local income tax NA\n", - "20 Locality name NA" + " Value\n", + "a Employee's social security number 758-58-5787\n", + "b Employer identification number (EIN) 78-8778788\n", + "c Employer's name, address, and ZIP code DesignNext\\nKatham Dorbosto, Kashiani, Gopalga...\n", + "d Control number 9\n", + "e Employee's first name and initial Jesan\n", + "e Last name Rahaman\n", + "f Employee's address and ZIP code AL\\n877878878\n", + "1 Wages, tips, other compensation 80000.00\n", + "2 Federal income tax withheld 10368.00\n", + "3 Social security wages 80000.00\n", + "4 Social security tax withheld 4960.00\n", + "5 Medicare wages and tips 80000.00\n", + "6 Medicare tax withheld 1160.00\n", + "7 Social security tips NA\n", + "8 Allocated tips NA\n", + "10 Dependent care benefits NA\n", + "11 Nonqualified plans NA\n", + "13 Statutory Retroment employee Third-party sic... NA\n", + "14 Other NA\n", + "15 State AL\n", + "16 State wages, tips, etc. 80000.00\n", + "17 State income tax 3835.00\n", + "18 Local wages, tips, etc. NA\n", + "19 Local income tax NA\n", + "20 Locality name NA" ] }, - "execution_count": 10, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = qa_result[0]['result']\n", + "data = qa_result[0]\n", "keys = [list(item.keys()) for item in data][0]\n", "values = [list(item.values()) for item in data][0]\n", "\n", diff --git a/examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb b/examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb new file mode 100644 index 0000000..bfa84fd --- /dev/null +++ b/examples/prompt_to_extract_table_from_png_to_json_advanced_mode.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt to Extract Key-values into JSON from png using advanced mode\n", + "\n", + "Below it's an example of using OpenParser to extract key-values from a png into JSON format. (Note: the model is still in beta and is NOT robust enough to generate the same output. Please bear with it!)\n", + "\n", + "### 1. Load the libraries\n", + "\n", + "If you have install `open_parser`, uncomment the below line." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install python-dotenv\n", + "# !pip3 install --upgrade open_parser" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "from dotenv import load_dotenv\n", + "from open_parser import OpenParser\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Set up your OpenParser API key\n", + "\n", + "To set up your `CAMBIO_API_KEY` API key, you will:\n", + "\n", + "1. create a `.env` file in your root folder;\n", + "2. add the following one line to your `.env file:\n", + " ```\n", + " CAMBIO_API_KEY=17b************************\n", + " ```\n", + "\n", + "Then run the below line to load your API key." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Load sample data and Run OpenParser\n", + "\n", + "OpenParser supports both image and PDF. First let's load a sample data to test OpenParser's capabilities.\n", + "\n", + "Now we can run OpenParser on our sample data and then display it in the Markdown format." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload response: 204\n", + "Extraction success.\n" + ] + } + ], + "source": [ + "example_local_file = \"./sample_data/test3.png\"\n", + "example_prompt = \"Return table in a JSON format with each box's key and value.\"\n", + "\n", + "op = OpenParser(example_apikey)\n", + "# mode can be \"basic\" or \"advanced\"\n", + "qa_result = op.parse(example_local_file, example_prompt, mode=\"advanced\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Item 1Item 2Item 3Item 4Item 5
Office Commercial products and cloud services revenue growth (y/y)7% / 13%7% / 14%13%/17%12% / 14%15% / 14%
Office Consumer products and cloud services revenue growth (y/y)7% / 11%(2)% / 3%1% / 4%3% / 6%3% / 4%
Office 365 Commercial seat growth (y/y)14%12%11%11%10%
Microsoft 365 Consumer subscribers (in millions)65.167.770.874.976.7
Dynamics products and cloud services revenue growth (y/y)15% / 22%13% / 20%17% / 21%19% / 21%22% / 21%
LinkedIn revenue growth (y/y)17% / 21%10% / 14%8% / 11%6% / 8%8%
\n", + "
" + ], + "text/plain": [ + " Item 1 Item 2 \\\n", + "Office Commercial products and cloud services r... 7% / 13% 7% / 14% \n", + "Office Consumer products and cloud services rev... 7% / 11% (2)% / 3% \n", + "Office 365 Commercial seat growth (y/y) 14% 12% \n", + "Microsoft 365 Consumer subscribers (in millions) 65.1 67.7 \n", + "Dynamics products and cloud services revenue gr... 15% / 22% 13% / 20% \n", + "LinkedIn revenue growth (y/y) 17% / 21% 10% / 14% \n", + "\n", + " Item 3 Item 4 \\\n", + "Office Commercial products and cloud services r... 13%/17% 12% / 14% \n", + "Office Consumer products and cloud services rev... 1% / 4% 3% / 6% \n", + "Office 365 Commercial seat growth (y/y) 11% 11% \n", + "Microsoft 365 Consumer subscribers (in millions) 70.8 74.9 \n", + "Dynamics products and cloud services revenue gr... 17% / 21% 19% / 21% \n", + "LinkedIn revenue growth (y/y) 8% / 11% 6% / 8% \n", + "\n", + " Item 5 \n", + "Office Commercial products and cloud services r... 15% / 14% \n", + "Office Consumer products and cloud services rev... 3% / 4% \n", + "Office 365 Commercial seat growth (y/y) 10% \n", + "Microsoft 365 Consumer subscribers (in millions) 76.7 \n", + "Dynamics products and cloud services revenue gr... 22% / 21% \n", + "LinkedIn revenue growth (y/y) 8% " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Assuming qa_result is the JSON data you provided\n", + "data = qa_result[0]\n", + "\n", + "# Flatten the list of dictionaries into a DataFrame\n", + "df = pd.DataFrame(data)\n", + "\n", + "# Transpose the DataFrame so keys become the row index\n", + "df_transposed = df.T\n", + "\n", + "# Optionally, you can rename the columns to reflect the item number or any specific identification\n", + "df_transposed.columns = [f'Item {i+1}' for i in range(len(df_transposed.columns))]\n", + "\n", + "df_transposed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/open_parser/base.py b/open_parser/base.py index 01e2b94..0f1a658 100644 --- a/open_parser/base.py +++ b/open_parser/base.py @@ -31,7 +31,7 @@ def extract(self, file_path): def parse(self, file_path, prompt, mode="advanced"): user_id, job_id, s3_key = self._request_and_upload_by_apiKey(file_path) result = self._request_info_extraction(user_id, job_id, s3_key, mode, prompt) - return result["results"] + return result def _error_handler(self, response): if response.status_code == 403: From b6fd97e1e94a893b3bcc8afb52101edbc9daf0f7 Mon Sep 17 00:00:00 2001 From: Rachel Hu Date: Thu, 11 Apr 2024 00:16:55 -0700 Subject: [PATCH 2/2] Bump up version to 0.0.5. --- open_parser/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/open_parser/__init__.py b/open_parser/__init__.py index 7690287..8b6e08d 100644 --- a/open_parser/__init__.py +++ b/open_parser/__init__.py @@ -2,4 +2,4 @@ __all__ = ["OpenParser"] -__version__ = "0.0.4" +__version__ = "0.0.5" diff --git a/pyproject.toml b/pyproject.toml index ce23bd4..9ed35d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "open-parser" -version = "0.0.4" +version = "0.0.5" description = "Open parser for all." authors = ["CambioML "] maintainers = ["Rachel Hu "]