Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update extract with both basic and advanced mode #9

Merged
merged 5 commits into from
Apr 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 47 additions & 42 deletions examples/prompt_to_extract_table_from_pdf_to_json.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_67864/3281231558.py:2: DeprecationWarning: \n",
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
"but was not found to be installed on your system.\n",
"If this would cause problems for you,\n",
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
" \n",
" import pandas as pd\n"
]
}
],
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
Expand Down Expand Up @@ -70,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -91,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -108,33 +93,38 @@
"example_prompt = \"Return table in a JSON format with each box's key and value.\"\n",
"\n",
"op = OpenParser(example_apikey)\n",
"qa_result = op.parse(example_local_file, example_prompt)\n"
"# mode can be \"basic\" or \"advanced\"\n",
"qa_result = op.parse(example_local_file, example_prompt, mode=\"basic\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'result': [{\"Employee's social security number\": '758-58-5787'},\n",
" {'Employer identification number (EIN)': '78-8778788'},\n",
" {\"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133'},\n",
" {'Control number': '9'},\n",
" {\"Employee's first name and initial\": 'Jesan'},\n",
" {'Last name': 'Rahaman'},\n",
" {\"State, Employer's state ID number\": 'AL,877878878'},\n",
" {'State wages, tips, etc.': '80000.00'},\n",
" {'Federal income tax withheld': '3835.00'}],\n",
"[{'result': [{\"Employee's social security number\": '758-58-5787',\n",
" 'Employer identification number (EIN)': '78-8778788',\n",
" \"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
" 'Control number': '9',\n",
" \"Employee's first name and initial\": 'Jesan',\n",
" 'Last name': 'Rahaman',\n",
" \"State, Employer's state ID number\": 'AL',\n",
" 'Wages, tips, etc.': '80000.00',\n",
" 'Federal income tax withheld': '10368.00',\n",
" 'Social security tax withheld': '4960.00',\n",
" 'Medicare wages and tips': '80000.00',\n",
" 'Medicare tax withheld': '1160.00'}],\n",
" 'log': {'instruction': \"Return table in a JSON format with each box's key and value.\",\n",
" 'source': '',\n",
" 'usage': {'input_tokens': 1750, 'output_tokens': 232}},\n",
" 'usage': {'input_tokens': 1752, 'output_tokens': 226},\n",
" 'source_log': None},\n",
" 'page_num': 0}]"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -145,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -199,15 +189,27 @@
" </tr>\n",
" <tr>\n",
" <th>State, Employer's state ID number</th>\n",
" <td>AL,877878878</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>State wages, tips, etc.</th>\n",
" <th>Wages, tips, etc.</th>\n",
" <td>80000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Federal income tax withheld</th>\n",
" <td>3835.00</td>\n",
" <td>10368.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Social security tax withheld</th>\n",
" <td>4960.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Medicare wages and tips</th>\n",
" <td>80000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Medicare tax withheld</th>\n",
" <td>1160.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -221,20 +223,23 @@
"Control number 9\n",
"Employee's first name and initial Jesan\n",
"Last name Rahaman\n",
"State, Employer's state ID number AL,877878878\n",
"State wages, tips, etc. 80000.00\n",
"Federal income tax withheld 3835.00"
"State, Employer's state ID number AL\n",
"Wages, tips, etc. 80000.00\n",
"Federal income tax withheld 10368.00\n",
"Social security tax withheld 4960.00\n",
"Medicare wages and tips 80000.00\n",
"Medicare tax withheld 1160.00"
]
},
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = qa_result[0]['result']\n",
"keys = [list(item.keys())[0] for item in data]\n",
"values = [list(item.values())[0] for item in data]\n",
"keys = [list(item.keys()) for item in data][0]\n",
"values = [list(item.values()) for item in data][0]\n",
"\n",
"# Create a DataFrame\n",
"df = pd.DataFrame(values, index=keys, columns=['Value'])\n",
Expand Down
Loading
Loading