Skip to content

Commit

Permalink
Merge pull request #9 from CambioML/dev
Browse files Browse the repository at this point in the history
Update extract with both basic and advanced mode
  • Loading branch information
Cambio ML authored Apr 7, 2024
2 parents 3bf2b1e + d844d90 commit e82835b
Show file tree
Hide file tree
Showing 5 changed files with 419 additions and 47 deletions.
89 changes: 47 additions & 42 deletions examples/prompt_to_extract_table_from_pdf_to_json.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_67864/3281231558.py:2: DeprecationWarning: \n",
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
"but was not found to be installed on your system.\n",
"If this would cause problems for you,\n",
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
" \n",
" import pandas as pd\n"
]
}
],
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
Expand Down Expand Up @@ -70,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -91,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -108,33 +93,38 @@
"example_prompt = \"Return table in a JSON format with each box's key and value.\"\n",
"\n",
"op = OpenParser(example_apikey)\n",
"qa_result = op.parse(example_local_file, example_prompt)\n"
"# mode can be \"basic\" or \"advanced\"\n",
"qa_result = op.parse(example_local_file, example_prompt, mode=\"basic\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'result': [{\"Employee's social security number\": '758-58-5787'},\n",
" {'Employer identification number (EIN)': '78-8778788'},\n",
" {\"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133'},\n",
" {'Control number': '9'},\n",
" {\"Employee's first name and initial\": 'Jesan'},\n",
" {'Last name': 'Rahaman'},\n",
" {\"State, Employer's state ID number\": 'AL,877878878'},\n",
" {'State wages, tips, etc.': '80000.00'},\n",
" {'Federal income tax withheld': '3835.00'}],\n",
"[{'result': [{\"Employee's social security number\": '758-58-5787',\n",
" 'Employer identification number (EIN)': '78-8778788',\n",
" \"Employer's name, address, and ZIP code\": 'DesignNext\\nKatham Dorbosto, Kashiani, Gopalganj\\nGopalganj, AK 8133',\n",
" 'Control number': '9',\n",
" \"Employee's first name and initial\": 'Jesan',\n",
" 'Last name': 'Rahaman',\n",
" \"State, Employer's state ID number\": 'AL',\n",
" 'Wages, tips, etc.': '80000.00',\n",
" 'Federal income tax withheld': '10368.00',\n",
" 'Social security tax withheld': '4960.00',\n",
" 'Medicare wages and tips': '80000.00',\n",
" 'Medicare tax withheld': '1160.00'}],\n",
" 'log': {'instruction': \"Return table in a JSON format with each box's key and value.\",\n",
" 'source': '',\n",
" 'usage': {'input_tokens': 1750, 'output_tokens': 232}},\n",
" 'usage': {'input_tokens': 1752, 'output_tokens': 226},\n",
" 'source_log': None},\n",
" 'page_num': 0}]"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -145,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -199,15 +189,27 @@
" </tr>\n",
" <tr>\n",
" <th>State, Employer's state ID number</th>\n",
" <td>AL,877878878</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>State wages, tips, etc.</th>\n",
" <th>Wages, tips, etc.</th>\n",
" <td>80000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Federal income tax withheld</th>\n",
" <td>3835.00</td>\n",
" <td>10368.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Social security tax withheld</th>\n",
" <td>4960.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Medicare wages and tips</th>\n",
" <td>80000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Medicare tax withheld</th>\n",
" <td>1160.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -221,20 +223,23 @@
"Control number 9\n",
"Employee's first name and initial Jesan\n",
"Last name Rahaman\n",
"State, Employer's state ID number AL,877878878\n",
"State wages, tips, etc. 80000.00\n",
"Federal income tax withheld 3835.00"
"State, Employer's state ID number AL\n",
"Wages, tips, etc. 80000.00\n",
"Federal income tax withheld 10368.00\n",
"Social security tax withheld 4960.00\n",
"Medicare wages and tips 80000.00\n",
"Medicare tax withheld 1160.00"
]
},
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = qa_result[0]['result']\n",
"keys = [list(item.keys())[0] for item in data]\n",
"values = [list(item.values())[0] for item in data]\n",
"keys = [list(item.keys()) for item in data][0]\n",
"values = [list(item.values()) for item in data][0]\n",
"\n",
"# Create a DataFrame\n",
"df = pd.DataFrame(values, index=keys, columns=['Value'])\n",
Expand Down
Loading

0 comments on commit e82835b

Please sign in to comment.