From df35343dfb868196747e66de68f31ea25b2a1770 Mon Sep 17 00:00:00 2001 From: Rachel Hu Date: Tue, 7 May 2024 14:37:29 -0700 Subject: [PATCH] move notebook under example folder --- ...ing_Revenue_Report_(March_2024)_page35.pdf | Bin ...ing_Revenue_Report_(March_2024)_page43.pdf | Bin ...ming_Revenue_Report_(March_2024)_page8.pdf | Bin examples/pdf_to_html_to_excel.ipynb | 376 ++++++++++++++++++ 4 files changed, 376 insertions(+) rename {demo => examples/nevada_gaming_revenue_reports}/Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf (100%) rename {demo => examples/nevada_gaming_revenue_reports}/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf (100%) rename {demo => examples/nevada_gaming_revenue_reports}/Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf (100%) create mode 100644 examples/pdf_to_html_to_excel.ipynb diff --git a/demo/Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf b/examples/nevada_gaming_revenue_reports/Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf similarity index 100% rename from demo/Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf rename to examples/nevada_gaming_revenue_reports/Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf diff --git a/demo/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf b/examples/nevada_gaming_revenue_reports/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf similarity index 100% rename from demo/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf rename to examples/nevada_gaming_revenue_reports/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf diff --git a/demo/Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf b/examples/nevada_gaming_revenue_reports/Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf similarity index 100% rename from demo/Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf rename to examples/nevada_gaming_revenue_reports/Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf diff --git a/examples/pdf_to_html_to_excel.ipynb b/examples/pdf_to_html_to_excel.ipynb new file mode 100644 index 0000000..7d9faf9 --- /dev/null +++ b/examples/pdf_to_html_to_excel.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extract a Table from PDF into Excel\n", + "\n", + "Below it's an example of using AnyParser to extract a complicated table from a financial report (PDF) into Excel spread sheet. The sample data is from https://gaming.nv.gov/about/gaming-revenue/information/\n", + "\n", + "\n", + "## 1. Load the libraries\n", + "\n", + "If you have install `any_parser`, uncomment the below line." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install python-dotenv\n", + "# !pip3 install --upgrade any-parser\n", + "# !pip3 install openpyxl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Import and update path" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_92921/3792744553.py:2: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "from dotenv import load_dotenv\n", + "from any_parser import AnyParser\n", + "from IPython.display import HTML, display" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Load Your API Key and Initialize AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "example_apikey = os.getenv(\"CAMBIO_API_KEY\")\n", + "\n", + "op = AnyParser(example_apikey)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Helper function: Convert HTML to Excel" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "from io import StringIO\n", + "\n", + "\n", + "def html_to_excel(html_string, output_folder, output_filename):\n", + " if not os.path.exists(output_folder):\n", + " os.makedirs(output_folder)\n", + "\n", + " soup = BeautifulSoup(html_string, 'html.parser')\n", + "\n", + " tables = soup.find_all('table')\n", + "\n", + " dfs = {}\n", + " for i, table in enumerate(tables):\n", + " dfs[f\"Table_{i+1}\"] = pd.read_html(StringIO(str(table)))[0]\n", + "\n", + " output_file = os.path.join(output_folder, output_filename)\n", + " with pd.ExcelWriter(output_file) as writer:\n", + " for name, df in dfs.items():\n", + " df.to_excel(writer, sheet_name=name, index=False)\n", + "\n", + " print(f\"Excel file saved to {output_file}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Parse into HTML and Excel\n", + "\n", + "### 5.1 sample: March_2024 page 8" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Summary Current Month- March-2024 Three Months - 1/1/2024 to 3/31/2024 Twelve Months - 4/1/2023 to3/31/2024
Number of Reporting Licensees - 22 Number of ReportingLicensees - 23 Number of ReportingLicensees -24
Unit Description # Of Loc*# Of UnitsWin** Amount % Chg Win Percent# Of Loc*Avg UnitsWin** Amount% Chg Win Percent# Of Loc*Avg UnitsWin** Amount% Chg Win Percent
Table, Counter and Card Games
Twenty One 13 212 6,746 (9.94) 14.44 13 207 17,527 (12.25) 14.31 13 205 65,917 (1.80) 15.09
Craps 12 36 2,771 (3.34) 13.24 13 35 7,538 (14.01) 13.48 13 35 29,633 (10.80) 13.68
Roulette 12 37 2,268 4.14 20.70 12 36 5,985 3.30 20.65 12 36 21,031 1.09 19.75
Ultim Texas Hold'em 6 10 932 15.76 27.40 6 10 2,533 12.78 27.55 7 9 8,611 25.64 25.39
3-Card Poker 13 17 1,068 (15.43) 27.19 13 17 2,950 (16.27) 27.67 13 17 10,689 (12.46) 26.77
Mini-Baccarat 4 10 125 (79.47) 2.08 4 10 1,329 (54.94) 7.14 4 11 7,866 (38.08) 8.05
Keno 3 3 114 (27.92) 23.39 3 3 368 (23.18) 22.70 3 3 1,721 (14.16) 26.67
Let It Ride 7 8 368 13.77 23.97 7 8 862 (14.96) 20.82 7 7 3,672 (16.19) 23.17
Pai Gow Poker 8 10 35 (92.74) 1.45 8 10 1,123 (21.99) 15.71 9 10 5,280 19.80 18,91
Race Book (1) 10 10 39 (21.56) 16.04 11 10 110 (8.99) 16.45 12 9 529 (15.59) 8.56
Sports Pool (2) 13 13 10,330 (15.80) 5.17 13 13 37,703 31.82 6.31 13 13 119,348 5.20 5.31
Other 31 1,342 (12.47) 24.55 31 4,013 (12.35) 27.17 30 16,861 2,36 29.26
Total 20 397 26,137 (12.94) 8.65 20 390 82,041 3.22 9.41 20 385 291,158 (1.05) 8.84
Slot Machines
1 Cent 15 3,594 16,156 (29.61) 10.15 15 3,734 49,169 (27.00) 11.26 17 4,039 215,449 (17.77) 11.02
5 Cent 11 89 298 (6.80) 8.39 11 87 792 (12.84) 7.83 11 90 3,142 (6.65) 7.45
25 Cent 13 522 2,127 (26.73) 6.67 13 538 6,602 (23.91) 7.32 13 555 27,931 (14.69) 7.23
1 Dollar 13 639 3,674 (32.45) 5.78 13 658 11,940 (19.00) 6.64 13 671 47,908 (14.14) 6.48
5 Dollars 11 56 429 (41.91) 6.32 11 55 1,788 (18.58) 8.97 12 57 6,805 (17.65) 7.72
25 Dollars 4 7 43 (695.28) 10.56 4 7 162 25.57 11.55 4 6 552 49.73 8.85
100 Dollars 3 5 92 (73.91) 8.24 3 5 337 (43.21) 9,01 3 5 1,375 (21.01) 7.86
Multi Denomination 16 5,170 26,827 11.16 6.84 17 5,011 84,117 18.79 7.97 17 4,640 301,461 26.24 7.66
Other 52 416 (28.22) 53 1,220 (24.92) 61 5,436 (11.07)
Total 16 10,134 50,061 (12.80) 7.55 17 10,148 156,128 (6.53) 8.63 18 10,124 610,059 0.14 8.44
Total Gaming 76,199 (12.85) 238,168 (3.39) 901,217 (0.25)
(1) Race Book Parimutuel3 3 39 (19.97) 16.55 4 4 111 (8.34) 16.64 5 5 513 (12.78) 16.03
Sports Mobile 5 9,970 (10.82) 5.62 6 32,930 23.38 6.22 6 104,473 5.65 5.07
(2) Sports Football 9 9 (957) (2.80) (1,168.50) 13 13 8,527 204.02 7.74 13 13 36,108 14.16 5.46
Sports Basketball 13 13 8,097 (17.42) 5.61 13 13 19,155 7.14 5.45 13 13 37,142 3.37 5.56
Sports Baseball 13 13 184 (46.24) 1.87 13 13 (175) (181.64) (1.67) 13 13 19,839 (11.04) 4.23
Sports Parlay Cards 3 3 1 (93.65) 24.34 10 10 35 (81.99) 41.68 12 12 308 (78.52) 31.55
Sports Hockey 13 13 639 (47.25) 4.48 13 13 3,035 (0.92) 7.98 13 13 4,700 (36.73) 3.92
Other 13 2,367 25.73 7.51 13 7,127 60.19 8.17 13 21,251 44.36 6.48
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sample_input_folder = \"nevada_gaming_revenue_reports\"\n", + "sample_page8 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page8\"\n", + "sample_page8_result = op.parse(\"./{}/{}.pdf\".format(sample_input_folder, sample_page8))\n", + "\n", + "display(HTML(sample_page8_result[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page8.xlsx\n" + ] + } + ], + "source": [ + "output_folder = 'output'\n", + "sample_page8_output = '{}.xlsx'.format(sample_page8)\n", + "html_to_excel(sample_page8_result[0], output_folder, sample_page8_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 sample: March_2024 page 35" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Summary Current Month -March-2024 Three Months - 1/1/2024to 3/31/2024 Twelve Months - 4/1/2023to 3/31/2024
Number of Reporting Licensees - 5 Number of Reporting Licensees - 5 Number of Reporting Licensees - 5
Unit Description# Of Loc*# Of UnitsWin** Amount % Chg Win Percent# Of Loc*Avg UnitsWin** Amount % Chg Win Percent# Of Loc*Avg UnitsWin** Amount % Chg Win Percent
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sample_page35 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page35\"\n", + "sample_page35_result = op.parse(\"./{}/{}.pdf\".format(sample_input_folder, sample_page35))\n", + "\n", + "\n", + "display(HTML(sample_page35_result[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page35.xlsx\n" + ] + } + ], + "source": [ + "output_folder = 'output'\n", + "sample_page35_output = '{}.xlsx'.format(sample_page35)\n", + "html_to_excel(sample_page35_result[0], output_folder, sample_page35_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3 sample: March_2024 page 43" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Summary Current Month- March-2024 Three Months - 1/1/2024 to 3/31/2024 Twelve Months - 4/1/2023 to 3/31/2024
Number of Reporting Licensees - 13 Number of ReportingLicensees - 13 Number of ReportingLicensees - 13
Unit Description # Of Loc*# Of UnitsWin** Amount % Chg Win Percent# Of Loc*Avg UnitsWin** Amount% Chg Win Percent# Of Loc*Avg UnitsWin** Amount% Chg Win Percent
Table, Counter and CardGames
Other 2 6 (35.98) 38.17 2 19 51.18 37.22 2 41 (25.01) 26.35
Slot Machines
1 Cent 7 224 440 (20.66) 7.51 7 226 1,303 (17.26) 7.76 7 233 5,617 (10.36) 7.63
25 Cent 4 6 8 51.28 10.82 4 6 29 47.38 13.79 4 6 134 25.40 13.48
1 Dollar 4 9 5 (64.87) 2.76 4 9 30 (24.33) 4.47 4 9 161 4.16 5.11
Multi Denomination 12 499 916 7.88 7.06 12 499 2,549 3.84 6.79 12 488 10,675 7.20 6.79
Other 0 (34.95) (1) (47.76) (6) (29.00)
Total 12 738 1,369 (3.79) 7.17 12 740 3,909 (4.32) 7.08 12 736 16,581 0.63 7.06
Total Gaming 1,375 (4.00) 3,928 (4.16) 16,622 0.55
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sample_page43 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page43\"\n", + "sample_page43_result = op.parse(\"./{}/{}.pdf\".format(sample_input_folder, sample_page43))\n", + "\n", + "display(HTML(sample_page43_result[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page43.xlsx\n" + ] + } + ], + "source": [ + "output_folder = 'output'\n", + "sample_page43_output = '{}.xlsx'.format(sample_page43)\n", + "html_to_excel(sample_page43_result[0], output_folder, sample_page43_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}