forked from camelot-dev/camelot
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🧑🎓 add parser comparison notebook ✨
- Loading branch information
Showing
3 changed files
with
272 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Parser comparison\n", | ||
"\n", | ||
"This notebook lets you visualize side-by-side how each parser analyzes a document, and compare the resulting tables.\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Setup pypdf_table_extraction" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"os.getcwd()\n", | ||
"# Install from source\n", | ||
"!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n", | ||
"%cd src\n", | ||
"\n", | ||
"\n", | ||
"!pip install -e .\n", | ||
"\n", | ||
"# Optionally you can Install ghostscript as the imageconversion backend.\n", | ||
"# uncomment the following lines\n", | ||
"# !apt-get install -y ghostscript\n", | ||
"# !pip install ghostscript" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Bootstrap and common imports\n", | ||
"import sys, time\n", | ||
"sys.path.insert(0, os.path.abspath('')) # Prefer the local version of pypdf_table_extraction if available\n", | ||
"import pypdf_table_extraction\n", | ||
"\n", | ||
"print(f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Select a PDF file to review\n", | ||
"\n", | ||
"You can modify the section below to point to a pdf or your choice to visualize the results. By default, it points to one of the test .pdfs included with pypdf_table_extraction.\n", | ||
"This is seeded with the unit test files for convenience." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"kwargs = {}\n", | ||
"data = None\n", | ||
"# pdf_file = \"vertical_header.pdf\" # test_network_vertical_header\n", | ||
"# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"all\"} # test_network_vertical_headerpages\n", | ||
"# pdf_file, kwargs = \"background_lines_1.pdf\", {\"process_background\": True} # {\"process_background\": True} # test_lattice_process_background\n", | ||
"\n", | ||
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_network_flag_size\n", | ||
"# pdf_file, kwargs = \"superscript.pdf\", {\"flag_size\": True} # , data_stream_flag_size # test_network_flag_size\n", | ||
"# pdf_file = \"health.pdf\" # test_network\n", | ||
"# pdf_file = \"clockwise_table_2.pdf\"\n", | ||
"# pdf_file = \"clockwise_table_1.pdf\"\n", | ||
"# pdf_file = \"foo.pdf\"\n", | ||
"# pdf_file, kwargs = \"saturation_threshold.pdf\", {\"process_color_background\": False, \"process_background\": True, \"saturation_threshold\": 5, \"threshold_blocksize\": 25} # \"process_background\": True,\n", | ||
"\n", | ||
"# pdf_file = \"birdisland.pdf\"\n", | ||
"# pdf_file, kwargs = \"diesel_engines.pdf\", {\"pages\": \"4-5\"} # containing multiple pages 2-4 = hybrid error same for 3-4,2-3\n", | ||
"\n", | ||
"# pdf_file, kwargs = \"column_span_1.pdf\", {\"copy_text\": \"h\"}\n", | ||
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n", | ||
"# pdf_file, kwargs = \"tabula/12s0324.pdf\", {\"strip_text\": \" ,\\n\"} # interesting because contains two separate tables\n", | ||
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_network_table_regions\n", | ||
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_areas\": [\"320,500,573,335\"]} # test_network_table_areas\n", | ||
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n", | ||
"# pdf_file = \"detect_vertical_false.pdf\" #\n", | ||
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n", | ||
"# pdf_file, kwargs= \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, } # , data_stream_split_text # data_stream_split_text\n", | ||
"\n", | ||
"# pdf_file = \"clockwise_table_2.pdf\" # test_network_table_rotated / test_stream_table_rotated\n", | ||
"# pdf_file, kwargs = \"clockwise_table_2.pdf\", {\"edge_tol\": 10} # configurable vgap header search not working\n", | ||
"# edge_tol 0 gives an error\n", | ||
"pdf_file = \"vertical_header.pdf\"\n", | ||
"\n", | ||
"# pdf_file = \"twotables_2.pdf\"\n", | ||
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n", | ||
"# pdf_file = \"multiple_tables.pdf\" # fixes issue 132\n", | ||
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n", | ||
"# pdf_file, kwargs = \"edge_tol.pdf\", {\"edge_tol\": 500} # , data_stream_edge_tol\n", | ||
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n", | ||
"\n", | ||
"# pdf_file = \"tabula/arabic.pdf\"\n", | ||
"# pdf_file = \"tabula/indictb1h_14.pdf\" # interesting mixed type table\n", | ||
"# pdf_file = \"tabula/m27.pdf\" # one table spanning multiple pages\n", | ||
"# pdf_file = \"tabula/mednine.pdf\" # one table spanning multiple pages\n", | ||
"\n", | ||
"# pdf_file = \"tabula/spreadsheet_no_bounding_frame.pdf\n", | ||
"# pdf_file, kwargs = \"diesel_engines.pdf\", {\"pages\": \"4-5\"} # containing multiple pages\n", | ||
"\n", | ||
"# pdf_file, kwargs = \"tabula/schools.pdf\", {\"pages\": \"all\"} # network parser hangs on contour plot\n", | ||
"\n", | ||
"filename = os.path.join(\n", | ||
" os.path.dirname(os.path.abspath('.')),\n", | ||
" \"src/tests/files\",\n", | ||
" pdf_file\n", | ||
")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"FLAVORS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n", | ||
"tables_parsed = {}\n", | ||
"parses = {}\n", | ||
"max_tables = 0\n", | ||
"for idx, flavor in enumerate(FLAVORS):\n", | ||
" timer_before_parse = time.perf_counter()\n", | ||
" error, tables = None, []\n", | ||
" try:\n", | ||
" tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", | ||
" except ValueError as value_error:\n", | ||
" error = f\"Invalid argument for parser {flavor}: {value_error}\"\n", | ||
" print(error)\n", | ||
" timer_after_parse = time.perf_counter()\n", | ||
" max_tables = max(max_tables, len(tables))\n", | ||
"\n", | ||
" parses[flavor] = {\n", | ||
" \"tables\": tables,\n", | ||
" \"time\": timer_after_parse - timer_before_parse,\n", | ||
" \"error\": error\n", | ||
" }\n", | ||
"\n", | ||
" print(f\"##### {flavor} ####\")\n", | ||
" print(f\"Found {len(tables)} table(s):\")\n", | ||
" for idx, table in enumerate(tables):\n", | ||
" flavors_matching = []\n", | ||
" for previous_flavor, previous_tables in tables_parsed.items():\n", | ||
" for prev_idx, previous_table in enumerate(previous_tables):\n", | ||
" if previous_table.df.equals(table.df):\n", | ||
" flavors_matching.append(\n", | ||
" f\"{previous_flavor} table {prev_idx}\")\n", | ||
" print(f\"## Table {idx} ##\")\n", | ||
" if flavors_matching:\n", | ||
" print(f\"Same as {', '.join(flavors_matching)}.\")\n", | ||
" else:\n", | ||
" display(table.df)\n", | ||
" print(\"\")\n", | ||
" tables_parsed[flavor] = tables\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Show tables layout within original document" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"# Set up plotting options\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"%matplotlib inline\n", | ||
"PLOT_HEIGHT = 12\n", | ||
"\n", | ||
"row_count = max(max_tables, 1)\n", | ||
"plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(FLAVORS), PLOT_HEIGHT * row_count]\n", | ||
"fig, axes = plt.subplots(row_count, len(FLAVORS))\n", | ||
"plt.subplots_adjust(wspace=0, hspace=0) # Reduce margins to maximize the display zone\n", | ||
"\n", | ||
"fig.suptitle('Side-by-side flavor comparison', fontsize=24, fontweight='bold')\n", | ||
"for idx, flavor in enumerate(FLAVORS):\n", | ||
" parse = parses[flavor]\n", | ||
" tables = parse[\"tables\"]\n", | ||
" top_ax = axes.flat[idx]\n", | ||
" title = f\"{flavor}\\n\" \\\n", | ||
" f\"Detected {len(tables)} table(s) in {parse['time']:.2f}s\"\n", | ||
" if parse['error']:\n", | ||
" title = title + f\"\\nError parsing: {parse['error']}\"\n", | ||
" top_ax.set_title(title, fontsize=12, fontweight='bold')\n", | ||
" for table_idx, table in enumerate(tables):\n", | ||
" if max_tables > 1:\n", | ||
" ax = axes[table_idx][idx]\n", | ||
" else:\n", | ||
" ax = axes[idx]\n", | ||
" # Check if the table has data before attempting to plot it\n", | ||
" if table.shape[0] > 0 and table.shape[1] > 0: # Check if table has rows and columns\n", | ||
" fig = camelot.plot(table, kind='grid', ax=ax)\n", | ||
" ax.text(\n", | ||
" 0.5, -0.1,\n", | ||
" \"{flavor} table {table_idx} - {rows}x{cols}\".format(\n", | ||
" flavor=flavor,\n", | ||
" table_idx=table_idx,\n", | ||
" rows=table.shape[0],\n", | ||
" cols=table.shape[1],\n", | ||
" ),\n", | ||
" size=14, ha=\"center\",\n", | ||
" transform=ax.transAxes\n", | ||
" )\n", | ||
" else:\n", | ||
" print(f\"Skipping plotting for empty table {table_idx} in {flavor}\") # Inform user about the skipped table\n", | ||
" timer_after_plot = time.perf_counter()\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"file_extension": ".py", | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.5" | ||
}, | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"npconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": 3 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |