diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 79443043..97ac4053 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,7 +28,7 @@ jobs: - { python: "3.10", os: "macos-latest", session: "tests" } - { python: "3.10", os: "ubuntu-latest", session: "typeguard" } - { python: "3.10", os: "ubuntu-latest", session: "xdoctest" } - - { python: "3.8", os: "ubuntu-latest", session: "docs-build" } + - { python: "3.10", os: "ubuntu-latest", session: "docs-build" } env: NOXSESSION: ${{ matrix.session }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 387e935f..00000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,117 +0,0 @@ -# Contributor Guide - -Thank you for your interest in improving this project. -This project is open-source under the [MIT license] and -welcomes contributions in the form of bug reports, feature requests, and pull requests. - -Kenneth Reitz has also written an [essay](https://www.kennethreitz.org/essays/2013/01/27/be-cordial-or-be-on-your-way) on this topic, which you should read. - -Here is a list of important resources for contributors: - -- [Source Code] -- [Documentation] -- [Issue Tracker] -- [Code of Conduct] - -[mit license]: https://opensource.org/licenses/MIT -[source code]: https://github.com/camelot-dev/camelot -[documentation]: https://camelot-py.readthedocs.io/ -[issue tracker]: https://github.com/camelot-dev/camelot/issues - -## How to report a bug - -Report bugs on the [Issue Tracker]. - -When filing an issue, make sure to answer these questions: - -- Which operating system and Python version are you using? -- Which version of this project are you using? -- What did you do? -- What did you expect to see? -- What did you see instead? - -The best way to get your bug fixed is to provide a test case, -and/or steps to reproduce the issue. - -## How to request a feature - -Request features on the [Issue Tracker]. - -## How to set up your development environment - -You need Python 3.8+ and the following tools: - -- [Poetry] -- [Nox] -- [nox-poetry] - -Install the package with development requirements: - -```console -$ poetry install -``` - -You can now run an interactive Python session, -or the command-line interface: - -```console -$ poetry run python -$ poetry run camelot -``` - -[poetry]: https://python-poetry.org/ -[nox]: https://nox.thea.codes/ -[nox-poetry]: https://nox-poetry.readthedocs.io/ - -## How to test the project - -Run the full test suite: - -```console -$ nox -``` - -List the available Nox sessions: - -```console -$ nox --list-sessions -``` - -You can also run a specific Nox session. -For example, invoke the unit test suite like this: - -```console -$ nox --session=tests -``` - -Unit tests are located in the _tests_ directory, -and are written using the [pytest] testing framework. - -[pytest]: https://pytest.readthedocs.io/ - -## How to submit changes - -Open a [pull request] to submit changes to this project. - -Your pull request needs to meet the following guidelines for acceptance: - -- The Nox test suite must pass without errors and warnings. -- Include unit tests. This project maintains 100% code coverage. -- If your changes add functionality, update the documentation accordingly. - -Feel free to submit early, though—we can always iterate on this. - -To run linting and code formatting checks before committing your change, you can install pre-commit as a Git hook by running the following command: - -```console -$ nox --session=pre-commit -- install -``` - -It is recommended to open an issue before starting work on anything. -This will allow a chance to talk it over with the owners and validate your approach. - -[pull request]: https://github.com/camelot-dev/camelot/pulls - - - -[code of conduct]: CODE_OF_CONDUCT.md diff --git a/LICENSE b/LICENSE index 2435efae..8c606c58 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (c) 2019-2021 Camelot Developers +Copyright (c) 2024 pypdf_table_extraction Developers +Copyright (c) 2019-2023 Camelot Developers Copyright (c) 2018-2019 Peeply Private Ltd (Singapore) Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/README.md b/README.md index a496e6d2..6835ecd9 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,17 @@

-# Camelot: PDF Table Extraction for Humans +# pypdf_table_extraction (Camelot): PDF Table Extraction for Humans -[![tests](https://github.com/camelot-dev/camelot/actions/workflows/tests.yml/badge.svg)](https://github.com/camelot-dev/camelot/actions/workflows/tests.yml) [![Documentation Status](https://readthedocs.org/projects/camelot-py/badge/?version=master)](https://camelot-py.readthedocs.io/en/master/) -[![codecov.io](https://codecov.io/github/camelot-dev/camelot/badge.svg?branch=master&service=github)](https://codecov.io/github/camelot-dev/camelot?branch=master) -[![image](https://img.shields.io/pypi/v/camelot-py.svg)](https://pypi.org/project/camelot-py/) [![image](https://img.shields.io/pypi/l/camelot-py.svg)](https://pypi.org/project/camelot-py/) [![image](https://img.shields.io/pypi/pyversions/camelot-py.svg)](https://pypi.org/project/camelot-py/) [![Gitter chat](https://badges.gitter.im/camelot-dev/Lobby.png)](https://gitter.im/camelot-dev/Lobby) -[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) +[![tests](https://github.com/py-pdf/pypdf_table_extraction/actions/workflows/tests.yml/badge.svg)](https://github.com/py-pdf/pypdf_table_extraction/actions/workflows/tests.yml) [![Documentation Status](https://readthedocs.org/projects/pypdf-table-extraction/badge/?version=latest)](https://pypdf-table-extraction.readthedocs.io/en/latest/) +[![codecov.io](https://codecov.io/github/py-pdf/pypdf_table_extraction/badge.svg?branch=main&service=github)](https://codecov.io/github/py-pdf/pypdf_table_extraction/?branch=main) +[![image](https://img.shields.io/pypi/v/pypdf-table-extraction.svg)](https://pypi.org/project/pypdf-table-extraction/) [![image](https://img.shields.io/pypi/l/pypdf-table-extraction.svg)](https://pypi.org/project/pypdf-table-extraction/) [![image](https://img.shields.io/pypi/pyversions/pypdf-table-extraction.svg)](https://pypi.org/project/pypdf-table-extraction/) -**Camelot** is a Python library that can help you extract tables from PDFs! - -**Note:** You can also check out [Excalibur](https://github.com/camelot-dev/excalibur), the web interface to Camelot! +**pypdf_table_extraction** Formerly known as [Camelot](https://github.com/camelot-dev/camelot) is a Python library that can help you extract tables from PDFs! --- -**Here's how you can extract tables from PDFs.** You can check out the PDF used in this example [here](https://github.com/camelot-dev/camelot/blob/master/docs/_static/pdf/foo.pdf). +**Here's how you can extract tables from PDFs.** You can check out the PDF used in this example [here](https://github.com/py-pdf/pypdf_table_extraction/blob/main/docs/_static/pdf/foo.pdf). ```python3 >>> import camelot @@ -45,53 +42,50 @@ | 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | | 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | -Camelot also comes packaged with a [command-line interface](https://camelot-py.readthedocs.io/en/master/user/cli.html)! +pypdf_table_extraction also comes packaged with a [command-line interface](https://pypdf-table-extraction.readthedocs.io/en/latest/user/cli.html)! -Refer to the [QuickStart Guide](https://github.com/py-pdf/pypdf_table_extraction/blob/main/docs/user/quickstart.rst#quickstart) to quickly get started with Camelot, extract tables from PDFs and explore some basic options. +Refer to the [QuickStart Guide](https://github.com/py-pdf/pypdf_table_extraction/blob/main/docs/user/quickstart.rst#quickstart) to quickly get started with pypdf_table_extraction, extract tables from PDFs and explore some basic options. -**Note:** Camelot only works with text-based PDFs and not scanned documents. (As Tabula [explains](https://github.com/tabulapdf/tabula#why-tabula), "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) +**Note:** pypdf_table_extraction only works with text-based PDFs and not scanned documents. (As Tabula [explains](https://github.com/tabulapdf/tabula#why-tabula), "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) -You can check out some frequently asked questions [here](https://camelot-py.readthedocs.io/en/master/user/faq.html). +You can check out some frequently asked questions [here](https://pypdf-table-extraction.readthedocs.io/en/latest/user/faq.html). -## Why Camelot? +## Why pypdf_table_extraction? -- **Configurability**: Camelot gives you control over the table extraction process with [tweakable settings](https://camelot-py.readthedocs.io/en/master/user/advanced.html). +- **Configurability**: pypdf_table_extraction gives you control over the table extraction process with [tweakable settings](https://pypdf-table-extraction.readthedocs.io/en/latest/user/advanced.html). - **Metrics**: You can discard bad tables based on metrics like accuracy and whitespace, without having to manually look at each table. - **Output**: Each table is extracted into a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). You can also export tables to multiple formats, which include CSV, JSON, Excel, HTML, Markdown, and Sqlite. -See [comparison with similar libraries and tools](https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). - -## Support the development - -If Camelot has helped you, please consider supporting its development with a one-time or monthly donation [on OpenCollective](https://opencollective.com/camelot). +See [comparison with similar libraries and tools](https://github.com/py-pdf/pypdf_table_extraction/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). ## Installation ### Using conda -The easiest way to install Camelot is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution. +The easiest way to install pypdf_table_extraction is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution. + ```bash -conda install -c conda-forge camelot-py +conda install -c conda-forge pypdf-table-extraction ``` ### Using pip -After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/bionic/python/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can also just use pip to install Camelot: +After [installing the dependencies](https://pypdf-table-extraction.readthedocs.io/en/latest/user/install-deps.html) ([tk](https://packages.ubuntu.com/bionic/python/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can also just use pip to install pypdf_table_extraction: ```bash -pip install camelot-py[base] +pip install pypdf-table-extraction[base] ``` ### From the source code -After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip), clone the repo using: +After [installing the dependencies](https://pypdf-table-extraction.readthedocs.io/en/latest/user/install.html#using-pip), clone the repo using: ```bash -git clone https://www.github.com/camelot-dev/camelot +git clone https://github.com/py-pdf/pypdf_table_extraction.git ``` -and install Camelot using pip: +and install using pip: ``` cd camelot @@ -100,7 +94,7 @@ pip install ".[base]" ## Documentation -The documentation is available at [http://camelot-py.readthedocs.io/](http://camelot-py.readthedocs.io/). +The documentation is available at [http://pypdf-table-extraction.readthedocs.io/](http://pypdf-table-extraction.readthedocs.io/). ## Wrappers @@ -112,12 +106,12 @@ The documentation is available at [http://camelot-py.readthedocs.io/](http://cam ## Contributing -The [Contributor's Guide](https://camelot-py.readthedocs.io/en/master/dev/contributing.html) has detailed information about contributing issues, documentation, code, and tests. +The [Contributor's Guide](https://pypdf-table-extraction.readthedocs.io/en/latest/dev/contributing.html) has detailed information about contributing issues, documentation, code, and tests. ## Versioning -Camelot uses [Semantic Versioning](https://semver.org/). For the available versions, see the tags on this repository. For the changelog, you can check out [HISTORY.md](https://github.com/camelot-dev/camelot/blob/master/HISTORY.md). +pypdf_table_extraction uses [Semantic Versioning](https://semver.org/). For the available versions, see the tags on this repository. For the changelog, you can check out the [releases](https://github.com/py-pdf/pypdf_table_extraction/releases) page. ## License -This project is licensed under the MIT License, see the [LICENSE](https://github.com/camelot-dev/camelot/blob/master/LICENSE) file for details. +This project is licensed under the MIT License, see the [LICENSE](https://github.com/py-pdf/pypdf_table_extraction/blob/main/LICENSE) file for details. diff --git a/docs/_templates/hacks.html b/docs/_templates/hacks.html deleted file mode 100644 index 960e1725..00000000 --- a/docs/_templates/hacks.html +++ /dev/null @@ -1,30 +0,0 @@ - diff --git a/docs/_templates/sidebarintro.html b/docs/_templates/sidebarintro.html deleted file mode 100644 index 470437c6..00000000 --- a/docs/_templates/sidebarintro.html +++ /dev/null @@ -1,24 +0,0 @@ - -

- -

- -

Useful Links

- diff --git a/docs/_templates/sidebarlogo.html b/docs/_templates/sidebarlogo.html deleted file mode 100644 index 12aa2992..00000000 --- a/docs/_templates/sidebarlogo.html +++ /dev/null @@ -1,15 +0,0 @@ - -

- -

diff --git a/docs/conf.py b/docs/conf.py index 368e0187..a9851dae 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,4 @@ # -# camelot documentation build configuration file, created by -# sphinx-quickstart on Tue Jul 19 13:44:18 2016. # # This file is execfile()d with the current directory set to its # containing dir. @@ -23,8 +21,8 @@ # # sys.path.insert(0, os.path.abspath('..')) -# Insert Camelot's path into the system. -sys.path.insert(0, os.path.abspath("..")) +# Insert pypdf_table_extraction's path into the system. +sys.path.insert(0, os.path.abspath("../camelot")) sys.path.insert(0, os.path.abspath("_themes")) @@ -32,7 +30,6 @@ # If your documentation needs a minimal Sphinx version, state it here. # -# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -40,10 +37,10 @@ extensions = [ "sphinx.ext.autodoc", "sphinx.ext.napoleon", - "sphinx.ext.intersphinx", "sphinx.ext.todo", "sphinx.ext.viewcode", "sphinx_click", + "sphinx_copybutton", "myst_parser", ] @@ -64,9 +61,9 @@ master_doc = "index" # General information about the project. -project = "Camelot" -copyright = "2021, Camelot Developers" -author = "Vinayak Mehta" +project = "pypdf_table_extraction" +copyright = "2024, pypdf_table_extraction Developers" +author = "pypdf_table_extraction Developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -82,7 +79,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -127,23 +124,24 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = "alabaster" +html_theme = "sphinx_book_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - "show_powered_by": False, - "github_user": "camelot-dev", - "github_repo": "camelot", - "github_banner": True, - "show_related": False, - "note_bg": "#FFF59C", + "repository_url": "https://github.com/py-pdf/pypdf_table_extraction", + "repository_branch": "main", + "path_to_docs": "/docs", + "use_repository_button": True, + # "launch_buttons": "dict to notebooks to launch", } # Add any paths that contain custom themes here, relative to this directory. @@ -161,7 +159,7 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. # -# html_logo = None +html_logo = "_static/pypdf-table-extraction.png" # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 @@ -190,23 +188,7 @@ html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -html_sidebars = { - "index": [ - "sidebarintro.html", - "relations.html", - "sourcelink.html", - "searchbox.html", - "hacks.html", - ], - "**": [ - "sidebarlogo.html", - "localtoc.html", - "relations.html", - "sourcelink.html", - "searchbox.html", - "hacks.html", - ], -} +#html_sidebars = { } # Additional templates that should be rendered to pages, maps page names to # template names. @@ -262,7 +244,7 @@ # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = "Camelotdoc" +htmlhelp_basename = "pypdf_table_extraction-doc" # -- Options for LaTeX output --------------------------------------------- @@ -285,7 +267,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "Camelot.tex", "Camelot Documentation", "Vinayak Mehta", "manual"), + (master_doc, "pypdf-table-extraction.tex", "pypdf-table-extraction Documentation", "Vinayak Mehta", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -325,7 +307,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "Camelot", "Camelot Documentation", [author], 1)] +man_pages = [(master_doc, "pypdf_table_extraction", "pypdf_table_extraction Documentation", [author], 1)] # If true, show URL addresses after external links. # @@ -340,11 +322,11 @@ texinfo_documents = [ ( master_doc, - "Camelot", - "Camelot Documentation", + "pypdf_table_extraction", + "pypdf_table_extraction Documentation", author, - "Camelot", - "One line description of project.", + "pypdf_table_extraction", + "PDF Table Extraction for Humans.", "Miscellaneous", ), ] @@ -365,9 +347,3 @@ # # texinfo_no_detailmenu = False - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = { - "https://docs.python.org/2": None, - "http://pandas.pydata.org/pandas-docs/stable": None, -} diff --git a/docs/dev/codeofconduct.rst b/docs/dev/codeofconduct.rst new file mode 100644 index 00000000..6a817134 --- /dev/null +++ b/docs/dev/codeofconduct.rst @@ -0,0 +1,7 @@ +.. _code_of_conduct: + +Code Of Conduct +=============== + +.. include:: ../../CODE_OF_CONDUCT.md + :parser: myst_parser.sphinx_ diff --git a/docs/dev/contributing.rst b/docs/dev/contributing.rst index fa8778c4..3edcf00b 100644 --- a/docs/dev/contributing.rst +++ b/docs/dev/contributing.rst @@ -1,13 +1,18 @@ .. _contributing: + Contributor's Guide =================== -If you're reading this, you're probably looking to contributing to Camelot. *Time is the only real currency*, and the fact that you're considering spending some here is *very* generous of you. Thank you very much! +If you're reading this, you're probably looking to contributing to pypdf_table_extraction. *Time is the only real currency*, and the fact that you're considering spending some here is *very* generous of you. Thank you very much! -This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer. +This document will help you get started with contributing documentation, code, testing and filing issues. -.. _Vinayak Mehta: https://vinayak.io +- This Documentation +- `Source Code `__ +- `Issue + Tracker `__ +- :doc:`Code Of Conduct `. Code Of Conduct --------------- @@ -20,28 +25,108 @@ Kenneth Reitz has also written an `essay`_ on this topic, which you should read. .. _essay: https://kennethreitz.org/essays/2013/01/27/be-cordial-or-be-on-your-way -As the `Requests Code Of Conduct`_ states, **all contributions are welcome**, as long as everyone involved is treated with respect. +For more info read our full :doc:`Code Of Conduct `. + +How to report a bug +------------------- + +Report bugs on the `Issue +Tracker `__. + +When filing an issue, make sure to answer these questions: + +- What did you do? +- What did you expect to see? +- What did you see instead? +- A link to the PDF document that you were trying to extract tables from. +- The complete traceback. +- Which operating system and Python version are you using? +- Which version of this project are you using? +- Which version of the dependencies are you using? + +You can use the following code snippet to find this information:: + + import platform; print(platform.platform()) + import sys; print('Python', sys.version) + import numpy; print('NumPy', numpy.__version__) + import cv2; print('OpenCV', cv2.__version__) + import camelot; print('Camelot', camelot.__version__) + +- Steps to reproduce the bug, using code snippets. See `Creating and highlighting code blocks`_. + +.. _Creating and highlighting code blocks: https://help.github.com/articles/creating-and-highlighting-code-blocks/ -.. _Requests Code Of Conduct: http://docs.python-requests.org/en/master/dev/contributing/#be-cordial + +Questions +^^^^^^^^^ + +Please don't use GitHub issues for support questions. A better place for them would be `Stack Overflow`_. Make sure you tag them using the ``pypdf_table_extraction`` tag. + +.. _Stack Overflow: http://stackoverflow.com + + +How to request a feature +------------------------ + +Request features on the `Issue +Tracker `__. Your first contribution ----------------------- -A great way to start contributing to Camelot is to pick an issue tagged with the `help wanted`_ or the `good first issue`_ tags. If you're unable to find a good first issue, feel free to contact the maintainer. +A great way to start contributing to pypdf_table_extraction is to pick an issue tagged with the `help wanted`_ or the `good first issue`_ tags. If you're unable to find a good first issue, feel free to contact the maintainer. -.. _help wanted: https://github.com/camelot-dev/camelot/labels/help%20wanted -.. _good first issue: https://github.com/camelot-dev/camelot/labels/good%20first%20issue +.. _help wanted: https://github.com/py-pdf/pypdf_table_extraction/labels/help%20wanted +.. _good first issue: https://github.com/py-pdf/pypdf_table_extraction/labels/good%20first%20issue Setting up a development environment ------------------------------------ -To install the dependencies needed for development, you can use pip:: +You need Python 3.8+ and the following tools: + +- `Poetry `__ +- `Nox `__ +- `nox-poetry `__ + +Install the package with development requirements: + +.. code-block:: console + + $ poetry install + +You can now run an interactive Python session, or the command-line +interface: + +.. code-block:: console + + $ poetry run python + $ poetry run pypdf-table-extraction + +How to test the project +----------------------- + +Run the full test suite: - $ pip install "camelot-py[dev]" +.. code-block:: console -Alternatively, you can clone the project repository, and install using pip:: + $ nox + +List the available Nox sessions: + +.. code-block:: console + + $ nox --list-sessions + +You can also run a specific Nox session. For example, invoke the unit +test suite like this: + +.. code-block:: console + + $ nox --session=tests + +Unit tests are located in the *tests* directory, and are written using +the `pytest `__ testing framework. - $ pip install ".[dev]" Pull Requests ------------- @@ -49,34 +134,44 @@ Pull Requests Submit a pull request ^^^^^^^^^^^^^^^^^^^^^ -The preferred workflow for contributing to Camelot is to fork the `project repository`_ on GitHub, clone, develop on a branch and then finally submit a pull request. Here are the steps: +The preferred workflow for contributing to pypdf_table_extraction is to fork the `project repository`_ on GitHub, clone, develop on a branch and then finally submit a pull request. Here are the steps: + +.. _project repository: https://github.com/py-pdf/pypdf_table_extraction/ -.. _project repository: https://github.com/camelot-dev/camelot 1. Fork the project repository. Click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub. -2. Clone your fork of Camelot from your GitHub account:: +2. Clone your fork of pypdf_table_extraction from your GitHub account + +.. code-block:: console - $ git clone https://www.github.com/[username]/camelot + $ git clone https://www.github.com/[username]/pypdf_table_extraction -3. Create a branch to hold your changes:: +3. Create a branch to hold your changes + +.. code-block:: console $ git checkout -b my-feature -Always branch out from ``master`` to work on your contribution. It's good practice to never work on the ``master`` branch! +Always branch out from ``main`` to work on your contribution. It's good practice to never work on the ``main`` branch! .. note:: ``git stash`` is a great way to save the work that you haven't committed yet, to move between branches. -4. Work on your contribution. Add changed files using ``git add`` and then ``git commit`` them:: +4. Work on your contribution. Add changed files using ``git add`` and then ``git commit`` them + +.. code-block:: console $ git add modified_files $ git commit -5. Finally, push them to your GitHub fork:: +5. Finally, push them to your GitHub fork + +.. code-block:: console $ git push -u origin my-feature -Now it's time to go to the your fork of Camelot and create a pull request! You can `follow these instructions`_ to do the same. +Now it's time to go to the your fork of pypdf_table_extraction and create a `pull +request `__! You can `follow these instructions`_ to do the same. .. _follow these instructions: https://help.github.com/articles/creating-a-pull-request-from-a-fork/ @@ -89,32 +184,36 @@ We recommend that your pull request complies with the following guidelines: .. _pep8: http://pep8.org -- In case your pull request contains function docstrings, make sure you follow the `numpydoc`_ format. All function docstrings in Camelot follow this format. Following the format will make sure that the API documentation is generated flawlessly. + +- In case your pull request contains function docstrings, make sure you follow the `numpydoc`_ format. All function docstrings in pypdf_table_extraction follow this format. Following the format will make sure that the API documentation is generated flawlessly. .. _numpydoc: https://numpydoc.readthedocs.io/en/latest/format.html -- Make sure your commit messages follow `the seven rules of a great git commit message`_: - - Separate subject from body with a blank line - - Limit the subject line to 50 characters - - Capitalize the subject line - - Do not end the subject line with a period - - Use the imperative mood in the subject line - - Wrap the body at 72 characters - - Use the body to explain what and why vs. how -.. _the seven rules of a great git commit message: https://chris.beams.io/posts/git-commit/ -- Please prefix your title of your pull request with [MRG] (Ready for Merge), if the contribution is complete and ready for a detailed review. An incomplete pull request's title should be prefixed with [WIP] (to indicate a work in progress), and changed to [MRG] when it's complete. A good `task list`_ in the PR description will ensure that other people get a fair idea of what it proposes to do, which will also increase collaboration. +- Please create a draft pull request if it is a work in progress. An incomplete pull request's title could be prefixed with [WIP] (to indicate a work in progress). Change the status of your pull request if the contribution is complete and ready for a detailed review. A good `task list`_ in the PR description will ensure that other people get a fair idea of what it proposes to do, which will also increase collaboration. .. _task list: https://blog.github.com/2013-01-09-task-lists-in-gfm-issues-pulls-comments/ -- If contributing new functionality, make sure that you add a unit test for it, while making sure that all previous tests pass. Camelot uses `pytest`_ for testing. Tests can be run using: +- If contributing new functionality, make sure that you add a unit test for it, while making sure that all previous tests pass. + + +.. note:: It is recommended to open an issue before starting work on anything. This will allow a chance to talk it over with the contributors and validate your approach. + +To run linting and code formatting checks before committing your change, +you can install pre-commit as a Git hook by running the following +command: + +.. code-block:: console -.. _pytest: https://docs.pytest.org/en/latest/ + $ nox --session=pre-commit -- install -:: +Your pull request needs to meet the following guidelines for acceptance: - $ python setup.py test +- The Nox test suite must pass without errors and warnings. +- Include unit tests. This project maintains 100% code coverage. +- If your changes add functionality, update the documentation + accordingly. Writing Documentation --------------------- @@ -129,37 +228,28 @@ The function docstrings are written using the `numpydoc`_ extension for Sphinx. .. _Sphinx: http://www.sphinx-doc.org/en/master/ .. _numpydoc: https://numpydoc.readthedocs.io/en/latest/format.html -Filing Issues -------------- - -We use `GitHub issues`_ to keep track of all issues and pull requests. Before opening an issue (which asks a question or reports a bug), please use GitHub search to look for existing issues (both open and closed) that may be similar. - -.. _GitHub issues: https://github.com/camelot-dev/camelot/issues - -Questions -^^^^^^^^^ - -Please don't use GitHub issues for support questions. A better place for them would be `Stack Overflow`_. Make sure you tag them using the ``python-camelot`` tag. - -.. _Stack Overflow: http://stackoverflow.com -Bug Reports -^^^^^^^^^^^ +How to make a release +--------------------- -In bug reports, make sure you include: +.. note:: *You need to be a project maintainer to make a release.* -- Your operating system type and Python version number, along with the version numbers of NumPy, OpenCV and Camelot. You can use the following code snippet to find this information:: +Before making a release, go through the following checklist: - import platform; print(platform.platform()) - import sys; print('Python', sys.version) - import numpy; print('NumPy', numpy.__version__) - import cv2; print('OpenCV', cv2.__version__) - import camelot; print('Camelot', camelot.__version__) +- All pull requests for the release have been merged. +- The default branch passes all checks. -- The complete traceback. Just adding the exception message or a part of the traceback won't help us fix your issue sooner. +Releases are made by publishing a GitHub Release. +A draft release is being maintained based on merged pull requests. +To publish the release, follow these steps: -- Steps to reproduce the bug, using code snippets. See `Creating and highlighting code blocks`_. +1. Click **Edit** next to the draft release. +2. Enter a tag with the new version. +3. Enter the release title, also the new version. +4. Edit the release description, if required. +5. Click **Publish Release**. -.. _Creating and highlighting code blocks: https://help.github.com/articles/creating-and-highlighting-code-blocks/ +After publishing the release, the following automated steps are triggered: -- A link to the PDF document that you were trying to extract tables from, telling us what you expected the code to do and what actually happened. +- The Git tag is applied to the repository. +- [Read the Docs] builds a new stable version of the documentation. diff --git a/docs/index.rst b/docs/index.rst index 55d67a8f..0087c7bb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,46 +1,34 @@ -.. Camelot documentation master file, created by - sphinx-quickstart on Tue Jul 19 13:44:18 2016. +.. documentation master file, created by + sphinx-quickstart You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Camelot: PDF Table Extraction for Humans -======================================== +pypdf_table_extraction (Camelot): PDF Table Extraction for Humans +================================================================= -Release v\ |version|. (:ref:`Installation `) -.. image:: https://travis-ci.org/camelot-dev/camelot.svg?branch=master - :target: https://travis-ci.org/camelot-dev/camelot +Release v\ |version|. (:ref:`Installation `) -.. image:: https://readthedocs.org/projects/camelot-py/badge/?version=master - :target: https://camelot-py.readthedocs.io/en/master/ +.. image:: https://readthedocs.org/projects/pypdf-table-extraction/badge/?version=latest + :target: https://pypdf-table-extraction.readthedocs.io/ :alt: Documentation Status -.. image:: https://codecov.io/github/camelot-dev/camelot/badge.svg?branch=master&service=github - :target: https://codecov.io/github/camelot-dev/camelot?branch=master - -.. image:: https://img.shields.io/pypi/v/camelot-py.svg - :target: https://pypi.org/project/camelot-py/ - -.. image:: https://img.shields.io/pypi/l/camelot-py.svg - :target: https://pypi.org/project/camelot-py/ - -.. image:: https://img.shields.io/pypi/pyversions/camelot-py.svg - :target: https://pypi.org/project/camelot-py/ +.. image:: https://codecov.io/github/py-pdf/pypdf_table_extraction/badge.svg?branch=main&service=github + :target: https://codecov.io/github/py-pdf/pypdf_table_extraction/?branch=main -.. image:: https://badges.gitter.im/camelot-dev/Lobby.png - :target: https://gitter.im/camelot-dev/Lobby +.. image:: https://img.shields.io/pypi/v/pypdf-table-extraction.svg + :target: https://pypi.org/project/pypdf-table-extraction/ -.. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/ambv/black +.. image:: https://img.shields.io/pypi/l/pypdf-table-extraction.svg + :target: https://pypi.org/project/pypdf-table-extraction/ -.. image:: https://img.shields.io/badge/continous%20quality-deepsource-lightgrey - :target: https://deepsource.io/gh/camelot-dev/camelot/?ref=repository-badge +.. image:: https://img.shields.io/pypi/pyversions/pypdf-table-extraction.svg + :target: https://pypi.org/project/pypdf-table-extraction/ -**Camelot** is a Python library that can help you extract tables from PDFs! -.. note:: You can also check out `Excalibur`_, the web interface to Camelot! +**pypdf_table_extraction** Formerly known as `Camelot`_ is a Python library that can help you extract tables from PDFs! -.. _Excalibur: https://github.com/camelot-dev/excalibur +.. _Camelot: https://github.com/camelot-dev/camelot ---- @@ -48,7 +36,7 @@ Release v\ |version|. (:ref:`Installation `) .. _here: _static/pdf/foo.pdf -:: +.. code-block:: pycon >>> import camelot >>> tables = camelot.read_pdf('foo.pdf') @@ -70,18 +58,18 @@ Release v\ |version|. (:ref:`Installation `) .. csv-table:: :file: _static/csv/foo.csv -Camelot also comes packaged with a :ref:`command-line interface `! +pypdf_table_extraction also comes packaged with a :ref:`command-line interface `! -.. note:: Camelot only works with text-based PDFs and not scanned documents. (As Tabula `explains`_, "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) +.. note:: pypdf_table_extraction only works with text-based PDFs and not scanned documents. (As Tabula `explains`_, "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) You can check out some frequently asked questions :ref:`here `. .. _explains: https://github.com/tabulapdf/tabula#why-tabula -Why Camelot? ------------- +Why pypdf_table_extraction? +--------------------------- -- **Configurability**: Camelot gives you control over the table extraction process with :ref:`tweakable settings `. +- **Configurability**: pypdf_table_extraction gives you control over the table extraction process with :ref:`tweakable settings `. - **Metrics**: You can discard bad tables based on metrics like accuracy and whitespace, without having to manually look at each table. - **Output**: Each table is extracted into a **pandas DataFrame**, which seamlessly integrates into `ETL and data analysis workflows`_. You can also export tables to multiple formats, which include CSV, JSON, Excel, HTML, Markdown, and Sqlite. @@ -91,17 +79,11 @@ See `comparison with similar libraries and tools`_. .. _comparison with similar libraries and tools: https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools -Support the development ------------------------ - -If Camelot has helped you, please consider supporting its development with a one-time or monthly donation `on OpenCollective`_! - -.. _on OpenCollective: https://opencollective.com/camelot The User Guide -------------- -This part of the documentation begins with some background information about why Camelot was created, takes you through some implementation details, and then focuses on step-by-step instructions for getting the most out of Camelot. +This part of the documentation begins with some background information about why pypdf_table_extraction was created, takes you through some implementation details, and then focuses on step-by-step instructions for getting the most out of Camelot. .. toctree:: :maxdepth: 2 @@ -115,6 +97,7 @@ This part of the documentation begins with some background information about why user/faq user/cli + The API Documentation/Guide --------------------------- @@ -134,3 +117,4 @@ If you want to contribute to the project, this part of the documentation is for :maxdepth: 2 dev/contributing + Changelog diff --git a/docs/requirements.txt b/docs/requirements.txt index 9017c882..ac7f9449 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,11 @@ -furo==2023.9.10 -sphinx==7.2.6 +sphinx==7.3.7 +sphinx-book-theme>=1.0.1 sphinx-click==5.0.1 -myst_parser==2.0.0 +myst_parser==4.0.0 ghostscript==0.7 opencv-python==4.8.1.78 matplotlib==3.8.0 +accessible-pygments==0.0.5 +pydata-sphinx-theme==0.15.4 +sphinx-copybutton==0.5.2 +sphinx-prompt==1.8.0 diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index b9a1ea19..d63af36e 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -13,30 +13,32 @@ To detect line segments, :ref:`Lattice ` needs the lines that make the .. figure:: ../_static/png/background_lines.png :scale: 50% :alt: A table with lines in background - :align: left + :align: center Source: `PDF <../_static/pdf/background_lines.pdf>`__ To process background lines, you can pass ``process_background=True``. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('background_lines.pdf', process_background=True) >>> tables[1].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -back background_lines.pdf .. csv-table:: :file: ../_static/csv/background_lines.csv + :class: full-width Visual debugging ---------------- -.. note:: Visual debugging using ``plot()`` requires `matplotlib `_ which is an optional dependency. You can install it using ``$ pip install camelot-py[plot]``. +.. note:: Visual debugging using ``plot()`` requires `matplotlib `_ which is an optional dependency. You can install it using ``$ pip install pypdf-table-extraction[plot]``. You can use the :class:`plot() ` method to generate a `matplotlib `_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. @@ -53,7 +55,7 @@ You can specify the type of element you want to plot using the ``kind`` keyword Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('foo.pdf') >>> tables @@ -64,13 +66,14 @@ text Let's plot all the text present on the table's PDF page. -:: +.. code-block:: pycon >>> camelot.plot(tables[0], kind='text').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -plot text foo.pdf @@ -79,7 +82,7 @@ Let's plot all the text present on the table's PDF page. :width: 1366 :scale: 50% :alt: A plot of all text on a PDF page - :align: left + :align: center This, as we shall later see, is very helpful with :ref:`Stream ` for noting table areas and column separators, in case Stream does not guess them correctly. @@ -90,13 +93,14 @@ table Let's plot the table (to see if it was detected correctly or not). This plot type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.) -:: +.. code-block:: pycon >>> camelot.plot(tables[0], kind='grid').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -plot grid foo.pdf @@ -105,7 +109,7 @@ Let's plot the table (to see if it was detected correctly or not). This plot typ :width: 1366 :scale: 50% :alt: A plot of all tables on a PDF page - :align: left + :align: center The table is perfect! @@ -114,13 +118,14 @@ contour Now, let's plot all table boundaries present on the table's PDF page. -:: +.. code-block:: pycon >>> camelot.plot(tables[0], kind='contour').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -plot contour foo.pdf @@ -129,20 +134,21 @@ Now, let's plot all table boundaries present on the table's PDF page. :width: 1366 :scale: 50% :alt: A plot of all contours on a PDF page - :align: left + :align: center line ^^^^ Cool, let's plot all line segments present on the table's PDF page. -:: +.. code-block:: pycon >>> camelot.plot(tables[0], kind='line').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -plot line foo.pdf @@ -151,20 +157,21 @@ Cool, let's plot all line segments present on the table's PDF page. :width: 1366 :scale: 50% :alt: A plot of all lines on a PDF page - :align: left + :align: center joint ^^^^^ Finally, let's plot all line intersections present on the table's PDF page. -:: +.. code-block:: pycon >>> camelot.plot(tables[0], kind='joint').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -plot joint foo.pdf @@ -173,20 +180,21 @@ Finally, let's plot all line intersections present on the table's PDF page. :width: 1366 :scale: 50% :alt: A plot of all line intersections on a PDF page - :align: left + :align: center textedge ^^^^^^^^ -You can also visualize the textedges found on a page by specifying ``kind='textedge'``. To know more about what a "textedge" is, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis `_. +You can also visualize the textedges found on a page by specifying ``kind='textedge'``. To know more about what a "textedge" is, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis `_: -:: +.. code-block:: pycon >>> camelot.plot(tables[0], kind='textedge').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot stream -plot textedge foo.pdf @@ -195,46 +203,51 @@ You can also visualize the textedges found on a page by specifying ``kind='texte :width: 1366 :scale: 50% :alt: A plot of relevant textedges on a PDF page - :align: left + :align: center Specify table areas ------------------- In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. -Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `, using the ``table_areas`` keyword argument. +Table areas that you want pypdf_table_extraction to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `, using the ``table_areas`` keyword argument. -:: +.. code-block:: pycon + :class: full-width >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot stream -T 316,499,566,337 table_areas.pdf .. csv-table:: :file: ../_static/csv/table_areas.csv + :class: full-width + .. note:: ``table_areas`` accepts strings of the form x1,y1,x2,y2 where (x1, y1) -> top-left and (x2, y2) -> bottom-right in PDF coordinate space. In PDF coordinate space, the bottom-left corner of the page is the origin, with coordinates (0, 0). Specify table regions --------------------- -However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] `__, where the table might not lie at the exact coordinates every time but in an approximate region. +However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] `__, where the table might not lie at the exact coordinates every time but in an approximate region. -You can use the ``table_regions`` keyword argument to :meth:`read_pdf() ` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables. +You can use the ``table_regions`` keyword argument to :meth:`read_pdf() ` to solve for such cases. When ``table_regions`` is specified, pypdf_table_extraction will only analyze the specified regions to look for tables. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270']) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -R 170,370,560,270 table_regions.pdf @@ -244,7 +257,7 @@ You can use the ``table_regions`` keyword argument to :meth:`read_pdf() `__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by plotting the text on the page. +In cases like `these <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that pypdf_table_extraction may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by plotting the text on the page. You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() `, using the ``columns`` keyword argument. @@ -254,18 +267,21 @@ For example, if you have specified two table areas, ``table_areas=['12,54,43,23' Let's get back to the *x* coordinates we got from plotting the text that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out! -:: +.. code-block:: pycon + :class: full-width >>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683']) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot stream -C 72,95,209,327,442,529,566,606,683 column_separators.pdf .. csv-table:: + :class: full-width "...","...","...","...","...","...","...","...","...","..." "LICENSE","","","","PREMISE","","","","","" @@ -279,18 +295,21 @@ Split text along separators To deal with cases like the output from the previous section, you can pass ``split_text=True`` to :meth:`read_pdf() `, which will split any strings that lie in different cells but have been assigned to a single cell (as a result of being merged together by `PDFMiner `_). -:: +.. code-block:: pycon + :class: full-width >>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'], split_text=True) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot -split stream -C 72,95,209,327,442,529,566,606,683 column_separators.pdf .. csv-table:: + :class: full-width "...","...","...","...","...","...","...","...","...","..." "LICENSE","","","","PREMISE","","","","","" @@ -304,7 +323,7 @@ There might be cases where you want to differentiate between the text and supers .. figure:: ../_static/png/superscript.png :alt: A PDF with superscripts - :align: left + :align: center In this case, the text that `other tools`_ return, will be ``24.912``. This is relatively harmless when that decimal point is involved. But when it isn't there, you'll be left wondering why the results of your data analysis are 10x bigger! @@ -312,14 +331,15 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc .. _other tools: https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('superscript.pdf', flavor='stream', flag_size=True) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot -flag stream superscript.pdf @@ -334,16 +354,17 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc Strip characters from text -------------------------- -You can strip unwanted characters like spaces, dots and newlines from a string using the ``strip_text`` keyword argument. Take a look at `this PDF `_ as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines. +You can strip unwanted characters like spaces, dots and newlines from a string using the ``strip_text`` keyword argument. Take a look at `this PDF `_ as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('12s0324.pdf', flavor='stream', strip_text=' .\n') >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot -strip ' .\n' stream 12s0324.pdf @@ -360,20 +381,21 @@ You can strip unwanted characters like spaces, dots and newlines from a string u Improve guessed table areas --------------------------- -While using :ref:`Stream `, automatic table detection can fail for PDFs like `this one `_. That's because the text is relatively far apart vertically, which can lead to shorter textedges being calculated. +While using :ref:`Stream `, automatic table detection can fail for PDFs like `this one `_. That's because the text is relatively far apart vertically, which can lead to shorter textedges being calculated. -.. note:: To know more about how textedges are calculated to guess table areas, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis `_. +.. note:: To know more about how textedges are calculated to guess table areas, you can see pages 20, 35 and 40 of Anssi Nurminen's `master's thesis `_. Let's see the table area that is detected by default. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream') >>> camelot.plot(tables[0], kind='contour').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot stream -plot contour edge.pdf @@ -382,18 +404,19 @@ Let's see the table area that is detected by default. :width: 1366 :scale: 50% :alt: Table area with default edge_tol - :align: left + :align: center To improve the detected area, you can increase the ``edge_tol`` (default: 50) value to counter the effect of text being placed relatively far apart vertically. Larger ``edge_tol`` will lead to longer textedges being detected, leading to an improved guess of the table area. Let's use a value of 500. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream', edge_tol=500) >>> camelot.plot(tables[0], kind='contour').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot stream -e 500 -plot contour edge.pdf @@ -402,7 +425,7 @@ To improve the detected area, you can increase the ``edge_tol`` (default: 50) va :width: 1366 :scale: 50% :alt: Table area with default edge_tol - :align: left + :align: center As you can see, the guessed table area has improved! @@ -411,7 +434,7 @@ Improve guessed table rows You can pass ``row_tol=<+int>`` to group the rows closer together, as shown below. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream') >>> tables[0].df @@ -425,14 +448,15 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo "01","Aguascalientes","001","Aguascalientes","","0096","Agua Azul" "01","Aguascalientes","001","Aguascalientes","","0100","Rancho Alegre" -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_tol=10) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot stream -r 10 group_rows.pdf @@ -457,39 +481,40 @@ Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating t .. figure:: ../_static/png/short_lines.png :alt: A PDF table with short lines - :align: left + :align: center Let's plot the table for this PDF. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('short_lines.pdf') >>> camelot.plot(tables[0], kind='grid').show() .. figure:: ../_static/png/short_lines_1.png :alt: A plot of the PDF table with short lines - :align: left + :align: center Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40) >>> camelot.plot(tables[0], kind='grid').show() .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -scale 40 -plot grid short_lines.pdf .. figure:: ../_static/png/short_lines_2.png :alt: An improved plot of the PDF table with short lines - :align: left + :align: center -Voila! Camelot can now see those lines. Let's get our table. +Voila! pypdf_table_extraction can now see those lines. Let's get our table. -:: +.. code-block:: pycon >>> tables[0].df @@ -518,9 +543,9 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example .. figure:: ../_static/png/short_lines.png :alt: A PDF table with short lines - :align: left + :align: center -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['']) >>> tables[0].df @@ -541,14 +566,15 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example No surprises there — it did remain in place (observe the strings "2400" and "All the available individuals"). Let's pass ``shift_text=['r', 'b']`` to set the *gravity* to right-bottom and move the text in that direction. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b']) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -scale 40 -shift r -shift b short_lines.pdf @@ -575,12 +601,14 @@ You can copy text in spanning cells when using :ref:`Lattice `, in eith Let's try it out on this `PDF <../_static/pdf/copy_text.pdf>`__. First, let's check out the output table to see if we need to use any other configuration parameters. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('copy_text.pdf') >>> tables[0].df .. csv-table:: + :class: full-width + "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." "1","Kerala","Kollam","i. Food Poisoning","19","0","31/12/13","03/01/14","Under control","..." @@ -592,18 +620,20 @@ Let's try it out on this `PDF <../_static/pdf/copy_text.pdf>`__. First, let's ch We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in the vertical direction. This can save you some time by not having to add this step in your cleaning script! -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('copy_text.pdf', copy_text=['v']) >>> tables[0].df .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot lattice -copy v copy_text.pdf .. csv-table:: + :class: full-width "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." "1","Kerala","Kollam","i. Food Poisoning","19","0","31/12/13","03/01/14","Under control","..." @@ -616,11 +646,11 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in Tweak layout generation ----------------------- -Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. +pypdf_table_extraction is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False}) @@ -629,14 +659,18 @@ To deal with such cases, you can tweak PDFMiner's `LAParams kwargs ` flavor, Camelot uses ``ghostscript`` to convert PDF pages to images for line recognition. If you face installation issues with ``ghostscript``, you can use an alternate image conversion backend called ``poppler``. You can specify which image conversion backend you want to use with:: +When using the :ref:`Lattice ` flavor, pypdf_table_extraction uses ``ghostscript`` to convert PDF pages to images for line recognition. If you face installation issues with ``ghostscript``, you can use an alternate image conversion backend called ``poppler``. You can specify which image conversion backend you want to use with + +.. code-block:: pycon >>> tables = camelot.read_pdf(filename, backend="ghostscript") # default >>> tables = camelot.read_pdf(filename, backend="poppler") .. note:: ``ghostscript`` will be replaced by ``poppler`` as the default image conversion backend in ``v0.12.0``. -If you face issues with both ``ghostscript`` and ``poppler``, you can supply your own image conversion backend:: +If you face issues with both ``ghostscript`` and ``poppler``, you can supply your own image conversion backend + +.. code-block:: pycon >>> class ConversionBackend(object): >>> def convert(pdf_path, png_path): diff --git a/docs/user/cli.rst b/docs/user/cli.rst index 2dc65bd7..fa415971 100644 --- a/docs/user/cli.rst +++ b/docs/user/cli.rst @@ -3,36 +3,11 @@ Command-Line Interface ====================== -Camelot comes with a command-line interface. +pypdf_table_extraction comes with a command-line interface. -You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below. Furthermore, you can print the help for each command by typing ``camelot --help``. Try it out! +You can print the help for the interface by typing ``camelot --help`` in your favorite terminal program, as shown below. +Furthermore, you can print the help for each command by typing ``camelot --help``. Try it out! -:: - - Usage: camelot [OPTIONS] COMMAND [ARGS]... - - Camelot: PDF Table Extraction for Humans - - Options: - --version Show the version and exit. - -q, --quiet TEXT Suppress logs and warnings. - -p, --pages TEXT Comma-separated page numbers. Example: 1,3,4 - or 1,4-end. - -pw, --password TEXT Password for decryption. - -o, --output TEXT Output file path. - -f, --format [csv|json|excel|html] - Output file format. - -z, --zip Create ZIP archive. - -split, --split_text Split text that spans across multiple cells. - -flag, --flag_size Flag text based on font size. Useful to - detect super/subscripts. - -strip, --strip_text Characters that should be stripped from a - string before assigning it to a cell. - -M, --margins ... - PDFMiner char_margin, line_margin and - word_margin. - --help Show this message and exit. - - Commands: - lattice Use lines between text to parse the table. - stream Use spaces between text to parse the table. +.. click:: camelot.cli:cli + :prog: camelot + :nested: short diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 71babbda..27127c11 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -3,12 +3,12 @@ Frequently Asked Questions ========================== -This part of the documentation answers some common questions. To add questions, please open an issue `here `_. +This part of the documentation answers some common questions. To add questions, please open an issue `here `_. -Does Camelot work with image-based PDFs? ----------------------------------------- +Does pypdf_table_extraction work with image-based PDFs? +------------------------------------------------------- -**No**, Camelot only works with text-based PDFs and not scanned documents. (As Tabula `explains `_, "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) +**No**, pypdf_table_extraction only works with text-based PDFs and not scanned documents. (As Tabula `explains `_, "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) How to reduce memory usage for long PDFs? ----------------------------------------- @@ -19,7 +19,7 @@ A simple workaround is to divide the extraction into chunks, and save extracted For more details, check out this code snippet from `@anakin87 `_: -:: +.. code-block:: python import camelot @@ -58,7 +58,9 @@ For more details, check out this code snippet from `@anakin87 ` flavor, you can supply your own :ref:`image conversion backend ` by creating a class with a ``convert`` method as follows:: +When using the :ref:`Lattice ` flavor, you can supply your own :ref:`image conversion backend ` by creating a class with a ``convert`` method as follows: + +.. code-block:: python >>> class ConversionBackend(object): >>> def convert(pdf_path, png_path): diff --git a/docs/user/how-it-works.rst b/docs/user/how-it-works.rst index 27bd97f8..efaa4cd6 100644 --- a/docs/user/how-it-works.rst +++ b/docs/user/how-it-works.rst @@ -3,9 +3,9 @@ How It Works ============ -This part of the documentation includes a high-level explanation of how Camelot extracts tables from PDF files. +This part of the documentation includes a high-level explanation of how pypdf_table_extraction extracts tables from PDF files. -You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula `_. +You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside pypdf_table_extraction were inspired from `Tabula `_. .. _stream: @@ -43,7 +43,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step. :height: 674 :width: 1366 :scale: 50% - :align: left + :align: center 2. Line intersections are detected, by overlapping the detected line segments and "`and`_"ing their pixel intensities. @@ -53,7 +53,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step. :height: 674 :width: 1366 :scale: 50% - :align: left + :align: center 3. Table boundaries are computed by overlapping the detected line segments again, this time by "`or`_"ing their pixel intensities. @@ -63,7 +63,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step. :height: 674 :width: 1366 :scale: 50% - :align: left + :align: center 4. Since dimensions of the PDF page and its image vary, the detected table boundaries, line intersections, and line segments are scaled and translated to the PDF page's coordinate space, and a representation of the table is created. @@ -71,7 +71,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step. :height: 674 :width: 1366 :scale: 50% - :align: left + :align: center 5. Spanning cells are detected using the line segments and line intersections. @@ -79,6 +79,6 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step. :height: 674 :width: 1366 :scale: 50% - :align: left + :align: center 6. Finally, the words found on the page are assigned to the table's cells based on their *x* and *y* coordinates. diff --git a/docs/user/install-deps.rst b/docs/user/install-deps.rst index f4ab87d9..792857e7 100755 --- a/docs/user/install-deps.rst +++ b/docs/user/install-deps.rst @@ -10,16 +10,24 @@ OS-specific instructions Ubuntu ^^^^^^ -:: +.. code-block:: console $ apt install ghostscript python3-tk MacOS ^^^^^ -:: + +.. code-block:: console $ brew install ghostscript tcl-tk +.. note:: + You might encounter the problem that the ghostscript module cannot be found. This can be fixed with the following commands. + + ``mkdir -p ~/lib`` + + ``ln -s "$(brew --prefix gs)/lib/libgs.dylib" ~/lib`` + Windows ^^^^^^^ @@ -35,13 +43,17 @@ For Ghostscript Open the Python REPL and run the following: -For Ubuntu/MacOS:: +For Ubuntu/MacOS + +.. code-block:: pycon >>> from ctypes.util import find_library >>> find_library("gs") "libgs.so.9" -For Windows:: +For Windows + +.. code-block:: pycon >>> import ctypes >>> from ctypes.util import find_library @@ -55,7 +67,9 @@ If the output is empty, then it's possible that the Ghostscript library is not a For Tkinter ^^^^^^^^^^^ -Launch Python and then import Tkinter:: +Launch Python and then import Tkinter + +.. code-block:: pycon >>> import tkinter diff --git a/docs/user/install.rst b/docs/user/install.rst index 20208ce6..22a7eac8 100644 --- a/docs/user/install.rst +++ b/docs/user/install.rst @@ -1,40 +1,47 @@ .. _install: -Installation of Camelot -======================= +Installation +============= -This part of the documentation covers the steps to install Camelot. +This part of the documentation covers the steps to install pypdf_table_extraction. -After :ref:`installing the dependencies `, which include `Ghostscript `_ and `Tkinter `_, you can use one of the following methods to install Camelot: +After :ref:`installing the dependencies `, which include `Ghostscript `_ and `Tkinter `_, you can use one of the following methods to install pypdf_table_extraction: .. warning:: The ``lattice`` flavor will fail to run if Ghostscript is not installed. You may run into errors as shown in `issue #193 `_. pip --- -To install Camelot from PyPI using ``pip``, please include the extra ``cv`` requirement as shown:: +To install pypdf_table_extraction from PyPI using ``pip`` - $ pip install "camelot-py[base]" +.. code-block:: console + + $ pip install "pypdf-table-extraction[base]" conda ----- -`conda`_ is a package manager and environment management system for the `Anaconda `_ distribution. It can be used to install Camelot from the ``conda-forge`` channel:: - $ conda install -c conda-forge camelot-py +`conda`_ is a package manager and environment management system for the `Anaconda `_ distribution. It can be used to install pypdf_table_extraction from the ``conda-forge`` channel + +.. code-block:: console + + $ conda install -c conda-forge pypdf-table-extraction From the source code -------------------- -After :ref:`installing the dependencies `, you can install Camelot from source by: +After :ref:`installing the dependencies `, you can install pypdf_table_extraction from source by: 1. Cloning the GitHub repository. -:: - $ git clone https://www.github.com/camelot-dev/camelot +.. code-block:: console + + $ git clone https://github.com/py-pdf/pypdf_table_extraction.git 2. And then simply using pip again. -:: + +.. code-block:: console $ cd camelot $ pip install ".[base]" diff --git a/docs/user/intro.rst b/docs/user/intro.rst index bdd1b5a8..825e10e4 100644 --- a/docs/user/intro.rst +++ b/docs/user/intro.rst @@ -19,9 +19,9 @@ Why another PDF table extraction library? There are both open (`Tabula`_, `pdf-table-extract`_) and closed-source (`smallpdf`_, `PDFTables`_) tools that are widely used to extract tables from PDF files. They either give a nice output or fail miserably. There is no in between. This is not helpful since everything in the real world, including PDF table extraction, is fuzzy. This leads to the creation of ad-hoc table extraction scripts for each type of PDF table. -Camelot was created to offer users complete control over table extraction. If you can't get your desired output with the default settings, you can tweak them and get the job done! +pypdf_table_extraction (formerly `Camelot`_) was created to offer users complete control over table extraction. If you can't get your desired output with the default settings, you can tweak them and get the job done! -Here is a `comparison`_ of Camelot's output with outputs from other open-source PDF parsing libraries and tools. +Here is a `comparison`_ of pypdf_table_extraction's output with outputs from other open-source PDF parsing libraries and tools. .. _Tabula: http://tabula.technology/ .. _pdf-table-extract: https://github.com/ashima/pdf-table-extract @@ -32,15 +32,18 @@ Here is a `comparison`_ of Camelot's output with outputs from other open-source What's in a name? ----------------- -As you can already guess, this library is named after `The Camelot Project`_. +pypdf_table_extraction is the comunity maintained fork of `Camelot`_. +As you can already guess, that library is named after `The Camelot Project`_. -Fun fact: In the British comedy film `Monty Python and the Holy Grail`_ (and in the `Arthurian legend`_ depicted in the film), "Camelot" is the name of the castle where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written (Python) was named after Monty Python. +.. note:: + Fun fact: In the British comedy film `Monty Python and the Holy Grail`_ (and in the `Arthurian legend`_ depicted in the film), "Camelot" is the name of the castle where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written (Python) was named after Monty Python. +.. _Camelot: https://github.com/camelot-dev/camelot .. _The Camelot Project: https://web.archive.org/web/20210203041543/http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf .. _Monty Python and the Holy Grail: https://en.wikipedia.org/wiki/Monty_Python_and_the_Holy_Grail .. _Arthurian legend: https://en.wikipedia.org/wiki/King_Arthur -Camelot License ---------------- +pypdf_table_extracion License +----------------------------- .. include:: ../../LICENSE diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst index c3cff640..3f4c52d4 100644 --- a/docs/user/quickstart.rst +++ b/docs/user/quickstart.rst @@ -3,14 +3,16 @@ Quickstart ========== -In a hurry to extract tables from PDFs? This document gives a good introduction to help you get started with Camelot. +In a hurry to extract tables from PDFs? This document gives a good introduction to help you get started with pypdf_table_extraction. Read the PDF ------------ -Reading a PDF to extract tables with Camelot is very simple. +Reading a PDF to extract tables with pypdf_table_extraction is very simple. -Begin by importing the Camelot module:: +Begin by importing the Camelot module + +.. code-block:: pycon >>> import camelot @@ -20,7 +22,7 @@ Now, let's try to read a PDF. (You can check out the PDF used in this example `h .. _here: ../_static/pdf/foo.pdf -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('foo.pdf') >>> tables @@ -30,14 +32,14 @@ Now, we have a :class:`TableList ` object called ``table We can access each table using its index. From the code snippet above, we can see that the ``tables`` object has only one table, since ``n=1``. Let's access the table using the index ``0`` and take a look at its ``shape``. -:: +.. code-block:: pycon >>> tables[0] Let's print the parsing report. -:: +.. code-block:: pycon >>> print tables[0].parsing_report { @@ -49,7 +51,7 @@ Let's print the parsing report. Woah! The accuracy is top-notch and there is less whitespace, which means the table was most likely extracted correctly. You can access the table as a pandas DataFrame by using the :class:`table ` object's ``df`` property. -:: +.. code-block:: pycon >>> tables[0].df @@ -58,7 +60,7 @@ Woah! The accuracy is top-notch and there is less whitespace, which means the ta Looks good! You can now export the table as a CSV file using its :meth:`to_csv() ` method. Alternatively you can use :meth:`to_json() `, :meth:`to_excel() ` :meth:`to_html() ` :meth:`to_markdown() ` or :meth:`to_sqlite() ` methods to export the table as JSON, Excel, HTML files or a sqlite database respectively. -:: +.. code-block:: pycon >>> tables[0].to_csv('foo.csv') @@ -66,13 +68,14 @@ This will export the table as a CSV file at the path specified. In this case, it You can also export all tables at once, using the :class:`tables ` object's :meth:`export() ` method. -:: +.. code-block:: pycon >>> tables.export('foo.csv', f='csv') .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot --format csv --output foo.csv lattice foo.pdf @@ -80,20 +83,21 @@ This will export all tables as CSV files at the path specified. Alternatively, y .. note:: The :meth:`export() ` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files. -.. note:: Camelot handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF`_. +.. note:: pypdf_table_extraction handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF`_. .. _this PDF: ../_static/pdf/rotated.pdf Specify page numbers -------------------- -By default, Camelot only uses the first page of the PDF to extract tables. To specify multiple pages, you can use the ``pages`` keyword argument:: +By default, pypdf_table_extraction only uses the first page of the PDF to extract tables. To specify multiple pages, you can use the ``pages`` keyword argument:: >>> camelot.read_pdf('your.pdf', pages='1,2,3') .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot --pages 1,2,3 lattice your.pdf @@ -102,9 +106,9 @@ The ``pages`` keyword argument accepts pages as comma-separated string of page n Extract tables in parallel -------------------------- -Camelot supports extracting tables in parrallel using all the available CPU cores. +pypdf_table_extraction supports extracting tables in parrallel using all the available CPU cores. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('foo.pdf', page='all', parallel=True) >>> tables @@ -112,7 +116,8 @@ Camelot supports extracting tables in parrallel using all the available CPU core .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot --pages all --parallel lattice foo.pdf @@ -124,7 +129,7 @@ Reading encrypted PDFs To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() `. -:: +.. code-block:: pycon >>> tables = camelot.read_pdf('foo.pdf', password='userpass') >>> tables @@ -132,15 +137,16 @@ To extract tables from encrypted PDF files you must provide a password when call .. tip:: Here's how you can do the same with the :ref:`command-line interface `. - :: + + .. code-block:: console $ camelot --password userpass lattice foo.pdf -Camelot supports PDFs with all encryption types supported by `pypdf`_. This might require installing PyCryptodome. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm. +pypdf_table_extraction supports PDFs with all encryption types supported by `pypdf`_. This might require installing PyCryptodome. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm. Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() `. This can been successfully achieved with third-party tools such as `QPDF`_. -:: +.. code-block:: console $ qpdf --password= --decrypt input.pdf output.pdf diff --git a/noxfile.py b/noxfile.py index 4360cd60..2fc6422b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -213,7 +213,7 @@ def xdoctest(session: Session) -> None: session.run("python", "-m", "xdoctest", *args) -@session(name="docs-build", python=python_versions[2]) +@session(name="docs-build", python=python_versions[0]) def docs_build(session: Session) -> None: """Build the documentation.""" args = session.posargs or ["docs", "docs/_build"] @@ -222,7 +222,7 @@ def docs_build(session: Session) -> None: session.install(".") session.install( - "sphinx", "sphinx-click", "furo", "myst-parser", *base_requires, *plot_requires + "sphinx", "sphinx-click", "sphinx-book-theme", "myst-parser", "sphinx-copybutton", "sphinx-prompt", *base_requires, *plot_requires ) build_dir = Path("docs", "_build") @@ -241,8 +241,10 @@ def docs(session: Session) -> None: "sphinx", "sphinx-autobuild", "sphinx-click", - "furo", + "sphinx-book-theme", "myst-parser", + "sphinx-copybutton", + "sphinx-prompt", *base_requires, *plot_requires, ) diff --git a/poetry.lock b/poetry.lock index 8a7f1e78..99da244f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,19 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "accessible-pygments" +version = "0.0.4" +description = "A collection of accessible pygments styles" +optional = false +python-versions = "*" +files = [ + {file = "accessible-pygments-0.0.4.tar.gz", hash = "sha256:e7b57a9b15958e9601c7e9eb07a440c813283545a20973f2574a5f453d0e953e"}, + {file = "accessible_pygments-0.0.4-py2.py3-none-any.whl", hash = "sha256:416c6d8c1ea1c5ad8701903a20fcedf953c6e720d64f33dc47bfb2d3f2fa4e8d"}, +] + +[package.dependencies] +pygments = ">=1.5" + [[package]] name = "alabaster" version = "0.7.13" @@ -699,13 +713,13 @@ files = [ [[package]] name = "docutils" -version = "0.20.1" +version = "0.19" description = "Docutils -- Python Documentation Utilities" optional = false python-versions = ">=3.7" files = [ - {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, - {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, + {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"}, + {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, ] [[package]] @@ -903,23 +917,6 @@ ufo = ["fs (>=2.2.0,<3)"] unicode = ["unicodedata2 (>=15.1.0)"] woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] -[[package]] -name = "furo" -version = "2024.8.6" -description = "A clean customisable Sphinx documentation theme." -optional = false -python-versions = ">=3.8" -files = [ - {file = "furo-2024.8.6-py3-none-any.whl", hash = "sha256:6cd97c58b47813d3619e63e9081169880fbe331f0ca883c871ff1f3f11814f5c"}, - {file = "furo-2024.8.6.tar.gz", hash = "sha256:b63e4cee8abfc3136d3bc03a3d45a76a850bada4d6374d24c1716b0e01394a01"}, -] - -[package.dependencies] -beautifulsoup4 = "*" -pygments = ">=2.7" -sphinx = ">=6.0,<9.0" -sphinx-basic-ng = ">=1.0.0.beta2" - [[package]] name = "ghostscript" version = "0.7" @@ -1567,11 +1564,11 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.21.0", markers = "python_version <= \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\" and python_version >= \"3.8\""}, - {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.17.3", markers = "(platform_system != \"Darwin\" and platform_system != \"Linux\") and python_version >= \"3.8\" and python_version < \"3.9\" or platform_system != \"Darwin\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_machine != \"aarch64\" or platform_machine != \"arm64\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_system != \"Linux\" or (platform_machine != \"arm64\" and platform_machine != \"aarch64\") and python_version >= \"3.8\" and python_version < \"3.9\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.17.3", markers = "(platform_system != \"Darwin\" and platform_system != \"Linux\") and python_version >= \"3.8\" and python_version < \"3.9\" or platform_system != \"Darwin\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_machine != \"aarch64\" or platform_machine != \"arm64\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_system != \"Linux\" or (platform_machine != \"arm64\" and platform_machine != \"aarch64\") and python_version >= \"3.8\" and python_version < \"3.9\""}, ] [[package]] @@ -2038,6 +2035,33 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pydata-sphinx-theme" +version = "0.14.4" +description = "Bootstrap-based Sphinx theme from the PyData community" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydata_sphinx_theme-0.14.4-py3-none-any.whl", hash = "sha256:ac15201f4c2e2e7042b0cad8b30251433c1f92be762ddcefdb4ae68811d918d9"}, + {file = "pydata_sphinx_theme-0.14.4.tar.gz", hash = "sha256:f5d7a2cb7a98e35b9b49d3b02cec373ad28958c2ed5c9b1ffe6aff6c56e9de5b"}, +] + +[package.dependencies] +accessible-pygments = "*" +Babel = "*" +beautifulsoup4 = "*" +docutils = "!=0.17.0" +packaging = "*" +pygments = ">=2.7" +sphinx = ">=5.0" +typing-extensions = "*" + +[package.extras] +a11y = ["pytest-playwright"] +dev = ["nox", "pre-commit", "pydata-sphinx-theme[doc,test]", "pyyaml"] +doc = ["ablog (>=0.11.0rc2)", "colorama", "ipykernel", "ipyleaflet", "jupyter_sphinx", "jupyterlite-sphinx", "linkify-it-py", "matplotlib", "myst-parser", "nbsphinx", "numpy", "numpydoc", "pandas", "plotly", "rich", "sphinx-autoapi (>=3.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-favicon (>=1.0.1)", "sphinx-sitemap", "sphinx-togglebutton", "sphinxcontrib-youtube (<1.4)", "sphinxext-rediraffe", "xarray"] +test = ["pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "pyflakes" version = "2.5.0" @@ -2488,20 +2512,20 @@ files = [ [[package]] name = "sphinx" -version = "7.1.2" +version = "6.2.1" description = "Python documentation generator" optional = false python-versions = ">=3.8" files = [ - {file = "sphinx-7.1.2-py3-none-any.whl", hash = "sha256:d170a81825b2fcacb6dfd5a0d7f578a053e45d3f2b153fecc948c37344eb4cbe"}, - {file = "sphinx-7.1.2.tar.gz", hash = "sha256:780f4d32f1d7d1126576e0e5ecc19dc32ab76cd24e950228dcf7b1f6d3d9e22f"}, + {file = "Sphinx-6.2.1.tar.gz", hash = "sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b"}, + {file = "sphinx-6.2.1-py3-none-any.whl", hash = "sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912"}, ] [package.dependencies] alabaster = ">=0.7,<0.8" babel = ">=2.9" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} -docutils = ">=0.18.1,<0.21" +docutils = ">=0.18.1,<0.20" imagesize = ">=1.3" importlib-metadata = {version = ">=4.8", markers = "python_version < \"3.10\""} Jinja2 = ">=3.0" @@ -2541,21 +2565,24 @@ sphinx = "*" test = ["pytest", "pytest-cov"] [[package]] -name = "sphinx-basic-ng" -version = "1.0.0b2" -description = "A modern skeleton for Sphinx themes." +name = "sphinx-book-theme" +version = "1.0.1" +description = "A clean book theme for scientific explanations and documentation with Sphinx" optional = false python-versions = ">=3.7" files = [ - {file = "sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b"}, - {file = "sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9"}, + {file = "sphinx_book_theme-1.0.1-py3-none-any.whl", hash = "sha256:d15f8248b3718a9a6be0ba617a32d1591f9fa39c614469bface777ba06a73b75"}, + {file = "sphinx_book_theme-1.0.1.tar.gz", hash = "sha256:927b399a6906be067e49c11ef1a87472f1b1964075c9eea30fb82c64b20aedee"}, ] [package.dependencies] -sphinx = ">=4.0" +pydata-sphinx-theme = ">=0.13.3" +sphinx = ">=4,<7" [package.extras] -docs = ["furo", "ipython", "myst-parser", "sphinx-copybutton", "sphinx-inline-tabs"] +code-style = ["pre-commit"] +doc = ["ablog", "docutils (==0.17.1)", "folium", "ipywidgets", "matplotlib", "myst-nb", "nbclient", "numpy", "numpydoc", "pandas", "plotly", "sphinx-copybutton", "sphinx-design", "sphinx-examples", "sphinx-tabs (<=3.4.0)", "sphinx-thebe", "sphinx-togglebutton", "sphinxcontrib-bibtex", "sphinxcontrib-youtube", "sphinxext-opengraph"] +test = ["beautifulsoup4", "coverage", "myst-nb", "pytest", "pytest-cov", "pytest-regressions", "sphinx_thebe"] [[package]] name = "sphinx-click" @@ -2573,6 +2600,38 @@ click = ">=8.0" docutils = "*" sphinx = ">=4.0" +[[package]] +name = "sphinx-copybutton" +version = "0.5.2" +description = "Add a copy button to each of your code cells." +optional = false +python-versions = ">=3.7" +files = [ + {file = "sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd"}, + {file = "sphinx_copybutton-0.5.2-py3-none-any.whl", hash = "sha256:fb543fd386d917746c9a2c50360c7905b605726b9355cd26e9974857afeae06e"}, +] + +[package.dependencies] +sphinx = ">=1.8" + +[package.extras] +code-style = ["pre-commit (==2.12.1)"] +rtd = ["ipython", "myst-nb", "sphinx", "sphinx-book-theme", "sphinx-examples"] + +[[package]] +name = "sphinx-prompt" +version = "1.5.0" +description = "Sphinx directive to add unselectable prompt" +optional = false +python-versions = "*" +files = [ + {file = "sphinx_prompt-1.5.0-py3-none-any.whl", hash = "sha256:fa4e90d8088b5a996c76087d701fc7e31175f8b9dc4aab03a507e45051067162"}, +] + +[package.dependencies] +pygments = "*" +Sphinx = "*" + [[package]] name = "sphinxcontrib-applehelp" version = "1.0.4" @@ -2865,4 +2924,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "b6550fa6eb18972e1601fed58a52a45838ac2e66c577337f81588deb0b1a173d" +content-hash = "0be8560c848acfa22b10ac0a4b84b5418ca2be0697ec6d6a82f7ce64fc821a8b" diff --git a/pyproject.toml b/pyproject.toml index 7a3e8813..7c743ddc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ flake8 = ">=4.0.1" flake8-bandit = ">=2.1.2" flake8-bugbear = ">=21.9.2" flake8-rst-docstrings = ">=0.2.5" -furo = ">=2021.11.12" +sphinx-book-theme = ">=1.0.1" isort = ">=5.10.1" mypy = ">=0.930" pep8-naming = ">=0.12.1" @@ -50,9 +50,11 @@ safety = ">=2.2.3" sphinx = ">=4.3.2" sphinx-autobuild = ">=2021.3.14" sphinx-click = ">=3.0.2" +sphinx-copybutton = ">=0.5.2" +sphinx-prompt = ">=1.5.0" typeguard = ">=2.13.3" xdoctest = {extras = ["colors"], version = ">=0.15.10"} -myst-parser = {version = ">=0.16.1"} +myst-parser = {version = ">=2.0.0"} [tool.poetry.scripts] camelot = "camelot.__main__:main"