diff --git a/camelot/handlers.py b/camelot/handlers.py index 74ddde7a..d5c402d9 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -36,7 +36,7 @@ class PDFHandler: """ - def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None): + def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None, multi={}): if is_url(filepath): filepath = download_url(filepath) self.filepath: Union[StrByteType, Path] = filepath @@ -51,6 +51,7 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None) if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(pages) + self.multi = multi def _get_pages(self, pages): """Converts pages string to list of ints. @@ -188,8 +189,17 @@ def parse( with mp.get_context("spawn").Pool(processes=cpu_count) as pool: jobs = [] for p in self.pages: + p_no = str(p) + + page_kwargs = kwargs + page_parser = parser + + if p_no in self.multi: + page_kwargs.update(self.multi[p_no]) + page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs) + j = pool.apply_async( - self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs) + self._parse_page,(p, tempdir, page_parser, suppress_stdout, layout_kwargs) ) jobs.append(j) @@ -198,7 +208,16 @@ def parse( tables.extend(t) else: for p in self.pages: - t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs) + p_no = str(p) + + page_kwargs = kwargs + page_parser = parser + + if p_no in self.multi: + page_kwargs.update(self.multi[p_no]) + page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs) + + t = self._parse_page(p, tempdir, page_parser, suppress_stdout, layout_kwargs) tables.extend(t) return TableList(sorted(tables)) @@ -224,7 +243,7 @@ def _parse_page( ------- tables : camelot.core.TableList List of tables found in PDF. - + """ self._save_page(self.filepath, page, tempdir) page_path = os.path.join(tempdir, f"page-{page}.pdf") diff --git a/camelot/io.py b/camelot/io.py index 12718828..ad71f41f 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -17,6 +17,7 @@ def read_pdf( suppress_stdout=False, parallel=False, layout_kwargs=None, + multi = {}, **kwargs ): """Read PDF and return extracted tables. @@ -43,6 +44,9 @@ def read_pdf( layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. + multi: dict, optional(default: {}) + A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params). + Parameters defined in multi overwrite kwargs for that page table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -120,7 +124,7 @@ def read_pdf( warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, multi=multi) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/tests/data.py b/tests/data.py index 2309ab78..910feccf 100644 --- a/tests/data.py +++ b/tests/data.py @@ -3838,3 +3838,20 @@ "vivek.garg@incablenet.net", ], ] + +data_multi_params1 = [ + ["Number of Coils", "Number of Paperclips"], + ["5", "3, 5, 4"], + ["10", "7, 8, 6"], + ["15", "11, 10, 12"], + ["20", "15, 13, 14"] +] + +data_multi_params2 = [ + ["Time (drops of water)", "Distance (cm)"], + ["1", "10,11,9"], + ["2", "29, 31, 30"], + ["3", "59, 58, 61"], + ["4", "102, 100, 98"], + ["5", "122, 125, 127"] +] diff --git a/tests/files/multi_params.pdf b/tests/files/multi_params.pdf new file mode 100644 index 00000000..0fe11133 Binary files /dev/null and b/tests/files/multi_params.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 0563a3da..5072e8f8 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -214,3 +214,17 @@ def _make_table(page, order): assert iterator_b is not None item_c = next(iterator_b) assert item_c is not None + +@skip_on_windows +def test_multi_params(testdir): + df1 = pd.DataFrame(data_multi_params1) + df2 = pd.DataFrame(data_multi_params2) + + filename = os.path.join( + testdir, "multi_params.pdf" + ) + tables = camelot.read_pdf(filename, pages="all", multi={'2': {"table_regions": ["120, 210, 400, 90"]}}, + split_text=True) + + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df)