From e473ec7566db0ce7286c86b7ebd6b56edc6e3650 Mon Sep 17 00:00:00 2001 From: hpierre001 <58784889+hpierre001@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:27:42 +0200 Subject: [PATCH 1/2] ENH: Add ``layout_mode_font_height_weight`` argument to ``PageObject.extract_text()`` --- pypdf/_page.py | 8 +++++++- pypdf/_text_extraction/_layout_mode/_fixed_width_page.py | 5 +++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index a6f8abaeb..3c794d7a0 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2210,6 +2210,7 @@ def _layout_mode_text( scale_weight: float = 1.25, strip_rotated: bool = True, debug_path: Optional[Path] = None, + font_height_weight: float = 1, ) -> str: """ Get text preserving fidelity to source PDF text layout. @@ -2229,6 +2230,8 @@ def _layout_mode_text( - bts.json: text render ops left justified and grouped by BT/ET operators - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) Defaults to None. + font_height_weight: multiplier for font height when calculating + blank lines. Defaults to 1. Returns: str: multiline string containing page text in a fixed width format that @@ -2260,7 +2263,7 @@ def _layout_mode_text( char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) - return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically) + return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) def extract_text( self, @@ -2335,6 +2338,8 @@ def extract_text( - tjs.json: individual text render ops with corresponding transform matrices - bts.json: text render ops left justified and grouped by BT/ET operators - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) + layout_mode_font_height_weight (float): multiplier for font height when calculating + blank lines. Defaults to 1. Returns: The extracted text @@ -2358,6 +2363,7 @@ def extract_text( scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), strip_rotated=kwargs.get("layout_mode_strip_rotated", True), debug_path=kwargs.get("layout_mode_debug_path"), + font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) ) if len(args) >= 1: if isinstance(args[0], str): diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py index 80dc2ed26..9c516d1ea 100644 --- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py +++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py @@ -343,7 +343,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl def fixed_width_page( - ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool + ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float ) -> str: """ Generate page text from text operations grouped by rendered y coordinate. @@ -352,6 +352,7 @@ def fixed_width_page( ty_groups: dict of text show ops as returned by y_coordinate_groups() char_width: fixed character width space_vertically: include blank lines inferred from y distance + font height. + font_height_weight: multiplier for font height when calculating blank lines. Returns: str: page text in a fixed width format that closely adheres to the rendered @@ -363,7 +364,7 @@ def fixed_width_page( for y_coord, line_data in ty_groups.items(): if space_vertically and lines: blank_lines = ( - int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1 + int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1 ) lines.extend([""] * blank_lines) line = "" From df762115406a867b94e6beca4ff5f2e6fa9363a5 Mon Sep 17 00:00:00 2001 From: hpierre001 <58784889+hpierre001@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:57:22 +0200 Subject: [PATCH 2/2] TST: Add test for ``layout_mode_font_height_weight`` of ``PageObject.extract_text()`` --- resources/crazyones_layout_vertical_space.txt | 19 +++++++++ ...yout_vertical_space_font_height_weight.txt | 25 +++++++++++ tests/test_text_extraction.py | 41 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 resources/crazyones_layout_vertical_space.txt create mode 100644 resources/crazyones_layout_vertical_space_font_height_weight.txt diff --git a/resources/crazyones_layout_vertical_space.txt b/resources/crazyones_layout_vertical_space.txt new file mode 100644 index 000000000..b745f6f63 --- /dev/null +++ b/resources/crazyones_layout_vertical_space.txt @@ -0,0 +1,19 @@ +The Crazy Ones +October 14, 1998 + + Heres to the crazy ones. The misfits. The rebels. The troublemakers. + The round pegs in the square holes. + The ones who see things differently. Theyre not fond of rules. And + they have no respect for the status quo. You can quote them, + disagree with them, glorify or vilify them. + About the only thing you cant do is ignore them. Because they change + things. They invent. They imagine. They heal. They explore. They + create. They inspire. They push the human race forward. + Maybe they have to be crazy. + How else can you stare at an empty canvas and see a work of art? Or + sit in silence and hear a song thats never been written? Or gaze at + a red planet and see a laboratory on wheels? + We make tools for these kinds of people. + While some see them as the crazy ones, we see genius. Because the + people who are crazy enough to think they can change the world, + are the ones who do. \ No newline at end of file diff --git a/resources/crazyones_layout_vertical_space_font_height_weight.txt b/resources/crazyones_layout_vertical_space_font_height_weight.txt new file mode 100644 index 000000000..e90fe87e9 --- /dev/null +++ b/resources/crazyones_layout_vertical_space_font_height_weight.txt @@ -0,0 +1,25 @@ +The Crazy Ones +October 14, 1998 + + Heres to the crazy ones. The misfits. The rebels. The troublemakers. + The round pegs in the square holes. + + The ones who see things differently. Theyre not fond of rules. And + they have no respect for the status quo. You can quote them, + disagree with them, glorify or vilify them. + + About the only thing you cant do is ignore them. Because they change + things. They invent. They imagine. They heal. They explore. They + create. They inspire. They push the human race forward. + + Maybe they have to be crazy. + + How else can you stare at an empty canvas and see a work of art? Or + sit in silence and hear a song thats never been written? Or gaze at + a red planet and see a laboratory on wheels? + + We make tools for these kinds of people. + + While some see them as the crazy ones, we see genius. Because the + people who are crazy enough to think they can change the world, + are the ones who do. \ No newline at end of file diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index c04d4ceb0..a92525e84 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -219,3 +219,44 @@ def test_text_leading_height_unit(): page = reader.pages[0] extracted = page.extract_text() assert "Something[cited]\n" in extracted + + +def test_layout_mode_space_vertically_font_height_weight(): + """Tests layout mode with vertical space and font height weight (issue #2915)""" + with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: + # Load PDF file from file + reader = PdfReader(inputfile) + page = reader.pages[0] + + # Normal behaviour + with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file: + pdftext = pdftext_file.read() + + text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8") + + # Compare the text of the PDF to a known source + for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): + assert expected_line == actual_line + + pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows + assert text == pdftext, ( + "PDF extracted text differs from expected value.\n\n" + "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text) + ) + + # Blank lines are added to truly separate paragraphs + with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file: + pdftext = pdftext_file.read() + + text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True, + layout_mode_font_height_weight=0.85).encode("utf-8") + + # Compare the text of the PDF to a known source + for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): + assert expected_line == actual_line + + pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows + assert text == pdftext, ( + "PDF extracted text differs from expected value.\n\n" + "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text) + )