From e473ec7566db0ce7286c86b7ebd6b56edc6e3650 Mon Sep 17 00:00:00 2001
From: hpierre001 <58784889+hpierre001@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:27:42 +0200
Subject: [PATCH 1/2] ENH: Add ``layout_mode_font_height_weight`` argument to
 ``PageObject.extract_text()``

---
 pypdf/_page.py                                           | 8 +++++++-
 pypdf/_text_extraction/_layout_mode/_fixed_width_page.py | 5 +++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index a6f8abaeb..3c794d7a0 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2210,6 +2210,7 @@ def _layout_mode_text(
         scale_weight: float = 1.25,
         strip_rotated: bool = True,
         debug_path: Optional[Path] = None,
+        font_height_weight: float = 1,
     ) -> str:
         """
         Get text preserving fidelity to source PDF text layout.
@@ -2229,6 +2230,8 @@ def _layout_mode_text(
                   - bts.json: text render ops left justified and grouped by BT/ET operators
                   - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
                 Defaults to None.
+            font_height_weight: multiplier for font height when calculating
+                blank lines. Defaults to 1.
 
         Returns:
             str: multiline string containing page text in a fixed width format that
@@ -2260,7 +2263,7 @@ def _layout_mode_text(
 
         char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
 
-        return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
+        return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
 
     def extract_text(
         self,
@@ -2335,6 +2338,8 @@ def extract_text(
                   - tjs.json: individual text render ops with corresponding transform matrices
                   - bts.json: text render ops left justified and grouped by BT/ET operators
                   - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
+            layout_mode_font_height_weight (float): multiplier for font height when calculating
+                blank lines. Defaults to 1.
 
         Returns:
             The extracted text
@@ -2358,6 +2363,7 @@ def extract_text(
                 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
                 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
                 debug_path=kwargs.get("layout_mode_debug_path"),
+                font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
             )
         if len(args) >= 1:
             if isinstance(args[0], str):
diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
index 80dc2ed26..9c516d1ea 100644
--- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
+++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -343,7 +343,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl
 
 
 def fixed_width_page(
-    ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
+    ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
 ) -> str:
     """
     Generate page text from text operations grouped by rendered y coordinate.
@@ -352,6 +352,7 @@ def fixed_width_page(
         ty_groups: dict of text show ops as returned by y_coordinate_groups()
         char_width: fixed character width
         space_vertically: include blank lines inferred from y distance + font height.
+        font_height_weight: multiplier for font height when calculating blank lines.
 
     Returns:
         str: page text in a fixed width format that closely adheres to the rendered
@@ -363,7 +364,7 @@ def fixed_width_page(
     for y_coord, line_data in ty_groups.items():
         if space_vertically and lines:
             blank_lines = (
-                int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
+                int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1
             )
             lines.extend([""] * blank_lines)
         line = ""

From df762115406a867b94e6beca4ff5f2e6fa9363a5 Mon Sep 17 00:00:00 2001
From: hpierre001 <58784889+hpierre001@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:57:22 +0200
Subject: [PATCH 2/2] TST: Add test for ``layout_mode_font_height_weight`` of
 ``PageObject.extract_text()``

---
 resources/crazyones_layout_vertical_space.txt | 19 +++++++++
 ...yout_vertical_space_font_height_weight.txt | 25 +++++++++++
 tests/test_text_extraction.py                 | 41 +++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 resources/crazyones_layout_vertical_space.txt
 create mode 100644 resources/crazyones_layout_vertical_space_font_height_weight.txt

diff --git a/resources/crazyones_layout_vertical_space.txt b/resources/crazyones_layout_vertical_space.txt
new file mode 100644
index 000000000..b745f6f63
--- /dev/null
+++ b/resources/crazyones_layout_vertical_space.txt
@@ -0,0 +1,19 @@
+The Crazy Ones
+October 14, 1998
+
+   Heres to the crazy ones. The misﬁts. The rebels. The troublemakers.
+       The round pegs in the square holes.
+   The ones who see things diﬀerently. Theyre not fond of rules. And
+       they have no respect for the status quo. You can quote them,
+       disagree with them, glorify or vilify them.
+   About the only thing you cant do is ignore them. Because they change
+       things. They invent. They imagine. They heal. They explore. They
+       create. They inspire. They push the human race forward.
+   Maybe they have to be crazy.
+   How else can you stare at an empty canvas and see a work of art? Or
+       sit in silence and hear a song thats never been written? Or gaze at
+       a red planet and see a laboratory on wheels?
+   We make tools for these kinds of people.
+   While some see them as the crazy ones, we see genius. Because the
+       people who are crazy enough to think they can change the world,
+       are the ones who do.
\ No newline at end of file
diff --git a/resources/crazyones_layout_vertical_space_font_height_weight.txt b/resources/crazyones_layout_vertical_space_font_height_weight.txt
new file mode 100644
index 000000000..e90fe87e9
--- /dev/null
+++ b/resources/crazyones_layout_vertical_space_font_height_weight.txt
@@ -0,0 +1,25 @@
+The Crazy Ones
+October 14, 1998
+
+   Heres to the crazy ones. The misﬁts. The rebels. The troublemakers.
+       The round pegs in the square holes.
+
+   The ones who see things diﬀerently. Theyre not fond of rules. And
+       they have no respect for the status quo. You can quote them,
+       disagree with them, glorify or vilify them.
+
+   About the only thing you cant do is ignore them. Because they change
+       things. They invent. They imagine. They heal. They explore. They
+       create. They inspire. They push the human race forward.
+
+   Maybe they have to be crazy.
+
+   How else can you stare at an empty canvas and see a work of art? Or
+       sit in silence and hear a song thats never been written? Or gaze at
+       a red planet and see a laboratory on wheels?
+
+   We make tools for these kinds of people.
+
+   While some see them as the crazy ones, we see genius. Because the
+       people who are crazy enough to think they can change the world,
+       are the ones who do.
\ No newline at end of file
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index c04d4ceb0..a92525e84 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -219,3 +219,44 @@ def test_text_leading_height_unit():
     page = reader.pages[0]
     extracted = page.extract_text()
     assert "Something[cited]\n" in extracted
+
+
+def test_layout_mode_space_vertically_font_height_weight():
+    """Tests layout mode with vertical space and font height weight (issue #2915)"""
+    with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile:
+        # Load PDF file from file
+        reader = PdfReader(inputfile)
+        page = reader.pages[0]
+
+        # Normal behaviour
+        with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file:
+            pdftext = pdftext_file.read()
+
+        text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8")
+
+        # Compare the text of the PDF to a known source
+        for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
+            assert expected_line == actual_line
+
+        pdftext = pdftext.replace(b"\r\n", b"\n")  # fix for windows
+        assert text == pdftext, (
+            "PDF extracted text differs from expected value.\n\n"
+            "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
+        )
+
+        # Blank lines are added to truly separate paragraphs
+        with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file:
+            pdftext = pdftext_file.read()
+
+        text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True,
+                                 layout_mode_font_height_weight=0.85).encode("utf-8")
+
+        # Compare the text of the PDF to a known source
+        for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
+            assert expected_line == actual_line
+
+        pdftext = pdftext.replace(b"\r\n", b"\n")  # fix for windows
+        assert text == pdftext, (
+                "PDF extracted text differs from expected value.\n\n"
+                "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
+        )