Skip to content

Commit

Permalink
ENH: Add layout_mode_font_height_weight argument to ``PageObject.…
Browse files Browse the repository at this point in the history
…extract_text()``
  • Loading branch information
hpierre001 committed Oct 22, 2024
1 parent 80c3939 commit dad1788
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
8 changes: 7 additions & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2183,6 +2183,7 @@ def _layout_mode_text(
scale_weight: float = 1.25,
strip_rotated: bool = True,
debug_path: Optional[Path] = None,
font_height_weight: float = 1,
) -> str:
"""
Get text preserving fidelity to source PDF text layout.
Expand All @@ -2202,6 +2203,8 @@ def _layout_mode_text(
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
Defaults to None.
font_height_weight: multiplier for font height when calculating
blank lines. Defaults to 1.
Returns:
str: multiline string containing page text in a fixed width format that
Expand Down Expand Up @@ -2232,7 +2235,7 @@ def _layout_mode_text(

char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

def extract_text(
self,
Expand Down Expand Up @@ -2307,6 +2310,8 @@ def extract_text(
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
layout_mode_font_height_weight (float): multiplier for font height when calculating
blank lines. Defaults to 1.
Returns:
The extracted text
Expand All @@ -2329,6 +2334,7 @@ def extract_text(
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
debug_path=kwargs.get("layout_mode_debug_path", None),
font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
)
if len(args) >= 1:
if isinstance(args[0], str):
Expand Down
5 changes: 3 additions & 2 deletions pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl


def fixed_width_page(
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
) -> str:
"""
Generate page text from text operations grouped by rendered y coordinate.
Expand All @@ -347,6 +347,7 @@ def fixed_width_page(
ty_groups: dict of text show ops as returned by y_coordinate_groups()
char_width: fixed character width
space_vertically: include blank lines inferred from y distance + font height.
font_height_weight: multiplier for font height when calculating blank lines.
Returns:
str: page text in a fixed width format that closely adheres to the rendered
Expand All @@ -357,7 +358,7 @@ def fixed_width_page(
for y_coord, line_data in ty_groups.items():
if space_vertically and lines:
blank_lines = (
int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1
)
lines.extend([""] * blank_lines)
line = ""
Expand Down

0 comments on commit dad1788

Please sign in to comment.