py-pdf · MartinThoma · Jan 11, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024
diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
@@ -20,6 +20,21 @@ print(page.extract_text(0))
 print(page.extract_text((0, 90)))
 ```
 
+you can also extract text in "layout" mode:
+
+```python
+# extract text in a fixed width format that closely adheres to the rendered
+# layout in the source pdf
+print(page.extract_text(extraction_mode="layout"))
+
+# extract text preserving horizontal positioning without excess vertical
+# whitespace (removes blank and "whitespace only" lines)
+print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False))
+
+# adjust horizontal spacing
+print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0))
+```
+
 Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extract_text) for more details.
 
 ## Using a visitor

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -27,9 +27,12 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
+import json
 import math
 import re
+import sys
 from decimal import Decimal
+from pathlib import Path
 from typing import (
  Any,
  Callable,
@@ -50,6 +53,7 @@
 from ._protocols import PdfReaderProtocol, PdfWriterProtocol
 from ._text_extraction import (
  OrientationNotFoundError,
+ _layout_mode,
  crlf_space_check,
  handle_tj,
  mult,
@@ -83,6 +87,12 @@
  StreamObject,
 )
 
+if sys.version_info >= (3, 8):
+ from typing import Literal
+else:
+ from typing_extensions import Literal
+
+
 MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
 
 
@@ -1868,6 +1878,83 @@
  visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
  return output
 
+ def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
+ """
+ Get fonts formatted for "layout" mode text extraction
+
+ Returns:
+ Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
+ """
+ # Font retrieval logic adapted from pypdf.PageObject._extract_text()
+ objr: Any = self
+ while NameObject(PG.RESOURCES) not in objr:
+ objr = objr["/Parent"].get_object()
+ resources_dict: Any = objr[PG.RESOURCES]
+ fonts: Dict[str, _layout_mode.Font] = {}
+ if "/Font" in resources_dict and self.pdf is not None:
+ for font_name in resources_dict["/Font"]:
+ *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
+ font_dict = {
+ k: self.pdf.get_object(v)
+ if isinstance(v, IndirectObject)
+ else [
+ self.pdf.get_object(_v) if isinstance(_v, IndirectObject) else _v
+ for _v in v
+ ]
+ if isinstance(v, ArrayObject)
+ else v
+ for k, v in font_dict_obj.items()
+ }
- font_dict = {
- k: self.pdf.get_object(v)
- if isinstance(v, IndirectObject)
- else [
- self.pdf.get_object(_v) if isinstance(_v, IndirectObject) else _v
- for _v in v
- ]
- if isinstance(v, ArrayObject)
- else v
- for k, v in font_dict_obj.items()
- }
+ font_dict = {
+ k: v.get_object() for k, v in font_dict_obj.items()
+ }
- font_dict = {
- k: self.pdf.get_object(v)
- if isinstance(v, IndirectObject)
- else [
- self.pdf.get_object(_v) if isinstance(_v, IndirectObject) else _v
- for _v in v
- ]
- if isinstance(v, ArrayObject)
- else v
- for k, v in font_dict_obj.items()
- }
+ font_dict = {
+ k: v.get_object() for k, v in font_dict_obj.items()
+ }
+ # mypy really sucks at unpacking
+ fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore
+ return fonts
+
+ def _layout_mode_text(
+ self,
+ space_vertically: bool = True,
+ scale_weight: float = 1.25,
+ debug_path: Union[Path, None] = None,
+ ) -> str:
+ """
+ Get text preserving fidelity to source PDF text layout
+
+ Args:
+ space_vertically: include blank lines inferred from y distance + font
+ height. Defaults to True.
+ scale_weight: multiplier for string length when calculating weighted
+ average character width. Defaults to 1.25.
+ debug_path (Path | None): if supplied, must target a directory.
+ creates the following files with debug information for layout mode
+ functions if supplied:
+ - fonts.json: output of self._layout_mode_fonts
+ - tjs.json: individual text render ops with corresponding xform matrices
+ - bts.json: text render ops left justified and grouped by BT/ET operators
+ - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
+ Defaults to None.
+
+ Returns:
+ str: multiline string containing page text in a fixed width format that
+ closely adheres to the rendered layout in the source pdf.
+ """
+ fonts = self._layout_mode_fonts()
+ if debug_path:
+ debug_path.with_name("fonts.json").write_text(
+ json.dumps(fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)),
+ "utf-8",
+ )
+
+ ops = iter(ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations)
+ bt_groups = _layout_mode.text_show_operations(ops, fonts, debug_path)
+
+ if not bt_groups:
+ return ""
+
+ ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
+
+ char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
+
+ return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
+
  def extract_text(
  self,
  *args: Any,
@@ -1876,6 +1963,8 @@
  visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
  visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
  visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+ extraction_mode: Literal["plain", "layout"] = "plain",
+ **kwargs: Any,
  ) -> str:
  """
  Locate all text drawing commands, in the order they are provided in the
@@ -1913,10 +2002,35 @@
  text matrix, font-dictionary and font-size.
  The font-dictionary may be None in case of unknown fonts.
  If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
+ extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality
+ "layout" for experimental layout mode functionality.
+ NOTE: orientations, space_width, and visitor_* parameters are NOT respected
+ in "layout" mode.
+
+ KwArgs:
+ layout_mode_space_vertically: include blank lines inferred from y distance + font
+ height. Defaults to True.
+ layout_mode_scale_weight: multiplier for string length when calculating weighted
+ average character width. Defaults to 1.25.
+ layout_mode_debug_path (Path | None): if supplied, must target a directory.
+ creates the following files with debug information for layout mode
+ functions if supplied:
+ - fonts.json: output of self._layout_mode_fonts
+ - tjs.json: individual text render ops with corresponding xform matrices
+ - bts.json: text render ops left justified and grouped by BT/ET operators
+ - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
 
  Returns:
  The extracted text
  """
+ if extraction_mode not in ["plain", "layout"]:
+ raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
+ if extraction_mode == "layout":
+ return self._layout_mode_text(
+ space_vertically=kwargs.get("layout_mode_space_vertically", True),
+ scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
+ debug_path=kwargs.get("layout_mode_debug_path", None)
+ )
  if len(args) >= 1:
  if isinstance(args[0], str):
  if len(args) >= 3:

diff --git a/pypdf/_text_extraction/_layout_mode/__init__.py b/pypdf/_text_extraction/_layout_mode/__init__.py
@@ -0,0 +1,5 @@
+"""layout mode text extraction extension for pypdf"""
+from ._fixed_width_page import fixed_char_width, fixed_width_page, text_show_operations, y_coordinate_groups
+from ._fonts import Font
+
+__all__ = ["fixed_char_width", "fixed_width_page", "text_show_operations", "y_coordinate_groups", "Font"]