From ad4f13d17412f529f3f595c5c295c31788110931 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 8 Sep 2023 13:39:01 +0200 Subject: [PATCH] BUG: Accept tabs in cmaps (#2174) Closes #2173 --- pypdf/_cmap.py | 5 +++-- tests/test_cmap.py | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 369ab1904..b09119d3d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -232,7 +232,7 @@ def parse_to_unicode( cm = prepare_cm(ft) for line in cm.split(b"\n"): process_rg, process_char, multiline_rg = process_cm_line( - line.strip(b" "), + line.strip(b" \t"), process_rg, process_char, multiline_rg, @@ -295,8 +295,9 @@ def process_cm_line( map_dict: Dict[Any, Any], int_entry: List[int], ) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: - if line in (b"", b" ") or line[0] == 37: # 37 = % + if line == b"" or line[0] == 37: # 37 = % return process_rg, process_char, multiline_rg + line = line.replace(b"\t", b" ") if b"beginbfrange" in line: process_rg = True elif b"endbfrange" in line: diff --git a/tests/test_cmap.py b/tests/test_cmap.py index fe769e1c5..262869c94 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -189,3 +189,12 @@ def test_unixxx_glyphs(): txt = reader.pages[0].extract_text() # no error for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): assert pat in txt + + +@pytest.mark.enable_socket() +def test_tabs_in_cmap(): + """Issue #2173""" + url = "https://github.com/py-pdf/pypdf/files/12552700/tt.pdf" + name = "iss2173.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].extract_text()