Skip to content

Commit

Permalink
BUG: Accept tabs in cmaps (#2174)
Browse files Browse the repository at this point in the history
Closes #2173
  • Loading branch information
pubpub-zz authored Sep 8, 2023
1 parent 05f2a65 commit ad4f13d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
5 changes: 3 additions & 2 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def parse_to_unicode(
cm = prepare_cm(ft)
for line in cm.split(b"\n"):
process_rg, process_char, multiline_rg = process_cm_line(
line.strip(b" "),
line.strip(b" \t"),
process_rg,
process_char,
multiline_rg,
Expand Down Expand Up @@ -295,8 +295,9 @@ def process_cm_line(
map_dict: Dict[Any, Any],
int_entry: List[int],
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
if line in (b"", b" ") or line[0] == 37: # 37 = %
if line == b"" or line[0] == 37: # 37 = %
return process_rg, process_char, multiline_rg
line = line.replace(b"\t", b" ")
if b"beginbfrange" in line:
process_rg = True
elif b"endbfrange" in line:
Expand Down
9 changes: 9 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,12 @@ def test_unixxx_glyphs():
txt = reader.pages[0].extract_text() # no error
for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
assert pat in txt


@pytest.mark.enable_socket()
def test_tabs_in_cmap():
"""Issue #2173"""
url = "https://github.com/py-pdf/pypdf/files/12552700/tt.pdf"
name = "iss2173.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].extract_text()

0 comments on commit ad4f13d

Please sign in to comment.