diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index f0db464bf..779958b4d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -200,12 +200,15 @@ def parse_to_unicode( int_entry: List[int] = [] if "/ToUnicode" not in ft: - return {}, space_code, [] + if ft.get("/Subtype", "") == "/Type1": + return type1_alternative(ft, map_dict, space_code, int_entry) + else: + return {}, space_code, [] process_rg: bool = False process_char: bool = False multiline_rg: Union[ None, Tuple[int, int] - ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file + ] = None # tuple = (current_chaxr, remaining size) ; cf #1285 for example of file cm = prepare_cm(ft) for line in cm.split(b"\n"): process_rg, process_char, multiline_rg = process_cm_line( @@ -434,3 +437,35 @@ def compute_space_width( cpt += 1 sp_width = m / max(1, cpt) / 2 return sp_width + + +def type1_alternative( + ft: DictionaryObject, + map_dict: Dict[Any, Any], + space_code: int, + int_entry: List[int], +) -> Tuple[Dict[Any, Any], int, List[int]]: + if "/FontDescriptor" not in ft: + return map_dict, space_code, int_entry + ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") + if ft_desc is None: + return map_dict, space_code, int_entry + txt = ft_desc.get_object().get_data() + txt = txt.split(b"eexec\n")[0] # only clear part + txt = txt.split(b"/Encoding")[1] # to get the encoding part + lines = txt.replace(b"\r", b"\n").split(b"\n") + for li in lines: + if li.startswith(b"dup"): + words = [_w for _w in li.split(b" ") if _w != b""] + if words[3] != b"put": + continue + try: + i = int(words[1]) + v = adobe_glyphs[words[2].decode()] + except (ValueError, KeyError): + continue + if v == " ": + space_code = i + map_dict[chr(i)] = v + int_entry.append(i) + return map_dict, space_code, int_entry diff --git a/pypdf/_codecs/adobe_glyphs.py b/pypdf/_codecs/adobe_glyphs.py index 6d8f7fbdf..9fb6e03fd 100644 --- a/pypdf/_codecs/adobe_glyphs.py +++ b/pypdf/_codecs/adobe_glyphs.py @@ -13424,6 +13424,540 @@ "/zuhiragana": "\u305A", "/zukatakana": "\u30BA", "/zwarakay": "\u0659", + # manually added from + # https://github.com/serviceprototypinglab/latex-pdfa/blob/master/glyphtounicode-cmr.tex + "/angbracketleftBig": "\u28E8", + "/angbracketleftBigg": "\u27E8", + "/angbracketleftbig": "\u27E8", + "/angbracketleftbigg": "\u27E8", + "/angbracketrightBig": "\u27E9", + "/angbracketrightBigg": "\u27E9", + "/angbracketrightbig": "\u27E9", + "/angbracketrightbigg": "\u27E9", + "/arrowbt": "\u2193", + "/arrowdblbt": "\u21D3", + "/arrowdbltp": "\u21D1", + "/arrowhookleft": "\u21AA", + "/arrowhookright": "\u21A9", + "/arrowtp": "\u2191", + # diff : "/arrowvertex": "\u23D0", + "/arrowvertexdbl": "\uED12", + "/backslashBig": "\u005C", + "/backslashBigg": "\u005C", + "/backslashbig": "\u005C", + "/backslashbigg": "\u005C", + # diff : "/braceex": "\u23AA", + "/bracehtipdownleft": "\uED17", + "/bracehtipdownright": "\uED18", + "/bracehtipupleft": "\uED19", + "/bracehtipupright": "\uED1A", + "/braceleftBig": "\u007B", + "/braceleftBigg": "\u007B", + "/braceleftbig": "\u007B", + "/braceleftbigg": "\u007B", + # diff : "/braceleftbt": "\u23A9", + # diff : "/braceleftmid": "\u23A8", + # diff : "/bracelefttp": "\u23A7", + "/bracerightBig": "\u007D", + "/bracerightBigg": "\u007D", + "/bracerightbig": "\u007D", + "/bracerightbigg": "\u007D", + # diff : "/bracerightbt": "\u23AD", + # diff : "/bracerightmid": "\u23AC", + # diff : "/bracerighttp": "\u23AB", + "/bracketleftBig": "\u005B", + "/bracketleftBigg": "\u005B", + "/bracketleftbig": "\u005B", + "/bracketleftbigg": "\u005B", + # diff : "/bracketleftbt": "\u23A3", + # diff : "/bracketleftex": "\u23A2", + # diff : "/bracketlefttp": "\u23A1", + "/bracketrightBig": "\u005D", + "/bracketrightBigg": "\u005D", + "/bracketrightbig": "\u005D", + "/bracketrightbigg": "\u005D", + # diff : "/bracketrightbt": "\u23A6", + # diff : "/bracketrightex": "\u23A5", + # diff : "/bracketrighttp": "\u23A4", + "/ceilingleftBig": "\u2308", + "/ceilingleftBigg": "\u2308", + "/ceilingleftbig": "\u2308", + "/ceilingleftbigg": "\u2308", + "/ceilingrightBig": "\u2309", + "/ceilingrightBigg": "\u2309", + "/ceilingrightbig": "\u2309", + "/ceilingrightbigg": "\u2309", + "/circledotdisplay": "\u2A00", + "/circledottext": "\u2A00", + "/circlemultiplydisplay": "\u2A02", + "/circlemultiplytext": "\u2A02", + "/circleplusdisplay": "\u2A01", + "/circleplustext": "\u2A01", + "/contintegraldisplay": "\u222E", + "/contintegraltext": "\u222E", + "/coproductdisplay": "\u2210", + "/coproducttext": "\u2210", + "/floorleftBig": "\u230A", + "/floorleftBigg": "\u230A", + "/floorleftbig": "\u230A", + "/floorleftbigg": "\u230A", + "/floorrightBig": "\u230B", + "/floorrightBigg": "\u230B", + "/floorrightbig": "\u230B", + "/floorrightbigg": "\u230B", + "/hatwide": "\u02C6", + "/hatwider": "\u02C6", + "/hatwidest": "\u02C6", + "/integraldisplay": "\u222B", + "/integraltext": "\u222B", + "/intersectiondisplay": "\u22C2", + "/intersectiontext": "\u22C2", + "/logicalanddisplay": "\u22C0", + "/logicalandtext": "\u22C0", + "/logicalordisplay": "\u22C1", + "/logicalortext": "\u22C1", + "/mapsto": "\u21A6", + "/parenleftBig": "\u0028", + "/parenleftBigg": "\u0028", + "/parenleftbig": "\u0028", + "/parenleftbigg": "\u0028", + # diff : "/parenleftbt": "\u239D", + # diff : "/parenleftex": "\u239C", + # diff : "/parenlefttp": "\u239B", + "/parenrightBig": "\u0029", + "/parenrightBigg": "\u0029", + "/parenrightbig": "\u0029", + "/parenrightbigg": "\u0029", + # diff : "/parenrightbt": "\u23A0", + # diff : "/parenrightex": "\u239F", + # diff : "/parenrighttp": "\u239E", + "/productdisplay": "\u220F", + "/producttext": "\u220F", + "/radicalBig": "\u221A", + "/radicalBigg": "\u221A", + "/radicalbig": "\u221A", + "/radicalbigg": "\u221A", + "/radicalbt": "\u221A", + "/radicaltp": "\uED6A", + "/radicalvertex": "\uED6B", + "/slashBig": "\u002F", + "/slashBigg": "\u002F", + "/slashbig": "\u002F", + "/slashbigg": "\u002F", + "/summationdisplay": "\u2211", + "/summationtext": "\u2211", + "/tie": "\u2040", + "/tildewide": "\u02DC", + "/tildewider": "\u02DC", + "/tildewidest": "\u02DC", + "/uniondisplay": "\u22C3", + "/unionmultidisplay": "\u2A04", + "/unionmultitext": "\u2A04", + "/unionsqdisplay": "\u2A06", + "/unionsqtext": "\u2A06", + "/uniontext": "\u22C3", + "/vextenddouble": "\uED79", + "/vextendsingle": "\u23D0", + "/a1": "\u25C1", + "/a2": "\u22B4", + "/a3": "\u25B7", + "/a4": "\u22B5", + "/a40": "\u02C2", + "/a41": "\u02C3", + "/a42": "\u2303", + "/a43": "\u2304", + "/a48": "\u2127", + "/a49": "\u22C8", + "/a50": "\u25A1", + "/a51": "\u25C7", + "/a58": "\u2053", + "/a59": "\u219D", + "/a60": "\u228F", + "/a61": "\u2290", + "/d0": "\u2199", + "/d1": "\u2199", + "/d2": "\u2199", + "/d3": "\u2199", + "/d4": "\u2199", + "/d5": "\u2199", + "/d6": "\u2199", + "/d7": "\u2193", + "/d8": "\u2193", + "/d9": "\u2193", + "/d10": "\u2193", + "/d11": "\u2193", + "/d12": "\u2193", + "/d13": "\u2193", + "/d14": "\u2193", + "/d15": "\u2193", + "/d16": "\u2193", + "/d17": "\u2193", + "/d18": "\u2193", + "/d19": "\u2193", + "/d20": "\u2193", + "/d21": "\u2193", + "/d22": "\u2193", + "/d23": "\u2193", + "/d24": "\u2198", + "/d25": "\u2198", + "/d26": "\u2198", + "/d27": "\u2198", + "/d28": "\u2198", + "/d29": "\u2198", + "/d30": "\u2198", + "/d31": "\u2198", + "/d32": "\u2198", + "/d33": "\u2198", + "/d34": "\u2198", + "/d35": "\u2198", + "/d36": "\u2198", + "/d37": "\u2198", + "/d38": "\u2198", + "/d39": "\u2192", + "/d40": "\u2192", + "/d41": "\u2192", + "/d42": "\u2192", + "/d43": "\u2192", + "/d44": "\u2192", + "/d45": "\u2192", + "/d46": "\u2192", + "/d47": "\u2192", + "/d48": "\u2192", + "/d49": "\u2192", + "/d50": "\u2192", + "/d51": "\u2192", + "/d52": "\u2192", + "/d53": "\u2192", + "/d54": "\u2192", + "/d55": "\u2192", + "/d56": "\u2197", + "/d57": "\u2197", + "/d58": "\u2197", + "/d59": "\u2197", + "/d60": "\u2197", + "/d61": "\u2197", + "/d62": "\u2197", + "/d63": "\u2197", + "/d64": "\u2197", + "/d65": "\u2197", + "/d66": "\u2197", + "/d67": "\u2197", + "/d68": "\u2197", + "/d69": "\u2197", + "/d70": "\u2197", + "/d71": "\u2191", + "/d72": "\u2191", + "/d73": "\u2191", + "/d74": "\u2191", + "/d75": "\u2191", + "/d76": "\u2191", + "/d77": "\u2191", + "/d78": "\u2191", + "/d79": "\u2191", + "/d80": "\u2191", + "/d81": "\u2191", + "/d82": "\u2191", + "/d83": "\u2191", + "/d84": "\u2191", + "/d85": "\u2191", + "/d86": "\u2191", + "/d87": "\u2191", + "/d88": "\u2196", + "/d89": "\u2196", + "/d90": "\u2196", + "/d91": "\u2196", + "/d92": "\u2196", + "/d93": "\u2196", + "/d94": "\u2196", + "/d95": "\u2196", + "/d96": "\u2196", + "/d97": "\u2196", + "/d98": "\u2196", + "/d99": "\u2196", + "/d100": "\u2196", + "/d101": "\u2196", + "/d102": "\u2196", + "/d103": "\u2190", + "/d104": "\u2190", + "/d105": "\u2190", + "/d106": "\u2190", + "/d107": "\u2190", + "/d108": "\u2190", + "/d109": "\u2190", + "/d110": "\u2190", + "/d111": "\u2190", + "/d112": "\u2190", + "/d113": "\u2190", + "/d114": "\u2190", + "/d115": "\u2190", + "/d116": "\u2190", + "/d117": "\u2190", + "/d118": "\u2190", + "/d119": "\u2190", + "/d120": "\u2199", + "/d121": "\u2199", + "/d122": "\u2199", + "/d123": "\u2199", + "/d124": "\u2199", + "/d125": "\u2199", + "/d126": "\u2199", + "/d127": "\u2199", + # manually added from + # https://github.com/kohler/lcdf-typetools/blob/master/texglyphlist.txt + "/Ifractur": "\u2111", + "/FFsmall": "\uF766", + "/FFIsmall": "\uF766", + "/FFLsmall": "\uF766", + "/FIsmall": "\uF766", + "/FLsmall": "\uF766", + # diff : "/Germandbls": "\u0053", + "/Germandblssmall": "\uF773", + "/Ng": "\u014A", + "/Rfractur": "\u211C", + "/SS": "\u0053", + "/SSsmall": "\uF773", + "/altselector": "\uD802", + "/angbracketleft": "\u27E8", + "/angbracketright": "\u27E9", + "/arrowbothv": "\u2195", + "/arrowdblbothv": "\u21D5", + "/arrowleftbothalf": "\u21BD", + "/arrowlefttophalf": "\u21BC", + "/arrownortheast": "\u2197", + "/arrownorthwest": "\u2196", + "/arrowrightbothalf": "\u21C1", + "/arrowrighttophalf": "\u21C0", + "/arrowsoutheast": "\u2198", + "/arrowsouthwest": "\u2199", + "/ascendercompwordmark": "\uD80A", + "/asteriskcentered": "\u2217", + "/bardbl": "\u2225", + "/capitalcompwordmark": "\uD809", + "/circlecopyrt": "\u20DD", + "/circledivide": "\u2298", + "/circleminus": "\u2296", + "/coproduct": "\u2A3F", + "/ct": "\u0063", + "/cwm": "\u200C", + "/dblbracketleft": "\u27E6", + "/dblbracketright": "\u27E7", + # diff : "/diamond": "\u2662", + "/diamondmath": "\u22C4", + # diff : "/dotlessj": "\u0237", + "/emptyslot": "\uD801", + "/epsilon1": "\u03F5", + "/epsiloninv": "\u03F6", + "/equivasymptotic": "\u224D", + "/flat": "\u266D", + "/follows": "\u227B", + "/followsequal": "\u2AB0", + "/followsorcurly": "\u227D", + "/greatermuch": "\u226B", + # diff : "/heart": "\u2661", + "/interrobangdown": "\u2E18", + "/intersectionsq": "\u2293", + "/latticetop": "\u22A4", + "/lessmuch": "\u226A", + "/longdbls": "\u017F", + "/longsh": "\u017F", + "/longsi": "\u017F", + "/longsl": "\u017F", + "/longst": "\uFB05", + "/lscript": "\u2113", + "/natural": "\u266E", + "/negationslash": "\u0338", + "/ng": "\u014B", + "/owner": "\u220B", + "/pertenthousand": "\u2031", + # diff : "/phi": "\u03D5", + # diff : "/phi1": "\u03C6", + "/pi1": "\u03D6", + "/precedesequal": "\u2AAF", + "/precedesorcurly": "\u227C", + "/prime": "\u2032", + "/rho1": "\u03F1", + "/ringfitted": "\uD80D", + "/sharp": "\u266F", + "/similarequal": "\u2243", + "/slurabove": "\u2322", + "/slurbelow": "\u2323", + "/st": "\uFB06", + "/subsetsqequal": "\u2291", + "/supersetsqequal": "\u2292", + "/triangle": "\u25B3", + "/triangleinv": "\u25BD", + "/triangleleft": "\u25C1", + # diff : "/triangleright": "\u25B7", + "/turnstileleft": "\u22A2", + "/turnstileright": "\u22A3", + "/twelveudash": "\uD80C", + "/unionmulti": "\u228E", + "/unionsq": "\u2294", + "/vector": "\u20D7", + "/visualspace": "\u2423", + "/Dbar": "\u0110", + "/compwordmark": "\u200C", + "/dbar": "\u0111", + "/rangedash": "\u2013", + "/hyphenchar": "\u002D", + "/punctdash": "\u2014", + "/visiblespace": "\u2423", + "/Yen": "\u00A5", + "/anticlockwise": "\u27F2", + "/arrowparrleftright": "\u21C6", + "/arrowparrrightleft": "\u21C4", + "/arrowtailleft": "\u21A2", + "/arrowtailright": "\u21A3", + "/arrowtripleleft": "\u21DA", + "/arrowtripleright": "\u21DB", + "/check": "\u2713", + "/circleR": "\u00AE", + "/circleS": "\u24C8", + "/circleasterisk": "\u229B", + "/circleequal": "\u229C", + "/circlering": "\u229A", + "/clockwise": "\u27F3", + "/curlyleft": "\u21AB", + "/curlyright": "\u21AC", + "/dblarrowdwn": "\u21CA", + "/dblarrowheadleft": "\u219E", + "/dblarrowheadright": "\u21A0", + # diff : "/dblarrowup": "\u21C8", + "/defines": "\u225C", + "/diamondsolid": "\u2666", + "/difference": "\u224F", + "/downfall": "\u22CE", + "/equaldotleftright": "\u2252", + "/equaldotrightleft": "\u2253", + "/equalorfollows": "\u22DF", + # diff : "/equalorgreater": "\u2A96", + # diff : "/equalorless": "\u2A95", + "/equalsdots": "\u2251", + "/followsorequal": "\u227F", + "/forcesbar": "\u22AA", + # diff : "/fork": "\u22D4", + "/geomequivalent": "\u224E", + "/greaterdbleqlless": "\u2A8C", + "/greaterdblequal": "\u2267", + "/greaterlessequal": "\u22DB", + "/greaterorapproxeql": "\u2A86", + "/greaterorequalslant": "\u2A7E", + "/greaterorsimilar": "\u2273", + "/harpoondownleft": "\u21C3", + "/harpoondownright": "\u21C2", + "/harpoonleftright": "\u21CC", + "/harpoonrightleft": "\u21CB", + "/harpoonupleft": "\u21BF", + "/harpoonupright": "\u21BE", + "/intercal": "\u22BA", + "/lessdbleqlgreater": "\u2A8B", + "/lessdblequal": "\u2266", + "/lessequalgreater": "\u22DA", + "/lessorapproxeql": "\u2A85", + "/lessorequalslant": "\u2A7D", + "/lessorsimilar": "\u2272", + "/maltesecross": "\u2720", + "/multiopenleft": "\u22CB", + "/multiopenright": "\u22CC", + "/orunderscore": "\u22BB", + "/perpcorrespond": "\u2A5E", + # diff : "/precedesorequal": "\u227E", + "/primereverse": "\u2035", + "/revasymptequal": "\u22CD", + "/revsimilar": "\u223D", + "/rightanglene": "\u231D", + "/rightanglenw": "\u231C", + "/rightanglese": "\u231F", + "/rightanglesw": "\u231E", + "/satisfies": "\u22A8", + "/shiftleft": "\u21B0", + "/shiftright": "\u21B1", + "/square": "\u25A1", + "/squaredot": "\u22A1", + "/squareminus": "\u229F", + "/squaremultiply": "\u22A0", + "/squareplus": "\u229E", + "/squaresolid": "\u25A0", + "/squiggleleftright": "\u21AD", + "/squiggleright": "\u21DD", + "/subsetdblequal": "\u2AC5", + "/supersetdbl": "\u22D1", + "/supersetdblequal": "\u2AC6", + "/triangledownsld": "\u25BC", + "/triangleleftequal": "\u22B4", + "/triangleleftsld": "\u25C0", + "/trianglerightequal": "\u22B5", + "/trianglerightsld": "\u25B6", + "/trianglesolid": "\u25B2", + "/uprise": "\u22CF", + # diff : "/Digamma": "\u1D7C", + "/Finv": "\u2132", + "/Gmir": "\u2141", + "/Omegainv": "\u2127", + "/approxorequal": "\u224A", + "/archleftdown": "\u21B6", + "/archrightdown": "\u21B7", + "/beth": "\u2136", + "/daleth": "\u2138", + "/dividemultiply": "\u22C7", + "/downslope": "\u29F9", + "/equalorsimilar": "\u2242", + "/follownotdbleqv": "\u2ABA", + "/follownotslnteql": "\u2AB6", + "/followornoteqvlnt": "\u22E9", + "/greaternotdblequal": "\u2A8A", + "/greaternotequal": "\u2A88", + "/greaterornotdbleql": "\u2269", + "/greaterornotequal": "\u2269", + "/integerdivide": "\u2216", + "/lessnotdblequal": "\u2A89", + "/lessnotequal": "\u2A87", + "/lessornotdbleql": "\u2268", + "/lessornotequal": "\u2268", + "/multicloseleft": "\u22C9", + "/multicloseright": "\u22CA", + "/notapproxequal": "\u2247", + "/notarrowboth": "\u21AE", + "/notarrowleft": "\u219A", + "/notarrowright": "\u219B", + "/notbar": "\u2224", + "/notdblarrowboth": "\u21CE", + "/notdblarrowleft": "\u21CD", + "/notdblarrowright": "\u21CF", + "/notfollows": "\u2281", + "/notfollowsoreql": "\u2AB0", + "/notforces": "\u22AE", + "/notforcesextra": "\u22AF", + "/notgreaterdblequal": "\u2267", + "/notgreaterequal": "\u2271", + "/notgreaterorslnteql": "\u2A7E", + "/notlessdblequal": "\u2266", + "/notlessequal": "\u2270", + "/notlessorslnteql": "\u2A7D", + "/notprecedesoreql": "\u2AAF", + "/notsatisfies": "\u22AD", + "/notsimilar": "\u2241", + "/notsubseteql": "\u2288", + "/notsubsetordbleql": "\u2AC5", + "/notsubsetoreql": "\u228A", + "/notsuperseteql": "\u2289", + "/notsupersetordbleql": "\u2AC6", + "/notsupersetoreql": "\u228B", + "/nottriangeqlleft": "\u22EC", + "/nottriangeqlright": "\u22ED", + "/nottriangleleft": "\u22EA", + "/nottriangleright": "\u22EB", + "/notturnstile": "\u22AC", + "/planckover2pi": "\u210F", + "/planckover2pi1": "\u210F", + "/precedenotdbleqv": "\u2AB9", + "/precedenotslnteql": "\u2AB5", + "/precedeornoteqvlnt": "\u22E8", + "/subsetnoteql": "\u228A", + "/subsetornotdbleql": "\u2ACB", + "/supersetnoteql": "\u228B", + "/supersetornotdbleql": "\u2ACC", + "/upslope": "\u29F8", } diff --git a/tests/test_cmap.py b/tests/test_cmap.py index ce91fd23c..f74da326d 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -168,3 +168,14 @@ def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text) for contained in within_text: assert contained in extracted assert caplog_text in caplog.text + + +@pytest.mark.enable_socket() +def test_latex(): + url = "https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf" + name = "math_latex.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + txt = reader.pages[0].extract_text() # no error + for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): + assert pat in txt + # actually the ϕ and φ seems to be crossed in latex