diff --git a/font_collector/font.py b/font_collector/font.py index 63d2859..180926f 100644 --- a/font_collector/font.py +++ b/font_collector/font.py @@ -2,9 +2,21 @@ import os from .exceptions import InvalidFontException from .font_parser import FontParser, NameID -from fontTools.ttLib.tables._c_m_a_p import CmapSubtable +from ctypes import byref from fontTools.ttLib.ttFont import TTFont from fontTools.ttLib.ttCollection import TTCollection +from freetype import ( + FT_Done_Face, + FT_Done_FreeType, + FT_Exception, + FT_Face, + FT_Get_Char_Index, + FT_Get_CMap_Format, + FT_Init_FreeType, + FT_Library, + FT_New_Memory_Face, + FT_Set_Charmap, +) from typing import Any, Dict, List, Sequence, Set, Tuple _logger = logging.getLogger(__name__) @@ -243,54 +255,94 @@ def __hash__(self): def __repr__(self): return f'Filename: "{self.filename}" Family_names: "{self.family_names}", Weight: "{self.weight}", Italic: "{self.italic}, Exact_names: "{self.exact_names}", Named_instance_coordinates: "{self.named_instance_coordinates}"' - def get_missing_glyphs(self, text: Sequence[str]) -> Set[str]: + def get_missing_glyphs( + self, + text: Sequence[str], + support_only_ascii_char_for_symbol_font: bool = False + ) -> Set[str]: """ Parameters: text (Sequence[str]): Text + support_only_ascii_char_for_symbol_font (bool): + Libass only support ascii character for symbol cmap, but VSFilter can support more character. + If you wish to use libass, we recommand you to set this param to True. + If you wish to use VSFilter, we recommand you to set this param to False. + For more detail, see the issue: https://github.com/libass/libass/issues/319 Returns: A set of all the character that the font cannot display. """ - ttFont = TTFont(self.filename, fontNumber=self.font_index) char_not_found: Set[str] = set() - cmap_tables: List[CmapSubtable] = list( - filter(lambda table: table.platformID == 3, ttFont["cmap"].tables) - ) + library = FT_Library() + face = FT_Face() + + error = FT_Init_FreeType(byref(library)) + if error: raise FT_Exception(error) + + # We cannot use FT_New_Face due to this issue: https://github.com/rougier/freetype-py/issues/157 + with open(self.filename, mode="rb") as f: + filebody = f.read() + error = FT_New_Memory_Face(library, filebody, len(filebody), self.font_index, byref(face)) + if error: raise FT_Exception(error) + + supported_charmaps = [face.contents.charmaps[i] for i in range(face.contents.num_charmaps) if FT_Get_CMap_Format(face.contents.charmaps[i]) != -1 and face.contents.charmaps[i].contents.platform_id == 3] # GDI seems to take apple cmap if there isn't any microsoft cmap: https://github.com/libass/libass/issues/679 - if len(cmap_tables) == 0: - cmap_tables = list( - filter( - lambda table: table.platformID == 1 and table.platEncID == 0, - ttFont["cmap"].tables, - ) - ) + if len(supported_charmaps) == 0: + supported_charmaps = [face.contents.charmaps[i] for i in range(face.contents.num_charmaps) if FT_Get_CMap_Format(face.contents.charmaps[i]) != -1 and face.contents.charmaps[i].contents.platform_id == 1 and face.contents.charmaps[i].contents.encoding_id == 0] for char in text: char_found = False - for cmap_table in cmap_tables: - cmap_encoding = FontParser.get_cmap_encoding(cmap_table) + for charmap in supported_charmaps: + error = FT_Set_Charmap(face, charmap) + if error: raise FT_Exception(error) + + platform_id = charmap.contents.platform_id + encoding_id = charmap.contents.encoding_id - # Cmap isn't supported + cmap_encoding = FontParser.get_cmap_encoding(platform_id, encoding_id) + + # cmap not supported if cmap_encoding is None: continue - try: - codepoint = int.from_bytes(char.encode(cmap_encoding), "big") - except UnicodeEncodeError: - continue + if cmap_encoding == "unicode": + codepoint = ord(char) + else: + if cmap_encoding == "unknown": + if platform_id == 3 and encoding_id == 0: + if support_only_ascii_char_for_symbol_font and not char.isascii(): + continue + cmap_encoding = FontParser.get_symbol_cmap_encoding(face) + + if cmap_encoding is None: + # Fallback if guess fails + cmap_encoding = "cp1252" + else: + # cmap not supported + continue + + try: + codepoint = int.from_bytes(char.encode(cmap_encoding), "big") + except UnicodeEncodeError: + continue # GDI/Libass modify the codepoint for microsoft symbol cmap: https://github.com/libass/libass/blob/04a208d5d200360d2ac75f8f6cfc43dd58dd9225/libass/ass_font.c#L249-L250 - if cmap_table.platformID == 3 and cmap_table.platEncID == 0: + if platform_id == 3 and encoding_id == 0: codepoint = 0xF000 | codepoint - if codepoint in cmap_table.cmap: + index = FT_Get_Char_Index(face, codepoint) + + if index: char_found = True break if not char_found: char_not_found.add(char) + FT_Done_Face(face) + FT_Done_FreeType(library) + return char_not_found diff --git a/font_collector/font_parser.py b/font_collector/font_parser.py index 0b2412b..9c9f781 100644 --- a/font_collector/font_parser.py +++ b/font_collector/font_parser.py @@ -1,14 +1,15 @@ import freetype import logging from .exceptions import NameNotFoundException +from ctypes import byref, c_uint, create_string_buffer from enum import IntEnum from io import BufferedReader from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple from fontTools.ttLib.ttFont import TTFont -from fontTools.ttLib.tables._c_m_a_p import CmapSubtable from fontTools.ttLib.tables._n_a_m_e import NameRecord from fontTools.varLib.instancer.names import ELIDABLE_AXIS_VALUE_NAME +from freetype import FT_Face, FT_Get_Glyph_Name from struct import error as struct_error _logger = logging.getLogger(__name__) @@ -51,14 +52,14 @@ class FontParser: 0: "mac_roman", }, 3: { # Microsoft - 0: "utf_16_be", - 1: "utf_16_be", + 0: "unknown", + 1: "unicode", 2: "cp932", 3: "cp936", 4: "cp950", 5: "cp949", 6: "cp1361", - 10: "utf_16_be", + 10: "unicode", }, } @@ -566,18 +567,56 @@ def get_font_family_fullname_property( return families, fullnames + + @staticmethod + def get_symbol_cmap_encoding(face: FT_Face) -> Optional[str]: + """ + Parameters: + face (FT_Face): An Font face + Returns: + The cmap ansi code page encoding. + If it couldn't guess the encoding, it return None. + It can return none if the font is those case if the font doesn't use any unique character of an ansi code page. + Note: Chinese (cp936 or cp950) and Korean (cp949) doesn't contain any unique character. + So, we can't recognized them. + Libass currently has an issue about this problem: https://github.com/libass/libass/issues/319 + When Libass will add the logic with the track language, this method will be deprecated. + """ + font_glyph_names: Set[str] = set() + # This is a limit set by adobe: http://adobe-type-tools.github.io/afdko/OpenTypeFeatureFileSpecification.html#2fi-glyph-name + buffer_max = 64 + for i in range(face.contents.num_glyphs): + buffer = create_string_buffer(buffer_max) + error = FT_Get_Glyph_Name(face, c_uint(i), byref(buffer), c_uint(buffer_max)) + + if error: + continue + font_glyph_names.add(buffer.value.decode("ascii").lower()) + + count_codepage: dict[str, int] = {} + for code_page, glyph_names in UNIQUE_ADOBE_GLYPH_NAME_BY_CODE_PAGE.items(): + count = sum(1 for font_glyph_name in font_glyph_names if font_glyph_name in glyph_names) + count_codepage[code_page] = count + # If there is a tie, prefer codepage different then cp1252 + codepage_encoding = max(count_codepage, key=lambda codepage: (count_codepage[codepage], codepage != 'cp1252')) + + if count_codepage[codepage_encoding]: + return codepage_encoding + + return None + + @staticmethod - def get_cmap_encoding(cmap_table: CmapSubtable) -> Optional[str]: + def get_cmap_encoding(platform_id: int, encoding_id: int) -> Optional[str]: """ Parameters: - cmap_table (CmapSubtable): CMAP table + platform_id (int): CMAP platform id + encoding_id (int): CMAP encoding id Returns: The cmap codepoint encoding. If GDI does not support the platform_id and/or platform_encoding_id, return None. """ - return FontParser.CMAP_ENCODING_MAP.get(cmap_table.platformID, {}).get( - cmap_table.platEncID, None - ) + return FontParser.CMAP_ENCODING_MAP.get(platform_id, {}).get(encoding_id, None) @staticmethod def get_name_encoding(name: NameRecord) -> Optional[str]: @@ -633,3 +672,35 @@ def is_file_font(filepath: str) -> bool: or FontParser.is_file_opentype(fontFile) or FontParser.is_file_truetype_collection(fontFile) ) + + +# The Chinese (cp936 or cp950) and Korean (cp949) aren't in this dict since they doesn't have any unique char. +# This dict have been generated with "proof/[Symbol Font] Find unique char by ansi code page.py" +# The name of those glyph is from this list: https://raw.githubusercontent.com/adobe-type-tools/agl-aglfn/4036a9ca80a62f64f9de4f7321a9a045ad0ecfd6/glyphlist.txt +UNIQUE_ADOBE_GLYPH_NAME_BY_CODE_PAGE: dict[str, Set[str]] = { + "cp874": {'angkhankhuthai', 'lolingthai', 'thanthakhatthai', 'phosamphaothai', 'phophanthai', 'paiyannoithai', 'phinthuthai', 'threethai', 'kokaithai', 'topatakthai', 'lochulathai', 'nonuthai', 'thothungthai', 'lakkhangyaothai', 'ngonguthai', 'thothanthai', 'khokhaithai', 'khokhwaithai', 'oangthai', 'sixthai', 'saraueethai', 'saraaathai', 'wowaenthai', 'chochangthai', 'fourthai', 'maihanakatthai', 'nonenthai', 'maiekthai', 'sosalathai', 'sorusithai', 'saraamthai', 'saraethai', 'saraithai', 'fofanthai', 'fofathai', 'poplathai', 'thothongthai', 'roruathai', 'chochingthai', 'nikhahitthai', 'saraiithai', 'yamakkanthai', 'luthai', 'onethai', 'sosothai', 'maitaikhuthai', 'seventhai', 'khorakhangthai', 'yoyingthai', 'sarauthai', 'dochadathai', 'ruthai', 'maichattawathai', + 'bobaimaithai', 'sarauethai', 'saraaimaimuanthai', 'chochoethai', 'twothai', 'sarauuthai', 'phophungthai', 'saraothai', 'khomutthai', 'thophuthaothai', 'fongmanthai', 'fivethai', 'honokhukthai', 'zerothai', 'maiyamokthai', 'hohipthai', 'khokhonthai', 'ninethai', 'bahtthai', 'saraaethai', 'dodekthai', 'chochanthai', 'eightthai', 'yoyakthai', 'khokhuatthai', 'saraaimaimalaithai', 'maithothai', 'thothahanthai', 'sosuathai', 'saraathai', 'totaothai', 'maitrithai', 'momathai', 'thonangmonthothai'}, + + "cp932": {'nekatakanahalfwidth', 'okatakanahalfwidth', 'yukatakanahalfwidth', 'sekatakanahalfwidth', 'hakatakanahalfwidth', 'sakatakanahalfwidth', 'yokatakanahalfwidth', 'mekatakanahalfwidth', 'osmallkatakanahalfwidth', 'sokatakanahalfwidth', 'wakatakanahalfwidth', 'hokatakanahalfwidth', 'ismallkatakanahalfwidth', 'rakatakanahalfwidth', 'katahiraprolongmarkhalfwidth', 'ikatakanahalfwidth', 'nakatakanahalfwidth', 'mikatakanahalfwidth', 'kikatakanahalfwidth', 'tikatakanahalfwidth', 'tusmallkatakanahalfwidth', 'semivoicedmarkkanahalfwidth', 'sikatakanahalfwidth', 'middledotkatakanahalfwidth', 'mokatakanahalfwidth', 'ekatakanahalfwidth', 'hekatakanahalfwidth', 'tekatakanahalfwidth', 'wokatakanahalfwidth', 'makatakanahalfwidth', 'asmallkatakanahalfwidth', 'tokatakanahalfwidth', 'cornerbracketlefthalfwidth', 'mukatakanahalfwidth', 'kukatakanahalfwidth', 'yusmallkatakanahalfwidth', 'yosmallkatakanahalfwidth', 'nokatakanahalfwidth', 'kekatakanahalfwidth', 'takatakanahalfwidth', 'rikatakanahalfwidth', 'ukatakanahalfwidth', 'cornerbracketrighthalfwidth', 'braceleftmid', 'esmallkatakanahalfwidth', 'tukatakanahalfwidth', 'rukatakanahalfwidth', 'nukatakanahalfwidth', 'bracelefttp', 'voicedmarkkanahalfwidth', 'rokatakanahalfwidth', 'kokatakanahalfwidth', 'usmallkatakanahalfwidth', 'bracketleftbt', 'hukatakanahalfwidth', 'kakatakanahalfwidth', 'sukatakanahalfwidth', 'hikatakanahalfwidth', 'periodhalfwidth', 'braceleftbt', 'yasmallkatakanahalfwidth', 'nkatakanahalfwidth', 'rekatakanahalfwidth', 'yakatakanahalfwidth', 'ideographiccommaleft', 'akatakanahalfwidth', 'nikatakanahalfwidth'}, + + "cp1250": {'lacute', 'tcedilla', 'dcaron', 'breve', 'lcaron', 'uhungarumlaut', 'racute', 'tcommaaccent', 'ecaron', 'hungarumlaut', 'uring', 'tcaron', 'ncaron', 'rcaron', 'odblacute', 'udblacute', 'ohungarumlaut'}, + + "cp1251": {'acyrillic', 'efcyrillic', 'afii10055', 'afii10069', 'afii10097', 'afii10037', 'afii10085', 'afii10053', 'kjecyrillic', 'afii10096', 'afii10019', 'afii10086', 'afii10030', 'afii10034', 'iucyrillic', 'elcyrillic', 'ushortcyrillic', 'afii10052', 'afii10036', 'shchacyrillic', 'iishortcyrillic', 'afii10105', 'afii10061', 'afii10026', 'ercyrillic', 'tecyrillic', 'afii10060', 'afii10106', 'afii10028', 'afii10098', 'afii10042', 'yicyrillic', 'afii10193', 'emcyrillic', 'jecyrillic', 'afii10021', 'afii10054', 'afii10101', 'encyrillic', 'afii10022', 'becyrillic', 'njecyrillic', 'softsigncyrillic', 'afii10075', 'afii10100', 'afii10068', 'afii10089', 'afii10065', 'afii61352', 'iicyrillic', 'iecyrillic', 'afii10088', 'tshecyrillic', 'afii10059', 'vecyrillic', 'zhecyrillic', 'afii10102', 'afii10099', 'iocyrillic', 'pecyrillic', 'afii10050', 'afii10048', 'afii10080', 'gecyrillic', 'afii10035', 'djecyrillic', 'ucyrillic', 'afii10066', 'afii10027', 'afii10045', 'afii10082', 'afii10017', 'afii10076', 'afii10067', 'afii10087', 'iacyrillic', 'afii10024', 'dzhecyrillic', 'afii10058', 'afii10029', 'afii10110', 'afii10084', 'afii10095', 'afii10040', 'yericyrillic', 'afii10072', 'afii10109', 'gjecyrillic', 'ljecyrillic', 'afii10074', 'afii10081', 'dzecyrillic', 'afii10093', 'khacyrillic', 'afii10023', 'afii10079', 'afii10031', 'afii10047', 'escyrillic', 'decyrillic', 'afii10062', 'afii10070', 'afii10049', 'tsecyrillic', 'afii10094', 'afii10025', 'afii10041', 'afii10077', 'afii10073', 'afii10038', 'gheupturncyrillic', 'ereversedcyrillic', 'kacyrillic', 'afii10107', 'afii10108', 'ocyrillic', 'afii10145', 'afii10103', 'afii10032', 'shacyrillic', 'checyrillic', 'afii10090', 'numero', 'zecyrillic', 'afii10104', 'afii10092', 'afii10083', 'afii10056', 'afii10039', 'hardsigncyrillic', 'afii10044', 'afii10078', 'afii10018', 'icyrillic', 'afii10043', 'afii10091', 'afii10046', 'afii10057', 'afii10051', 'afii10071', 'ecyrillic', 'afii10020', 'afii10033'}, + + "cp1252": {'thorn', 'eth'}, + + "cp1253": {'sigmafinal', 'iotatonos', 'pi', 'kappa', 'dieresistonos', 'lambda', 'chi', 'sigma', 'delta', 'psi', 'rho', 'sigma1', 'epsilontonos', 'alphatonos', 'epsilon', 'beta', 'deltagreek', 'zeta', 'iotadieresis', 'upsilontonos', 'afii00208', 'omicrontonos', 'iota', 'alpha', 'omega', 'omicron', 'gamma', 'upsilon', 'omegagreek', 'tonos', 'omegatonos', 'upsilondieresistonos', 'theta', 'nu', 'dialytikatonos', 'phi', 'mu', 'etatonos', 'iotadieresistonos', 'tau', 'upsilondieresis', 'horizontalbar', 'xi', 'eta', 'mugreek'}, + + "cp1254": {'gbreve', 'dotlessi', 'idot', 'idotaccent'}, + + "cp1255": {'daletsegol', 'het', 'vav', 'reshhatafpatahhebrew', 'reshtserehebrew', 'zayinhebrew', 'tsere12', 'hatafsegolhebrew', 'reshholam', 'afii57678', 'tsere', 'qamatsqatanhebrew', 'daletpatah', 'qamatsqatanquarterhebrew', 'daletholamhebrew', 'finaltsadi', 'afii57668', 'pehebrew', 'mem', 'afii57803', 'qubutsquarterhebrew', 'rafe', 'qamats', 'hiriqhebrew', 'qoftsere', 'hatafsegol30', 'zayin', 'hiriqquarterhebrew', 'afii57670', 'afii57671', 'afii57801', 'afii57636', 'hatafqamats34', 'patahquarterhebrew', 'qamatswidehebrew', 'dalethiriq', 'qubuts31', 'afii57672', 'hehebrew', 'qofsegolhebrew', 'afii57680', 'holamquarterhebrew', 'reshqamats', 'afii57717', 'qofpatahhebrew', 'qofhiriq', 'tserehebrew', 'hiriq', 'qubutshebrew', 'kafhebrew', 'nun', 'afii57689', 'shindothebrew', 'afii57793', 'qubuts18', 'pe', 'memhebrew', 'yodhebrew', 'daletpatahhebrew', 'finalmemhebrew', 'finalpehebrew', 'hatafsegolquarterhebrew', 'qamatsqatannarrowhebrew', 'afii57645', 'shevaquarterhebrew', 'dalethebrew', 'patahwidehebrew', 'dalethatafsegol', 'lamedholamdageshhebrew', 'afii57664', 'shevahebrew', 'afii57842', 'finalkafsheva', 'tsere1e', 'hiriqnarrowhebrew', 'sheva22', 'tethebrew', 'hatafqamats28', 'hatafqamats', 'segolnarrowhebrew', 'afii57804', 'gershayimhebrew', 'tav', 'nunhebrew', 'holamhebrew', 'afii57716', 'patahnarrowhebrew', 'he', 'rafehebrew', 'qofsegol', 'afii57687', 'hatafqamatsnarrowhebrew', 'qofqamatshebrew', 'dalet', 'qubutswidehebrew', 'vavhebrew', 'qofshevahebrew', 'gimelhebrew', 'dalethiriqhebrew', 'patah', 'sheqelhebrew', 'qofholam', 'ayinhebrew', 'segol13', 'reshhebrew', 'finalnun', 'newsheqelsign', 'hatafsegol', 'hatafsegol24', 'sofpasuqhebrew', 'qamats1c', 'dalethatafpatah', 'reshshevahebrew', 'sheva2e', 'reshhatafsegolhebrew', 'afii57839', 'afii57681', 'hatafqamatsquarterhebrew', 'afii57667', 'lamedhebrew', 'qofpatah', 'tserewidehebrew', 'finalnunhebrew', 'sheva', 'daletqubutshebrew', 'finaltsadihebrew', 'qamatsde', 'tsadihebrew', 'finalkafshevahebrew', 'hatafqamatswidehebrew', 'reshhatafpatah', 'afii57794', 'reshqubuts', 'samekh', 'sheqel', 'finalpe', 'shevawidehebrew', 'segol', 'reshhiriq', + 'holamwidehebrew', 'qamatsquarterhebrew', 'lamedholamhebrew', 'afii57841', 'vavyodhebrew', 'hatafqamats1b', 'hatafsegolnarrowhebrew', 'afii57795', 'hatafpatahhebrew', 'qofhatafpatah', 'finalkafqamats', 'shinhebrew', 'afii57684', 'reshqamatshebrew', 'hatafqamatshebrew', 'afii57806', 'shevanarrowhebrew', 'sindothebrew', 'patah2a', 'segolhebrew', 'afii57798', 'qofhatafpatahhebrew', 'afii57800', 'qamatsqatanwidehebrew', 'hiriq14', 'qofqubuts', 'hatafpatah', 'hatafpatahnarrowhebrew', 'reshholamhebrew', 'afii57675', 'samekhhebrew', 'shin', 'tavhebrew', 'holam', 'finalkafhebrew', 'finalkafqamatshebrew', 'afii57679', 'tserequarterhebrew', 'holamnarrowhebrew', 'hatafsegolwidehebrew', 'qamatsnarrowhebrew', 'segolquarterhebrew', 'hatafpatahwidehebrew', 'tet', 'hiriq21', 'qamats27', 'afii57674', 'dalethatafpatahhebrew', 'bet', 'bethebrew', 'afii57685', 'yod', 'lamedholamdagesh', 'gereshhebrew', 'alef', 'daletqubuts', 'segol2c', 'qoftserehebrew', 'afii57677', 'finalkaf', 'daletqamatshebrew', 'ayin', 'hatafpatah16', 'paseqhebrew', 'qubuts25', 'tsere2b', 'afii57802', 'afii57669', 'dalethatafsegolhebrew', 'qofhatafsegolhebrew', 'daletqamats', 'qofholamhebrew', 'qamats10', 'afii57718', 'yodyodhebrew', 'afii57807', 'afii57799', 'qofhiriqhebrew', 'qofqubutshebrew', 'tsadi', 'qubutsnarrowhebrew', 'maqafhebrew', 'reshsegolhebrew', 'holam26', 'sheva15', 'lamedholam', 'vavvavhebrew', 'reshqubutshebrew', 'patah11', 'patah1d', 'kaf', 'daletshevahebrew', 'qamats1a', 'sheva115', 'dalettserehebrew', 'qofsheva', 'hethebrew', 'segol1f', 'hiriq2d', + 'afii57673', 'afii57797', 'dalettsere', 'qofhebrew', 'hiriqwidehebrew', 'reshhiriqhebrew', 'tserenarrowhebrew', 'patahhebrew', 'reshhatafsegol', 'daletholam', 'reshpatah', 'qubuts', 'afii57665', 'hatafpatah23', 'afii57658', 'gimel', 'daletsheva', 'hatafpatahquarterhebrew', 'alefhebrew', 'siluqlefthebrew', 'reshsegol', 'qofqamats', 'hatafsegol17', 'afii57676', 'afii57688', 'lamed', 'qof', 'reshpatahhebrew', 'afii57683', 'segolwidehebrew', 'finalmem', 'qofhatafsegol', 'dageshhebrew', 'afii57686', 'qamatshebrew', 'qamats33', 'qamats29', 'afii57682', 'daletsegolhebrew', 'reshtsere', 'holam32', 'afii57690', 'afii57666', 'dagesh', 'holam19', 'siluqhebrew', 'afii57796', 'resh', 'reshsheva', 'hatafpatah2f'}, + + "cp1256": {'afii57449', 'afii57440', 'afii57450', 'beharabic', 'afii57419', 'sadarabic', 'meemarabic', 'afii57508', 'afii57512', 'semicolonarabic', 'afii57423', 'dadarabic', 'taharabic', 'afii57442', 'afii57513', 'afii57448', 'gafarabic', 'ddalarabic', 'afii57446', 'afii57441', 'afii57430', 'lamarabic', 'afii57470', 'afii57421', 'dammalowarabic', 'alefmaksuraarabic', 'hamzadammaarabic', + 'afii57426', 'noonarabic', 'dammatanarabic', 'zerowidthnonjoiner', 'tehmarbutaarabic', 'qafarabic', 'hamzaarabic', 'afii57454', 'hamzafathatanarabic', 'dalarabic', 'jeemarabic', 'afii57506', 'afii57458', 'afii57445', 'rehyehaleflamarabic', 'hamzalowkasraarabic', 'afii57519', 'afii57412', 'noonghunnaarabic', 'hamzasukunarabic', 'shaddafathatanarabic', 'zainarabic', 'afii57444', 'alefhamzabelowarabic', 'feharabic', 'fathaarabic', 'afii61664', 'afii57415', 'afii57403', 'kashidaautoarabic', 'afii57422', 'wawarabic', 'afii57409', 'sukunarabic', 'kafarabic', 'tcheharabic', 'afii57453', 'afii57433', 'yeharabic', 'jeharabic', 'hehaltonearabic', 'afii57411', 'alefmaddaabovearabic', 'afii57432', 'alefhamzaabovearabic', 'afii57511', 'afii57414', 'hamzadammatanarabic', 'shaddaarabic', 'khaharabic', 'rreharabic', 'kashidaautonosidebearingarabic', 'kasratanarabic', 'teharabic', 'peharabic', 'afii57429', 'afii57452', 'hamzalowkasratanarabic', 'haaltonearabic', 'heharabic', 'fathatanarabic', 'questionarabic', 'kasraarabic', 'afii57420', 'afii57418', 'tatweelarabic', 'fathalowarabic', 'afii57451', 'afii57507', 'afii57455', 'wawhamzaabovearabic', 'afii57416', 'dammatanaltonearabic', 'afii57424', 'afii57410', 'afii57388', 'thalarabic', 'afii57443', 'haharabic', 'commaarabic', 'afii57413', 'sheenarabic', 'ainarabic', 'afii57417', 'hamzafathaarabic', 'yehhamzaabovearabic', 'afii57456', 'afii57428', 'afii57425', 'alefarabic', 'zaharabic', 'tteharabic', 'hamzalowarabic', 'ghainarabic', 'afii57514', 'reharabic', 'yehbarreearabic', 'afii57509', 'afii301', 'dammaarabic', 'afii57427', 'afii57457', 'afii57431', 'afii57407', 'seenarabic', 'afii57434', 'theharabic'}, + + "cp1257": {'ncommaaccent', 'rcedilla', 'lcedilla', 'emacron', 'ncedilla', 'iogonek', 'edotaccent', 'kcommaaccent', 'amacron', 'uogonek', 'gcommaaccent', 'kcedilla', 'rcommaaccent', 'umacron', 'gcedilla', 'omacron', 'edot', 'imacron', 'lcommaaccent'}, + + "cp1258": {'tildecmb', 'gravecomb', 'dong', 'acutecmb', 'gravecmb', 'dotbelowcmb', 'uhorn', 'acutecomb', 'hookcmb', 'ohorn', 'hookabovecomb', 'tildecomb', 'dotbelowcomb'} +} \ No newline at end of file diff --git a/proof/[Symbol Font] Find unique char by ansi code page.py b/proof/[Symbol Font] Find unique char by ansi code page.py new file mode 100644 index 0000000..3e9a44a --- /dev/null +++ b/proof/[Symbol Font] Find unique char by ansi code page.py @@ -0,0 +1,94 @@ +from fontTools.agl import LEGACY_AGL2UV + + +def generate_supported_char_by_code_page(code_pages: list[int]) -> dict[int, set[str]]: + supported_char_by_code_page: dict[int, set[str]] = {} + + for code_page in code_pages: + code_page_encoding_name = f"cp{code_page}" + + for codepoint in range(0, 256): + try: + codepoint_byte = int.to_bytes(codepoint, 1, "big") + char = codepoint_byte.decode(code_page_encoding_name) + except UnicodeDecodeError: + continue + + if code_page not in supported_char_by_code_page: + supported_char_by_code_page[code_page] = set() + + supported_char_by_code_page[code_page].add(char) + + return supported_char_by_code_page + + +def generate_unique_char_by_code_page(supported_char_by_code_page: dict[int, set[str]]) -> dict[int, set[str]]: + unique_char_by_code_page: dict[int, set[str]] = {} + + for codepoint, char_set in supported_char_by_code_page.items(): + unique_char_by_code_page[codepoint] = set(char_set) + + for other_codepoint, other_char_set in supported_char_by_code_page.items(): + if other_codepoint == codepoint: + continue + + unique_char_by_code_page[codepoint] -= other_char_set + + return unique_char_by_code_page + + +def generate_unique_adobe_glyph_name_by_code_page(unique_char_by_code_page: dict[int, set[str]]) -> dict[int, set[str]]: + unique_adobe_glyph_name_by_code_page: dict[int, set[str]] = {} + + for codepoint, char_set in unique_char_by_code_page.items(): + unique_adobe_glyph_name_by_code_page[codepoint] = set() + + for char in char_set: + found = False + for legacy_adobe_glyph_name, adobe_codepoints in LEGACY_AGL2UV.items(): + for adobe_codepoint in adobe_codepoints: + if adobe_codepoint == ord(char): + unique_adobe_glyph_name_by_code_page[codepoint].add(legacy_adobe_glyph_name.lower()) + found = True + + if not found: + print(char) + print(hex(ord(char))) + + return unique_adobe_glyph_name_by_code_page + + +def main(): + code_pages = [ + 874, + 932, + 936, + 949, + 950, + 1250, + 1251, + 1252, + 1253, + 1254, + 1255, + 1256, + 1257, + 1258, + ] + + + supported_char_by_code_page = generate_supported_char_by_code_page(code_pages) + unique_char_by_code_page = generate_unique_char_by_code_page(supported_char_by_code_page) + unique_adobe_glyph_name_by_code_page = generate_unique_adobe_glyph_name_by_code_page(unique_char_by_code_page) + + #for codepoint, char_set in unique_char_by_code_page.items(): + # print(f"{codepoint} {char_set}") + + + #for codepoint, adobe_glyph_name in unique_adobe_glyph_name_by_code_page.items(): + # print(f"{codepoint}: {adobe_glyph_name},") + # print() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/fonts/font_cmap_encoding_0.ttf b/tests/fonts/font_cmap_encoding_0.ttf new file mode 100644 index 0000000..99f2e77 Binary files /dev/null and b/tests/fonts/font_cmap_encoding_0.ttf differ diff --git a/tests/fonts/font_cmap_encoding_1.ttf b/tests/fonts/font_cmap_encoding_1.ttf new file mode 100644 index 0000000..3f13bc4 Binary files /dev/null and b/tests/fonts/font_cmap_encoding_1.ttf differ diff --git a/tests/test_font.py b/tests/test_font.py index 27138e0..60c95dd 100644 --- a/tests/test_font.py +++ b/tests/test_font.py @@ -53,6 +53,37 @@ def test_font_without_axis_value(): assert fonts == expected_fonts +def test_font_get_missing_glyphs_cmap_encoding_0(): + + font_cmap_encoding_0 = os.path.join(dir_path, "fonts", "font_cmap_encoding_0.ttf") + + font = Font.from_font_path(font_cmap_encoding_0) + assert len(font) == 1 + font = font[0] + + # Verify is the optional param is the right value + missing_glyphs = font.get_missing_glyphs("Έκθεση για Απασχόληση Dream Top Co. Οι επιλογές À a") + assert missing_glyphs == set("À") + + missing_glyphs = font.get_missing_glyphs("Έκθεση για Απασχόληση Dream Top Co. Οι επιλογές À a", False) + assert missing_glyphs == set("À") + + missing_glyphs = font.get_missing_glyphs("Έκθεση για Απασχόληση Dream Top Co. Οι επιλογές À a", True) + assert missing_glyphs == set("ΈκθεσηγιαπασχόλησηΟιεπιλογέςÀΑ") + + +def test_font_get_missing_glyphs_cmap_encoding_1(): + + font_cmap_encoding_1 = os.path.join(dir_path, "fonts", "font_cmap_encoding_1.TTF") + + font = Font.from_font_path(font_cmap_encoding_1) + assert len(font) == 1 + font = font[0] + + missing_glyphs = font.get_missing_glyphs(string.digits + "🇦🤍") + assert missing_glyphs == set() + + def test_font_get_missing_glyphs_cmap_encoding_2(): font_cmap_encoding_2 = os.path.join(dir_path, "fonts", "font_cmap_encoding_2.TTF") @@ -61,6 +92,7 @@ def test_font_get_missing_glyphs_cmap_encoding_2(): assert len(font) == 1 font = font[0] + # Try "é" since cp932 doesn't support this char missing_glyphs = font.get_missing_glyphs( string.ascii_letters + string.digits + "éヲ&*" ) @@ -74,9 +106,9 @@ def test_font_get_missing_glyphs_cmap_encoding_mac_platform(): font = font[0] missing_glyphs = font.get_missing_glyphs( - string.ascii_letters + string.digits + "@é¸" + string.ascii_letters + string.digits + "@é¸^Æ~" ) - assert missing_glyphs == set(["@", "¸"]) + assert missing_glyphs == set(["@", "¸", "~"]) def test_variable_font_with_invalid_fvar_axes():