forked from sdtblck/PDFextract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_unicode.py
110 lines (93 loc) · 3.17 KB
/
fix_unicode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
import re
import unicodedata
"""
from: https://github.com/mattbierbaum/arxiv-public-datasets/blob/f0b8a4fd17e7aeed38465ec00a63eb219fe1672e/arxiv_public_data/fixunicode.py#L92
List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature
MKB removed the following elements from the list:
- et 🙰 U+1F670 🙰
- ſs, ſz ẞ, ß U+00DF ß
Additional notes:
* Some classes of characters were listed in the original utf8 fixes but I'm not
sure they don't belong elsewhere (end user processing). In these cases, pass
through unidecode should normalize them to proper ascii. They are listed here
with reasoning:
- Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf
r'[\u0300-\u036F]': ''
- Ditch chars that sometimes (incorrectly?) appear as combining diacritics
r'(?:\xa8|[\u02C0-\u02DF])': ''
* Should we run ftfy?
"""
ligature_table = """
AA, aa Ꜳ, ꜳ U+A732, U+A733 Ꜳ ꜳ
AE, ae Æ, æ U+00C6, U+00E6 Æ æ
AO, ao Ꜵ, ꜵ U+A734, U+A735 Ꜵ ꜵ
AU, au Ꜷ, ꜷ U+A736, U+A737 Ꜷ ꜷ
AV, av Ꜹ, ꜹ U+A738, U+A739 Ꜹ ꜹ
AV, av Ꜻ, ꜻ U+A73A, U+A73B Ꜻ ꜻ
AY, ay Ꜽ, ꜽ U+A73C, U+A73D Ꜽ ꜽ
ff ff U+FB00 ff
ffi ffi U+FB03 ffi
ffl ffl U+FB04 ffl
fi fi U+FB01 fi
fl fl U+FB02 fl
OE, oe Œ, œ U+0152, U+0153 Œ œ
OO, oo Ꝏ, ꝏ U+A74E, U+A74F Ꝏ ꝏ
st st U+FB06 st
ſt ſt U+FB05 ſt
TZ, tz Ꜩ, ꜩ U+A728, U+A729 Ꜩ ꜩ
ue ᵫ U+1D6B ᵫ
VY, vy Ꝡ, ꝡ U+A760, U+A761 Ꝡ ꝡ
db ȸ U+0238 ȸ
dz ʣ U+02A3 ʣ
dʑ ʥ U+02A5 ʥ
dʒ ʤ U+02A4 ʤ
fŋ ʩ U+02A9 ʩ
IJ, ij IJ, ij U+0132, U+0133 IJ ij
ls ʪ U+02AA ʪ
lz ʫ U+02AB ʫ
lʒ ɮ U+026E ɮ
qp ȹ U+0239 ȹ
tɕ ʨ U+02A8 ʨ
ts ʦ U+02A6 ʦ
tʃ ʧ U+02A7 ʧ
ui ꭐ U+AB50 ꭐ
ui ꭑ U+AB51 ꭐ
"""
unicode_mapping = {}
for row in ligature_table.split('\n'):
if row.count('\t') <= 1:
continue
unicode_mapping.update(
{
u.strip(): unicodedata.normalize('NFKC', a.strip())
for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]])
}
)
unicode_mapping.update({
# 'ẞ, ß': careful, some use this for \beta
r'(\B)\u00DF': r'\1ss',
# Additions (manual normalization that we feel is important)
# unicode space u'\xa0' (not \x{0c} = ^L keep!)
'\xa0': ' ',
# single + double quotes, dash, and asterisk
r'[\u2018\u2019]': r"'",
r'[\u201C\u201D]': r'"',
r'[\xad\u2014]': r'-',
r'\xb7': r'*'
})
def fix_unicode(txt: str) -> str:
"""
Given UTF-8 encoded text, remove typographical ligatures (normalize to true
non-display character set) and do a general normalization of the unicode
so that possible redundant characters and simplified to a single set.
Parameters
----------
txt : unicode string
Returns
-------
output : unicode string
"""
for search, replace in unicode_mapping.items():
txt = re.subn(search, replace, txt)[0]
return unicodedata.normalize('NFKC', txt)