From 2e411a4bcb3f27902ae42d6596de128dd42c1af4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=A2=E7=B4=A0=E5=8F=A2=E9=9B=86=E3=83=BB=E3=81=90?= =?UTF-8?q?=E3=82=89=E3=81=B5=E3=81=83=E3=83=BC=E3=82=80?= Date: Fri, 10 Mar 2023 12:15:51 +0800 Subject: [PATCH] Handle Punctuation in `get_*_text` (#3) * Handle Punctuation in `get_*_text` * Handle Negative Numbers and Optimize --- README.md | 34 +++++--- src/ToJyutping/ToJyutping.py | 15 +--- src/ToJyutping/utils.py | 163 +++++++++++++++++++++++++++++++++++ src/ToJyutping/version.py | 2 +- 4 files changed, 190 insertions(+), 24 deletions(-) create mode 100644 src/ToJyutping/utils.py diff --git a/README.md b/README.md index 98696f3..fbbc447 100644 --- a/README.md +++ b/README.md @@ -10,18 +10,27 @@ Usage: ```python >>> import ToJyutping ->>> ToJyutping.get_jyutping_list('一瓩係乜嘢嚟㗎?') -[('一', 'jat1'), ('瓩', 'cin1 ngaa5'), ('係', 'hai6'), ('乜', 'mat1'), ('嘢', 'je5'), ('嚟', 'lai4'), ('㗎', 'gaa3'), ('?', None)] ->>> ToJyutping.get_jyutping('一瓩係乜嘢嚟㗎?') -'一(jat1)瓩(cin1 ngaa5)係(hai6)乜(mat1)嘢(je5)嚟(lai4)㗎(gaa3)?' ->>> ToJyutping.get_jyutping_text('一瓩係乜嘢嚟㗎?') -'jat1 cin1 ngaa5 hai6 mat1 je5 lai4 gaa3' ->>> ToJyutping.get_ipa_list('一瓩係乜嘢嚟㗎?') -[('一', 'jɐt̚˥'), ('瓩', 't͡sʰiːn˥.ŋaː˩˧'), ('係', 'hɐi̯˨'), ('乜', 'mɐt̚˥'), ('嘢', 'jɛː˩˧'), ('嚟', 'lɐi̯˨˩'), ('㗎', 'kaː˧'), ('?', None)] ->>> ToJyutping.get_ipa('一瓩係乜嘢嚟㗎?') -'一[jɐt̚˥]瓩[t͡sʰiːn˥.ŋaː˩˧]係[hɐi̯˨]乜[mɐt̚˥]嘢[jɛː˩˧]嚟[lɐi̯˨˩]㗎[kaː˧]?' ->>> ToJyutping.get_ipa_text('一瓩係乜嘢嚟㗎?') -'jɐt̚˥.t͡sʰiːn˥.ŋaː˩˧.hɐi̯˨.mɐt̚˥.jɛː˩˧.lɐi̯˨˩.kaː˧' +>>> ToJyutping.get_jyutping_list('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。') +[('咁', 'gam3'), ('啱', 'ngaam1'), ('老', 'lou5'), ('世', 'sai3'), ('要', 'jiu1'), ('求', 'kau4'), ('佢', 'keoi5'), ('等', 'dang2'), ('陣', 'zan6'), ('要', 'jiu3'), ('開', 'hoi1'), ('會', 'wui2'), (',', None), ('剩', 'zing6'), ('低', 'dai1'), ('嘅', 'ge2'), ('嘢', 'je5'), ('我', 'ngo5'), ('會', 'wui5'), ('搞', 'gaau2'), ('掂', 'dim6'), ('㗎', 'ga3'), ('喇', 'laa3'), ('。', None)] +>>> ToJyutping.get_jyutping('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。') +'咁(gam3)啱(ngaam1)老(lou5)世(sai3)要(jiu1)求(kau4)佢(keoi5)等(dang2)陣(zan6)要(jiu3)開(hoi1)會(wui2),剩(zing6)低(dai1)嘅(ge2)嘢(je5)我(ngo5)會(wui5)搞(gaau2)掂(dim6)㗎(ga3)喇(laa3)。' +>>> ToJyutping.get_jyutping_text('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。') +'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.' +>>> ToJyutping.get_ipa_list('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。') +[('咁', 'kɐm˧'), ('啱', 'ŋaːm˥'), ('老', 'lou̯˩˧'), ('世', 'sɐi̯˧'), ('要', 'jiːu̯˥'), ('求', 'kʰɐu̯˨˩'), ('佢', 'kʰɵy̑˩˧'), ('等', 'tɐŋ˧˥'), ('陣', 't͡sɐn˨'), ('要', 'jiːu̯˧'), ('開', 'hɔːi̯˥'), ('會', 'wuːi̯˧˥'), (',', None), ('剩', 't͡seŋ˨'), ('低', 'tɐi̯˥'), ('嘅', 'kɛː˧˥'), ('嘢', 'jɛː˩˧'), ('我', 'ŋɔː˩˧'), ('會', 'wuːi̯˩˧'), ('搞', 'kaːu̯˧˥'), ('掂', 'tiːm˨'), ('㗎', 'kɐ˧'), ('喇', 'laː˧'), ('。', None)] +>>> ToJyutping.get_ipa('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。') +'咁[kɐm˧]啱[ŋaːm˥]老[lou̯˩˧]世[sɐi̯˧]要[jiːu̯˥]求[kʰɐu̯˨˩]佢[kʰɵy̑˩˧]等[tɐŋ˧˥]陣[t͡sɐn˨]要[jiːu̯˧]開[hɔːi̯˥]會[wuːi̯˧˥],剩[t͡seŋ˨]低[tɐi̯˥]嘅[kɛː˧˥]嘢[jɛː˩˧]我[ŋɔː˩˧]會[wuːi̯˩˧]搞[kaːu̯˧˥]掂[tiːm˨]㗎[kɐ˧]喇[laː˧]。' +>>> ToJyutping.get_ipa_text('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。') +'kɐm˧.ŋaːm˥.lou̯˩˧.sɐi̯˧.jiːu̯˥.kʰɐu̯˨˩.kʰɵy̑˩˧.tɐŋ˧˥.t͡sɐn˨.jiːu̯˧.hɔːi̯˥.wuːi̯˧˥ | t͡seŋ˨.tɐi̯˥.kɛː˧˥.jɛː˩˧.ŋɔː˩˧.wuːi̯˩˧.kaːu̯˧˥.tiːm˨.kɐ˧.laː˧' +``` + +In rare cases, the pronunciation of a single character can contain more than one syllable: + +```python +>>> ToJyutping.get_jyutping_list('一瓩') +[('一', 'jat1'), ('瓩', 'cin1 ngaa5')] +>>> ToJyutping.get_ipa_list('一瓩') +[('一', 'jɐt̚˥'), ('瓩', 't͡sʰiːn˥.ŋaː˩˧')] ``` Helper: @@ -34,3 +43,4 @@ Helper: ``` Note that autocorrection is intentionally not included in this helper, and an error is thrown if strings like `jyt6` are passed into the function. +Punctuation is ignored in the helper. diff --git a/src/ToJyutping/ToJyutping.py b/src/ToJyutping/ToJyutping.py index dd769e5..858c353 100644 --- a/src/ToJyutping/ToJyutping.py +++ b/src/ToJyutping/ToJyutping.py @@ -1,11 +1,12 @@ from os import path import pygtrie import re +import utils here = path.abspath(path.dirname(__file__)) t = pygtrie.CharTrie() -with open(path.join(here, 'jyut6ping3.simple.dict.yaml')) as f: +with open(path.join(here, 'jyut6ping3.simple.dict.yaml'), encoding='utf-8') as f: for line in f: k, v = line.rstrip().split('\t') t[k] = v @@ -30,11 +31,7 @@ def get_jyutping(s): return l def get_jyutping_text(s): - l = [] - for k, v in get_jyutping_list(s): - if v: - l += [v] - return ' '.join(l) + return utils.format_romanization_text(s, get_jyutping_list) def get_ipa_list(s): l = [] @@ -49,11 +46,7 @@ def get_ipa(s): return l def get_ipa_text(s): - l = [] - for k, v in get_jyutping_list(s): - if v: - l += [jyutping2ipa(v)] - return '.'.join(l) + return utils.format_ipa_text(s, get_ipa_list) initial = { 'b': 'p', 'p': 'pʰ', 'm': 'm', 'f': 'f', 'd': 't', 't': 'tʰ', 'n': 'n', 'l': 'l', 'g': 'k', 'k': 'kʰ', 'ng': 'ŋ', 'gw': 'kʷ', 'kw': 'kʷʰ', 'w': 'w', 'h': 'h', 'z': 't͡s', 'c': 't͡sʰ', 's': 's', 'j': 'j' } diff --git a/src/ToJyutping/utils.py b/src/ToJyutping/utils.py new file mode 100644 index 0000000..a1bbc31 --- /dev/null +++ b/src/ToJyutping/utils.py @@ -0,0 +1,163 @@ +import re + +punct_dict = dict( + zip( + '''!"'(),-./:;?[]{}~·‐‑‒–—―‘’“”…⋮⋯⸱⸳⸺⸻、。〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟・︐︑︒︓︔︕︖︗︘︙︱︲︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹇﹈﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹣!"'(),-./:;?[]{}~⦅⦆。「」、・''', + '''!"'(),-./:;?[]{}~·------‘’“”………··--,.‘’“”“”‘’[][][][][]~“””·,,.:;!?[]…--(){}[][]“”‘’“”‘’[],,.;:?!-(){}[]-!"'(),-./:;?[]{}~().“”,·''' + ) +) + +left_bracket = '([{‘“' +right_bracket = ')]}’”' +left_bracket_to_right = dict(zip(left_bracket, right_bracket)) +left_bracket = {*left_bracket} +right_bracket = {*right_bracket} +left_punct = {*left_bracket} +right_punct = {*'!,.:;?…', *right_bracket} +other_punct = {*'''"'·-~'''} +left_or_other_punct = {' ', *left_punct, *other_punct} +right_or_other_punct = {*right_punct, *other_punct} + +minus_signs = {*'-﹣-'} # U+2212 is unnecessary +decimal_seps = {*'''',.·⸱⸳﹒'.'''} +digits = {*'00𝟎𝟘𝟢𝟬𝟶🯰11𝟏𝟙𝟣𝟭𝟷🯱22𝟐𝟚𝟤𝟮𝟸🯲33𝟑𝟛𝟥𝟯𝟹🯳44𝟒𝟜𝟦𝟰𝟺🯴55𝟓𝟝𝟧𝟱𝟻🯵66𝟔𝟞𝟨𝟲𝟼🯶77𝟕𝟟𝟩𝟳𝟽🯷88𝟖𝟠𝟪𝟴𝟾🯸99𝟗𝟡𝟫𝟵𝟿🯹'} +unknown_or_hyphen = {'', '-'} + +def format_romanization_text(s, conv): + def inner(m): + t = [None] + d = [None] + for k, v in conv(m[0]): + if v: + t += [v] + d += [None] + elif not k.isspace(): + t += [punct_dict.get(k, '')] + d += [k] + t += [None] + d += [None] + l = '' + b = '' + for i, (p, c, n) in enumerate(zip(t, t[1:], t[2:]), 1): + def between(): + nonlocal t, i + j = i - 1 + while j and t[j] in right_bracket: + j -= 1 + f = j and t[j] and len(t[j]) > 1 + j = i + 1 + while j < len(t) - 1 and t[j] in left_bracket: + j += 1 + g = j and t[j] and len(t[j]) > 1 + return f and g + + def lspace(): + nonlocal l + if l and l[-1] not in left_or_other_punct: + l += ' ' + + def rspace(): + nonlocal n, l + if i < len(d) - 2 and d[i + 2] in digits if d[i + 1] in minus_signs else n not in right_or_other_punct: + l += ' ' + + if len(c) > 1: + lspace() + l += c + rspace() + elif not c or d[i] in minus_signs and d[i + 1] in digits and p not in unknown_or_hyphen: + if not l.endswith('[…]'): + l += '[…]' + elif d[i] in decimal_seps and d[i + 1] in digits and d[i - 1] in digits: + continue + elif c in left_punct: + lspace() + l += c + b += left_bracket_to_right[c] + elif c in right_punct: + l += c + rspace() + try: + b = b[:b.rindex(c)] + except ValueError: + pass + elif c == '-': + if p == '-': + continue + if n == '-' or between(): + l += ' – ' + else: + l += c + elif c == '~': + if p == '~' and n != '~' or between(): + l += '~ ' + else: + l += c + elif c == '·': + l += c + else: + j = len(b) - 1 + y = False + while j >= 0 and b[j] not in right_bracket: + if b[j] == c: + y = True + break + j -= 1 + if y: + b = b[:j] + l += c + rspace() + else: + lspace() + l += c + b += c + return ' '.join(l.split()) + + return re.sub(r'[^\0-\x1f\x80-\x9f]+', inner, s) + +major_break = {*'.!?…'} +minor_break = {*',/:;-~()[]{}'} + +def format_ipa_text(s, conv): + def inner(m): + t = [] + d = [] + for k, v in conv(m[0]): + if v: + t += [v] + d += [None] + elif not k.isspace(): + t += [punct_dict.get(k, '')] + d += [k] + d += [None] + l = [] + for i, c in enumerate(t): + if len(c) > 1: + l += [c] + elif not c or d[i] in minus_signs and d[i + 1] in digits and i and t[i - 1] not in unknown_or_hyphen: + if not l or l[-1] != '⸨…⸩': + l += ['⸨…⸩'] + elif l: + if d[i] in decimal_seps and d[i + 1] in digits and i and d[i - 1] in digits: + continue + if c in major_break: + if len(l[-1]) > 1: + l += ['‖'] + else: + l[-1] = '‖' + elif c in minor_break and len(l[-1]) > 1: + l += ['|'] + if len(l[-1]) == 1: + l.pop() + s = '' + for i, c in enumerate(l): + s += c + if i < len(l) - 1: + n = l[i + 1] + if c != '⸨…⸩' and len(c) > 1 and n != '⸨…⸩' and len(n) > 1: + s += '.' + else: + s += ' ' + return s + + return re.sub(r'[^\0-\x1f\x80-\x9f]+', inner, s) diff --git a/src/ToJyutping/version.py b/src/ToJyutping/version.py index fc79d63..020ed73 100644 --- a/src/ToJyutping/version.py +++ b/src/ToJyutping/version.py @@ -1 +1 @@ -__version__ = '0.2.1' +__version__ = '0.2.2'