-
Notifications
You must be signed in to change notification settings - Fork 0
/
rx.py
401 lines (401 loc) · 17 KB
/
rx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#!/usr/bin/env python3
## coding=UTF-8
import re
import time
import logsetup
log=logsetup.getlog(__name__)
# logsetup.setlevel('INFO',log) #for this file
logsetup.setlevel('DEBUG',log) #for this file
"""This is called from a number of places"""
framerx=re.compile('__') #replace this w/data in frames.
def slashdash(x):
return sub(r'/',r'-',x)
def urlok(x):
x=str(x) # just in case we pass a path object
#These should each be tuple of
# 1. a simple list of characters to strip,
# 2. replacement
d=("̀́̂̌̄̃᷉̋̄̏̌̂᷄᷅̌᷆᷇᷉̈","")
# * . " / \ [ ] : ; | , # illegal in MS Windows
p=(r"][\. /?*\\:;\|,\"><'",'_')
l=("əéèêɛ",'e')
o=("ô",'o')
for i in [d,p,l]:
x=re.sub('['+i[0]+']',i[1],x)
return x
def splitxpath(x):
"""Confirm that r is correct here"""
tag,*features=split(r'[\]\[]',x)
attrib={}
for f in [i for i in features if i]:
# print('-'+f)
key,val=split('=',f)
key=split('^@',key)
val=val.strip('"\'')
# print(key,val)
attrib[key[1]]=val
return tag,attrib
def escapeattr(x):
x=str(x)
if "'" in x:
return '\"'+x+'\"'
else:
return "'"+x+"'" #b+="[@{}=\"{}\"]".format(attr,self.kwargs[attrs[attr]])
# This doesn't seem to be able to work:
# def removepicreldir(x):
# dirs=[
# r'pictures/',
# r'pictures\\'
# ]
# for d in dirs:
# if x.startswith(d):
# x=x.split(d)[-1]
# return x
def split(delre,str):
return re.split(delre,str)
def countxiny(x,y):
return re.subn(x, x, y)[1]
def linebreakwords(x):
return re.sub(' ','\n',x)
def pymoduleable(x):
"""Confirm that r is correct here"""
return re.sub(r'\.','_', str(x))
def delinebreak(x):
return re.sub('\n','',x)
def stripquotes(x):
try:
return x.strip('‘’')
except:
return x
"""passthrough fns"""
IGNORECASE=re.IGNORECASE
def sub(*args,**kwargs):
# pattern, repl, string, count=0, flags=0
# log.info("Running re.sub with args: {} and kwargs: {}".format(args,kwargs))
return re.sub(*args,**kwargs)
def compile(x):
return re.compile(x, re.UNICODE)
def id(x):
x=x.replace('˥','4').replace('˦','3').replace('˧','2'
).replace('˨','1').replace('˩','0')
"""Confirm that r is correct here"""
return re.sub(r'[][ .!=\(\),\'/?ꞌ\n:;+*]','_',x) #remove charcters that are invalid for ids
def tonerxs():
return (re.compile('[˥˦˧˨˩]+', re.UNICODE),
re.compile(' ', re.UNICODE),
re.compile(' ', re.UNICODE))
def update(t,regexdict,check,value,matches=[]):
tori=t
for c in reversed(check.split('=')):
log.info("subbing {} for {} in {}, using {}".format(value,c,t,
regexdict[c]))
# log.info("found {}".format(regexdict[c].search(t)))
match=regexdict[c].search(t)
if match:
matches.append(match.groups()[-1])
t=match.expand('\\g<1>'+value)+t[match.end():]
log.info("updated {} > {}".format(tori,t))
for match in matches:
if len(match)>1:
log.info(_("NOTICE: we just matched (to remove) a set of "
"symbols representing one sound ({}). Until you are done "
"with it, we will leave it there, so both forms will be "
"found. Once you are done with it, remove it from the "
"polygraph settings.").format(match))
return t
def texmllike(x):
"""This attempts to implement TeXMLLikeCharacterConversion.java from
XLingPaper"""
repls={
'\\': "\\textbackslash{}",
'{': "\\{",
'}': "\\}",
'$': "\\textdollar{}",
'[': "{[}", #"\\textsquarebracketleft{}",
']': "{]}", #"\\textsquarebracketright{}",
'<': "\\textless{}",
'≤': "\\textless{}=",
'>': "\\textgreater{}",
'≥': "\\textgreater{}=",
'&': "\\&",
'#': "\\#",
'^': "\\^{}",
'_': "\\_",
'~': "\\textasciitilde{}",
'%': "\\%",
'|': "\\textbar{}",
'<': "\\textless{}",
'>': "\\textgreater{}",
'.\u200b ': ".\\ ",
'\u200c ': ".\\ \\ ",
'\u200d ': ".~"
}
for y in repls:
print("Replacing",y,"with",repls[y])
x=x.replace(y,repls[y])
"""Confirm that r is correct here"""
x=re.sub(r'\\\\textless{}(([\?!/]|tex:)[^\\\\]*)\\\\textgreater{}',"<\\1>",x)
return x
def noparenscontent(x): #pull all content in parentheses
if isinstance(x,str):
"""Confirm that r is correct here"""
return re.sub(r'\(.*\)','',x)
def noparens(x): #pull just the parens
if isinstance(x,str):
"""Confirm that r is correct here"""
return re.sub(r'\(|\)','',x)
def glossdeftoform(x):
# i=x
x=noparenscontent(x)
if isinstance(x,str):
# x=re.sub('\(.*\)','',x)
x=re.sub(',.*','',x) #stop at any comma
x=re.sub('^ *','',x) #no leading spaces
x=re.sub(' .*','',x) #just use the first word
# x=re.sub('^(([^() ]+)( [^() ]+){,2})(.*)*$','\\1',x) #up to three words, no parens
# x=re.sub(',$','',x)
# x=re.sub(', ',',',x)
# x=re.sub(' ','.',x)
# log.info("glossdeftoform: {} > {}".format(i,x))
return x
def glossifydefn(x):
if isinstance(x,str):
x=re.sub('^(([^() ]+)( [^() ]+){,2})(.*)*$','\\1',x) #up to three words, no parens
x=re.sub(',$','',x)
x=re.sub(', ',',',x)
x=re.sub(' ','.',x)
return x
def makeprecomposed(x):
if x is None:
return
subs={'á':'á',
'à':'à',
'é':'é',
'è':'è',
'ê':'ê',
'í':'í',
'ì':'ì',
'ó':'ó',
'ò':'ò',
'ú':'ú',
'ù':'ù',
}
for s in subs:
x=re.sub(s,subs[s],x)
return x
def fixunicodeerrorsWindows(x):
errordict={
'É”': 'ɔ',
'É›': 'ɛ',
'ɲ': 'ɲ',
'Å‹': 'ŋ',
'î': 'î',
'ô': 'ô',
'ï': 'ï',
'û': 'û',
'Ã ': 'à',
'‘': '',
'’': '',
'Å“': 'œ',
'â': 'â'
}
for e in errordict:
if e in x:
x=re.sub(e,errordict[e],x)
return x
# ls |grep 'ɔ\|ɛ\|ɲ\|ŋ\|î\|ô\|ï\|û\|à \|‘\|’\|œ\|â'
# mv `ls |grep 'ɔ\|ɛ\|ɲ\|ŋ\|î\|ô\|ï\|û\|à \|‘\|’\|œ\|â'` messedup/
# rename -n 's/É”/ɔ/g;s/É›/ɛ/g;s/ɲ/ɲ/g;s/Å‹/ŋ/g;s/î/î/g;s/ô/ô/g;s/ï/ï/g;s/û/û/g;s/à /à/g;s/‘//g;s/’//g;s/Å“/œ/g;s/â/â/g' *
def stripdiacritics(check,x):
if 'd' in check.rx:
return check.rx['d'].sub('',x)
return x
def segmentin(forms, glyph):
# """This actually allows for dygraphs, etc., so I'm keeping it."""
# for form in forms: # as: self.citationforms[lang] + self.lexemes[lang]
if re.search(glyph,' '.join([x for x in forms if x != None])): #see if the glyph is there
# log.info("Found glyph '{}'".format(glyph))
return glyph #find it and stop looking, or return nothing
# log.info("Found not glyph '{}'".format(glyph))
def inxyz(db, lang, segmentlist): #This calls the above script for each character.
start_time=time.time() #this enables boot time evaluation
actuals=list()
forms=db.lcs[lang] + db.lxs[lang]
for i in segmentlist:
s=segmentin(forms,i)
#log.info(s) #to see the following run per segment
if s is not None:
actuals.append(s)
log.log(2,'{} {}'.format(time.time()-start_time, segmentlist)) # with this
return list(dict.fromkeys(actuals))
def slisttoalternations(graphemeset,group=False):
# This '|' delimited list should never go inside of [^ ], as it will be
# misinterpreted!!
# This provides the form to go in [^ ] lists or alone, with a one grouping
# around the list, but with longer graphemes first (trigraphs, then
# digraphs and decomposed characters)
output='|'.join(sorted(graphemeset,key=len,reverse=True))
if group:
output='('+output+')'
return output
def s(sdict, stype, polyn=0, word=False, compile=False): #settings lang=None
"""join a list into regex format, sort for longer first, to capture
the largest units possible."""
"""sdict should be a dictionary value keyed by check/settings.s[analang]"""
lessdict=set()
if stype == "C-ʔ-N":
if 'ʔ' in sdict:
lessdict+=set(sdict['ʔ'])
if 'N' in sdict:
lessdict+=set(sdict['N'])
elif stype == "C-ʔ":
if 'ʔ' in sdict:
lessdict+=set(sdict['ʔ'])
elif stype == "C-N":
if 'N' in sdict:
lessdict+=set(sdict['N'])
elif stype not in sdict:
log.error("Dunno why, but this isn't in lists: {}".format(stype))
return
graphemeset=set(sdict[stype])-lessdict
if polyn:
#make the above limited by len here
graphemeset=[i for i in graphemeset if len(i) == polyn]
output=slisttoalternations(graphemeset,group=True)
if compile:
# log.info("Compiling {}[{}] regex {} (word={})"
# "".format(stype,polyn,output,word))
return make(output, word=word, compile=compile)
else:
return output
def make(regex, **kwargs):
# if (re.match('^[^(]*\|',regex)) or (re.search('\|[^)]*$',regex)):
# log.error('Regex problem! (need parentheses around segments!):',regex)
# exit()
word=kwargs.get('word')
compile=kwargs.get('compile')
caseinsensitive=kwargs.get('caseinsensitive')
if kwargs.get('word'):
"""To make alternations and references work correctly, this should
already have parentheses () around each S."""
regex='^'+regex+'$'
if kwargs.get('caseinsensitive'):
flags=re.UNICODE|re.IGNORECASE
else:
flags=re.UNICODE
if kwargs.get('compile'):
try:
regex=re.compile(regex, flags=flags)
except:
log.error('Regex problem!')
return regex
def nX(segmentsin,segmentsout,n):
#Start by being clear which graphs count, and which don't.
# these should mutually exclude each other.
overlap=set(segmentsin) & set(segmentsout)
if overlap:
log.error("Your in/out segment lists overlap: {}".format(overlap))
# log.error("in: {}".format(segmentsin))
# log.error("out: {}".format(segmentsout))
# for each of in/out, make a dict keyed by length, with value listing glyphs
# with that length (automatically separate trigraphs, digraphs, etc)
sindict={n:[i for i in segmentsin if len(i) == n]
for n in range(1,len(max(segmentsin,key=len, default=''))+1)}
#?default='' b/c can be empty sometimes, early on
soutdict={n:[i for i in segmentsout if len(i) == n]
for n in range(1,len(max(segmentsout,key=len, default=''))+1)}
#default='' b/c can be empty sometimes, early on
# Convert those value lists to a string of alternations, for each key
sin={k:slisttoalternations(sindict[k]) for k in sindict}
sin.update({'all':slisttoalternations([i for j in sindict.values()
for i in j])})
sout={k:slisttoalternations(soutdict[k]) for k in soutdict}
sout.update({'all':slisttoalternations([i for j in soutdict.values()
for i in j])})
# Make a list, longest first
# this probably doesn't need the isdigit test
strlist=[sout[i] for i in range(max([j for j in sout.keys()
if str(j).isdigit()],default=0),
0,-1)]
#join list of alternations to one long alternation
notS='|'.join(strlist)
strlist+=['('+sin['all']+')'] #look for, capture this
#This needs to multiply as a unit, while getting each subpart separately:
oneS='(('+notS+')*('+sin['all']+'))'#.join(strlist)
#We need to keep each alternation set a unit, and keep all but last in \1
if n-1:
priors='('+oneS*(n-1)+')'
else:
priors=''
nS='('+priors+'('+notS+')*)('+sin['all']+')'
# for n,i in enumerate([sin,sout,oneS,notS,nS]):
# print(n,i)
# log.info("Compiling X{} regex {}".format(n,nS))
return make(nS, compile=True)
def fromCV(CVs, sdict, distinguish, **kwargs): #check, lang
""" this inputs regex variable (regexCV), a tuple of two parts:
1. abbreviations with 'C' and 'V' in it, and/or variables for actual
segments or back reference, e.g., 1 for \1 or 2 for \2, and 'c' or 'v'.
2. dictionary of variable meanings (e.g., {'v':'e'}).
e.g., for total variable: CVs=("CvC2",{'v':'e'})
CAUTION: if you don't have this dictionary, CVs[0] is just one letter...
It outputs language specific regex (compiled if compile=True,
whole word word=True)."""
"""lang should be check.analang"""
if type(CVs) is not str:
log.error("regexCV is not string! ({})".format(CVs))
regex=list()
references=('\1','\2','\3','\4')
references=range(1,5)
# Replace word final C first, to get it out of the way:
if (distinguish['ʔwd'] and not distinguish['ʔ']) and (distinguish['Nwd']
and not distinguish['N']):
rxthis=s(sdict,'C-ʔ-N') #Pull out C# first, set to find only relevant Cs
CVs=re.sub('C$',rxthis,CVs)
elif distinguish['ʔwd'] and not distinguish['ʔ']:
rxthis=s(sdict,'C-ʔ') #Pull out C# first; set to find only relevant Cs
CVs=re.sub('C$',rxthis,CVs)
elif distinguish['Nwd'] and not distinguish['N']:
rxthis=s(sdict,'C-N') #Pull out C# first; set to find only relevant Cs
CVs=re.sub('C$',rxthis,CVs)
# log.info('CVs: {}'.format(CVs))
# if C includes [N,?], find C first; if it doesn't, move on to [N,?].
# if we distinguish [N,?]# (only), C# is already gone, so other C's here.
for x in sdict: #["V","C","N","ʔ","G","S"]:
# if x in check.s[lang]: #just pull out big ones first
rxthis=s(sdict,x) #this should have parens for each S
CVs=re.sub(x,rxthis,CVs)
# log.info('CVs: {}'.format(CVs))
for x in references: #get capture group expressions
CVrepl='\\\\{}'.format(str(x)) #this needs to be escaped to survive...
# log.info('x: {}; repl: {}'.format(x,CVrepl))
# log.info('CVs: {}'.format(CVs))
"""Confirm that r is correct here"""
CVs=re.sub(r'\)([^(]+)\(',')(\\1)(',CVs) #?
# log.info('Going to compile regex with CVs: {}'.format(CVs))
return make(CVs, **kwargs)
if __name__ == '__main__':
x='ne [pas] plaire, (ne pas) agréer, ne pas'
rgx='(ne pas) agréer'
ts=['bobongo','bobingo']
check='V1=V2=V3'
value='a'
r='(eau|ai|ey|oe|ow|ei|ou|au|oi|yi|ie|oa|ay|oo|ea|ee|ue|é|i|a|o|I|u|O|e)'
regexdict={'V1': make("((ckw|thw|tch|cc|pp|pt|tt|ck|tw|kw|ch|ph|sh|hh|ff|sc|ss|th|sw|hw|ts|sl|gh|bb|dd|gg|mb|nd|dw|gw|zl|yw|mm|ny|gn|nn|nw|rh|wh|ll|rr|lw|rw|p|P|t|c|k|q|f|s|x|h|b|B|d|g|j|v|z|y|w|m|n|l|r|')*)(eau|ou|ei|ai|yi|ea|ay|ee|ey|ie|oa|oo|ow|ue|oe|au|oi|a|e|i|o|u|I|O|é)",compile=True),
'V2': make("((((ckw|thw|tch|cc|pp|pt|tt|ck|tw|kw|ch|ph|sh|hh|ff|sc|ss|th|sw|hw|ts|sl|gh|bb|dd|gg|mb|nd|dw|gw|zl|yw|mm|ny|gn|nn|nw|rh|wh|ll|rr|lw|rw|p|P|t|c|k|q|f|s|x|h|b|B|d|g|j|v|z|y|w|m|n|l|r|')*(eau|ou|ei|ai|yi|ea|ay|ee|ey|ie|oa|oo|ow|ue|oe|au|oi|a|e|i|o|u|I|O|é)))(ckw|thw|tch|cc|pp|pt|tt|ck|tw|kw|ch|ph|sh|hh|ff|sc|ss|th|sw|hw|ts|sl|gh|bb|dd|gg|mb|nd|dw|gw|zl|yw|mm|ny|gn|nn|nw|rh|wh|ll|rr|lw|rw|p|P|t|c|k|q|f|s|x|h|b|B|d|g|j|v|z|y|w|m|n|l|r|')*)(eau|ou|ei|ai|yi|ea|ay|ee|ey|ie|oa|oo|ow|ue|oe|au|oi|a|e|i|o|u|I|O|é)",compile=True),
'V3': make("((((ckw|thw|tch|cc|pp|pt|tt|ck|tw|kw|ch|ph|sh|hh|ff|sc|ss|th|sw|hw|ts|sl|gh|bb|dd|gg|mb|nd|dw|gw|zl|yw|mm|ny|gn|nn|nw|rh|wh|ll|rr|lw|rw|p|P|t|c|k|q|f|s|x|h|b|B|d|g|j|v|z|y|w|m|n|l|r|')*(eau|ou|ei|ai|yi|ea|ay|ee|ey|ie|oa|oo|ow|ue|oe|au|oi|a|e|i|o|u|I|O|é))((ckw|thw|tch|cc|pp|pt|tt|ck|tw|kw|ch|ph|sh|hh|ff|sc|ss|th|sw|hw|ts|sl|gh|bb|dd|gg|mb|nd|dw|gw|zl|yw|mm|ny|gn|nn|nw|rh|wh|ll|rr|lw|rw|p|P|t|c|k|q|f|s|x|h|b|B|d|g|j|v|z|y|w|m|n|l|r|')*(eau|ou|ei|ai|yi|ea|ay|ee|ey|ie|oa|oo|ow|ue|oe|au|oi|a|e|i|o|u|I|O|é)))(ckw|thw|tch|cc|pp|pt|tt|ck|tw|kw|ch|ph|sh|hh|ff|sc|ss|th|sw|hw|ts|sl|gh|bb|dd|gg|mb|nd|dw|gw|zl|yw|mm|ny|gn|nn|nw|rh|wh|ll|rr|lw|rw|p|P|t|c|k|q|f|s|x|h|b|B|d|g|j|v|z|y|w|m|n|l|r|')*)(eau|ou|ei|ai|yi|ea|ay|ee|ey|ie|oa|oo|ow|ue|oe|au|oi|a|e|i|o|u|I|O|é)",compile=True),
'V':make(r,compile=True)
}
for t in ts:
print(update(t,regexdict,check,value))
print(id(x))
impname='Imperative'
y='field[@type="{}"][@value^="{}"]'.format(impname,1)
splitxpath(y)
y="field[@type='{}'][@value^='{}']".format(impname,1)
splitxpath(y)
x='be quench, extinguish'
print(glossdeftoform(x))
# s='ááààééèèííììóóòòúúùù'
# s2=makeprecomposed(s)
# print(s,s2)