-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtextclean.py
61 lines (53 loc) · 2.3 KB
/
textclean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import string
try:
import inflect
from kitchen.text.converters import to_unicode
except:
pass # for python3 version
transfrom = '\xd5\xd3\xd2\xd0\xd1\xcd\xd4'
transto = '\'""--\'\''
try:
unimaketrans = string.maketrans(transfrom, transto)
except:
unimaketrans = str.maketrans(transfrom, transto) # for python3 version
replacement_mappings = {"\xe2\x80\x93": " - ",
'\xe2\x80\x94': " - ",
'\xe2\x80\x99': "'",
'\xe2\x80\x9c': '"',
'\xe2\x80\x9d': '"',
'\xe2\x80\xa6': '...',
'\r\n': '\n',
'\r': '\n'}
def norm_dollar_signs(word):
"""convert $n to n dollars"""
if word.startswith('$'):
suffix = 'dollars'
if len(word)>1:
if word[1:] == '1':
suffix = 'dollar'
return word[1:]+' '+suffix
else:
return suffix
return word
def process_usertext(inputstring):
"""cleans up unicode, translate numbers, outputs as a list of unicode words."""
if(isinstance(inputstring, str)):
#MS line breaks and stylized characters that TextEdit inserts. (is there an existing module that does this?)
inputstring = string.translate(inputstring.strip(),
unimaketrans)
for ustr in replacement_mappings:
inputstring = inputstring.replace(ustr, replacement_mappings[ustr])
inputstring = to_unicode(inputstring, encoding='utf-8', errors='ignore') # catch-all?
cleaned = inputstring.replace('[', '').replace(']', '') # common in linguists' transcriptions
cleaned = cleaned.replace('-', ' ').replace('/', ' ')
# convert digits and normalize $n
digitconverter = inflect.engine()
returnstr = ''
for line in cleaned.splitlines():
wordlist = map(lambda word: word.strip(string.punctuation), line.split())
wordlist = ' '.join(map(norm_dollar_signs,
wordlist)).split()
returnstr += ' '.join(map(lambda word:
digitconverter.number_to_words(word).replace('-', ' ').replace(',', '') if word[0].isdigit() or (word[0]=="'" and len(word)>1 and word[1].isdigit()) else word,
wordlist))+'\n'
return returnstr