-
Notifications
You must be signed in to change notification settings - Fork 3
/
util.py
42 lines (32 loc) · 1.01 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding:utf-8 -*-
from __future__ import unicode_literals
import re
import unicodedata
import sys
import os
import codecs
def preprocess_en(s):
s = s.rstrip() # trail space, tab, newlineの削除
s = s.replace('.', ' .')
s = s.replace('!', ' !')
s = s.replace('?', ' ?')
s = s.replace(',', ' ,')
s = re.sub(r'\s+', r' ', s) # スペースの個数正規化
s = re.sub(r'(\d) ([.,]) (\d)', r'\1\2\3', s) # 0 . 1 -> 0.1
s = re.sub(r'(Dr|Jr|Prof|Rev|Gen|Mr|Mt|Mrs|Ms) .', r'\1.', s) # Mr . -> Mr.
s = s.replace(u'e . g .', u'e.g.')
s = s.replace(u'i . e .', u'e.g.')
s = s.replace(u'U . S .', u'U.S.')
return s
def preprocess_ja(s):
s = s.rstrip() # trail space, tab, newlineの削除
s = unicodedata.normalize('NFKC', s) # まず正規化
return s
def preprocess(s, lang):
funcname = 'preprocess_{0}'.format(lang)
return globals()[funcname](s)
def split(s, lang):
if lang == 'ja':
return list(s)
else:
return s.split()