forked from CodeW1zard/WordsMining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cleaner.py
27 lines (22 loc) · 792 Bytes
/
Cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re
from time import time
class Cleaner(object):
@classmethod
def n_gram(self, line, n):
return [line[i:i + n] for i in range(len(line) - n + 1)] if len(line) >= n else []
@classmethod
def remove_punc(self, line):
rule = re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5]")
line = rule.sub(' ', line).strip().split(' ')
return line
@classmethod
def preprocess_text(self, rfpath):
tic = time()
EOS = '*'
cnt = 0
text = open(rfpath, 'r', encoding='UTF-8').readlines()
text = sum(map(self.remove_punc, text), [])
text = [EOS + sent + EOS for sent in text]
cnt = sum(map(len, text))
print("peprocess done! %.2fs %d words in total" % (time() - tic, cnt))
return text