-
Notifications
You must be signed in to change notification settings - Fork 3
/
Cleaner.py
81 lines (73 loc) · 2.33 KB
/
Cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
from time import time
from Entropy import line_entropy
class Cleaner(object):
@classmethod
def n_gram(self, line, n):
return [line[i:i + n] for i in range(len(line) - n + 1)] if len(line) >= n else []
@classmethod
def remove_punc(self, line):
rule = re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5]")
line = rule.sub(' ', line).strip().split(' ')
return line
@classmethod
def z_alg(self, s):
n = len(s)
z = [0] * n
L = 0
R = 0
for i in range(n):
if i > R:
L = R = i
while R < n and s[R-L] == s[R]:
R += 1
z[i] = R-L
R -= 1
else:
k = i-L
if z[k] < R-i+1:
z[i] = z[k]
else:
L = i
while R < n and s[R-L] == s[R]:
R += 1
z[i] = R-L
R -= 1
if z[i] + i == n:
return s[:i]
return ''
@classmethod
def preprocess_text(self, rfpath):
EOS = '*'
cnt = 0
text = open(rfpath, 'r', encoding='UTF-8').readlines()
text = sum(map(self.remove_punc, text), [])
text = [line for line in text if line and line_entropy(line)>=1]
text = [EOS + sent + EOS for sent in text]
return text
@classmethod
def preprocess_danmu(self, rfpath):
EOS = '*'
cnt = 0
text = open(rfpath, 'r', encoding='utf-8').readlines()
num = len('2019-04-08 14:45:24,')
res = []
prev_line = ''
rule = re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5]")
for i, line in enumerate(text):
line = rule.sub(' ', line[num:]).lower()
if prev_line == line:
continue
prev_line = line
line = [word for word in line.split() if word and line_entropy(word)>=1]
res.extend(line)
for i, word in enumerate(res):
word_ = self.z_alg(word)
if word_:
res[i] = EOS + word_ + EOS
else:
res[i] = EOS + word + EOS
return res
if __name__ =='__main__':
text = Cleaner.preprocess_danmu("data/bilibili_txt/20181222_5_67235555.txt")
print(text)