-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathtext_filter.py
140 lines (121 loc) · 5.8 KB
/
text_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# encoding:utf-8
"""
文本过滤器
基于DFA与字典树实现的高效文本过滤器
>>>t = TextFilter() # 初始化 # 贪婪模式,匹配所有敏感词
>>>t.is_contain('气死我了,卧槽. 免费提供无抵押贷款') # 监测是否有敏感词,返回(敏感词在字符串的起始位置,敏感词,敏感词类型)构成的列表
[(5, u'\u5367\u69fd', 'dirty'), (13, u'\u65e0\u62b5\u62bc\u8d37\u6b3e', 'ad')]
>>>t.filter('习近平修宪') # 敏感词过滤 str
***修宪
>>>t.filter(u'卧槽,我真是草泥马') # 敏感词过滤 unicode
**,我真是***
>>>t.filter(u'法论功大发好,真善忍好',replace_char=u'-') # 敏感词过滤,指定替换字符
---大发好,真善忍好
>>>t.filter('高效低价英雄联盟代练') # 测试添加敏感词功能
高效低价英雄联盟代练
>>>t.add_word(u'英雄联盟代练')
>>>t.filter('高效低价英雄联盟代练')
高效低价******
>>>t.classifie('出售幼,女私房照,小萝,莉私房,联系QQxxx') # 文本敏感词统计(敏感词类型,出现次数) (会提前过滤符号)
[('pron-child', 2), ('ad', 1)]
author : @h-j-13
time : 2018-7-19
"""
import re
from collections import Counter
from sensitive_word import SensitiveWords
class Node(object):
"""字典树节点"""
def __init__(self):
self.children = None # dict格式 {u'char1':node1, u'char2':node2...}
self.sensitive_word = None
self.sensitive_word_type = None
class TextFilter(object):
"""文本过滤"""
# Singleton
_instance = None
def __new__(cls, *args, **kw):
"""单例模式"""
if not cls._instance:
cls._instance = super(TextFilter, cls).__new__(cls, *args, **kw)
return cls._instance
def __init__(self):
self.root = Node()
self.sensitive_word = SensitiveWords().sensitive_word_dict
for word_type in self.sensitive_word.keys():
for word in self.sensitive_word[word_type]:
self.add_word(word, word_type)
def add_word(self, word, word_type=u'common'):
"""向字典树里添加敏感词汇及敏感词类型"""
# 处理编码
if type(word) == str:
word = word.decode('utf-8')
# 向tire树添加节点
node = self.root
for i in range(len(word)):
if not node.children: # 该节点是叶节点
node.children = {word[i]: Node()}
elif word[i] not in node.children: # note : 监测dict中是否有某个key, 用 k in d 比用 k in d.keys() 快三倍
node.children[word[i]] = Node()
node = node.children[word[i]]
node.sensitive_word = word # 在最后一个节点上记录整个词
node.sensitive_word_type = word_type
def is_contain(self, message):
"""监测文本是否含有字典树的敏感词
返回一个列表,每一个元祖都是敏感词(出现在字符串文中的位置,敏感词,类型)"""
# 处理编码
if type(message) == str:
message = message.decode('utf-8')
# 初始化结果变量
result = []
i, j, message_length = 0, 0, len(message)
# tire树 查找
while i < message_length:
j = i
p = self.root
while j < message_length and p.children is not None and message[j] in p.children: # 匹配最长的词
p = p.children[message[j]]
j = j + 1
if p.sensitive_word: # 查找时最后落到了敏感词叶节点上
result.append((j - len(p.sensitive_word),
p.sensitive_word,
p.sensitive_word_type))
i += len(p.sensitive_word) # 直接跳跃到敏感词下一个字符进行继续匹配
else:
i += 1
return result
def filter(self, message, replace_char=u'*'):
"""过滤文本,将其中的敏感词替换为过滤字符(默认为*)"""
# 处理编码
if type(message) == str:
message = message.decode('utf-8')
res = self.is_contain(message)
for (i, word, _) in res:
message = message[:i] + u"".join([replace_char for _ in xrange(len(word))]) + message[i + len(word):]
return message
def classifie(self, message):
"""过滤字符串,获取字符串分类及恶意程度"""
result = []
# 处理编码
if type(message) == str:
message = message.decode('utf-8')
# 去除各种标点符号
message = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"), message)
res = self.is_contain(message)
# 聚合语句中的敏感词信息及权重
for _, _, word_type in res:
result.append(word_type)
d = dict(Counter(result))
d = sorted(d.items(), key=lambda x: x[1], reverse=True)
return d
if __name__ == '__main__':
t = TextFilter() # 初始化 # 贪婪模式,匹配所有敏感词
print t.is_contain('气死我了,卧槽. 免费提供无抵押贷款') # 监测是否有敏感词,返回(敏感词在字符串的起始位置,敏感词,敏感词类型)构成的列表
print t.filter('习近平修宪') # 敏感词过滤 str
print t.filter(u'卧槽,我真是草泥马') # 敏感词过滤 unicode
t.filter(u'法论功大发好,真善忍好', replace_char=u'-') # 敏感词过滤,指定替换字符
t.filter('高效低价英雄联盟代练') # 测试添加敏感词功能
t.add_word(u'英雄联盟代练')
t.filter('高效低价英雄联盟代练')
print t.classifie('出售幼,女私房照,小萝,莉私房,联系QQxxx')