forked from zll17/Neural_Topic_Models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenization.py
29 lines (23 loc) · 1.08 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re
from pyhanlp import *
def Tokenizer(sent, stopwords=None):
pat = re.compile(r'[0-9!"#$%&\'()*+,-./:;<=>?@—,。:★、¥…【】()《》?“”‘’!\[\\\]^_`{|}~\u3000]+')
tokens = [t.word for t in HanLP.segment(sent)]
tokens = [re.sub(pat, r'', t).strip() for t in tokens]
tokens = [t for t in tokens if t != '']
if stopwords is not None:
tokens = [t for t in tokens if not (t in stopwords)]
return tokens
'''
def Tokenizer(sent,stopwords=None):
# Tokenizer for English.
pat = re.compile(r'[0-9!"#$%&\'()*+,-./:;<=>?@—,。:★、¥…【】()《》?“”‘’!\[\\\]^_`{|}~\u3000]+')
tokens = [re.sub(pat,r'',t).strip() for t in sent.split(' ')]
tokens = [t for t in tokens if t!='']
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
tokens = [wnl.lemmatize(t).lower() for t in tokens]
return tokens
'''
if __name__ == '__main__':
print(Tokenizer('他拿的是《红楼梦》?!我还以为他是个Foreigner———'))