-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
107 lines (87 loc) · 2.92 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import MeCab
from morph import Morph
def load_parser(tokenizer, dic, stop_words):
if tokenizer == 'ja':
parser = lambda x: mecab_parser(x, dic=dic, stop_words=stop_words)
elif tokenizer == 'space':
parser = lambda x: space_split_parser(x, stop_words=stop_words)
elif isinstance(tokenizer, function):
parser = lambda x: user_parser(tokenizer, x, stop_words=stop_words)
else:
raise ValueError("tokenizer parameter is 'ja' or 'space' or user default function.")
return parser
def mecab_parser(line, dic='', stop_words=None):
"""
parser for japanese sentence
:param str line:
:param str dic: dictionary path of MeCab
:param stop_words:
:return: parsed words
:rtype: list[morph.Morph]
"""
morphs = []
if dic:
dic = '-d ' + dic
stop = None
if stop_words == 'default':
stop = 'default'
elif isinstance(stop_words, list) and all([isinstance(stop_word, str) for stop_word in stop_words]):
stop = 'list'
elif stop_words is None:
pass
else:
raise ValueError("stop_words must be 'default' or list[str]")
tagger = MeCab.Tagger(dic)
tagger.parse(dic)
node = tagger.parseToNode(line)
while node:
if not node.feature.startswith('BOS/EOS'):
features = node.feature.split(',')
morph = Morph(surf=node.surface,
pos1=features[0],
pos2=features[1],
conj1=features[4],
conj2=features[5],
base=features[6])
if morph.base == '*':
morph.base = morph.surf
if stop == 'default':
if not morph.is_stopword():
morphs.append(morph)
elif stop == 'list':
if morph.surf not in stop_words:
morphs.append(morph)
else:
morphs.append(morph)
node = node.next
return morphs
def space_split_parser(line, stop_words=None):
"""
:param str line:
:param list[str] stop_words:
:return: parsed words
:rtype: list[morph.Morph]
"""
if stop_words is None:
stop_words = []
return [Morph(word, '', '', '', '', word) for word in line.split(' ') if word not in stop_words]
def words_parser(words, stop_words=list):
"""
:param list[str] words:
:param list[str] stop_words:
:return: parsed words
:rtype: list[morph.Morph]
"""
if stop_words is None:
stop_words = []
return [Morph(word, '', '', '', '', word) for word in words if word not in stop_words]
def user_parser(parser, line, stop_words=None):
"""
:param function parser:
:param str line:
:param stop_words:
:return:
"""
if stop_words is None:
stop_words = []
return [Morph(word, '', '', '', '', word) for word in parser(line) if word not in stop_words]