-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
132 lines (97 loc) · 4.15 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import re
import nltk
from nltk.stem.snowball import GermanStemmer
from nltk.corpus import stopwords
def setup():
# setup needed libraries, directories etc. TODO construct need directories automatically
nltk.download('punkt')
class TokenizerBase():
def split_to_words(self, s, delimiter='[.,?!:; {}()"\[" "\]"" "\n"]'):
l = re.split(delimiter, s)
l = [v for v in l if v != ''] #remove all empty strings
return l
def replace_umlauts(self, text):
res = text
return res
def replace_special_chars(self, text):
res = text
res = res.replace(u'ß', 'ss')
res = res.replace(u'—', '-')
return res
class SimpleGermanTokenizer(TokenizerBase):
def tokenize(self, s):
words = self.split_to_words(s)
stemmed_words = self.stem_words(words)
return stemmed_words
def stem_words(self, words):
stemmer = GermanStemmer()
stemmed_words = []
for word in words:
stemmed_words.append(stemmer.stem(word))
return stemmed_words
class NonStemmingTokenizer(TokenizerBase):
# https://github.com/devmount/GermanWordEmbeddings/blob/master/preprocessing.py
def tokenize(self, s):
# punctuation and stopwords
punctuation_tokens = ['.', '..', '...', ',', ';', ':', '"', u'„', '„', u'“', '“', '\'',
'[', ']', '{', '}', '(', ')', '<', '>', '?', '!', '-', u'–', '+',
'*', '--', '\\', '\'\'', '``', '‚', '‘', '\n', '\\n', '']
punctuation = ['?', '.', '!', '/', ';', ':', '(', ')', '&', '\n']
# Define at which chars you want to split words
# split_chars = ['-', '/', '\\\\', '+', '|']
split_chars = ['/', '\\\\', '+', '|']
# stop_words = [self.replace_umlauts(token) for token in stopwords.words('german')]
# replace umlauts
s = self.replace_umlauts(s)
# replace newline chars
def remove_newlines(document):
document = re.sub('\\n', ' ', document)
document = re.sub('\\\\n', ' ', document)
document = re.sub('\n', ' ', document)
return document
s = remove_newlines(s)
s = self.replace_special_chars(s)
# get word tokens
words = nltk.word_tokenize(s)
# filter punctuation tokens
words = [x for x in words if x not in punctuation_tokens]
# remove stopwords
# words = [x for x in words if x not in stop_words]
# split words at defined characters
delimiters = '[' + "".join(split_chars) + ']'
flat_words = []
for x in words:
flat_words.extend(re.split(delimiters, x))
words = flat_words
# functions to remove all punctuations at the beginning and end of a word
# (in case something in the nltk.word_tokenize() was left over)
def remove_start_punct(word):
while word and (word[0] in punctuation_tokens):
word = word[1:]
return word
def remove_end_puntc(word):
while word and (word[-1] in punctuation_tokens):
word = word[:-1]
return word
# remove all punctuations at the beginning and ending of a word
words = [remove_start_punct(x) for x in words]
words = [remove_end_puntc(x) for x in words]
# remove all undesired punctuations at any location
words = [re.sub('[' + "".join(punctuation) + ']', '', x) for x in words]
# process words
words = [x.lower() for x in words]
# remove everything except
words = [re.sub(r'[^a-z0-9%ÜÖÄÉÈÀéèàöäü=><†@≥≤\s\-\/]', '', x) for x in words]
# remove stopwords TODO activate maybe
# words = [x for x in words if x not in stop_words]
return words
def get_tokenizer(tk):
if tk == 'sgt':
tokenizer = SimpleGermanTokenizer()
elif tk == 'nst':
tokenizer = NonStemmingTokenizer()
else:
# Default
print("Warning: Couldn't find specified tokenizer. Continuing with default tokenizer. ")
tokenizer = NonStemmingTokenizer()
return tokenizer