-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_test_tokenizer.py
245 lines (163 loc) · 7.91 KB
/
train_test_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import _pickle as cPickle
from nltk import ngrams
from collections import Counter
def import_tweets(filepath, FULL=False):
"""
Imports the tweets from text files and returns them as a list of list of words
"""
if FULL:
train_files = [file for file in os.listdir(filepath) if 'full' in file]
train_size = 2500000
else:
train_files = [file for file in os.listdir(filepath) if 'full' not in file]
train_size = 200000
test_file = [file for file in os.listdir(filepath) if 'test' in file]
train_tweets = []; test_tweets = []
labels = np.array(train_size // 2 * [0] + train_size // 2 * [1])
print("Loading tweet data...")
for file in train_files:
with open(os.path.join(filepath, file), mode='rt', encoding='utf-8') as f:
for line in f:
train_tweets.append(line.strip())
with open(os.path.join(filepath, test_file[0]), mode='rt', encoding='utf-8') as f:
for line in f:
test_tweets.append(' '.join(line.strip().split(',')[1:]))
return train_tweets, test_tweets, labels
def tokenize(train_tweets, test_tweets, max_word=None):
"""
Attribute a unique token value for in word from a list of tweets,
each tweet being a list of words.
IN:
train_tweets: list of list of words
test_tweets: list of list of words
max_word: integer, determines the number of most occuring different words to keep
OUT:
train_tweets_tokenized: list of list of tokens (int)
test_tweets_tokenized: list of list of tokens (int)
word_index: index linking words to corresponding token
"""
if max_word == None:
tokenizer = Tokenizer(filters='')
else:
tokenizer = Tokenizer(num_words=max_word, filters='')
print("Fitting tokenizer...")
tokenizer.fit_on_texts(train_tweets)
word_index = tokenizer.word_index
nb_token = len(word_index)
print("Found {} unique tokens.".format(nb_token))
train_tweets_tokenized = tokenizer.texts_to_sequences(train_tweets)
test_tweets_tokenized = tokenizer.texts_to_sequences(test_tweets)
return train_tweets_tokenized, test_tweets_tokenized, word_index
def create_ngram_dic(tweets, max_token, ngram_range=2, min_occ=0, max_occ=None, n_first=None):
'''
Creates a dictionary that has ngram of tokens for keys and an unique int for value
IN:
tweets: list of list of tokenized tweets
max_token: int, max value of tokens in tweets
ngram_range: which ngrams to compute (bigram, trigram, etc.)
min_occ: when not None, sets the minium of occurence of ngrams to keep
max_occ: when not None, sets the maximum of occurence of ngrams to keep
n_first: when not None, sets the number of most occuring ngrams to keep
OUT:
ngram_dic: dictionary that has ngram of tokens for keys and an unique int for value
'''
# Creation of a list of all ngrams present in the tweet set provided
ngram_list =[]
for tweet in tweets:
for i in range(2, ngram_range + 1):
for n in ngrams(tweet, i):
ngram_list.append(n)
# counter of all ngrams to be able to filter them according to number of occurence
counter = Counter(ngram_list)
# according to condition, extract the ngrams needed from the counter
new_ngram_list = []
if n_first:
new_ngram_list = [val[0] for val in counter.most_common(n_first)]
elif max_occ:
new_ngram_list = [val[0] for val in counter.items() if val[1]>=min_occ and val[1]<=max_occ]
else:
new_ngram_list = [val[0] for val in counter.items() if val[1]>=min_occ]
# create unique token indices for ngrams. Needs to be unique so we start after max_token
new_tokens = range(max_token+1, len(new_ngram_list)+max_token+1)
# creation of dictionary with ngram as key and token as value
ngram_dic = dict(zip(new_ngram_list, new_tokens))
return ngram_dic
def add_ngrams(tweet_list, ngram_dic, ngram_range=2):
'''
appends a token of the ngrams in the tweet. Tokens that are appended can be
restricted in function of number of occurence.
IN:
tweet_list: list of list of tokenized tweets
ngram_dic: dictionary linking each ngram to a unique token
ngram range: which ngram to compute (bigram, trigram, etc.)
OUT: a list of list of tokens for each tweet with ngram tokens included
'''
new_train_tweets = []
for tweet in tweet_list:
ngram_list = []
new_tweet = tweet
for i in range(2, ngram_range + 1):
for n in ngrams(tweet, i):
ngram_list.append(n)
ngram_set = set(ngram_list)
for ngram in ngram_set:
try:
new_tweet.append(ngram_dic[ngram])
except KeyError:
continue
new_train_tweets.append(new_tweet)
return new_train_tweets
def load_embedding_matrix(filepath, EMB_DIM, word_index):
"""
Loads the GloVe pretrained embedding matrix of dimension EMB_DIM and returns it
as a numpy array.
"""
embbeding_mat = np.zeros((len(word_index.keys()), EMB_DIM))
embed_dict = {}
with open(os.path.join(filepath, 'glove.twitter.27B.{}d.txt'.format(EMB_DIM)),
mode='rt', encoding='utf-8') as f:
print("Loading embeddings...")
for i, line in enumerate(f):
key_vec = line.split()
embed_dict.update({key_vec[0]:np.array(key_vec[1:])})
if i % 1e5 == 0:
print("Loaded {:1.1E} words".format(i))
print("Creating embedding matrix...")
for word in word_index.keys():
row = word_index[word]-1
try:
embbeding_mat[row, :] = embed_dict[word]
except KeyError:
embbeding_mat[row, :] = np.zeros(EMB_DIM)
return embbeding_mat
def main():
DATA_PATH = "../data"
TWEET_PATH = os.path.join(DATA_PATH, "twitter-datasets")
FULL = True
EMB_DIM = 200
NGRAM_RANGE = 3
MAXLEN = 40
n_first_ngram = None
train_tweets, test_tweets, labels = import_tweets(TWEET_PATH, FULL)
train_tweets, test_tweets, word_index = tokenize(train_tweets, test_tweets)
max_word = len(word_index.keys())
if NGRAM_RANGE:
ngram_dic = create_ngram_dic(train_tweets, max_word, n_first=n_first_ngram)
train_tweets_ngram = add_ngrams(train_tweets, ngram_dic, ngram_range=NGRAM_RANGE)
test_tweets_ngram = add_ngrams(test_tweets, ngram_dic, ngram_range=NGRAM_RANGE)
train_tweets_ngram = sequence.pad_sequences(train_tweets_ngram, maxlen=(MAXLEN*NGRAM_RANGE))
test_tweets_ngram = sequence.pad_sequences(test_tweets_ngram, maxlen=(MAXLEN*NGRAM_RANGE))
cPickle.dump([train_tweets_ngram, labels, test_tweets_ngram, max_word+len(ngram_dic)],
open(os.path.join(DATA_PATH, 'for_graph_trigram.pkl').format(NGRAM_RANGE), 'wb'))
else:
train_tweets= sequence.pad_sequences(train_tweets, maxlen=MAXLEN)
test_tweets = sequence.pad_sequences(test_tweets, maxlen=MAXLEN)
embedding_matrix = load_embedding_matrix(DATA_PATH, EMB_DIM, word_index)
cPickle.dump([train_tweets, labels, test_tweets, len(word_index.keys()), embedding_matrix],
open(os.path.join(DATA_PATH, 'for_graph_1gram.pkl'), 'wb'))
if __name__ == '__main__':
main()