-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlog_reg_twitt_glove.py
91 lines (34 loc) · 1.76 KB
/
log_reg_twitt_glove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
import os
import numpy as np
import _pickle as cPickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from src import implementations as imp
def main():
"""
This is just a quick test to check whether the preprocessing, tokenizing and embeding worked
"""
DATA_PATH = "data/"
EMB_DIM = 200
print("Loading pickled data ...")
train_tweets, labels, test_tweets, nb_tokens, emb_matrix = \
cPickle.load(open(os.path.join(DATA_PATH, "train_test_{}embedding.pkl".format(EMB_DIM)), mode='rb'))
print("Embedding ...")
train_data = np.zeros((train_tweets.shape[0], EMB_DIM))
for i, tweet in enumerate(train_tweets):
temp_tweet = np.zeros(EMB_DIM)
for token in tweet:
if token != 0:
temp_tweet = temp_tweet + emb_matrix[token-1, :]
train_data[i, :] = temp_tweet
np.random.seed(0)
ind = np.random.permutation(train_tweets.shape[0])
labels = labels[ind]
train_data = imp.standardize(train_data[ind])
print("Logistic regresion fitting...")
logistic = LogisticRegression(solver='liblinear')
logistic.fit(train_data[:109000], labels[:109000])
print("Accuracy is: {}".format(np.mean(cross_val_score(logistic, train_data[-10000:], labels[-10000:], cv=5, scoring='accuracy'))))
if __name__ == '__main__':
main()