-
Notifications
You must be signed in to change notification settings - Fork 14
/
eval.py
executable file
·116 lines (99 loc) · 5.03 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import nltk
from sklearn.linear_model import LogisticRegressionCV as LogitCV
from sklearn.preprocessing import normalize
from text_embedding.features import *
from text_embedding.vectors import *
from SARC.utils import *
def parse():
parser = argparse.ArgumentParser()
parser.add_argument('dataset', help='pol or main', type=str)
parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
parser.add_argument('--min_count', default=1, help='Min count', type=int)
parser.add_argument('--embedding', default=CCGLOVE,
help='embedding file', type=str)
parser.add_argument('--weights', default=None,
help='weights to use for ngrams (e.g. sif, None)', type=str)
parser.add_argument('-norm', '--normalize', action='store_true',
help='Normalize vectors')
parser.add_argument('-l', '--lower', action='store_true',
help='Whether or not to lowercase text')
parser.add_argument('-e', '--embed', action='store_true',
help='Use embeddings instead of bong')
return parser.parse_args()
def main():
args = parse()
if args.dataset.lower() == 'pol':
SARC = SARC_POL
elif args.dataset.lower() == 'main':
SARC = SARC_MAIN
train_file = SARC+'train-balanced.csv'
test_file = SARC+'test-balanced.csv'
comment_file = SARC+'comments.json'
# Load SARC pol/main sequences with labels.
print('Load SARC data')
train_seqs, test_seqs, train_labels, test_labels =\
load_sarc_responses(train_file, test_file, comment_file, lower=args.lower)
# Only use responses for this method. Ignore ancestors.
train_resp = train_seqs['responses']
test_resp = test_seqs['responses']
# Split into first and second responses and their labels.
# {0: list_of_first_responses, 1: list_of_second_responses}
train_docs = {i: [l[i] for l in train_resp] for i in range(2)}
test_docs = {i: [l[i] for l in test_resp] for i in range(2)}
train_labels = {i: [2*int(l[i])-1 for l in train_labels] for i in range(2)}
test_labels = {i: [2*int(l[i])-1 for l in test_labels] for i in range(2)}
# Train a classifier on all responses in training data. We will later use this
# classifier to determine for every sequence which of the 2 responses is more sarcastic.
train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])
test_all_docs_tok = tokenize(test_docs[0] + test_docs[1])
train_all_labels = np.array(train_labels[0] + train_labels[1])
test_all_labels = np.array(test_labels[0] + test_labels[1])
# Bongs or embeddings.
if args.embed:
print('Create embeddings')
weights = None
if args.weights == 'sif':
weights = sif_weights(train_all_docs_tok, 1E-3)
if args.weights == 'snif':
weights = sif_weights(train_all_docs_tok, 1E-3)
weights = {f: 1-w for f, w in weights.items()}
w2v = vocab2vecs({word for doc in train_all_docs_tok+test_all_docs_tok for word in doc}, vectorfile=args.embedding)
train_all_vecs = docs2vecs(train_all_docs_tok, f2v=w2v, weights=weights)
test_all_vecs = docs2vecs(test_all_docs_tok, f2v=w2v, weights=weights)
else:
print('Create bongs')
n = args.n
min_count = args.min_count
train_ngrams = [sum((list(nltk.ngrams(doc, k)) for k in range(1, n+1)), []) for doc in train_all_docs_tok]
test_ngrams = [sum((list(nltk.ngrams(doc, k)) for k in range(1, n+1)), []) for doc in test_all_docs_tok]
vocabulary = feature_vocab(train_ngrams, min_count=min_count)
train_all_vecs = docs2bofs(train_ngrams, vocabulary)
test_all_vecs = docs2bofs(test_ngrams, vocabulary)
# Normalize?
if args.normalize:
normalize(train_all_vecs, copy=False)
normalize(test_all_vecs, copy=False)
print('Dimension of representation: %d'%train_all_vecs.shape[1])
# Evaluate this classifier on all responses.
print('Evaluate the classifier on all responses')
clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=0)
clf.fit(train_all_vecs, train_all_labels)
print('\tTrain acc: ', clf.score(train_all_vecs, train_all_labels))
print('\tTest acc: ', clf.score(test_all_vecs, test_all_labels))
# Get vectors for first and second responses.
n_tr = int(train_all_vecs.shape[0]/2)
n_te = int(test_all_vecs.shape[0]/2)
train_vecs = {i: train_all_vecs[i*n_tr:(i+1)*n_tr,:] for i in range(2)}
test_vecs = {i: test_all_vecs[i*n_te:(i+1)*n_te,:] for i in range(2)}
# Final evaluation.
print('Evaluate the classifier on the original dataset')
hyperplane = clf.coef_[0,:]
train_pred_labels = 2*(train_vecs[0].dot(hyperplane) > train_vecs[1].dot(hyperplane))-1
test_pred_labels = 2*(test_vecs[0].dot(hyperplane) > test_vecs[1].dot(hyperplane))-1
train_expect_labels = train_labels[0]
test_expect_labels = test_labels[0]
print('\tTrain acc: ', (train_pred_labels == train_expect_labels).sum() / train_pred_labels.shape[0])
print('\tTest acc: ', (test_pred_labels == test_expect_labels).sum() / test_pred_labels.shape[0])
if __name__ == '__main__':
main()