eval.py

import argparse
import nltk
from sklearn.linear_model import LogisticRegressionCV as LogitCV
from sklearn.preprocessing import normalize
from text_embedding.features import *
from text_embedding.vectors import *
from SARC.utils import *


def parse():
  parser = argparse.ArgumentParser()
  parser.add_argument('dataset', help='pol or main', type=str)
  parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
  parser.add_argument('--min_count', default=1, help='Min count', type=int)
  parser.add_argument('--embedding', default=CCGLOVE,
                      help='embedding file', type=str)
  parser.add_argument('--weights', default=None,
                      help='weights to use for ngrams (e.g. sif, None)', type=str)
  parser.add_argument('-norm', '--normalize', action='store_true',
                      help='Normalize vectors')
  parser.add_argument('-l', '--lower', action='store_true',
                      help='Whether or not to lowercase text')
  parser.add_argument('-e', '--embed', action='store_true',
                      help='Use embeddings instead of bong')
  return parser.parse_args()


def main():
  args = parse()
  if args.dataset.lower() == 'pol':
    SARC = SARC_POL
  elif args.dataset.lower() == 'main':
    SARC = SARC_MAIN

  train_file = SARC+'train-balanced.csv'
  test_file = SARC+'test-balanced.csv'
  comment_file = SARC+'comments.json'

  # Load SARC pol/main sequences with labels.
  print('Load SARC data')
  train_seqs, test_seqs, train_labels, test_labels =\
    load_sarc_responses(train_file, test_file, comment_file, lower=args.lower)

  # Only use responses for this method. Ignore ancestors.
  train_resp = train_seqs['responses']
  test_resp = test_seqs['responses']

  # Split into first and second responses and their labels.
  # {0: list_of_first_responses, 1: list_of_second_responses}
  train_docs = {i: [l[i] for l in train_resp] for i in range(2)}
  test_docs = {i: [l[i] for l in test_resp] for i in range(2)}
  train_labels = {i: [2*int(l[i])-1 for l in train_labels] for i in range(2)}
  test_labels = {i: [2*int(l[i])-1 for l in test_labels] for i in range(2)}

  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])
  test_all_docs_tok = tokenize(test_docs[0] + test_docs[1])
  train_all_labels = np.array(train_labels[0] + train_labels[1])
  test_all_labels = np.array(test_labels[0] + test_labels[1])

  # Bongs or embeddings.
  if args.embed:
    print('Create embeddings')
    weights = None
    if args.weights == 'sif':
      weights = sif_weights(train_all_docs_tok, 1E-3)
    if args.weights == 'snif':
      weights = sif_weights(train_all_docs_tok, 1E-3)
      weights = {f: 1-w for f, w in weights.items()}
    w2v = vocab2vecs({word for doc in train_all_docs_tok+test_all_docs_tok for word in doc}, vectorfile=args.embedding)
    train_all_vecs = docs2vecs(train_all_docs_tok, f2v=w2v, weights=weights)
    test_all_vecs = docs2vecs(test_all_docs_tok, f2v=w2v, weights=weights)
  else:
    print('Create bongs')
    n = args.n
    min_count = args.min_count
    train_ngrams = [sum((list(nltk.ngrams(doc, k)) for k in range(1, n+1)), []) for doc in train_all_docs_tok]
    test_ngrams = [sum((list(nltk.ngrams(doc, k)) for k in range(1, n+1)), []) for doc in test_all_docs_tok]
    vocabulary = feature_vocab(train_ngrams, min_count=min_count)
    train_all_vecs = docs2bofs(train_ngrams, vocabulary)
    test_all_vecs = docs2bofs(test_ngrams, vocabulary)

  # Normalize?
  if args.normalize:
    normalize(train_all_vecs, copy=False)
    normalize(test_all_vecs, copy=False)
  print('Dimension of representation: %d'%train_all_vecs.shape[1])

  # Evaluate this classifier on all responses.
  print('Evaluate the classifier on all responses')
  clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=0) 
  clf.fit(train_all_vecs, train_all_labels)
  print('\tTrain acc: ', clf.score(train_all_vecs, train_all_labels))
  print('\tTest acc: ', clf.score(test_all_vecs, test_all_labels))

  # Get vectors for first and second responses.
  n_tr = int(train_all_vecs.shape[0]/2)
  n_te = int(test_all_vecs.shape[0]/2)
  train_vecs = {i: train_all_vecs[i*n_tr:(i+1)*n_tr,:] for i in range(2)}
  test_vecs = {i: test_all_vecs[i*n_te:(i+1)*n_te,:] for i in range(2)}

  # Final evaluation.
  print('Evaluate the classifier on the original dataset')
  hyperplane = clf.coef_[0,:]
  train_pred_labels = 2*(train_vecs[0].dot(hyperplane) > train_vecs[1].dot(hyperplane))-1
  test_pred_labels = 2*(test_vecs[0].dot(hyperplane) > test_vecs[1].dot(hyperplane))-1
  train_expect_labels = train_labels[0]
  test_expect_labels = test_labels[0]
  print('\tTrain acc: ', (train_pred_labels == train_expect_labels).sum() / train_pred_labels.shape[0])
  print('\tTest acc: ', (test_pred_labels == test_expect_labels).sum() / test_pred_labels.shape[0])


if __name__ == '__main__': 
  
  main()