Add files via upload

Melanee-Melanee · Apr 3, 2022 · 7feb5ee · 7feb5ee
1 parent af2fbe1
commit 7feb5ee
Showing 1 changed file with 164 additions and 0 deletions.
diff --git a/naivebayes.py b/naivebayes.py
@@ -0,0 +1,164 @@
+import utils
+import random
+import numpy as np
+from sklearn.naive_bayes import MultinomialNB
+from scipy.sparse import lil_matrix
+from sklearn.feature_extraction.text import TfidfTransformer
+
+# Performs classification using Naive Bayes.
+
+FREQ_DIST_FILE = '../train-processed-freqdist.pkl'
+BI_FREQ_DIST_FILE = '../train-processed-freqdist-bi.pkl'
+TRAIN_PROCESSED_FILE = '../train-processed.csv'
+TEST_PROCESSED_FILE = '../test-processed.csv'
+TRAIN = True
+UNIGRAM_SIZE = 15000
+VOCAB_SIZE = UNIGRAM_SIZE
+USE_BIGRAMS = True
+if USE_BIGRAMS:
+    BIGRAM_SIZE = 10000
+    VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
+FEAT_TYPE = 'frequency'
+
+
+def get_feature_vector(tweet):
+    uni_feature_vector = []
+    bi_feature_vector = []
+    words = tweet.split()
+    for i in xrange(len(words) - 1):
+        word = words[i]
+        next_word = words[i + 1]
+        if unigrams.get(word):
+            uni_feature_vector.append(word)
+        if USE_BIGRAMS:
+            if bigrams.get((word, next_word)):
+                bi_feature_vector.append((word, next_word))
+    if len(words) >= 1:
+        if unigrams.get(words[-1]):
+            uni_feature_vector.append(words[-1])
+    return uni_feature_vector, bi_feature_vector
+
+
+def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence'):
+    num_batches = int(np.ceil(len(tweets) / float(batch_size)))
+    for i in xrange(num_batches):
+        batch = tweets[i * batch_size: (i + 1) * batch_size]
+        features = lil_matrix((batch_size, VOCAB_SIZE))
+        labels = np.zeros(batch_size)
+        for j, tweet in enumerate(batch):
+            if test_file:
+                tweet_words = tweet[1][0]
+                tweet_bigrams = tweet[1][1]
+            else:
+                tweet_words = tweet[2][0]
+                tweet_bigrams = tweet[2][1]
+                labels[j] = tweet[1]
+            if feat_type == 'presence':
+                tweet_words = set(tweet_words)
+                tweet_bigrams = set(tweet_bigrams)
+            for word in tweet_words:
+                idx = unigrams.get(word)
+                if idx:
+                    features[j, idx] += 1
+            if USE_BIGRAMS:
+                for bigram in tweet_bigrams:
+                    idx = bigrams.get(bigram)
+                    if idx:
+                        features[j, UNIGRAM_SIZE + idx] += 1
+        yield features, labels
+
+
+def apply_tf_idf(X):
+    transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
+    transformer.fit(X)
+    return transformer
+
+
+def process_tweets(csv_file, test_file=True):
+    """Returns a list of tuples of type (tweet_id, feature_vector)
+            or (tweet_id, sentiment, feature_vector)
+
+    Args:
+        csv_file (str): Name of processed csv file generated by preprocess.py
+        test_file (bool, optional): If processing test file
+
+    Returns:
+        list: Of tuples
+    """
+    tweets = []
+    print 'Generating feature vectors'
+    with open(csv_file, 'r') as csv:
+        lines = csv.readlines()
+        total = len(lines)
+        for i, line in enumerate(lines):
+            if test_file:
+                tweet_id, tweet = line.split(',')
+            else:
+                tweet_id, sentiment, tweet = line.split(',')
+            feature_vector = get_feature_vector(tweet)
+            if test_file:
+                tweets.append((tweet_id, feature_vector))
+            else:
+                tweets.append((tweet_id, int(sentiment), feature_vector))
+            utils.write_status(i + 1, total)
+    print '\n'
+    return tweets
+
+
+if __name__ == '__main__':
+    np.random.seed(1337)
+    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
+    if USE_BIGRAMS:
+        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
+    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
+    if TRAIN:
+        train_tweets, val_tweets = utils.split_data(tweets)
+    else:
+        random.shuffle(tweets)
+        train_tweets = tweets
+    del tweets
+    print 'Extracting features & training batches'
+    clf = MultinomialNB()
+    batch_size = len(train_tweets)
+    i = 1
+    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
+    for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
+        utils.write_status(i, n_train_batches)
+        i += 1
+        if FEAT_TYPE == 'frequency':
+            tfidf = apply_tf_idf(training_set_X)
+            training_set_X = tfidf.transform(training_set_X)
+        clf.partial_fit(training_set_X, training_set_y, classes=[0, 1])
+    print '\n'
+    print 'Testing'
+    if TRAIN:
+        correct, total = 0, len(val_tweets)
+        i = 1
+        batch_size = len(val_tweets)
+        n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
+        for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
+            if FEAT_TYPE == 'frequency':
+                val_set_X = tfidf.transform(val_set_X)
+            prediction = clf.predict(val_set_X)
+            correct += np.sum(prediction == val_set_y)
+            utils.write_status(i, n_val_batches)
+            i += 1
+        print '\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)
+    else:
+        del train_tweets
+        test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
+        n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
+        predictions = np.array([])
+        print 'Predicting batches'
+        i = 1
+        for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE):
+            if FEAT_TYPE == 'frequency':
+                test_set_X = tfidf.transform(test_set_X)
+            prediction = clf.predict(test_set_X)
+            predictions = np.concatenate((predictions, prediction))
+            utils.write_status(i, n_test_batches)
+            i += 1
+        predictions = [(str(j), int(predictions[j]))
+                       for j in range(len(test_tweets))]
+        utils.save_results_to_csv(predictions, 'naivebayes.csv')
+        print '\nSaved to naivebayes.csv'