Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Melanee-Melanee authored Apr 3, 2022
1 parent af2fbe1 commit 7feb5ee
Showing 1 changed file with 164 additions and 0 deletions.
164 changes: 164 additions & 0 deletions naivebayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import utils
import random
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer

# Performs classification using Naive Bayes.

FREQ_DIST_FILE = '../train-processed-freqdist.pkl'
BI_FREQ_DIST_FILE = '../train-processed-freqdist-bi.pkl'
TRAIN_PROCESSED_FILE = '../train-processed.csv'
TEST_PROCESSED_FILE = '../test-processed.csv'
TRAIN = True
UNIGRAM_SIZE = 15000
VOCAB_SIZE = UNIGRAM_SIZE
USE_BIGRAMS = True
if USE_BIGRAMS:
BIGRAM_SIZE = 10000
VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
FEAT_TYPE = 'frequency'


def get_feature_vector(tweet):
uni_feature_vector = []
bi_feature_vector = []
words = tweet.split()
for i in xrange(len(words) - 1):
word = words[i]
next_word = words[i + 1]
if unigrams.get(word):
uni_feature_vector.append(word)
if USE_BIGRAMS:
if bigrams.get((word, next_word)):
bi_feature_vector.append((word, next_word))
if len(words) >= 1:
if unigrams.get(words[-1]):
uni_feature_vector.append(words[-1])
return uni_feature_vector, bi_feature_vector


def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence'):
num_batches = int(np.ceil(len(tweets) / float(batch_size)))
for i in xrange(num_batches):
batch = tweets[i * batch_size: (i + 1) * batch_size]
features = lil_matrix((batch_size, VOCAB_SIZE))
labels = np.zeros(batch_size)
for j, tweet in enumerate(batch):
if test_file:
tweet_words = tweet[1][0]
tweet_bigrams = tweet[1][1]
else:
tweet_words = tweet[2][0]
tweet_bigrams = tweet[2][1]
labels[j] = tweet[1]
if feat_type == 'presence':
tweet_words = set(tweet_words)
tweet_bigrams = set(tweet_bigrams)
for word in tweet_words:
idx = unigrams.get(word)
if idx:
features[j, idx] += 1
if USE_BIGRAMS:
for bigram in tweet_bigrams:
idx = bigrams.get(bigram)
if idx:
features[j, UNIGRAM_SIZE + idx] += 1
yield features, labels


def apply_tf_idf(X):
transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
transformer.fit(X)
return transformer


def process_tweets(csv_file, test_file=True):
"""Returns a list of tuples of type (tweet_id, feature_vector)
or (tweet_id, sentiment, feature_vector)
Args:
csv_file (str): Name of processed csv file generated by preprocess.py
test_file (bool, optional): If processing test file
Returns:
list: Of tuples
"""
tweets = []
print 'Generating feature vectors'
with open(csv_file, 'r') as csv:
lines = csv.readlines()
total = len(lines)
for i, line in enumerate(lines):
if test_file:
tweet_id, tweet = line.split(',')
else:
tweet_id, sentiment, tweet = line.split(',')
feature_vector = get_feature_vector(tweet)
if test_file:
tweets.append((tweet_id, feature_vector))
else:
tweets.append((tweet_id, int(sentiment), feature_vector))
utils.write_status(i + 1, total)
print '\n'
return tweets


if __name__ == '__main__':
np.random.seed(1337)
unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
if USE_BIGRAMS:
bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
if TRAIN:
train_tweets, val_tweets = utils.split_data(tweets)
else:
random.shuffle(tweets)
train_tweets = tweets
del tweets
print 'Extracting features & training batches'
clf = MultinomialNB()
batch_size = len(train_tweets)
i = 1
n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
utils.write_status(i, n_train_batches)
i += 1
if FEAT_TYPE == 'frequency':
tfidf = apply_tf_idf(training_set_X)
training_set_X = tfidf.transform(training_set_X)
clf.partial_fit(training_set_X, training_set_y, classes=[0, 1])
print '\n'
print 'Testing'
if TRAIN:
correct, total = 0, len(val_tweets)
i = 1
batch_size = len(val_tweets)
n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
if FEAT_TYPE == 'frequency':
val_set_X = tfidf.transform(val_set_X)
prediction = clf.predict(val_set_X)
correct += np.sum(prediction == val_set_y)
utils.write_status(i, n_val_batches)
i += 1
print '\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)
else:
del train_tweets
test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
predictions = np.array([])
print 'Predicting batches'
i = 1
for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE):
if FEAT_TYPE == 'frequency':
test_set_X = tfidf.transform(test_set_X)
prediction = clf.predict(test_set_X)
predictions = np.concatenate((predictions, prediction))
utils.write_status(i, n_test_batches)
i += 1
predictions = [(str(j), int(predictions[j]))
for j in range(len(test_tweets))]
utils.save_results_to_csv(predictions, 'naivebayes.csv')
print '\nSaved to naivebayes.csv'

0 comments on commit 7feb5ee

Please sign in to comment.