-
Notifications
You must be signed in to change notification settings - Fork 0
/
NaiveBayesAnalysis.py
48 lines (37 loc) · 1.49 KB
/
NaiveBayesAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import numpy as np
def remove_punct(desc):
import string
import re
t = re.sub('[' + string.punctuation + ']', '', str(desc).lower())
return t
def NaiveBayesGuess(df,filename):
from textblob.classifiers import NaiveBayesClassifier
import pickle
texts = df['Clean_Text']
scores = df['ScoreLabel']
text_tuples = []
# Create Text Tuples
for i,text in enumerate(texts):
text = remove_punct(text)
text_tuples.append(tuple((text,scores[i])))
# Train a Naive Bayes Classifier (NLTK implementation) on normalized texts
nbcl = NaiveBayesClassifier(text_tuples)
# 'Pickle' the classifier for future use
f = open('nbcl_'+filename+'.pickle', 'wb')
pickle.dump(nbcl, f)
f.close()
# Load Pickled Classifier
# loaded_nbcl = pickle.load(open('nbcl_'+filename+'.pickle', 'rb'))
loaded_nbcl = nbcl #comment out when loading
# Check the classifier's predictions for the test set
nbcl_acc = loaded_nbcl.accuracy(text_tuples)
print('NB description classifier: {}'.format(nbcl_acc))
# Classify all descriptions and write class values to file
pd.DataFrame(loaded_nbcl.classify(remove_punct(text)) for text in texts).to_csv('nb_'+filename+'_guess.csv',index='False',header='False')
iot = 'iot_posts_with_readibility_measures'
ai = 'ai_posts_with_readibility_measures'
df_iot = pd.read_csv(iot+'.csv', sep=",")
df_ai = pd.read_csv(ai+'.csv', sep=",")
# NaiveBayesGuess(df_iot,iot)
NaiveBayesGuess(df_ai,ai)