forked from ucsf-ckm/machine-learning-carpentry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClassifierOneRecord.py
63 lines (47 loc) · 1.8 KB
/
ClassifierOneRecord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import re
train = pd.read_csv('data/trainReviews.tsv', header=0, delimiter="\t", quoting=3)
train_records = []
for i in range( 0, len(train["text"])):
text = train["text"][i]
text = re.sub("[^a-zA-Z0-9]"," ", text)
train_records.append(text.lower())
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = 'english',
max_features = 5000)
train_data_features = vectorizer.fit_transform(train_records)
train_data_features.toarray()
np.asarray(train_data_features)
clf = RandomForestClassifier()
#clf = MLPClassifier()
#clf = MultinomialNB()
#clf = LogisticRegression()
#clf = SVC()
clf.fit( train_data_features, train["category"] )
test = pd.read_csv('data/samplePositive.tsv', header=0, delimiter="\t", quoting=3)
test_records = []
for i in range(0,len(test["text"])):
text = test["text"][i]
text = re.sub("[^a-zA-Z0-9]"," ", text)
test_records.append(text.lower())
test_data_features = vectorizer.transform(test_records)
np.asarray(test_data_features)
# use predict to print category prediction
#predictions = clf.predict(test_data_features)
# use predict_proba to show probabilities for each category
predictions = clf.predict_proba(test_data_features)
# print predictions
f = open("predictions.tsv", "w")
for i in range(0, len(predictions)):
strout = test_records[i] + "\t"
strout += str(predictions[i])
f.write(strout + "\n")