From efaf785d05de1281bbdf4ae050b4fa365fa4e62f Mon Sep 17 00:00:00 2001 From: Areeb-Aatif Date: Mon, 11 Nov 2019 15:11:14 -0500 Subject: [PATCH 1/6] initial implementation of decision tree --- project3/Code/decision_tree.py | 126 +++++++++++++++++++++++++++++++++ project3/Code/main.py | 19 ++++- project3/Data/dummy.txt | 10 +++ 3 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 project3/Code/decision_tree.py create mode 100644 project3/Data/dummy.txt diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py new file mode 100644 index 0000000..caa5d28 --- /dev/null +++ b/project3/Code/decision_tree.py @@ -0,0 +1,126 @@ +import numpy as np +import pandas as pd +from math import log + +class decisionTree: + + def readData(self, filePath): + data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None) + dataDf = pd.DataFrame(data) + labels = dataDf.iloc[:,-1] + return dataDf.iloc[:,:-1], dataDf.iloc[:,-1] + + def oneHotEncoding(self, data, labels): + for colName, colData in data.iteritems(): + if colData.dtype == np.object: + data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1) + data.drop([colName], axis=1, inplace=True) + + return pd.concat([data, labels], axis=1) + + def decision(self, data): + print("Running Decision Tree Classifier ....................") + root = self.createTree(data.loc[:70*data.shape[0] / 100]) + # print(root) + testData = data.loc[70*data.shape[0] / 100:] + target = testData.iloc[:,-1].values.tolist() + predicted = self.testData(testData.iloc[:, :-1], root) + return target, predicted + + def createTree(self, data): + n = Node() + + print(data) + if data.iloc[:,-1].value_counts().shape[0] == 1: + n.feature = data.iloc[:, -1].iloc[0] + return n + + if data.shape[1] == 2: + n.feature = data.iloc[:,-1].value_counts().index[0] + return n + + bestFeature = self.getBestFeature(data) + n.feature = bestFeature + + condition = (data[bestFeature].max() + data[bestFeature].min()) / 2 + n.condition = condition + + leftChildData = data.loc[data[bestFeature] < condition] + leftChildData = leftChildData.drop(bestFeature, axis=1) + # print(leftChildData) + n.left = self.createTree(leftChildData) + + rightChildData = data.loc[data[bestFeature] >= condition] + rightChildData = rightChildData.drop(bestFeature, axis=1) + # print(rightChildData) + n.right = self.createTree(rightChildData) + + return n + + def getBestFeature(self, data): + entropy_p = self.entropy(data) + max_gain = float('-inf') + bestFeature = 0 + for colName, colData in data.iloc[:,:-1].iteritems(): + condition = (colData.max() - colData.min()) / 2 + entropy_i = 0.0 + + subData1 = data.loc[data[colName] < condition] + prob1 = len(subData1) / float(len(data)) + entropy_i += prob1 * self.entropy(subData1) + + subData2 = data.loc[data[colName] >= condition] + prob2 = len(subData2) / float(len(data)) + entropy_i += prob2 * self.entropy(subData2) + + info_gain = entropy_p - entropy_i + if info_gain > max_gain: + max_gain = info_gain + bestFeature = colName + + return bestFeature + + def entropy(self, data): + entropy = 0.0 + labelCounts = data.iloc[:,-1].value_counts() + for idx in labelCounts.index: + prob = float(labelCounts[idx]) / len(data) + entropy -= prob * log(prob, 2) + + return entropy + + def testData(self, data, root): + predicted = [] + for index, row in data.iterrows(): + predicted.append(self.testRow(row, root)) + + return predicted + + def testRow(self, data, root): + if not root.left and not root.right: + return root.feature + + if data[root.feature] < root.condition: + return self.testRow(data, root.left) + elif data[root.feature] >= root.condition: + return self.testRow(data, root.right) + + +class Node: + + def __init__(self): + self.feature = None + self.left = None + self.right = None + self.condition = None + + def __str__(self, level=0): + ret = "\t"*level+repr(self.feature)+"\n" + if self.left: + ret += self.left.__str__(level+1) + if self.right: + ret += self.right.__str__(level+1) + return ret + + def __repr__(self): + return '' diff --git a/project3/Code/main.py b/project3/Code/main.py index 78f775c..f0583ce 100644 --- a/project3/Code/main.py +++ b/project3/Code/main.py @@ -1,6 +1,6 @@ from helpers import helpers as hp from knn import knn -from naive_bayes import bayes +# from naive_bayes import bayes class main: def knn(self, kCrossValidation = 10): @@ -32,6 +32,21 @@ def knn(self, kCrossValidation = 10): print("PRECISION = {}%".format(averagePrecision*100)) print("RECALL = {}%".format(averageRecall*100)) print("F MEASURE = {}%".format(averageFscore*100)) + + def decision_tree(self, kCrossValidation = 10): + from decision_tree import decisionTree + h = hp() + fileName = h.get_fileName() + # filePath = "../Data/"+fileName+".txt" + filePath = "CSE-601/project3/Data/"+fileName+".txt" + dt = decisionTree() + data, labels = dt.readData(filePath) + data = dt.oneHotEncoding(data, labels) + target, predicted = dt.decision(data) + print(target) + print(predicted) + + def bayes_naive(self, kCrossValidation = 10): h = hp() @@ -49,4 +64,4 @@ def bayes_naive(self, kCrossValidation = 10): td = h.convertToList(tmp) -main().knn() \ No newline at end of file +main().decision_tree() \ No newline at end of file diff --git a/project3/Data/dummy.txt b/project3/Data/dummy.txt new file mode 100644 index 0000000..9553899 --- /dev/null +++ b/project3/Data/dummy.txt @@ -0,0 +1,10 @@ +Yes Single 125000 No +No Married 100000 No +No Single 70000 No +Yes Married 120000 No +No Divorced 95000 Yes +No Married 60000 No +Yes Divorced 220000 No +No Single 85000 Yes +No Married 75000 No +No Single 90000 Yes \ No newline at end of file From c4cad5e9a671980c41bf2b1800659798579c317b Mon Sep 17 00:00:00 2001 From: Areeb-Aatif Date: Mon, 11 Nov 2019 20:32:11 -0500 Subject: [PATCH 2/6] added optimizations to decision tree --- project3/Code/decision_tree.py | 107 ++++++++++++++++++++------------- project3/Code/main.py | 48 ++++++++++++--- 2 files changed, 102 insertions(+), 53 deletions(-) diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py index caa5d28..4293169 100644 --- a/project3/Code/decision_tree.py +++ b/project3/Code/decision_tree.py @@ -18,19 +18,17 @@ def oneHotEncoding(self, data, labels): return pd.concat([data, labels], axis=1) - def decision(self, data): - print("Running Decision Tree Classifier ....................") - root = self.createTree(data.loc[:70*data.shape[0] / 100]) - # print(root) - testData = data.loc[70*data.shape[0] / 100:] + def decision(self, trainData, testData): + # trainData = data.loc[:percentSplit*data.shape[0]] + # testData = data.loc[percentSplit*data.shape[0]:] + root = self.createTree(trainData) target = testData.iloc[:,-1].values.tolist() - predicted = self.testData(testData.iloc[:, :-1], root) - return target, predicted + predicted = self.predictData(testData.iloc[:, :-1], root) + return target, predicted, root def createTree(self, data): n = Node() - print(data) if data.iloc[:,-1].value_counts().shape[0] == 1: n.feature = data.iloc[:, -1].iloc[0] return n @@ -39,46 +37,56 @@ def createTree(self, data): n.feature = data.iloc[:,-1].value_counts().index[0] return n - bestFeature = self.getBestFeature(data) + bestFeature, condition = self.getBestFeature(data) n.feature = bestFeature - - condition = (data[bestFeature].max() + data[bestFeature].min()) / 2 n.condition = condition leftChildData = data.loc[data[bestFeature] < condition] leftChildData = leftChildData.drop(bestFeature, axis=1) - # print(leftChildData) - n.left = self.createTree(leftChildData) + if leftChildData.shape[0] == 0: + temp = Node() + temp.feature = data.iloc[:,-1].value_counts().index[0] + n.left = temp + else: + n.left = self.createTree(leftChildData) rightChildData = data.loc[data[bestFeature] >= condition] rightChildData = rightChildData.drop(bestFeature, axis=1) - # print(rightChildData) - n.right = self.createTree(rightChildData) + if rightChildData.shape[0] == 0: + temp = Node() + temp.feature = data.iloc[:,-1].value_counts().index[0] + n.right = temp + else: + n.right = self.createTree(rightChildData) return n def getBestFeature(self, data): entropy_p = self.entropy(data) max_gain = float('-inf') - bestFeature = 0 + bestFeature = 0.0 + bestCondition = 0.0 for colName, colData in data.iloc[:,:-1].iteritems(): - condition = (colData.max() - colData.min()) / 2 - entropy_i = 0.0 + percent = [0.25, 0.5, 0.75] + for p in percent: + condition = (colData.max() - colData.min()) * p + entropy_i = 0.0 - subData1 = data.loc[data[colName] < condition] - prob1 = len(subData1) / float(len(data)) - entropy_i += prob1 * self.entropy(subData1) + subData1 = data.loc[data[colName] < condition] + prob1 = len(subData1) / float(len(data)) + entropy_i += prob1 * self.entropy(subData1) - subData2 = data.loc[data[colName] >= condition] - prob2 = len(subData2) / float(len(data)) - entropy_i += prob2 * self.entropy(subData2) + subData2 = data.loc[data[colName] >= condition] + prob2 = len(subData2) / float(len(data)) + entropy_i += prob2 * self.entropy(subData2) - info_gain = entropy_p - entropy_i - if info_gain > max_gain: - max_gain = info_gain - bestFeature = colName + info_gain = entropy_p - entropy_i + if info_gain > max_gain: + max_gain = info_gain + bestFeature = colName + bestCondition = condition - return bestFeature + return bestFeature, bestCondition def entropy(self, data): entropy = 0.0 @@ -89,21 +97,34 @@ def entropy(self, data): return entropy - def testData(self, data, root): + def predictData(self, data, root): predicted = [] for index, row in data.iterrows(): - predicted.append(self.testRow(row, root)) + predicted.append(self.predictRow(row, root)) return predicted - def testRow(self, data, root): + def predictRow(self, data, root): if not root.left and not root.right: return root.feature if data[root.feature] < root.condition: - return self.testRow(data, root.left) + return self.predictRow(data, root.left) elif data[root.feature] >= root.condition: - return self.testRow(data, root.right) + return self.predictRow(data, root.right) + + def findParams(self, predicted, target, tp=1, tn=0): + truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0 + for p, t in zip(predicted, target): + if p == tp and t == tp: + truePositives+=1 + elif p == tp and t == tn: + falsePositives+=1 + elif p == tn and t == tp: + falseNegatives+=1 + else: + trueNegatives+=1 + return truePositives, trueNegatives, falsePositives, falseNegatives class Node: @@ -114,13 +135,13 @@ def __init__(self): self.right = None self.condition = None - def __str__(self, level=0): - ret = "\t"*level+repr(self.feature)+"\n" - if self.left: - ret += self.left.__str__(level+1) - if self.right: - ret += self.right.__str__(level+1) - return ret + # def __str__(self, level=0): + # ret = "\t"*level+repr(self.feature)+"\n" + # if self.left: + # ret += self.left.__str__(level+1) + # if self.right: + # ret += self.right.__str__(level+1) + # return ret - def __repr__(self): - return '' + # def __repr__(self): + # return '' diff --git a/project3/Code/main.py b/project3/Code/main.py index e6906dd..4f19271 100644 --- a/project3/Code/main.py +++ b/project3/Code/main.py @@ -3,6 +3,7 @@ from naive_bayes import bayes from sklearn import preprocessing import numpy as np +import pandas as pd class main: def knn(self, predictData = None, trainData = None, kCrossValidation = 10): @@ -38,6 +39,7 @@ def knn(self, predictData = None, trainData = None, kCrossValidation = 10): return accuracy, precision, recall, f_score def decision_tree(self, kCrossValidation = 10): + print("\nRunning Decision Tree Classifier ....................\n") from decision_tree import decisionTree h = hp() fileName = h.get_fileName() @@ -46,9 +48,31 @@ def decision_tree(self, kCrossValidation = 10): dt = decisionTree() data, labels = dt.readData(filePath) data = dt.oneHotEncoding(data, labels) - target, predicted = dt.decision(data) - print(target) - print(predicted) + + accuracy = [] + precision = [] + recall = [] + f_score = [] + models = [] + + foldSize = int(data.shape[0] / kCrossValidation) + for i in range(kCrossValidation): + print("Running iteration " + str(i+1) + " of k cross validation") + testData = data.loc[foldSize*i:foldSize*(i+1)-1] + trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) + target, predicted, root = dt.decision(trainData, testData) + models.append(root) + truePositives, trueNegatives, falsePositives, falseNegatives = dt.findParams(predicted, target) + # if truePositives < trueNegatives: + # truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives + accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) + tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + precision.append(tmpPrecision) + recall.append(tmpRecall) + f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) + return accuracy, precision, recall, f_score + def bayes_naive(self, predictData, trainData, kCrossValidation = 10): h = hp() @@ -88,13 +112,17 @@ def bayes_naive(self, predictData, trainData, kCrossValidation = 10): if __name__ == "__main__": m = main() h = hp() - trainData = h.get_file(h.get_fileName()) - name = h.get_fileName() - if name == '': - predictData = None - else: - predictData = h.get_file(name, fileType='predictData') + # trainData = h.get_file(h.get_fileName()) + # name = h.get_fileName() + # if name == '': + # predictData = None + # else: + # predictData = h.get_file(name, fileType='predictData') # accuracy, precision, recall, f_score = m.knn(predictData, trainData) - accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData) + # accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData) + # h.calculateMetrics(accuracy, precision, recall, f_score) + + accuracy, precision, recall, f_score = m.decision_tree() + print(accuracy, precision, recall, f_score) h.calculateMetrics(accuracy, precision, recall, f_score) From 0c0fe5bdbf63571d5094bf24f9f19a9041cfeafa Mon Sep 17 00:00:00 2001 From: Areeb-Aatif Date: Wed, 20 Nov 2019 15:57:20 -0500 Subject: [PATCH 3/6] experimented with different classification algorithms --- project3/Code/decision_tree.py | 65 +++----- project3/Code/helpers.py | 40 +++++ project3/Code/library_codes.py | 290 +++++++++++++++++++++++++++++++++ project3/Code/main.py | 98 +++++++---- project3/Code/random_forest.py | 36 ++++ 5 files changed, 453 insertions(+), 76 deletions(-) create mode 100644 project3/Code/library_codes.py create mode 100644 project3/Code/random_forest.py diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py index 4293169..53a977d 100644 --- a/project3/Code/decision_tree.py +++ b/project3/Code/decision_tree.py @@ -4,31 +4,22 @@ class decisionTree: - def readData(self, filePath): - data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None) - dataDf = pd.DataFrame(data) - labels = dataDf.iloc[:,-1] - return dataDf.iloc[:,:-1], dataDf.iloc[:,-1] - - def oneHotEncoding(self, data, labels): - for colName, colData in data.iteritems(): - if colData.dtype == np.object: - data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1) - data.drop([colName], axis=1, inplace=True) - - return pd.concat([data, labels], axis=1) - - def decision(self, trainData, testData): + def decision(self, trainData): # trainData = data.loc[:percentSplit*data.shape[0]] # testData = data.loc[percentSplit*data.shape[0]:] root = self.createTree(trainData) - target = testData.iloc[:,-1].values.tolist() - predicted = self.predictData(testData.iloc[:, :-1], root) - return target, predicted, root + # target = testData.iloc[:,-1].values.tolist() + # predicted = self.predictData(testData.iloc[:, :-1], root) + # return target, predicted, root + return root - def createTree(self, data): + def createTree(self, data, depth=float('inf'), minLeafRows=0): n = Node() + if depth <= 0 or data.shape[0] <= minLeafRows: + n.feature = data.iloc[:,-1].value_counts().index[0] + return n + if data.iloc[:,-1].value_counts().shape[0] == 1: n.feature = data.iloc[:, -1].iloc[0] return n @@ -48,7 +39,7 @@ def createTree(self, data): temp.feature = data.iloc[:,-1].value_counts().index[0] n.left = temp else: - n.left = self.createTree(leftChildData) + n.left = self.createTree(leftChildData, depth-1, minLeafRows) rightChildData = data.loc[data[bestFeature] >= condition] rightChildData = rightChildData.drop(bestFeature, axis=1) @@ -57,7 +48,7 @@ def createTree(self, data): temp.feature = data.iloc[:,-1].value_counts().index[0] n.right = temp else: - n.right = self.createTree(rightChildData) + n.right = self.createTree(rightChildData, depth-1, minLeafRows) return n @@ -71,7 +62,6 @@ def getBestFeature(self, data): for p in percent: condition = (colData.max() - colData.min()) * p entropy_i = 0.0 - subData1 = data.loc[data[colName] < condition] prob1 = len(subData1) / float(len(data)) entropy_i += prob1 * self.entropy(subData1) @@ -113,19 +103,6 @@ def predictRow(self, data, root): elif data[root.feature] >= root.condition: return self.predictRow(data, root.right) - def findParams(self, predicted, target, tp=1, tn=0): - truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0 - for p, t in zip(predicted, target): - if p == tp and t == tp: - truePositives+=1 - elif p == tp and t == tn: - falsePositives+=1 - elif p == tn and t == tp: - falseNegatives+=1 - else: - trueNegatives+=1 - return truePositives, trueNegatives, falsePositives, falseNegatives - class Node: @@ -135,13 +112,13 @@ def __init__(self): self.right = None self.condition = None - # def __str__(self, level=0): - # ret = "\t"*level+repr(self.feature)+"\n" - # if self.left: - # ret += self.left.__str__(level+1) - # if self.right: - # ret += self.right.__str__(level+1) - # return ret + def __str__(self, level=0): + ret = "\t"*level+repr(self.feature)+"\n" + if self.left: + ret += self.left.__str__(level+1) + if self.right: + ret += self.right.__str__(level+1) + return ret - # def __repr__(self): - # return '' + def __repr__(self): + return '' diff --git a/project3/Code/helpers.py b/project3/Code/helpers.py index 16d318f..1cfa1f0 100644 --- a/project3/Code/helpers.py +++ b/project3/Code/helpers.py @@ -1,6 +1,7 @@ import numpy as np from point import point import math +import pandas as pd class helpers: def get_fileName(self): @@ -192,3 +193,42 @@ def calculateMetrics(self, accuracy, precision, recall, f_score): print("PRECISION = {}%".format(averagePrecision*100)) print("RECALL = {}%".format(averageRecall*100)) print("F MEASURE = {}%".format(averageFscore*100)) + + def readData(self, filePath): + ''' + Read input data for decision tree and random forest classifier + input: filepath + output: Data Points- a pandas dataframe of input data + Labels - a pandas dataframe of labels for each data point + ''' + data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None) + dataDf = pd.DataFrame(data) + labels = dataDf.iloc[:,-1] + return dataDf.iloc[:,:-1], dataDf.iloc[:,-1] + + def oneHotEncoding(self, data, labels): + ''' + One Hot Encode the input data file and then concat the labels to return a single dataframe + input: data - pandas dataframe of input data + labels - pandas dataframe of labels associated with input data points + output: returns a dataframe with one hot encoding and joining the labels to the data points + ''' + for colName, colData in data.iteritems(): + if colData.dtype == np.object: + data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1) + data.drop([colName], axis=1, inplace=True) + + return pd.concat([data, labels], axis=1) + + def findParameters(self, predicted, target, tp='1', tn='0'): + truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0 + for p, t in zip(predicted, target): + if p == tp and t == tp: + truePositives+=1 + elif p == tp and t == tn: + falsePositives+=1 + elif p == tn and t == tp: + falseNegatives+=1 + else: + trueNegatives+=1 + return truePositives, trueNegatives, falsePositives, falseNegatives diff --git a/project3/Code/library_codes.py b/project3/Code/library_codes.py new file mode 100644 index 0000000..e3b2297 --- /dev/null +++ b/project3/Code/library_codes.py @@ -0,0 +1,290 @@ +from helpers import helpers +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn import metrics +import xgboost +import matplotlib.pyplot as plt +from sklearn import preprocessing +from sklearn.neural_network import MLPClassifier +from sklearn.linear_model import LinearRegression +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import normalize +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA + +def readTrainData(): + + fileName = input("enter file name (without extension): ") + filePath = "CSE-601/project3/Data/"+fileName+".csv" + data = np.genfromtxt(filePath, dtype=None, delimiter=",", encoding=None) + data = pd.DataFrame(data) + data.drop(data.columns[0], axis=1, inplace=True) + + fileName = input("enter file name (without extension): ") + filePath = "CSE-601/project3/Data/"+fileName+".csv" + labels = np.genfromtxt(filePath, dtype=None, delimiter=",", encoding=None) + labels = pd.DataFrame(labels) + labels.drop(labels.index[0], axis=0, inplace=True) + labels.drop(labels.columns[0], axis=1, inplace=True) + labels = labels.reset_index(drop=True) + + return data, labels + +def splitData(data, labels): + + train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 5) + test_labels = test_labels.ravel() + train_labels = train_labels.ravel() + print('Training Features Shape:', train_features.shape) + print('Training Labels Shape:', train_labels.shape) + print('Testing Features Shape:', test_features.shape) + print('Testing Labels Shape:', test_labels.shape) + + return train_features, test_features, train_labels, test_labels + +def calMetrics(test_labels, pred_labels): + + print("Accuracy:", metrics.accuracy_score(test_labels, pred_labels)) + print("Precision:", metrics.precision_score(test_labels, pred_labels, pos_label='1')) + print("Recall:", metrics.recall_score(test_labels, pred_labels, pos_label='1')) + print("F-Measure:", metrics.f1_score(test_labels, pred_labels, pos_label='1')) + print("F Beta Score:", metrics.fbeta_score(test_labels, pred_labels, beta=0.5, pos_label='1')) + # print("MSE:", metrics.mean_squared_error(test_labels, pred_labels)) + +def readTestData(): + + fileName = input("enter test file name (without extension): ") + filePath = "CSE-601/project3/Data/"+fileName+".csv" + test_features = np.genfromtxt(filePath, dtype=None, delimiter=",", encoding=None) + test_features = pd.DataFrame(test_features) + test_features.set_index(['f0'], inplace=True) + return test_features + +def writeToFile(test_features, pred_labels): + + f = open('CSE-601/project3/Data/output8.csv', 'w') + for y, i in zip(pred_labels, test_features.index.values): + f.write(str(i)) + f.write(',') + f.write(str(y)) + f.write('\n') + f.close() + +def predict(clf, test_features): + + return clf.predict(test_features) + +def rf(data, labels, test_features=None): + + from random_forest import randomForest + from helpers import helpers as hp + from decision_tree import decisionTree + h = hp() + rf = randomForest() + dt = decisionTree() + + data = pd.concat([data, labels], axis=1) + print(data) + + accuracy = [] + precision = [] + recall = [] + f_score = [] + models = [] + fb_score = [] + + foldSize = int(data.shape[0] / 5) + for i in range(5): + print("Running iteration " + str(i+1) + " of k cross validation") + testData = data.loc[foldSize*i:foldSize*(i+1)-1] + trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) + forest = rf.forest(trainData) + target = testData.iloc[:,-1].values.tolist() + predicted = rf.predictForest(testData.iloc[:, :-1], forest) + models.append(forest) + calMetrics(target, predicted) + # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) + # print(truePositives, trueNegatives, falsePositives, falseNegatives) + # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) + # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + # precision.append(tmpPrecision) + # recall.append(tmpRecall) + # tm_fscore = h.findFMeasure(tmpPrecision, tmpRecall) + # print(tm_fscore) + # f_score.append(tm_fscore) + + h.calculateMetrics(accuracy, precision, recall, f_score) + + # for i in range(3): + # print("Running iteration " + str(i+1) + " of k cross validation") + # testData = data.loc[foldSize*i:foldSize*(i+1)-1] + # trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) + # root = dt.decision(trainData) + # target = testData.iloc[:,-1].values.tolist() + # predicted = dt.predictData(testData.iloc[:, :-1], root) + # models.append(root) + # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) + # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) + # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + # precision.append(tmpPrecision) + # recall.append(tmpRecall) + # f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) + + # print(accuracy, precision, recall, f_score) + # h.calculateMetrics(accuracy, precision, recall, f_score) + + # ind = f_score.index(min(f_score)) + # print(f_score[ind]) + # pred = rf.predictForest(test_features, models[ind]) + # print(pred) + predicted = pd.DataFrame() + for root in models: + pred = dt.predictData(test_features, root) + predicted = pd.concat([predicted, pd.DataFrame(pred)], axis=1) + + print(predicted) + + p = pd.DataFrame() + + p = [] + for idx, row in predicted.iterrows(): + p.append(row.value_counts().index.tolist()[0]) + + print(p) + + return p + # print([max(set(pred), key=pred.count) for pred in predicted]) + +def randomForest(train_features, train_labels): + + clf = RandomForestClassifier(n_estimators=100) + clf.fit(train_features, train_labels) + return clf + +def adaBoost(train_features, train_labels): + + clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=200) + clf.fit(train_features, train_labels) + + return clf + +def xgb(train_features, train_labels): + + clf = xgboost.XGBClassifier(random_state=1, learning_rate=0.01, n_estimators=200, max_depth=5) + clf.fit(train_features, train_labels) + return clf + +def nn(train_features, train_labels): + mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000) + mlp.fit(train_features, train_labels) + return mlp + +def lr(train_features, train_labels): + clf = LinearRegression() + clf.fit(train_features, train_labels) + return clf + +if __name__ == "__main__": + + data, labels = readTrainData() + # print(labels.iloc[:,0].value_counts()) + # print(zeros, ones) + # trainData = data.iloc[:int(data.shape[0]*0.8)] + # trainLabels = labels.iloc[:int(labels.shape[0]*0.8)] + # testData = data.iloc[int(data.shape[0]*0.8):] + # testLabels = labels.iloc[int(labels.shape[0]*0.8):] + # minmaxScaler = preprocessing.MinMaxScaler() + # scaledData = minmaxScaler.fit_transform(data) + # StandardScaler().fit_transform(data) + # pca = PCA(n_components=2) + # principalComponents = pca.fit_transform(data.values) + # principalDf = pd.DataFrame(data = principalComponents + # , columns = ['principal component 1', 'principal component 2']) + # finalDf = pd.concat([principalDf, labels], axis = 1) + # print(finalDf.head()) + + # zeros = finalDf[finalDf[1] == '0'] + # ones = finalDf[finalDf[1] == '1'] + + # newDf = pd.concat([zeros, ones]) + # print(newDf.head()) + # newdf = newDf.sample(frac=1, random_state=42) + + # data = newdf.drop([1], axis=1) + # labels = pd.DataFrame(newdf[1]) + + # print(pd.DataFrame(data).shape) + # exit() + # exit() + # labels = np.array(labels) + data = np.array(data) + train_features, test_features, train_labels, test_labels = splitData(data, np.array(labels)) + train_features = normalize(train_features) + test_features=normalize(test_features) + neighbours = np.arange(1,25) + train_accuracy =np.empty(len(neighbours)) + test_accuracy = np.empty(len(neighbours)) + for i,k in enumerate(neighbours): + # knn=KNeighborsClassifier(n_neighbors=k,algorithm="kd_tree",n_jobs=-1) + # knn.fit(train_features,train_labels.ravel()) + # train_accuracy[i] = knn.score(train_features, train_labels.ravel()) + # test_accuracy[i] = knn.score(test_features, test_labels.ravel()) + clf = xgb(train_features, train_labels) + + + + plt.title('k-NN Varying number of neighbors') + plt.plot(neighbours, test_accuracy, label='Testing Accuracy') + plt.plot(neighbours, train_accuracy, label='Training accuracy') + plt.legend() + plt.xlabel('Number of neighbors') + plt.ylabel('Accuracy') + plt.show() + + idx = np.where(test_accuracy == max(test_accuracy)) + x = neighbours[idx] + + knn=KNeighborsClassifier(n_neighbors=x[0],algorithm="kd_tree",n_jobs=-1) + knn.fit(train_features,train_labels.ravel()) + pred = knn.predict(test_features) + # calMetrics(test_labels, pred) + # exit() + # rf(data, labels) + # clf = adaBoost(train_features, train_labels) + # clf = nn(train_features, train_labels) + # clf = lr(train_features, train_labels) + + # xgboost.plot_importance(clf) + # plt.show() + + # pred = predict(clf, test_features) + # p = [] + # for pr in pred: + # if pr <= 0: p.append('0') + # else: p.append('1') + # print(p) + # calMetrics(test_labels, p) + + # pred = predict(clf, train_features) + # calMetrics(train_labels, pred) + + test_features = normalize(readTestData()) + pred = knn.predict(np.array(test_features)) + print(pred) + # print(train_features, test_features) + # exit() + + # pred = predict(clf, np.array(testData)) + # calMetrics(np.array(testLabels), pred) + + writeToFile(test_features, pred) + + + + diff --git a/project3/Code/main.py b/project3/Code/main.py index 4f19271..d22ac1d 100644 --- a/project3/Code/main.py +++ b/project3/Code/main.py @@ -36,6 +36,41 @@ def knn(self, predictData = None, trainData = None, kCrossValidation = 10): recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) predictData = pd + return accuracy, precision, recall, f_score + + def bayes_naive(self, predictData, trainData, kCrossValidation = 10): + h = hp() + nb = bayes() + accuracy = [] + precision = [] + recall = [] + f_score = [] + pd = predictData + for i in range(len(trainData)): + tmp = None + if predictData == None: + predictData = trainData[i] + tmp = [lt for j, lt in enumerate(trainData) if j != i] + else: + tmp = trainData + h.normalizeData(tmp) + h.normalizeEvaluationSet(predictData) + td = h.convertToList(tmp) + classPriorProbabilities = nb.findClassPriorProbability(td) + classes = nb.segregateClasses(td) + descriptorPosteriorProbabilites = nb.findDescriptorPosteriorProbabilites(classes) + nb.classify(predictData, classPriorProbabilities, descriptorPosteriorProbabilites) + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(predictData) + # print(truePositives, trueNegatives, falsePositives, falseNegatives) + if truePositives < trueNegatives: + truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives + accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) + tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + precision.append(tmpPrecision) + recall.append(tmpRecall) + f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) + predictData = pd return accuracy, precision, recall, f_score def decision_tree(self, kCrossValidation = 10): @@ -45,9 +80,9 @@ def decision_tree(self, kCrossValidation = 10): fileName = h.get_fileName() # filePath = "../Data/"+fileName+".txt" filePath = "CSE-601/project3/Data/"+fileName+".txt" + data, labels = h.readData(filePath) + data = h.oneHotEncoding(data, labels) dt = decisionTree() - data, labels = dt.readData(filePath) - data = dt.oneHotEncoding(data, labels) accuracy = [] precision = [] @@ -60,11 +95,11 @@ def decision_tree(self, kCrossValidation = 10): print("Running iteration " + str(i+1) + " of k cross validation") testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) - target, predicted, root = dt.decision(trainData, testData) + root = dt.decision(trainData) + target = testData.iloc[:,-1].values.tolist() + predicted = dt.predictData(testData.iloc[:, :-1], root) models.append(root) - truePositives, trueNegatives, falsePositives, falseNegatives = dt.findParams(predicted, target) - # if truePositives < trueNegatives: - # truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) @@ -72,41 +107,41 @@ def decision_tree(self, kCrossValidation = 10): recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) return accuracy, precision, recall, f_score - - - def bayes_naive(self, predictData, trainData, kCrossValidation = 10): + + def random_forest(self, kCrossValidation = 10): + print("\nRunning Random Forest Classifier ....................\n") + from random_forest import randomForest h = hp() - nb = bayes() + fileName = h.get_fileName() + # filePath = "../Data/"+fileName+".txt" + filePath = "CSE-601/project3/Data/"+fileName+".txt" + data, labels = h.readData(filePath) + data = h.oneHotEncoding(data, labels) + rf = randomForest() + accuracy = [] precision = [] recall = [] f_score = [] - pd = predictData - for i in range(len(trainData)): - tmp = None - if predictData == None: - predictData = trainData[i] - tmp = [lt for j, lt in enumerate(trainData) if j != i] - else: - tmp = trainData - h.normalizeData(tmp) - h.normalizeEvaluationSet(predictData) - td = h.convertToList(tmp) - classPriorProbabilities = nb.findClassPriorProbability(td) - classes = nb.segregateClasses(td) - descriptorPosteriorProbabilites = nb.findDescriptorPosteriorProbabilites(classes) - nb.classify(predictData, classPriorProbabilities, descriptorPosteriorProbabilites) - truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(predictData) - # print(truePositives, trueNegatives, falsePositives, falseNegatives) - if truePositives < trueNegatives: - truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives + models = [] + + foldSize = int(data.shape[0] / kCrossValidation) + for i in range(kCrossValidation): + print("Running iteration " + str(i+1) + " of k cross validation") + testData = data.loc[foldSize*i:foldSize*(i+1)-1] + trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) + forest = rf.forest(trainData) + target = testData.iloc[:,-1].values.tolist() + predicted = rf.predictForest(testData.iloc[:, :-1], forest) + models.append(forest) + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) + print(truePositives, trueNegatives, falsePositives, falseNegatives) accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) - predictData = pd return accuracy, precision, recall, f_score if __name__ == "__main__": @@ -122,7 +157,6 @@ def bayes_naive(self, predictData, trainData, kCrossValidation = 10): # accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData) # h.calculateMetrics(accuracy, precision, recall, f_score) - accuracy, precision, recall, f_score = m.decision_tree() + accuracy, precision, recall, f_score = m.random_forest() print(accuracy, precision, recall, f_score) h.calculateMetrics(accuracy, precision, recall, f_score) - diff --git a/project3/Code/random_forest.py b/project3/Code/random_forest.py new file mode 100644 index 0000000..0b7f48d --- /dev/null +++ b/project3/Code/random_forest.py @@ -0,0 +1,36 @@ +import numpy as np +import pandas as pd +from decision_tree import decisionTree as dt + +class randomForest: + + def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth=10, minLeafRows=5, randomSeed=12): + if numFeatures == None: + numFeatures = int(np.sqrt(trainData.shape[1])) + + if numRows == None: + # numRows = int(trainData.shape[0] * 0.8) + numRows = trainData.shape[0] + + forest = [self.createForest(trainData.iloc[:, :-1], trainData.iloc[:,-1], numFeatures, numRows, maxDepth, minLeafRows, randomSeed) for i in range(numTrees)] + return forest + + + def createForest(self, trainData, labels, numFeatures, numRows, maxDepth, minLeafRows, randomSeed): + + trainData = trainData.sample(numFeatures, axis=1, random_state=randomSeed, replace=False) + trainData = pd.concat([trainData, labels], axis=1) + trainData = trainData.sample(numRows, axis=0, random_state=randomSeed, replace=False) + + return dt().createTree(trainData, maxDepth, minLeafRows) + + def predictForest(self, testData, forest): + predicted = [] + for _, row in testData.iterrows(): + predictedRow = [dt().predictRow(row, root) for root in forest] + predicted.append(max(set(predictedRow), key=predictedRow.count)) + return predicted + + + + From e67544f6b268a114b6fe55bfa4694ca041aad4b1 Mon Sep 17 00:00:00 2001 From: Areeb-Aatif Date: Tue, 26 Nov 2019 14:36:42 -0500 Subject: [PATCH 4/6] modified the random forest implementation --- project3/Code/decision_tree.py | 34 ++++---- project3/Code/helpers.py | 4 +- project3/Code/library_codes.py | 151 ++++++++++++++++++++++++++------- project3/Code/main.py | 14 ++- project3/Code/random_forest.py | 11 ++- 5 files changed, 156 insertions(+), 58 deletions(-) diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py index 53a977d..f4528f4 100644 --- a/project3/Code/decision_tree.py +++ b/project3/Code/decision_tree.py @@ -1,19 +1,18 @@ import numpy as np import pandas as pd from math import log +import random class decisionTree: - def decision(self, trainData): - # trainData = data.loc[:percentSplit*data.shape[0]] - # testData = data.loc[percentSplit*data.shape[0]:] - root = self.createTree(trainData) - # target = testData.iloc[:,-1].values.tolist() - # predicted = self.predictData(testData.iloc[:, :-1], root) - # return target, predicted, root + def decision(self, trainData, maxFeatures=None, depth=float('inf'), minLeafRows=0, rf=False): + features = trainData.columns.values.tolist() + features.pop() + root = self.createTree(trainData, features, maxFeatures, depth, minLeafRows, rf) + # print(root) return root - def createTree(self, data, depth=float('inf'), minLeafRows=0): + def createTree(self, data, features, maxFeatures, depth, minLeafRows, rf): n = Node() if depth <= 0 or data.shape[0] <= minLeafRows: @@ -21,34 +20,37 @@ def createTree(self, data, depth=float('inf'), minLeafRows=0): return n if data.iloc[:,-1].value_counts().shape[0] == 1: - n.feature = data.iloc[:, -1].iloc[0] + n.feature = data.iloc[:,-1].iloc[0] return n - if data.shape[1] == 2: + if len(features) == 0: n.feature = data.iloc[:,-1].value_counts().index[0] return n - bestFeature, condition = self.getBestFeature(data) + if rf == True: + sampledData = pd.concat([data[random.sample(features, k=maxFeatures)], data.iloc[:,-1]], axis=1) + bestFeature, condition = self.getBestFeature(sampledData) + else: + bestFeature, condition = self.getBestFeature(pd.concat([data[features], data.iloc[:,-1]], axis=1)) + features = [x for _,x in enumerate(features) if x != bestFeature] n.feature = bestFeature n.condition = condition leftChildData = data.loc[data[bestFeature] < condition] - leftChildData = leftChildData.drop(bestFeature, axis=1) if leftChildData.shape[0] == 0: temp = Node() temp.feature = data.iloc[:,-1].value_counts().index[0] n.left = temp else: - n.left = self.createTree(leftChildData, depth-1, minLeafRows) + n.left = self.createTree(leftChildData, features, maxFeatures, depth-1, minLeafRows, rf) rightChildData = data.loc[data[bestFeature] >= condition] - rightChildData = rightChildData.drop(bestFeature, axis=1) if rightChildData.shape[0] == 0: temp = Node() temp.feature = data.iloc[:,-1].value_counts().index[0] n.right = temp else: - n.right = self.createTree(rightChildData, depth-1, minLeafRows) + n.right = self.createTree(rightChildData, features, maxFeatures, depth-1, minLeafRows, rf) return n @@ -58,7 +60,7 @@ def getBestFeature(self, data): bestFeature = 0.0 bestCondition = 0.0 for colName, colData in data.iloc[:,:-1].iteritems(): - percent = [0.25, 0.5, 0.75] + percent = [0.2, 0.5, 0.8] for p in percent: condition = (colData.max() - colData.min()) * p entropy_i = 0.0 diff --git a/project3/Code/helpers.py b/project3/Code/helpers.py index b904b6b..38211d2 100644 --- a/project3/Code/helpers.py +++ b/project3/Code/helpers.py @@ -6,7 +6,7 @@ class helpers: def get_fileName(self): - filename = input("enter file name (without extension): ") + filename = input("Enter file name (without extension): ") return filename def get_file_bayes(self, filename, kCrossValidation = 10, fileType='trainData'): @@ -253,7 +253,7 @@ def oneHotEncoding(self, data, labels): return pd.concat([data, labels], axis=1) - def findParameters(self, predicted, target, tp='1', tn='0'): + def findParameters(self, predicted, target, tp=1, tn=0): truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0 for p, t in zip(predicted, target): if p == tp and t == tp: diff --git a/project3/Code/library_codes.py b/project3/Code/library_codes.py index e3b2297..a47b767 100644 --- a/project3/Code/library_codes.py +++ b/project3/Code/library_codes.py @@ -15,6 +15,7 @@ from sklearn.preprocessing import normalize from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA +from scipy import stats def readTrainData(): @@ -36,6 +37,7 @@ def readTrainData(): def splitData(data, labels): + train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 5) test_labels = test_labels.ravel() train_labels = train_labels.ravel() @@ -66,7 +68,7 @@ def readTestData(): def writeToFile(test_features, pred_labels): - f = open('CSE-601/project3/Data/output8.csv', 'w') + f = open('CSE-601/project3/Data/output5.csv', 'w') for y, i in zip(pred_labels, test_features.index.values): f.write(str(i)) f.write(',') @@ -78,6 +80,48 @@ def predict(clf, test_features): return clf.predict(test_features) +def dt(data, labels, test_features=None): + + from helpers import helpers as hp + from decision_tree import decisionTree + h = hp() + dt = decisionTree() + data = pd.concat([data, labels], axis=1) + data.dropna(inplace=True) + print(data.head()) + trainAccuracy = [] + testAccuracy = [] + precision = [] + recall = [] + f_score = [] + models = [] + + foldSize = int(data.shape[0] / 10) + for i in range(10): + print("Running iteration " + str(i+1) + " of k cross validation") + testData = data.loc[foldSize*i:foldSize*(i+1)-1] + # testData = pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns) + trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) + trainData = trainData[(np.abs(stats.zscore(trainData.iloc[:,:-1])) < 3).all(axis=1)] + root = dt.decision(trainData, depth=10, minLeafRows=5) + testTarget = testData.iloc[:,-1].values.tolist() + # testPredicted = dt.predictData(testData.iloc[:, :-1], root) + testPredicted = dt.predictData(pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns.values.tolist()[:-1]), root) + trainTarget = trainData.iloc[:,-1].values.tolist() + trainPredicted = dt.predictData(trainData.iloc[:, :-1], root) + models.append(root) + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(trainPredicted, trainTarget) + trainAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(testPredicted, testTarget) + testAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) + tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + precision.append(tmpPrecision) + recall.append(tmpRecall) + f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) + h.calculateMetrics(testAccuracy, precision, recall, f_score) + return trainAccuracy, testAccuracy, precision, recall, models, dt + def rf(data, labels, test_features=None): from random_forest import randomForest @@ -88,7 +132,7 @@ def rf(data, labels, test_features=None): dt = decisionTree() data = pd.concat([data, labels], axis=1) - print(data) + # print(data) accuracy = [] precision = [] @@ -120,21 +164,7 @@ def rf(data, labels, test_features=None): h.calculateMetrics(accuracy, precision, recall, f_score) - # for i in range(3): - # print("Running iteration " + str(i+1) + " of k cross validation") - # testData = data.loc[foldSize*i:foldSize*(i+1)-1] - # trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) - # root = dt.decision(trainData) - # target = testData.iloc[:,-1].values.tolist() - # predicted = dt.predictData(testData.iloc[:, :-1], root) - # models.append(root) - # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) - # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) - # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) - # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) - # precision.append(tmpPrecision) - # recall.append(tmpRecall) - # f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) + # print(accuracy, precision, recall, f_score) # h.calculateMetrics(accuracy, precision, recall, f_score) @@ -193,6 +223,18 @@ def lr(train_features, train_labels): if __name__ == "__main__": data, labels = readTrainData() + # labels.rename(columns={1: 100}, inplace=True) + # data = stats.zscore(data, axis=1) + # data = pd.DataFrame(data) + # print(data.skew(axis=0)) + # data.drop(data.columns[1], axis=1, inplace=True) + # print(data.skew(axis=0)) + # print(data.head()) + # exit() + # for c in data.columns: + # data[c] = np.log10(data[c]) + # print(data.skew(axis=0)) + # exit() # print(labels.iloc[:,0].value_counts()) # print(zeros, ones) # trainData = data.iloc[:int(data.shape[0]*0.8)] @@ -223,37 +265,69 @@ def lr(train_features, train_labels): # exit() # exit() # labels = np.array(labels) + data = data.sample(data.shape[0], axis=0, random_state=12, replace=False) data = np.array(data) train_features, test_features, train_labels, test_labels = splitData(data, np.array(labels)) - train_features = normalize(train_features) - test_features=normalize(test_features) + train_features = pd.DataFrame(train_features) + train_labels = pd.DataFrame(train_labels) + test_features = pd.DataFrame(test_features) + test_labels = pd.DataFrame(test_labels) + # print(train_features.skew(axis=0).sort_values(ascending=False)) + train_features = pd.DataFrame(stats.zscore(train_features, axis=1), columns=train_features.columns) + print(train_features.skew(axis=0).sort_values(ascending=False)) + test_features = pd.DataFrame(stats.zscore(test_features, axis=1), columns=test_features.columns) + print(test_features.skew(axis=0).sort_values(ascending=False)) + # train_features = train_features[(np.abs(stats.zscore(train_features)) < 1).all(axis=1)] + + train_features.drop(columns=[29, 91], axis=1, inplace=True) + # train_features.drop(columns=[41], inplace=True) + # print(train_features.skew(axis=0).sort_values(ascending=False)) + trainData = pd.concat([train_features, train_labels], axis=1) + trainData.dropna(inplace=True) + train_features = np.array(trainData.iloc[:,:-1]) + train_labels = np.array(trainData.iloc[:,-1]) + + test_features.drop(columns=[29, 91], axis=1, inplace=True) + # test_features.drop(columns=[41], axis=1, inplace=True) + testData = pd.concat([test_features, test_labels], axis=1) + testData.dropna(inplace=True) + test_features = np.array(testData.iloc[:,:-1]) + test_labels = np.array(testData.iloc[:,-1]) + # train_features = normalize(train_features) + # test_features=normalize(test_features) neighbours = np.arange(1,25) train_accuracy =np.empty(len(neighbours)) test_accuracy = np.empty(len(neighbours)) for i,k in enumerate(neighbours): - # knn=KNeighborsClassifier(n_neighbors=k,algorithm="kd_tree",n_jobs=-1) - # knn.fit(train_features,train_labels.ravel()) - # train_accuracy[i] = knn.score(train_features, train_labels.ravel()) - # test_accuracy[i] = knn.score(test_features, test_labels.ravel()) - clf = xgb(train_features, train_labels) + knn=KNeighborsClassifier(n_neighbors=k,algorithm="kd_tree",n_jobs=-1) + knn.fit(train_features,train_labels.ravel()) + train_accuracy[i] = knn.score(train_features, train_labels.ravel()) + test_accuracy[i] = knn.score(test_features, test_labels.ravel()) + # clf = xgb(train_features, train_labels) - - + # trainAccuracy, testAccuracy, precision, recall, models, dt = dt(data, labels) + # neighbours = np.arange(1,11) plt.title('k-NN Varying number of neighbors') - plt.plot(neighbours, test_accuracy, label='Testing Accuracy') - plt.plot(neighbours, train_accuracy, label='Training accuracy') + plt.plot(neighbours, train_accuracy, label='Training Accuracy') + plt.plot(neighbours, test_accuracy, label='Testing accuracy') plt.legend() plt.xlabel('Number of neighbors') plt.ylabel('Accuracy') plt.show() idx = np.where(test_accuracy == max(test_accuracy)) + # idx = testAccuracy.index(max(testAccuracy)) + # idx = sorted(range(len(testAccuracy)), key=lambda i: testAccuracy[i], reverse=True)[:5] + # print(idx) + # print(testAccuracy[idx]) + # print(trainAccuracy[idx]) x = neighbours[idx] - + # roots = [models[i] for i in idx] + # print(testAccuracy) knn=KNeighborsClassifier(n_neighbors=x[0],algorithm="kd_tree",n_jobs=-1) knn.fit(train_features,train_labels.ravel()) pred = knn.predict(test_features) - # calMetrics(test_labels, pred) + calMetrics(test_labels, pred) # exit() # rf(data, labels) # clf = adaBoost(train_features, train_labels) @@ -273,10 +347,23 @@ def lr(train_features, train_labels): # pred = predict(clf, train_features) # calMetrics(train_labels, pred) - - test_features = normalize(readTestData()) + # print(root) + test_features = readTestData() + test_features.drop(columns=['f30', 'f92'], inplace=True) + # test_features = pd.DataFrame(stats.zscore(test_features), index=test_features.index) + print(test_features.skew(axis=0).sort_values(ascending=False)) + # test_features = pd.DataFrame(stats.zscore(test_features, axis=1), index=test_features.index) + # test_features.drop(test_features.columns[1], axis=1, inplace=True) + # test_features = pd.DataFrame(stats.zscore(test_features, axis=1)) + # print(test_features.skew(axis=0)) + # pred = [] + # for _, row in test_features.iterrows(): + # predictedRow = [dt.predictRow(row, root) for root in roots] + # pred.append(max(set(predictedRow), key=predictedRow.count)) pred = knn.predict(np.array(test_features)) print(pred) + # pred = knn.predict(np.array(test_features)) + # print(pred) # print(train_features, test_features) # exit() diff --git a/project3/Code/main.py b/project3/Code/main.py index 179979e..2bf7370 100644 --- a/project3/Code/main.py +++ b/project3/Code/main.py @@ -4,6 +4,7 @@ from sklearn import preprocessing import numpy as np import pandas as pd +import matplotlib.pyplot as plt class main: def knn(self, predictData = None, trainData = None): @@ -90,9 +91,10 @@ def decision_tree(self, kCrossValidation = 10): foldSize = int(data.shape[0] / kCrossValidation) for i in range(kCrossValidation): - print("Running iteration " + str(i+1) + " of k cross validation") + print("Running iteration " + str(i+1) + " of k cross validation .....") testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) + # root = dt.decision(trainData, depth=10, minLeafRows=5) root = dt.decision(trainData) target = testData.iloc[:,-1].values.tolist() predicted = dt.predictData(testData.iloc[:, :-1], root) @@ -125,7 +127,7 @@ def random_forest(self, kCrossValidation = 10): foldSize = int(data.shape[0] / kCrossValidation) for i in range(kCrossValidation): - print("Running iteration " + str(i+1) + " of k cross validation") + print("Running iteration " + str(i+1) + " of k cross validation .....") testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) forest = rf.forest(trainData) @@ -157,6 +159,9 @@ def random_forest(self, kCrossValidation = 10): predictData = h.get_file(name, fileType='predictData') accuracy, precision, recall, f_score = m.knn(predictData, trainData) h.calculateMetrics(accuracy, precision, recall, f_score) + elif algorithm == 2: + accuracy, precision, recall, f_score = m.decision_tree() + h.calculateMetrics(accuracy, precision, recall, f_score) elif algorithm == 3: print("Enter train File name") trainData = h.get_file_bayes(h.get_fileName(), kCrossValidation = 10) @@ -168,3 +173,8 @@ def random_forest(self, kCrossValidation = 10): predictData = h.get_file_bayes(name, fileType='predictData') accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData) h.calculateMetrics(accuracy, precision, recall, f_score) + elif algorithm == 4: + accuracy, precision, recall, f_score = m.random_forest() + h.calculateMetrics(accuracy, precision, recall, f_score) + else: + print("\nWrong input") diff --git a/project3/Code/random_forest.py b/project3/Code/random_forest.py index 0b7f48d..10052a6 100644 --- a/project3/Code/random_forest.py +++ b/project3/Code/random_forest.py @@ -4,7 +4,7 @@ class randomForest: - def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth=10, minLeafRows=5, randomSeed=12): + def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth=10, minLeafRows=3, randomSeed=12): if numFeatures == None: numFeatures = int(np.sqrt(trainData.shape[1])) @@ -12,17 +12,16 @@ def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth # numRows = int(trainData.shape[0] * 0.8) numRows = trainData.shape[0] - forest = [self.createForest(trainData.iloc[:, :-1], trainData.iloc[:,-1], numFeatures, numRows, maxDepth, minLeafRows, randomSeed) for i in range(numTrees)] + forest = [self.createForest(trainData, numFeatures, numRows, maxDepth, minLeafRows, randomSeed) for i in range(numTrees)] return forest - def createForest(self, trainData, labels, numFeatures, numRows, maxDepth, minLeafRows, randomSeed): + def createForest(self, trainData, numFeatures, numRows, maxDepth, minLeafRows, randomSeed): - trainData = trainData.sample(numFeatures, axis=1, random_state=randomSeed, replace=False) - trainData = pd.concat([trainData, labels], axis=1) + # trainData = trainData.sample(numFeatures, axis=1, random_state=randomSeed, replace=False) trainData = trainData.sample(numRows, axis=0, random_state=randomSeed, replace=False) - return dt().createTree(trainData, maxDepth, minLeafRows) + return dt().decision(trainData, maxFeatures=numFeatures, depth=maxDepth, minLeafRows=minLeafRows, rf=True) def predictForest(self, testData, forest): predicted = [] From 7efc5b572b2e396ee1ae35410990080248615de0 Mon Sep 17 00:00:00 2001 From: Areeb-Aatif Date: Wed, 27 Nov 2019 18:48:43 -0500 Subject: [PATCH 5/6] limited the depth of decision tree --- project3/Code/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/project3/Code/main.py b/project3/Code/main.py index 2bf7370..5a60ced 100644 --- a/project3/Code/main.py +++ b/project3/Code/main.py @@ -95,7 +95,7 @@ def decision_tree(self, kCrossValidation = 10): testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) # root = dt.decision(trainData, depth=10, minLeafRows=5) - root = dt.decision(trainData) + root = dt.decision(trainData, depth=15, minLeafRows=5) target = testData.iloc[:,-1].values.tolist() predicted = dt.predictData(testData.iloc[:, :-1], root) models.append(root) @@ -135,7 +135,6 @@ def random_forest(self, kCrossValidation = 10): predicted = rf.predictForest(testData.iloc[:, :-1], forest) models.append(forest) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) - print(truePositives, trueNegatives, falsePositives, falseNegatives) accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) From 37686e8a8e70007a93df8cfea2a6bf9e5f1c68a7 Mon Sep 17 00:00:00 2001 From: Areeb-Aatif Date: Thu, 28 Nov 2019 02:22:50 -0500 Subject: [PATCH 6/6] modified decision tree and random forest code to handle separate train and test data files --- project3/Code/main.py | 58 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/project3/Code/main.py b/project3/Code/main.py index 5a60ced..e187002 100644 --- a/project3/Code/main.py +++ b/project3/Code/main.py @@ -94,8 +94,8 @@ def decision_tree(self, kCrossValidation = 10): print("Running iteration " + str(i+1) + " of k cross validation .....") testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) - # root = dt.decision(trainData, depth=10, minLeafRows=5) - root = dt.decision(trainData, depth=15, minLeafRows=5) + # root = dt.decision(trainData) + root = dt.decision(trainData, depth=10, minLeafRows=3) target = testData.iloc[:,-1].values.tolist() predicted = dt.predictData(testData.iloc[:, :-1], root) models.append(root) @@ -106,7 +106,28 @@ def decision_tree(self, kCrossValidation = 10): precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) - return accuracy, precision, recall, f_score + + print("\nMetrics on train data with k-cross validation") + h.calculateMetrics(accuracy, precision, recall, f_score) + + fileName = input("\nEnter test data file name without extension (if no test file, just press enter): ") + if fileName != '': + # filePath = "../Data/"+fileName+".txt" + filePath = "CSE-601/project3/Data/"+fileName+".txt" + testData, testLabels = h.readData(filePath) + testData = h.oneHotEncoding(testData, testLabels) + predLabels = [] + for _,row in testData.iloc[:,:-1].iterrows(): + predictedRow = [dt.predictRow(row, root) for root in models] + predLabels.append(max(set(predictedRow), key=predictedRow.count)) + print(predLabels) + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predLabels, testData.iloc[:,-1].values.tolist()) + accuracy = [h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)] + precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + f_score = [h.findFMeasure(precision, recall)] + print("\nMetrics on test data with bagging") + h.calculateMetrics(accuracy, [precision], [recall], f_score) def random_forest(self, kCrossValidation = 10): print("\nRunning Random Forest Classifier ....................\n") @@ -141,7 +162,30 @@ def random_forest(self, kCrossValidation = 10): precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) - return accuracy, precision, recall, f_score + + print("\nMetrics on train data with k-cross validation") + h.calculateMetrics(accuracy, precision, recall, f_score) + + fileName = input("\nEnter test data file name without extension (if no test file, just press enter): ") + if fileName != '': + # filePath = "../Data/"+fileName+".txt" + filePath = "CSE-601/project3/Data/"+fileName+".txt" + testData, testLabels = h.readData(filePath) + testData = h.oneHotEncoding(testData, testLabels) + predLabels = [] + for forest in models: + predLabels.append(rf.predictForest(testData, forest)) + predLabels = pd.DataFrame(predLabels) + pred = [] + for _, colData in predLabels.iteritems(): + pred.append(colData.value_counts().index[0]) + truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(pred, testData.iloc[:,-1].values.tolist()) + accuracy = [h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)] + precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) + recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) + f_score = [h.findFMeasure(precision, recall)] + print("\nMetrics on test data with bagging") + h.calculateMetrics(accuracy, [precision], [recall], f_score) if __name__ == "__main__": m = main() @@ -159,8 +203,7 @@ def random_forest(self, kCrossValidation = 10): accuracy, precision, recall, f_score = m.knn(predictData, trainData) h.calculateMetrics(accuracy, precision, recall, f_score) elif algorithm == 2: - accuracy, precision, recall, f_score = m.decision_tree() - h.calculateMetrics(accuracy, precision, recall, f_score) + m.decision_tree() elif algorithm == 3: print("Enter train File name") trainData = h.get_file_bayes(h.get_fileName(), kCrossValidation = 10) @@ -173,7 +216,6 @@ def random_forest(self, kCrossValidation = 10): accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData) h.calculateMetrics(accuracy, precision, recall, f_score) elif algorithm == 4: - accuracy, precision, recall, f_score = m.random_forest() - h.calculateMetrics(accuracy, precision, recall, f_score) + m.random_forest() else: print("\nWrong input")