From efaf785d05de1281bbdf4ae050b4fa365fa4e62f Mon Sep 17 00:00:00 2001
From: Areeb-Aatif <areebuddin95@gmail.com>
Date: Mon, 11 Nov 2019 15:11:14 -0500
Subject: [PATCH 1/6] initial implementation of decision tree

---
 project3/Code/decision_tree.py | 126 +++++++++++++++++++++++++++++++++
 project3/Code/main.py          |  19 ++++-
 project3/Data/dummy.txt        |  10 +++
 3 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 project3/Code/decision_tree.py
 create mode 100644 project3/Data/dummy.txt

diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py
new file mode 100644
index 0000000..caa5d28
--- /dev/null
+++ b/project3/Code/decision_tree.py
@@ -0,0 +1,126 @@
+import numpy as np
+import pandas as pd
+from math import log
+
+class decisionTree:
+
+    def readData(self, filePath):
+        data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None)
+        dataDf = pd.DataFrame(data)
+        labels = dataDf.iloc[:,-1]
+        return dataDf.iloc[:,:-1], dataDf.iloc[:,-1]
+
+    def oneHotEncoding(self, data, labels):
+        for colName, colData in data.iteritems():
+            if colData.dtype == np.object:
+                data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1)
+                data.drop([colName], axis=1, inplace=True)
+
+        return pd.concat([data, labels], axis=1)
+
+    def decision(self, data):
+        print("Running Decision Tree Classifier ....................")
+        root = self.createTree(data.loc[:70*data.shape[0] / 100])
+        # print(root)
+        testData = data.loc[70*data.shape[0] / 100:]
+        target = testData.iloc[:,-1].values.tolist()
+        predicted = self.testData(testData.iloc[:, :-1], root)
+        return target, predicted
+
+    def createTree(self, data):
+        n = Node()
+
+        print(data)
+        if data.iloc[:,-1].value_counts().shape[0] == 1:
+            n.feature = data.iloc[:, -1].iloc[0]
+            return n
+
+        if data.shape[1] == 2:
+            n.feature = data.iloc[:,-1].value_counts().index[0]
+            return n
+
+        bestFeature = self.getBestFeature(data)
+        n.feature = bestFeature
+
+        condition = (data[bestFeature].max() + data[bestFeature].min()) / 2
+        n.condition = condition
+
+        leftChildData = data.loc[data[bestFeature] < condition]
+        leftChildData = leftChildData.drop(bestFeature, axis=1)
+        # print(leftChildData)
+        n.left = self.createTree(leftChildData)
+
+        rightChildData = data.loc[data[bestFeature] >= condition]
+        rightChildData = rightChildData.drop(bestFeature, axis=1)
+        # print(rightChildData)
+        n.right = self.createTree(rightChildData)
+
+        return n
+
+    def getBestFeature(self, data):
+        entropy_p = self.entropy(data)
+        max_gain = float('-inf')
+        bestFeature = 0
+        for colName, colData in data.iloc[:,:-1].iteritems():
+            condition = (colData.max() - colData.min()) / 2
+            entropy_i = 0.0
+
+            subData1 = data.loc[data[colName] < condition]
+            prob1 = len(subData1) / float(len(data))
+            entropy_i += prob1 * self.entropy(subData1)
+
+            subData2 = data.loc[data[colName] >= condition]
+            prob2 = len(subData2) / float(len(data))
+            entropy_i += prob2 * self.entropy(subData2)
+
+            info_gain = entropy_p - entropy_i
+            if info_gain > max_gain:
+                max_gain = info_gain
+                bestFeature = colName
+
+        return bestFeature
+
+    def entropy(self, data):
+        entropy = 0.0
+        labelCounts = data.iloc[:,-1].value_counts()
+        for idx in labelCounts.index:
+            prob = float(labelCounts[idx]) / len(data)
+            entropy -= prob * log(prob, 2)
+
+        return entropy
+
+    def testData(self, data, root):
+        predicted = []
+        for index, row in data.iterrows():
+            predicted.append(self.testRow(row, root))
+
+        return predicted
+
+    def testRow(self, data, root):
+        if not root.left and not root.right:
+            return root.feature
+
+        if data[root.feature] < root.condition:
+            return self.testRow(data, root.left)
+        elif data[root.feature] >= root.condition:
+            return self.testRow(data, root.right)
+
+
+class Node:
+
+    def __init__(self):
+        self.feature = None
+        self.left = None
+        self.right = None
+        self.condition = None
+
+    def __str__(self, level=0):
+        ret = "\t"*level+repr(self.feature)+"\n"
+        if self.left:
+            ret += self.left.__str__(level+1)
+        if self.right:
+            ret += self.right.__str__(level+1)
+        return ret
+
+    def __repr__(self):
+        return '<tree node representation>'
diff --git a/project3/Code/main.py b/project3/Code/main.py
index 78f775c..f0583ce 100644
--- a/project3/Code/main.py
+++ b/project3/Code/main.py
@@ -1,6 +1,6 @@
 from helpers import helpers as hp
 from knn import knn
-from naive_bayes import bayes
+# from naive_bayes import bayes
 
 class main:
     def knn(self, kCrossValidation = 10):
@@ -32,6 +32,21 @@ def knn(self, kCrossValidation = 10):
         print("PRECISION = {}%".format(averagePrecision*100))
         print("RECALL = {}%".format(averageRecall*100))
         print("F MEASURE = {}%".format(averageFscore*100))
+
+    def decision_tree(self, kCrossValidation = 10):
+        from decision_tree import decisionTree
+        h = hp()
+        fileName = h.get_fileName()
+        # filePath = "../Data/"+fileName+".txt"
+        filePath = "CSE-601/project3/Data/"+fileName+".txt"
+        dt = decisionTree()
+        data, labels = dt.readData(filePath)
+        data = dt.oneHotEncoding(data, labels)
+        target, predicted = dt.decision(data)
+        print(target)
+        print(predicted)
+
+        
     
     def bayes_naive(self, kCrossValidation = 10):
         h = hp()
@@ -49,4 +64,4 @@ def bayes_naive(self, kCrossValidation = 10):
             td = h.convertToList(tmp)
 
 
-main().knn()
\ No newline at end of file
+main().decision_tree()
\ No newline at end of file
diff --git a/project3/Data/dummy.txt b/project3/Data/dummy.txt
new file mode 100644
index 0000000..9553899
--- /dev/null
+++ b/project3/Data/dummy.txt
@@ -0,0 +1,10 @@
+Yes	Single	125000	No
+No	Married	100000	No
+No	Single	70000	No
+Yes	Married	120000	No
+No	Divorced	95000	Yes
+No	Married	60000	No
+Yes	Divorced	220000	No
+No	Single	85000	Yes
+No	Married	75000	No
+No	Single	90000	Yes
\ No newline at end of file

From c4cad5e9a671980c41bf2b1800659798579c317b Mon Sep 17 00:00:00 2001
From: Areeb-Aatif <areebuddin95@gmail.com>
Date: Mon, 11 Nov 2019 20:32:11 -0500
Subject: [PATCH 2/6] added optimizations to decision tree

---
 project3/Code/decision_tree.py | 107 ++++++++++++++++++++-------------
 project3/Code/main.py          |  48 ++++++++++++---
 2 files changed, 102 insertions(+), 53 deletions(-)

diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py
index caa5d28..4293169 100644
--- a/project3/Code/decision_tree.py
+++ b/project3/Code/decision_tree.py
@@ -18,19 +18,17 @@ def oneHotEncoding(self, data, labels):
 
         return pd.concat([data, labels], axis=1)
 
-    def decision(self, data):
-        print("Running Decision Tree Classifier ....................")
-        root = self.createTree(data.loc[:70*data.shape[0] / 100])
-        # print(root)
-        testData = data.loc[70*data.shape[0] / 100:]
+    def decision(self, trainData, testData):
+        # trainData = data.loc[:percentSplit*data.shape[0]]
+        # testData = data.loc[percentSplit*data.shape[0]:]
+        root = self.createTree(trainData)
         target = testData.iloc[:,-1].values.tolist()
-        predicted = self.testData(testData.iloc[:, :-1], root)
-        return target, predicted
+        predicted = self.predictData(testData.iloc[:, :-1], root)
+        return target, predicted, root
 
     def createTree(self, data):
         n = Node()
 
-        print(data)
         if data.iloc[:,-1].value_counts().shape[0] == 1:
             n.feature = data.iloc[:, -1].iloc[0]
             return n
@@ -39,46 +37,56 @@ def createTree(self, data):
             n.feature = data.iloc[:,-1].value_counts().index[0]
             return n
 
-        bestFeature = self.getBestFeature(data)
+        bestFeature, condition = self.getBestFeature(data)
         n.feature = bestFeature
-
-        condition = (data[bestFeature].max() + data[bestFeature].min()) / 2
         n.condition = condition
 
         leftChildData = data.loc[data[bestFeature] < condition]
         leftChildData = leftChildData.drop(bestFeature, axis=1)
-        # print(leftChildData)
-        n.left = self.createTree(leftChildData)
+        if leftChildData.shape[0] == 0:
+            temp = Node()
+            temp.feature = data.iloc[:,-1].value_counts().index[0]
+            n.left = temp
+        else:
+            n.left = self.createTree(leftChildData)
 
         rightChildData = data.loc[data[bestFeature] >= condition]
         rightChildData = rightChildData.drop(bestFeature, axis=1)
-        # print(rightChildData)
-        n.right = self.createTree(rightChildData)
+        if rightChildData.shape[0] == 0:
+            temp = Node()
+            temp.feature = data.iloc[:,-1].value_counts().index[0]
+            n.right = temp
+        else:
+            n.right = self.createTree(rightChildData)
 
         return n
 
     def getBestFeature(self, data):
         entropy_p = self.entropy(data)
         max_gain = float('-inf')
-        bestFeature = 0
+        bestFeature = 0.0
+        bestCondition = 0.0
         for colName, colData in data.iloc[:,:-1].iteritems():
-            condition = (colData.max() - colData.min()) / 2
-            entropy_i = 0.0
+            percent = [0.25, 0.5, 0.75]
+            for p in percent:
+                condition = (colData.max() - colData.min()) * p
+                entropy_i = 0.0
 
-            subData1 = data.loc[data[colName] < condition]
-            prob1 = len(subData1) / float(len(data))
-            entropy_i += prob1 * self.entropy(subData1)
+                subData1 = data.loc[data[colName] < condition]
+                prob1 = len(subData1) / float(len(data))
+                entropy_i += prob1 * self.entropy(subData1)
 
-            subData2 = data.loc[data[colName] >= condition]
-            prob2 = len(subData2) / float(len(data))
-            entropy_i += prob2 * self.entropy(subData2)
+                subData2 = data.loc[data[colName] >= condition]
+                prob2 = len(subData2) / float(len(data))
+                entropy_i += prob2 * self.entropy(subData2)
 
-            info_gain = entropy_p - entropy_i
-            if info_gain > max_gain:
-                max_gain = info_gain
-                bestFeature = colName
+                info_gain = entropy_p - entropy_i
+                if info_gain > max_gain:
+                    max_gain = info_gain
+                    bestFeature = colName
+                    bestCondition = condition
 
-        return bestFeature
+        return bestFeature, bestCondition
 
     def entropy(self, data):
         entropy = 0.0
@@ -89,21 +97,34 @@ def entropy(self, data):
 
         return entropy
 
-    def testData(self, data, root):
+    def predictData(self, data, root):
         predicted = []
         for index, row in data.iterrows():
-            predicted.append(self.testRow(row, root))
+            predicted.append(self.predictRow(row, root))
 
         return predicted
 
-    def testRow(self, data, root):
+    def predictRow(self, data, root):
         if not root.left and not root.right:
             return root.feature
 
         if data[root.feature] < root.condition:
-            return self.testRow(data, root.left)
+            return self.predictRow(data, root.left)
         elif data[root.feature] >= root.condition:
-            return self.testRow(data, root.right)
+            return self.predictRow(data, root.right)
+
+    def findParams(self, predicted, target, tp=1, tn=0):
+        truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0
+        for p, t in zip(predicted, target):
+            if p == tp and t == tp:
+                truePositives+=1
+            elif p == tp and t == tn:
+                falsePositives+=1
+            elif p == tn and t == tp:
+                falseNegatives+=1
+            else:
+                trueNegatives+=1
+        return truePositives, trueNegatives, falsePositives, falseNegatives
 
 
 class Node:
@@ -114,13 +135,13 @@ def __init__(self):
         self.right = None
         self.condition = None
 
-    def __str__(self, level=0):
-        ret = "\t"*level+repr(self.feature)+"\n"
-        if self.left:
-            ret += self.left.__str__(level+1)
-        if self.right:
-            ret += self.right.__str__(level+1)
-        return ret
+    # def __str__(self, level=0):
+    #     ret = "\t"*level+repr(self.feature)+"\n"
+    #     if self.left:
+    #         ret += self.left.__str__(level+1)
+    #     if self.right:
+    #         ret += self.right.__str__(level+1)
+    #     return ret
 
-    def __repr__(self):
-        return '<tree node representation>'
+    # def __repr__(self):
+    #     return '<tree node representation>'
diff --git a/project3/Code/main.py b/project3/Code/main.py
index e6906dd..4f19271 100644
--- a/project3/Code/main.py
+++ b/project3/Code/main.py
@@ -3,6 +3,7 @@
 from naive_bayes import bayes
 from sklearn import preprocessing
 import numpy as np
+import pandas as pd
 
 class main:
     def knn(self, predictData = None, trainData = None, kCrossValidation = 10):
@@ -38,6 +39,7 @@ def knn(self, predictData = None, trainData = None, kCrossValidation = 10):
         return accuracy, precision, recall, f_score
 
     def decision_tree(self, kCrossValidation = 10):
+        print("\nRunning Decision Tree Classifier ....................\n")
         from decision_tree import decisionTree
         h = hp()
         fileName = h.get_fileName()
@@ -46,9 +48,31 @@ def decision_tree(self, kCrossValidation = 10):
         dt = decisionTree()
         data, labels = dt.readData(filePath)
         data = dt.oneHotEncoding(data, labels)
-        target, predicted = dt.decision(data)
-        print(target)
-        print(predicted)
+
+        accuracy = []
+        precision = []
+        recall = []
+        f_score = []
+        models = []
+
+        foldSize = int(data.shape[0] / kCrossValidation)
+        for i in range(kCrossValidation):
+            print("Running iteration " + str(i+1) + " of k cross validation")
+            testData = data.loc[foldSize*i:foldSize*(i+1)-1]
+            trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
+            target, predicted, root = dt.decision(trainData, testData)
+            models.append(root)
+            truePositives, trueNegatives, falsePositives, falseNegatives = dt.findParams(predicted, target)
+            # if truePositives < trueNegatives:
+            #     truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives
+            accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
+            tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+            tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+            precision.append(tmpPrecision)
+            recall.append(tmpRecall)
+            f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
+        return accuracy, precision, recall, f_score
+        
     
     def bayes_naive(self, predictData, trainData, kCrossValidation = 10):
         h = hp()
@@ -88,13 +112,17 @@ def bayes_naive(self, predictData, trainData, kCrossValidation = 10):
 if __name__ == "__main__":
     m = main()
     h = hp()
-    trainData = h.get_file(h.get_fileName())
-    name = h.get_fileName()
-    if name == '':
-        predictData = None
-    else:
-        predictData = h.get_file(name, fileType='predictData')
+    # trainData = h.get_file(h.get_fileName())
+    # name = h.get_fileName()
+    # if name == '':
+    #     predictData = None
+    # else:
+    #     predictData = h.get_file(name, fileType='predictData')
     # accuracy, precision, recall, f_score = m.knn(predictData, trainData)
-    accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData)
+    # accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData)
+    # h.calculateMetrics(accuracy, precision, recall, f_score)
+
+    accuracy, precision, recall, f_score = m.decision_tree()
+    print(accuracy, precision, recall, f_score)
     h.calculateMetrics(accuracy, precision, recall, f_score)
 

From 0c0fe5bdbf63571d5094bf24f9f19a9041cfeafa Mon Sep 17 00:00:00 2001
From: Areeb-Aatif <areebuddin95@gmail.com>
Date: Wed, 20 Nov 2019 15:57:20 -0500
Subject: [PATCH 3/6] experimented with different classification algorithms

---
 project3/Code/decision_tree.py |  65 +++-----
 project3/Code/helpers.py       |  40 +++++
 project3/Code/library_codes.py | 290 +++++++++++++++++++++++++++++++++
 project3/Code/main.py          |  98 +++++++----
 project3/Code/random_forest.py |  36 ++++
 5 files changed, 453 insertions(+), 76 deletions(-)
 create mode 100644 project3/Code/library_codes.py
 create mode 100644 project3/Code/random_forest.py

diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py
index 4293169..53a977d 100644
--- a/project3/Code/decision_tree.py
+++ b/project3/Code/decision_tree.py
@@ -4,31 +4,22 @@
 
 class decisionTree:
 
-    def readData(self, filePath):
-        data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None)
-        dataDf = pd.DataFrame(data)
-        labels = dataDf.iloc[:,-1]
-        return dataDf.iloc[:,:-1], dataDf.iloc[:,-1]
-
-    def oneHotEncoding(self, data, labels):
-        for colName, colData in data.iteritems():
-            if colData.dtype == np.object:
-                data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1)
-                data.drop([colName], axis=1, inplace=True)
-
-        return pd.concat([data, labels], axis=1)
-
-    def decision(self, trainData, testData):
+    def decision(self, trainData):
         # trainData = data.loc[:percentSplit*data.shape[0]]
         # testData = data.loc[percentSplit*data.shape[0]:]
         root = self.createTree(trainData)
-        target = testData.iloc[:,-1].values.tolist()
-        predicted = self.predictData(testData.iloc[:, :-1], root)
-        return target, predicted, root
+        # target = testData.iloc[:,-1].values.tolist()
+        # predicted = self.predictData(testData.iloc[:, :-1], root)
+        # return target, predicted, root
+        return root
 
-    def createTree(self, data):
+    def createTree(self, data, depth=float('inf'), minLeafRows=0):
         n = Node()
 
+        if depth <= 0 or data.shape[0] <= minLeafRows:
+            n.feature = data.iloc[:,-1].value_counts().index[0]
+            return n
+
         if data.iloc[:,-1].value_counts().shape[0] == 1:
             n.feature = data.iloc[:, -1].iloc[0]
             return n
@@ -48,7 +39,7 @@ def createTree(self, data):
             temp.feature = data.iloc[:,-1].value_counts().index[0]
             n.left = temp
         else:
-            n.left = self.createTree(leftChildData)
+            n.left = self.createTree(leftChildData, depth-1, minLeafRows)
 
         rightChildData = data.loc[data[bestFeature] >= condition]
         rightChildData = rightChildData.drop(bestFeature, axis=1)
@@ -57,7 +48,7 @@ def createTree(self, data):
             temp.feature = data.iloc[:,-1].value_counts().index[0]
             n.right = temp
         else:
-            n.right = self.createTree(rightChildData)
+            n.right = self.createTree(rightChildData, depth-1, minLeafRows)
 
         return n
 
@@ -71,7 +62,6 @@ def getBestFeature(self, data):
             for p in percent:
                 condition = (colData.max() - colData.min()) * p
                 entropy_i = 0.0
-
                 subData1 = data.loc[data[colName] < condition]
                 prob1 = len(subData1) / float(len(data))
                 entropy_i += prob1 * self.entropy(subData1)
@@ -113,19 +103,6 @@ def predictRow(self, data, root):
         elif data[root.feature] >= root.condition:
             return self.predictRow(data, root.right)
 
-    def findParams(self, predicted, target, tp=1, tn=0):
-        truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0
-        for p, t in zip(predicted, target):
-            if p == tp and t == tp:
-                truePositives+=1
-            elif p == tp and t == tn:
-                falsePositives+=1
-            elif p == tn and t == tp:
-                falseNegatives+=1
-            else:
-                trueNegatives+=1
-        return truePositives, trueNegatives, falsePositives, falseNegatives
-
 
 class Node:
 
@@ -135,13 +112,13 @@ def __init__(self):
         self.right = None
         self.condition = None
 
-    # def __str__(self, level=0):
-    #     ret = "\t"*level+repr(self.feature)+"\n"
-    #     if self.left:
-    #         ret += self.left.__str__(level+1)
-    #     if self.right:
-    #         ret += self.right.__str__(level+1)
-    #     return ret
+    def __str__(self, level=0):
+        ret = "\t"*level+repr(self.feature)+"\n"
+        if self.left:
+            ret += self.left.__str__(level+1)
+        if self.right:
+            ret += self.right.__str__(level+1)
+        return ret
 
-    # def __repr__(self):
-    #     return '<tree node representation>'
+    def __repr__(self):
+        return '<tree node representation>'
diff --git a/project3/Code/helpers.py b/project3/Code/helpers.py
index 16d318f..1cfa1f0 100644
--- a/project3/Code/helpers.py
+++ b/project3/Code/helpers.py
@@ -1,6 +1,7 @@
 import numpy as np
 from point import point
 import math
+import pandas as pd
 
 class helpers:
     def get_fileName(self):
@@ -192,3 +193,42 @@ def calculateMetrics(self, accuracy, precision, recall, f_score):
         print("PRECISION = {}%".format(averagePrecision*100))
         print("RECALL = {}%".format(averageRecall*100))
         print("F MEASURE = {}%".format(averageFscore*100))
+
+    def readData(self, filePath):
+        '''
+            Read input data for decision tree and random forest classifier
+            input: filepath
+            output: Data Points- a pandas dataframe of input data
+                    Labels - a pandas dataframe of labels for each data point
+        '''
+        data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None)
+        dataDf = pd.DataFrame(data)
+        labels = dataDf.iloc[:,-1]
+        return dataDf.iloc[:,:-1], dataDf.iloc[:,-1]
+
+    def oneHotEncoding(self, data, labels):
+        '''
+            One Hot Encode the input data file and then concat the labels to return a single dataframe
+            input:  data - pandas dataframe of input data 
+                    labels - pandas dataframe of labels associated with input data points
+            output: returns a dataframe with one hot encoding and joining the labels to the data points
+        '''
+        for colName, colData in data.iteritems():
+            if colData.dtype == np.object:
+                data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1)
+                data.drop([colName], axis=1, inplace=True)
+
+        return pd.concat([data, labels], axis=1)
+
+    def findParameters(self, predicted, target, tp='1', tn='0'):
+        truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0
+        for p, t in zip(predicted, target):
+            if p == tp and t == tp:
+                truePositives+=1
+            elif p == tp and t == tn:
+                falsePositives+=1
+            elif p == tn and t == tp:
+                falseNegatives+=1
+            else:
+                trueNegatives+=1
+        return truePositives, trueNegatives, falsePositives, falseNegatives
diff --git a/project3/Code/library_codes.py b/project3/Code/library_codes.py
new file mode 100644
index 0000000..e3b2297
--- /dev/null
+++ b/project3/Code/library_codes.py
@@ -0,0 +1,290 @@
+from helpers import helpers
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import metrics
+import xgboost
+import matplotlib.pyplot as plt
+from sklearn import preprocessing
+from sklearn.neural_network import MLPClassifier
+from sklearn.linear_model import LinearRegression
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+
+def readTrainData():
+
+    fileName = input("enter file name (without extension): ")
+    filePath = "CSE-601/project3/Data/"+fileName+".csv"
+    data = np.genfromtxt(filePath, dtype=None, delimiter=",", encoding=None)
+    data = pd.DataFrame(data)
+    data.drop(data.columns[0], axis=1, inplace=True)
+
+    fileName = input("enter file name (without extension): ")
+    filePath = "CSE-601/project3/Data/"+fileName+".csv"
+    labels = np.genfromtxt(filePath, dtype=None, delimiter=",", encoding=None)
+    labels = pd.DataFrame(labels)
+    labels.drop(labels.index[0], axis=0, inplace=True)
+    labels.drop(labels.columns[0], axis=1, inplace=True)
+    labels = labels.reset_index(drop=True)
+
+    return data, labels
+
+def splitData(data, labels):
+
+    train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 5)
+    test_labels = test_labels.ravel()
+    train_labels = train_labels.ravel()
+    print('Training Features Shape:', train_features.shape)
+    print('Training Labels Shape:', train_labels.shape)
+    print('Testing Features Shape:', test_features.shape)
+    print('Testing Labels Shape:', test_labels.shape)
+
+    return train_features, test_features, train_labels, test_labels
+
+def calMetrics(test_labels, pred_labels):
+
+    print("Accuracy:", metrics.accuracy_score(test_labels, pred_labels))
+    print("Precision:", metrics.precision_score(test_labels, pred_labels, pos_label='1'))
+    print("Recall:", metrics.recall_score(test_labels, pred_labels, pos_label='1'))
+    print("F-Measure:", metrics.f1_score(test_labels, pred_labels, pos_label='1'))
+    print("F Beta Score:", metrics.fbeta_score(test_labels, pred_labels, beta=0.5, pos_label='1'))
+    # print("MSE:", metrics.mean_squared_error(test_labels, pred_labels))
+
+def readTestData():
+
+    fileName = input("enter test file name (without extension): ")
+    filePath = "CSE-601/project3/Data/"+fileName+".csv"
+    test_features = np.genfromtxt(filePath, dtype=None, delimiter=",", encoding=None)
+    test_features = pd.DataFrame(test_features)
+    test_features.set_index(['f0'], inplace=True)
+    return test_features
+
+def writeToFile(test_features, pred_labels):
+
+    f = open('CSE-601/project3/Data/output8.csv', 'w')
+    for y, i in zip(pred_labels, test_features.index.values):
+        f.write(str(i))
+        f.write(',')
+        f.write(str(y))
+        f.write('\n')
+    f.close()
+
+def predict(clf, test_features):
+
+    return clf.predict(test_features)
+
+def rf(data, labels, test_features=None):
+
+    from random_forest import randomForest
+    from helpers import helpers as hp
+    from decision_tree import decisionTree
+    h = hp()
+    rf = randomForest()
+    dt = decisionTree()
+
+    data = pd.concat([data, labels], axis=1)
+    print(data)
+
+    accuracy = []
+    precision = []
+    recall = []
+    f_score = []
+    models = []
+    fb_score = []
+
+    foldSize = int(data.shape[0] / 5)
+    for i in range(5):
+        print("Running iteration " + str(i+1) + " of k cross validation")
+        testData = data.loc[foldSize*i:foldSize*(i+1)-1]
+        trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
+        forest = rf.forest(trainData)
+        target = testData.iloc[:,-1].values.tolist()
+        predicted = rf.predictForest(testData.iloc[:, :-1], forest)
+        models.append(forest)
+        calMetrics(target, predicted)
+        # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
+        # print(truePositives, trueNegatives, falsePositives, falseNegatives)
+        # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
+        # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+        # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+        # precision.append(tmpPrecision)
+        # recall.append(tmpRecall)
+        # tm_fscore = h.findFMeasure(tmpPrecision, tmpRecall)
+        # print(tm_fscore)
+        # f_score.append(tm_fscore)
+    
+    h.calculateMetrics(accuracy, precision, recall, f_score)
+
+    # for i in range(3):
+    #     print("Running iteration " + str(i+1) + " of k cross validation")
+    #     testData = data.loc[foldSize*i:foldSize*(i+1)-1]
+    #     trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
+    #     root = dt.decision(trainData)
+    #     target = testData.iloc[:,-1].values.tolist()
+    #     predicted = dt.predictData(testData.iloc[:, :-1], root)
+    #     models.append(root)
+    #     truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
+    #     accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
+    #     tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+    #     tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+    #     precision.append(tmpPrecision)
+    #     recall.append(tmpRecall)
+    #     f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
+    
+    # print(accuracy, precision, recall, f_score)
+    # h.calculateMetrics(accuracy, precision, recall, f_score)
+
+    # ind = f_score.index(min(f_score))
+    # print(f_score[ind])
+    # pred = rf.predictForest(test_features, models[ind])
+    # print(pred)
+    predicted = pd.DataFrame()
+    for root in models:
+        pred = dt.predictData(test_features, root)
+        predicted = pd.concat([predicted, pd.DataFrame(pred)], axis=1)
+
+    print(predicted)
+
+    p = pd.DataFrame()
+
+    p = []
+    for idx, row in predicted.iterrows():
+        p.append(row.value_counts().index.tolist()[0])
+
+    print(p)
+
+    return p
+    # print([max(set(pred), key=pred.count) for pred in predicted])
+
+def randomForest(train_features, train_labels):
+
+    clf = RandomForestClassifier(n_estimators=100)
+    clf.fit(train_features, train_labels)
+    return clf
+
+def adaBoost(train_features, train_labels):
+
+    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=200)
+    clf.fit(train_features, train_labels)
+
+    return clf
+
+def xgb(train_features, train_labels):
+
+    clf = xgboost.XGBClassifier(random_state=1, learning_rate=0.01, n_estimators=200, max_depth=5)
+    clf.fit(train_features, train_labels)
+    return clf
+
+def nn(train_features, train_labels):
+    mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
+    mlp.fit(train_features, train_labels)
+    return mlp
+
+def lr(train_features, train_labels):
+    clf = LinearRegression()
+    clf.fit(train_features, train_labels)
+    return clf
+
+if __name__ == "__main__":
+
+    data, labels = readTrainData()
+    # print(labels.iloc[:,0].value_counts())
+    # print(zeros, ones)
+    # trainData = data.iloc[:int(data.shape[0]*0.8)]
+    # trainLabels = labels.iloc[:int(labels.shape[0]*0.8)]
+    # testData = data.iloc[int(data.shape[0]*0.8):]
+    # testLabels = labels.iloc[int(labels.shape[0]*0.8):]
+    # minmaxScaler = preprocessing.MinMaxScaler()
+    # scaledData = minmaxScaler.fit_transform(data)
+    # StandardScaler().fit_transform(data)
+    # pca = PCA(n_components=2)
+    # principalComponents = pca.fit_transform(data.values)
+    # principalDf = pd.DataFrame(data = principalComponents
+    #          , columns = ['principal component 1', 'principal component 2'])
+    # finalDf = pd.concat([principalDf, labels], axis = 1)
+    # print(finalDf.head())
+    
+    # zeros = finalDf[finalDf[1] == '0']
+    # ones = finalDf[finalDf[1] == '1']
+
+    # newDf = pd.concat([zeros, ones])
+    # print(newDf.head())
+    # newdf = newDf.sample(frac=1, random_state=42)
+
+    # data = newdf.drop([1], axis=1)
+    # labels = pd.DataFrame(newdf[1])
+
+    # print(pd.DataFrame(data).shape)
+    # exit()
+    # exit()
+    # labels = np.array(labels)
+    data = np.array(data)
+    train_features, test_features, train_labels, test_labels = splitData(data, np.array(labels))
+    train_features = normalize(train_features)
+    test_features=normalize(test_features)
+    neighbours = np.arange(1,25)
+    train_accuracy =np.empty(len(neighbours))
+    test_accuracy = np.empty(len(neighbours))
+    for i,k in enumerate(neighbours):
+        # knn=KNeighborsClassifier(n_neighbors=k,algorithm="kd_tree",n_jobs=-1)
+        # knn.fit(train_features,train_labels.ravel())
+        # train_accuracy[i] = knn.score(train_features, train_labels.ravel())
+        # test_accuracy[i] = knn.score(test_features, test_labels.ravel())
+        clf = xgb(train_features, train_labels)
+        
+
+
+    plt.title('k-NN Varying number of neighbors')
+    plt.plot(neighbours, test_accuracy, label='Testing Accuracy')
+    plt.plot(neighbours, train_accuracy, label='Training accuracy')
+    plt.legend()
+    plt.xlabel('Number of neighbors')
+    plt.ylabel('Accuracy')
+    plt.show()
+
+    idx = np.where(test_accuracy == max(test_accuracy))
+    x = neighbours[idx]
+
+    knn=KNeighborsClassifier(n_neighbors=x[0],algorithm="kd_tree",n_jobs=-1)
+    knn.fit(train_features,train_labels.ravel())
+    pred = knn.predict(test_features)
+    # calMetrics(test_labels, pred)
+    # exit()
+    # rf(data, labels)
+    # clf = adaBoost(train_features, train_labels)
+    # clf = nn(train_features, train_labels)
+    # clf = lr(train_features, train_labels)
+
+    # xgboost.plot_importance(clf)
+    # plt.show()  
+
+    # pred = predict(clf, test_features)
+    # p = []
+    # for pr in pred:
+    #     if pr <= 0: p.append('0')
+    #     else: p.append('1')
+    # print(p)
+    # calMetrics(test_labels, p)
+
+    # pred = predict(clf, train_features)
+    # calMetrics(train_labels, pred)
+
+    test_features = normalize(readTestData())
+    pred = knn.predict(np.array(test_features))
+    print(pred)
+    # print(train_features, test_features)
+    # exit()
+
+    # pred = predict(clf, np.array(testData))
+    # calMetrics(np.array(testLabels), pred)
+
+    writeToFile(test_features, pred)
+
+
+
+
diff --git a/project3/Code/main.py b/project3/Code/main.py
index 4f19271..d22ac1d 100644
--- a/project3/Code/main.py
+++ b/project3/Code/main.py
@@ -36,6 +36,41 @@ def knn(self, predictData = None, trainData = None, kCrossValidation = 10):
             recall.append(tmpRecall)
             f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
             predictData = pd 
+        return accuracy, precision, recall, f_score    
+    
+    def bayes_naive(self, predictData, trainData, kCrossValidation = 10):
+        h = hp()
+        nb = bayes()
+        accuracy = []
+        precision = []
+        recall = []
+        f_score = []
+        pd = predictData
+        for i in range(len(trainData)):
+            tmp = None
+            if predictData == None:
+                predictData = trainData[i]
+                tmp = [lt for j, lt in enumerate(trainData) if j != i]
+            else:
+                tmp = trainData
+            h.normalizeData(tmp)
+            h.normalizeEvaluationSet(predictData)
+            td = h.convertToList(tmp)
+            classPriorProbabilities = nb.findClassPriorProbability(td)
+            classes = nb.segregateClasses(td)
+            descriptorPosteriorProbabilites = nb.findDescriptorPosteriorProbabilites(classes)
+            nb.classify(predictData, classPriorProbabilities, descriptorPosteriorProbabilites)
+            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(predictData)
+            # print(truePositives, trueNegatives, falsePositives, falseNegatives)
+            if truePositives < trueNegatives:
+                truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives
+            accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
+            tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+            tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+            precision.append(tmpPrecision)
+            recall.append(tmpRecall)
+            f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
+            predictData = pd
         return accuracy, precision, recall, f_score
 
     def decision_tree(self, kCrossValidation = 10):
@@ -45,9 +80,9 @@ def decision_tree(self, kCrossValidation = 10):
         fileName = h.get_fileName()
         # filePath = "../Data/"+fileName+".txt"
         filePath = "CSE-601/project3/Data/"+fileName+".txt"
+        data, labels = h.readData(filePath)
+        data = h.oneHotEncoding(data, labels)
         dt = decisionTree()
-        data, labels = dt.readData(filePath)
-        data = dt.oneHotEncoding(data, labels)
 
         accuracy = []
         precision = []
@@ -60,11 +95,11 @@ def decision_tree(self, kCrossValidation = 10):
             print("Running iteration " + str(i+1) + " of k cross validation")
             testData = data.loc[foldSize*i:foldSize*(i+1)-1]
             trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
-            target, predicted, root = dt.decision(trainData, testData)
+            root = dt.decision(trainData)
+            target = testData.iloc[:,-1].values.tolist()
+            predicted = dt.predictData(testData.iloc[:, :-1], root)
             models.append(root)
-            truePositives, trueNegatives, falsePositives, falseNegatives = dt.findParams(predicted, target)
-            # if truePositives < trueNegatives:
-            #     truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives
+            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
             accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
             tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
             tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
@@ -72,41 +107,41 @@ def decision_tree(self, kCrossValidation = 10):
             recall.append(tmpRecall)
             f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
         return accuracy, precision, recall, f_score
-        
-    
-    def bayes_naive(self, predictData, trainData, kCrossValidation = 10):
+
+    def random_forest(self, kCrossValidation = 10):
+        print("\nRunning Random Forest Classifier ....................\n")
+        from random_forest import randomForest
         h = hp()
-        nb = bayes()
+        fileName = h.get_fileName()
+        # filePath = "../Data/"+fileName+".txt"
+        filePath = "CSE-601/project3/Data/"+fileName+".txt"
+        data, labels = h.readData(filePath)
+        data = h.oneHotEncoding(data, labels)
+        rf = randomForest()
+
         accuracy = []
         precision = []
         recall = []
         f_score = []
-        pd = predictData
-        for i in range(len(trainData)):
-            tmp = None
-            if predictData == None:
-                predictData = trainData[i]
-                tmp = [lt for j, lt in enumerate(trainData) if j != i]
-            else:
-                tmp = trainData
-            h.normalizeData(tmp)
-            h.normalizeEvaluationSet(predictData)
-            td = h.convertToList(tmp)
-            classPriorProbabilities = nb.findClassPriorProbability(td)
-            classes = nb.segregateClasses(td)
-            descriptorPosteriorProbabilites = nb.findDescriptorPosteriorProbabilites(classes)
-            nb.classify(predictData, classPriorProbabilities, descriptorPosteriorProbabilites)
-            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(predictData)
-            # print(truePositives, trueNegatives, falsePositives, falseNegatives)
-            if truePositives < trueNegatives:
-                truePositives, trueNegatives, falsePositives, falseNegatives = trueNegatives, truePositives, falseNegatives, falsePositives
+        models = []
+
+        foldSize = int(data.shape[0] / kCrossValidation)
+        for i in range(kCrossValidation):
+            print("Running iteration " + str(i+1) + " of k cross validation")
+            testData = data.loc[foldSize*i:foldSize*(i+1)-1]
+            trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
+            forest = rf.forest(trainData)
+            target = testData.iloc[:,-1].values.tolist()
+            predicted = rf.predictForest(testData.iloc[:, :-1], forest)
+            models.append(forest)
+            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
+            print(truePositives, trueNegatives, falsePositives, falseNegatives)
             accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
             tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
             tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
             precision.append(tmpPrecision)
             recall.append(tmpRecall)
             f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
-            predictData = pd
         return accuracy, precision, recall, f_score
 
 if __name__ == "__main__":
@@ -122,7 +157,6 @@ def bayes_naive(self, predictData, trainData, kCrossValidation = 10):
     # accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData)
     # h.calculateMetrics(accuracy, precision, recall, f_score)
 
-    accuracy, precision, recall, f_score = m.decision_tree()
+    accuracy, precision, recall, f_score = m.random_forest()
     print(accuracy, precision, recall, f_score)
     h.calculateMetrics(accuracy, precision, recall, f_score)
-
diff --git a/project3/Code/random_forest.py b/project3/Code/random_forest.py
new file mode 100644
index 0000000..0b7f48d
--- /dev/null
+++ b/project3/Code/random_forest.py
@@ -0,0 +1,36 @@
+import numpy as np
+import pandas as pd
+from decision_tree import decisionTree as dt
+
+class randomForest:
+
+    def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth=10, minLeafRows=5, randomSeed=12):
+        if numFeatures == None:
+            numFeatures = int(np.sqrt(trainData.shape[1]))
+
+        if numRows == None:
+            # numRows = int(trainData.shape[0] * 0.8)
+            numRows = trainData.shape[0]
+
+        forest = [self.createForest(trainData.iloc[:, :-1], trainData.iloc[:,-1], numFeatures, numRows, maxDepth, minLeafRows, randomSeed) for i in range(numTrees)]
+        return forest
+
+        
+    def createForest(self, trainData, labels, numFeatures, numRows, maxDepth, minLeafRows, randomSeed):
+
+        trainData = trainData.sample(numFeatures, axis=1, random_state=randomSeed, replace=False)
+        trainData = pd.concat([trainData, labels], axis=1)
+        trainData = trainData.sample(numRows, axis=0, random_state=randomSeed, replace=False)
+
+        return dt().createTree(trainData, maxDepth, minLeafRows)
+
+    def predictForest(self, testData, forest):
+        predicted = []
+        for _, row in testData.iterrows():
+            predictedRow = [dt().predictRow(row, root) for root in forest]
+            predicted.append(max(set(predictedRow), key=predictedRow.count))
+        return predicted
+
+
+
+

From e67544f6b268a114b6fe55bfa4694ca041aad4b1 Mon Sep 17 00:00:00 2001
From: Areeb-Aatif <areebuddin95@gmail.com>
Date: Tue, 26 Nov 2019 14:36:42 -0500
Subject: [PATCH 4/6] modified the random forest implementation

---
 project3/Code/decision_tree.py |  34 ++++----
 project3/Code/helpers.py       |   4 +-
 project3/Code/library_codes.py | 151 ++++++++++++++++++++++++++-------
 project3/Code/main.py          |  14 ++-
 project3/Code/random_forest.py |  11 ++-
 5 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/project3/Code/decision_tree.py b/project3/Code/decision_tree.py
index 53a977d..f4528f4 100644
--- a/project3/Code/decision_tree.py
+++ b/project3/Code/decision_tree.py
@@ -1,19 +1,18 @@
 import numpy as np
 import pandas as pd
 from math import log
+import random
 
 class decisionTree:
 
-    def decision(self, trainData):
-        # trainData = data.loc[:percentSplit*data.shape[0]]
-        # testData = data.loc[percentSplit*data.shape[0]:]
-        root = self.createTree(trainData)
-        # target = testData.iloc[:,-1].values.tolist()
-        # predicted = self.predictData(testData.iloc[:, :-1], root)
-        # return target, predicted, root
+    def decision(self, trainData, maxFeatures=None, depth=float('inf'), minLeafRows=0, rf=False):
+        features = trainData.columns.values.tolist()
+        features.pop()
+        root = self.createTree(trainData, features, maxFeatures, depth, minLeafRows, rf)
+        # print(root)
         return root
 
-    def createTree(self, data, depth=float('inf'), minLeafRows=0):
+    def createTree(self, data, features, maxFeatures, depth, minLeafRows, rf):
         n = Node()
 
         if depth <= 0 or data.shape[0] <= minLeafRows:
@@ -21,34 +20,37 @@ def createTree(self, data, depth=float('inf'), minLeafRows=0):
             return n
 
         if data.iloc[:,-1].value_counts().shape[0] == 1:
-            n.feature = data.iloc[:, -1].iloc[0]
+            n.feature = data.iloc[:,-1].iloc[0]
             return n
 
-        if data.shape[1] == 2:
+        if len(features) == 0:
             n.feature = data.iloc[:,-1].value_counts().index[0]
             return n
 
-        bestFeature, condition = self.getBestFeature(data)
+        if rf == True: 
+            sampledData = pd.concat([data[random.sample(features, k=maxFeatures)], data.iloc[:,-1]], axis=1)
+            bestFeature, condition = self.getBestFeature(sampledData)
+        else:
+            bestFeature, condition = self.getBestFeature(pd.concat([data[features], data.iloc[:,-1]], axis=1))
+            features = [x for _,x in enumerate(features) if x != bestFeature]
         n.feature = bestFeature
         n.condition = condition
 
         leftChildData = data.loc[data[bestFeature] < condition]
-        leftChildData = leftChildData.drop(bestFeature, axis=1)
         if leftChildData.shape[0] == 0:
             temp = Node()
             temp.feature = data.iloc[:,-1].value_counts().index[0]
             n.left = temp
         else:
-            n.left = self.createTree(leftChildData, depth-1, minLeafRows)
+            n.left = self.createTree(leftChildData, features, maxFeatures, depth-1, minLeafRows, rf)
 
         rightChildData = data.loc[data[bestFeature] >= condition]
-        rightChildData = rightChildData.drop(bestFeature, axis=1)
         if rightChildData.shape[0] == 0:
             temp = Node()
             temp.feature = data.iloc[:,-1].value_counts().index[0]
             n.right = temp
         else:
-            n.right = self.createTree(rightChildData, depth-1, minLeafRows)
+            n.right = self.createTree(rightChildData, features, maxFeatures, depth-1, minLeafRows, rf)
 
         return n
 
@@ -58,7 +60,7 @@ def getBestFeature(self, data):
         bestFeature = 0.0
         bestCondition = 0.0
         for colName, colData in data.iloc[:,:-1].iteritems():
-            percent = [0.25, 0.5, 0.75]
+            percent = [0.2, 0.5, 0.8]
             for p in percent:
                 condition = (colData.max() - colData.min()) * p
                 entropy_i = 0.0
diff --git a/project3/Code/helpers.py b/project3/Code/helpers.py
index b904b6b..38211d2 100644
--- a/project3/Code/helpers.py
+++ b/project3/Code/helpers.py
@@ -6,7 +6,7 @@
 
 class helpers:
     def get_fileName(self):
-        filename = input("enter file name (without extension): ")
+        filename = input("Enter file name (without extension): ")
         return filename
 
     def get_file_bayes(self, filename, kCrossValidation = 10,  fileType='trainData'):
@@ -253,7 +253,7 @@ def oneHotEncoding(self, data, labels):
 
         return pd.concat([data, labels], axis=1)
 
-    def findParameters(self, predicted, target, tp='1', tn='0'):
+    def findParameters(self, predicted, target, tp=1, tn=0):
         truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0
         for p, t in zip(predicted, target):
             if p == tp and t == tp:
diff --git a/project3/Code/library_codes.py b/project3/Code/library_codes.py
index e3b2297..a47b767 100644
--- a/project3/Code/library_codes.py
+++ b/project3/Code/library_codes.py
@@ -15,6 +15,7 @@
 from sklearn.preprocessing import normalize
 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
+from scipy import stats
 
 def readTrainData():
 
@@ -36,6 +37,7 @@ def readTrainData():
 
 def splitData(data, labels):
 
+
     train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 5)
     test_labels = test_labels.ravel()
     train_labels = train_labels.ravel()
@@ -66,7 +68,7 @@ def readTestData():
 
 def writeToFile(test_features, pred_labels):
 
-    f = open('CSE-601/project3/Data/output8.csv', 'w')
+    f = open('CSE-601/project3/Data/output5.csv', 'w')
     for y, i in zip(pred_labels, test_features.index.values):
         f.write(str(i))
         f.write(',')
@@ -78,6 +80,48 @@ def predict(clf, test_features):
 
     return clf.predict(test_features)
 
+def dt(data, labels, test_features=None):
+
+    from helpers import helpers as hp
+    from decision_tree import decisionTree
+    h = hp()
+    dt = decisionTree()
+    data = pd.concat([data, labels], axis=1)
+    data.dropna(inplace=True)
+    print(data.head())
+    trainAccuracy = []
+    testAccuracy = []
+    precision = []
+    recall = []
+    f_score = []
+    models = []
+
+    foldSize = int(data.shape[0] / 10)
+    for i in range(10):
+        print("Running iteration " + str(i+1) + " of k cross validation")
+        testData = data.loc[foldSize*i:foldSize*(i+1)-1]
+        # testData = pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns)
+        trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
+        trainData = trainData[(np.abs(stats.zscore(trainData.iloc[:,:-1])) < 3).all(axis=1)]
+        root = dt.decision(trainData, depth=10, minLeafRows=5)
+        testTarget = testData.iloc[:,-1].values.tolist()
+        # testPredicted = dt.predictData(testData.iloc[:, :-1], root)
+        testPredicted = dt.predictData(pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns.values.tolist()[:-1]), root)
+        trainTarget = trainData.iloc[:,-1].values.tolist()
+        trainPredicted = dt.predictData(trainData.iloc[:, :-1], root)
+        models.append(root)
+        truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(trainPredicted, trainTarget)
+        trainAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
+        truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(testPredicted, testTarget)
+        testAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
+        tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+        tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+        precision.append(tmpPrecision)
+        recall.append(tmpRecall)
+        f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
+    h.calculateMetrics(testAccuracy, precision, recall, f_score)
+    return trainAccuracy, testAccuracy, precision, recall, models, dt
+
 def rf(data, labels, test_features=None):
 
     from random_forest import randomForest
@@ -88,7 +132,7 @@ def rf(data, labels, test_features=None):
     dt = decisionTree()
 
     data = pd.concat([data, labels], axis=1)
-    print(data)
+    # print(data)
 
     accuracy = []
     precision = []
@@ -120,21 +164,7 @@ def rf(data, labels, test_features=None):
     
     h.calculateMetrics(accuracy, precision, recall, f_score)
 
-    # for i in range(3):
-    #     print("Running iteration " + str(i+1) + " of k cross validation")
-    #     testData = data.loc[foldSize*i:foldSize*(i+1)-1]
-    #     trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
-    #     root = dt.decision(trainData)
-    #     target = testData.iloc[:,-1].values.tolist()
-    #     predicted = dt.predictData(testData.iloc[:, :-1], root)
-    #     models.append(root)
-    #     truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
-    #     accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
-    #     tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
-    #     tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
-    #     precision.append(tmpPrecision)
-    #     recall.append(tmpRecall)
-    #     f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
+    
     
     # print(accuracy, precision, recall, f_score)
     # h.calculateMetrics(accuracy, precision, recall, f_score)
@@ -193,6 +223,18 @@ def lr(train_features, train_labels):
 if __name__ == "__main__":
 
     data, labels = readTrainData()
+    # labels.rename(columns={1: 100}, inplace=True)
+    # data = stats.zscore(data, axis=1)
+    # data = pd.DataFrame(data)
+    # print(data.skew(axis=0))
+    # data.drop(data.columns[1], axis=1, inplace=True)
+    # print(data.skew(axis=0))
+    # print(data.head())
+    # exit()
+    # for c in data.columns:
+    #     data[c] = np.log10(data[c])
+    # print(data.skew(axis=0))
+    # exit()
     # print(labels.iloc[:,0].value_counts())
     # print(zeros, ones)
     # trainData = data.iloc[:int(data.shape[0]*0.8)]
@@ -223,37 +265,69 @@ def lr(train_features, train_labels):
     # exit()
     # exit()
     # labels = np.array(labels)
+    data = data.sample(data.shape[0], axis=0, random_state=12, replace=False)
     data = np.array(data)
     train_features, test_features, train_labels, test_labels = splitData(data, np.array(labels))
-    train_features = normalize(train_features)
-    test_features=normalize(test_features)
+    train_features = pd.DataFrame(train_features)
+    train_labels = pd.DataFrame(train_labels)
+    test_features = pd.DataFrame(test_features)
+    test_labels = pd.DataFrame(test_labels)
+    # print(train_features.skew(axis=0).sort_values(ascending=False))
+    train_features = pd.DataFrame(stats.zscore(train_features, axis=1), columns=train_features.columns)
+    print(train_features.skew(axis=0).sort_values(ascending=False))
+    test_features = pd.DataFrame(stats.zscore(test_features, axis=1), columns=test_features.columns)
+    print(test_features.skew(axis=0).sort_values(ascending=False))
+    # train_features = train_features[(np.abs(stats.zscore(train_features)) < 1).all(axis=1)]
+
+    train_features.drop(columns=[29, 91], axis=1, inplace=True)
+    # train_features.drop(columns=[41], inplace=True)
+    # print(train_features.skew(axis=0).sort_values(ascending=False))
+    trainData = pd.concat([train_features, train_labels], axis=1)
+    trainData.dropna(inplace=True)
+    train_features = np.array(trainData.iloc[:,:-1])
+    train_labels = np.array(trainData.iloc[:,-1])
+
+    test_features.drop(columns=[29, 91], axis=1, inplace=True)
+    # test_features.drop(columns=[41], axis=1, inplace=True)
+    testData = pd.concat([test_features, test_labels], axis=1)
+    testData.dropna(inplace=True)
+    test_features = np.array(testData.iloc[:,:-1])
+    test_labels = np.array(testData.iloc[:,-1])
+    # train_features = normalize(train_features)
+    # test_features=normalize(test_features)
     neighbours = np.arange(1,25)
     train_accuracy =np.empty(len(neighbours))
     test_accuracy = np.empty(len(neighbours))
     for i,k in enumerate(neighbours):
-        # knn=KNeighborsClassifier(n_neighbors=k,algorithm="kd_tree",n_jobs=-1)
-        # knn.fit(train_features,train_labels.ravel())
-        # train_accuracy[i] = knn.score(train_features, train_labels.ravel())
-        # test_accuracy[i] = knn.score(test_features, test_labels.ravel())
-        clf = xgb(train_features, train_labels)
+        knn=KNeighborsClassifier(n_neighbors=k,algorithm="kd_tree",n_jobs=-1)
+        knn.fit(train_features,train_labels.ravel())
+        train_accuracy[i] = knn.score(train_features, train_labels.ravel())
+        test_accuracy[i] = knn.score(test_features, test_labels.ravel())
+        # clf = xgb(train_features, train_labels)
         
-
-
+    # trainAccuracy, testAccuracy, precision, recall, models, dt = dt(data, labels)
+    # neighbours = np.arange(1,11)
     plt.title('k-NN Varying number of neighbors')
-    plt.plot(neighbours, test_accuracy, label='Testing Accuracy')
-    plt.plot(neighbours, train_accuracy, label='Training accuracy')
+    plt.plot(neighbours, train_accuracy, label='Training Accuracy')
+    plt.plot(neighbours, test_accuracy, label='Testing accuracy')
     plt.legend()
     plt.xlabel('Number of neighbors')
     plt.ylabel('Accuracy')
     plt.show()
 
     idx = np.where(test_accuracy == max(test_accuracy))
+    # idx = testAccuracy.index(max(testAccuracy))
+    # idx = sorted(range(len(testAccuracy)), key=lambda i: testAccuracy[i], reverse=True)[:5]
+    # print(idx)
+    # print(testAccuracy[idx])
+    # print(trainAccuracy[idx])
     x = neighbours[idx]
-
+    # roots = [models[i] for i in idx]
+    # print(testAccuracy)
     knn=KNeighborsClassifier(n_neighbors=x[0],algorithm="kd_tree",n_jobs=-1)
     knn.fit(train_features,train_labels.ravel())
     pred = knn.predict(test_features)
-    # calMetrics(test_labels, pred)
+    calMetrics(test_labels, pred)
     # exit()
     # rf(data, labels)
     # clf = adaBoost(train_features, train_labels)
@@ -273,10 +347,23 @@ def lr(train_features, train_labels):
 
     # pred = predict(clf, train_features)
     # calMetrics(train_labels, pred)
-
-    test_features = normalize(readTestData())
+    # print(root)
+    test_features = readTestData()
+    test_features.drop(columns=['f30', 'f92'], inplace=True)
+    # test_features = pd.DataFrame(stats.zscore(test_features), index=test_features.index)
+    print(test_features.skew(axis=0).sort_values(ascending=False))
+    # test_features = pd.DataFrame(stats.zscore(test_features, axis=1), index=test_features.index)
+    # test_features.drop(test_features.columns[1], axis=1, inplace=True)
+    # test_features = pd.DataFrame(stats.zscore(test_features, axis=1))
+    # print(test_features.skew(axis=0))
+    # pred = []
+    # for _, row in test_features.iterrows():
+    #     predictedRow = [dt.predictRow(row, root) for root in roots]
+    #     pred.append(max(set(predictedRow), key=predictedRow.count))
     pred = knn.predict(np.array(test_features))
     print(pred)
+    # pred = knn.predict(np.array(test_features))
+    # print(pred)
     # print(train_features, test_features)
     # exit()
 
diff --git a/project3/Code/main.py b/project3/Code/main.py
index 179979e..2bf7370 100644
--- a/project3/Code/main.py
+++ b/project3/Code/main.py
@@ -4,6 +4,7 @@
 from sklearn import preprocessing
 import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
 
 class main:
     def knn(self, predictData = None, trainData = None):
@@ -90,9 +91,10 @@ def decision_tree(self, kCrossValidation = 10):
 
         foldSize = int(data.shape[0] / kCrossValidation)
         for i in range(kCrossValidation):
-            print("Running iteration " + str(i+1) + " of k cross validation")
+            print("Running iteration " + str(i+1) + " of k cross validation .....")
             testData = data.loc[foldSize*i:foldSize*(i+1)-1]
             trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
+            # root = dt.decision(trainData, depth=10, minLeafRows=5)
             root = dt.decision(trainData)
             target = testData.iloc[:,-1].values.tolist()
             predicted = dt.predictData(testData.iloc[:, :-1], root)
@@ -125,7 +127,7 @@ def random_forest(self, kCrossValidation = 10):
 
         foldSize = int(data.shape[0] / kCrossValidation)
         for i in range(kCrossValidation):
-            print("Running iteration " + str(i+1) + " of k cross validation")
+            print("Running iteration " + str(i+1) + " of k cross validation .....")
             testData = data.loc[foldSize*i:foldSize*(i+1)-1]
             trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
             forest = rf.forest(trainData)
@@ -157,6 +159,9 @@ def random_forest(self, kCrossValidation = 10):
             predictData = h.get_file(name, fileType='predictData')
         accuracy, precision, recall, f_score = m.knn(predictData, trainData)
         h.calculateMetrics(accuracy, precision, recall, f_score)
+    elif algorithm == 2:
+        accuracy, precision, recall, f_score = m.decision_tree()
+        h.calculateMetrics(accuracy, precision, recall, f_score)
     elif algorithm == 3:
         print("Enter train File name")
         trainData = h.get_file_bayes(h.get_fileName(), kCrossValidation = 10)
@@ -168,3 +173,8 @@ def random_forest(self, kCrossValidation = 10):
             predictData = h.get_file_bayes(name, fileType='predictData')
         accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData)
         h.calculateMetrics(accuracy, precision, recall, f_score)
+    elif algorithm == 4:
+        accuracy, precision, recall, f_score = m.random_forest()
+        h.calculateMetrics(accuracy, precision, recall, f_score)
+    else:
+        print("\nWrong input")
diff --git a/project3/Code/random_forest.py b/project3/Code/random_forest.py
index 0b7f48d..10052a6 100644
--- a/project3/Code/random_forest.py
+++ b/project3/Code/random_forest.py
@@ -4,7 +4,7 @@
 
 class randomForest:
 
-    def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth=10, minLeafRows=5, randomSeed=12):
+    def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth=10, minLeafRows=3, randomSeed=12):
         if numFeatures == None:
             numFeatures = int(np.sqrt(trainData.shape[1]))
 
@@ -12,17 +12,16 @@ def forest(self, trainData, numTrees=5, numFeatures=None, numRows=None, maxDepth
             # numRows = int(trainData.shape[0] * 0.8)
             numRows = trainData.shape[0]
 
-        forest = [self.createForest(trainData.iloc[:, :-1], trainData.iloc[:,-1], numFeatures, numRows, maxDepth, minLeafRows, randomSeed) for i in range(numTrees)]
+        forest = [self.createForest(trainData,  numFeatures, numRows, maxDepth, minLeafRows, randomSeed) for i in range(numTrees)]
         return forest
 
         
-    def createForest(self, trainData, labels, numFeatures, numRows, maxDepth, minLeafRows, randomSeed):
+    def createForest(self, trainData, numFeatures, numRows, maxDepth, minLeafRows, randomSeed):
 
-        trainData = trainData.sample(numFeatures, axis=1, random_state=randomSeed, replace=False)
-        trainData = pd.concat([trainData, labels], axis=1)
+        # trainData = trainData.sample(numFeatures, axis=1, random_state=randomSeed, replace=False)
         trainData = trainData.sample(numRows, axis=0, random_state=randomSeed, replace=False)
 
-        return dt().createTree(trainData, maxDepth, minLeafRows)
+        return dt().decision(trainData, maxFeatures=numFeatures, depth=maxDepth, minLeafRows=minLeafRows, rf=True)
 
     def predictForest(self, testData, forest):
         predicted = []

From 7efc5b572b2e396ee1ae35410990080248615de0 Mon Sep 17 00:00:00 2001
From: Areeb-Aatif <areebuddin95@gmail.com>
Date: Wed, 27 Nov 2019 18:48:43 -0500
Subject: [PATCH 5/6] limited the depth of decision tree

---
 project3/Code/main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/project3/Code/main.py b/project3/Code/main.py
index 2bf7370..5a60ced 100644
--- a/project3/Code/main.py
+++ b/project3/Code/main.py
@@ -95,7 +95,7 @@ def decision_tree(self, kCrossValidation = 10):
             testData = data.loc[foldSize*i:foldSize*(i+1)-1]
             trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
             # root = dt.decision(trainData, depth=10, minLeafRows=5)
-            root = dt.decision(trainData)
+            root = dt.decision(trainData, depth=15, minLeafRows=5)
             target = testData.iloc[:,-1].values.tolist()
             predicted = dt.predictData(testData.iloc[:, :-1], root)
             models.append(root)
@@ -135,7 +135,6 @@ def random_forest(self, kCrossValidation = 10):
             predicted = rf.predictForest(testData.iloc[:, :-1], forest)
             models.append(forest)
             truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
-            print(truePositives, trueNegatives, falsePositives, falseNegatives)
             accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
             tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
             tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)

From 37686e8a8e70007a93df8cfea2a6bf9e5f1c68a7 Mon Sep 17 00:00:00 2001
From: Areeb-Aatif <areebuddin95@gmail.com>
Date: Thu, 28 Nov 2019 02:22:50 -0500
Subject: [PATCH 6/6] modified decision tree and random forest code to handle
 separate train and test data files

---
 project3/Code/main.py | 58 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/project3/Code/main.py b/project3/Code/main.py
index 5a60ced..e187002 100644
--- a/project3/Code/main.py
+++ b/project3/Code/main.py
@@ -94,8 +94,8 @@ def decision_tree(self, kCrossValidation = 10):
             print("Running iteration " + str(i+1) + " of k cross validation .....")
             testData = data.loc[foldSize*i:foldSize*(i+1)-1]
             trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
-            # root = dt.decision(trainData, depth=10, minLeafRows=5)
-            root = dt.decision(trainData, depth=15, minLeafRows=5)
+            # root = dt.decision(trainData)
+            root = dt.decision(trainData, depth=10, minLeafRows=3)
             target = testData.iloc[:,-1].values.tolist()
             predicted = dt.predictData(testData.iloc[:, :-1], root)
             models.append(root)
@@ -106,7 +106,28 @@ def decision_tree(self, kCrossValidation = 10):
             precision.append(tmpPrecision)
             recall.append(tmpRecall)
             f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
-        return accuracy, precision, recall, f_score
+        
+        print("\nMetrics on train data with k-cross validation")
+        h.calculateMetrics(accuracy, precision, recall, f_score)
+
+        fileName = input("\nEnter test data file name without extension (if no test file, just press enter): ")
+        if fileName != '':
+            # filePath = "../Data/"+fileName+".txt"
+            filePath = "CSE-601/project3/Data/"+fileName+".txt"
+            testData, testLabels = h.readData(filePath)
+            testData = h.oneHotEncoding(testData, testLabels)
+            predLabels = []
+            for _,row in testData.iloc[:,:-1].iterrows():
+                predictedRow = [dt.predictRow(row, root) for root in models]
+                predLabels.append(max(set(predictedRow), key=predictedRow.count))
+            print(predLabels)
+            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predLabels, testData.iloc[:,-1].values.tolist())
+            accuracy = [h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)]
+            precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+            recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+            f_score = [h.findFMeasure(precision, recall)]
+            print("\nMetrics on test data with bagging")
+            h.calculateMetrics(accuracy, [precision], [recall], f_score)
 
     def random_forest(self, kCrossValidation = 10):
         print("\nRunning Random Forest Classifier ....................\n")
@@ -141,7 +162,30 @@ def random_forest(self, kCrossValidation = 10):
             precision.append(tmpPrecision)
             recall.append(tmpRecall)
             f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
-        return accuracy, precision, recall, f_score
+        
+        print("\nMetrics on train data with k-cross validation")
+        h.calculateMetrics(accuracy, precision, recall, f_score)
+
+        fileName = input("\nEnter test data file name without extension (if no test file, just press enter): ")
+        if fileName != '':
+            # filePath = "../Data/"+fileName+".txt"
+            filePath = "CSE-601/project3/Data/"+fileName+".txt"
+            testData, testLabels = h.readData(filePath)
+            testData = h.oneHotEncoding(testData, testLabels)
+            predLabels = []
+            for forest in models:
+                predLabels.append(rf.predictForest(testData, forest))
+            predLabels = pd.DataFrame(predLabels)
+            pred = []
+            for _, colData in predLabels.iteritems():
+                pred.append(colData.value_counts().index[0])
+            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(pred, testData.iloc[:,-1].values.tolist())
+            accuracy = [h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)]
+            precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
+            recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
+            f_score = [h.findFMeasure(precision, recall)]
+            print("\nMetrics on test data with bagging")
+            h.calculateMetrics(accuracy, [precision], [recall], f_score)
 
 if __name__ == "__main__":
     m = main()
@@ -159,8 +203,7 @@ def random_forest(self, kCrossValidation = 10):
         accuracy, precision, recall, f_score = m.knn(predictData, trainData)
         h.calculateMetrics(accuracy, precision, recall, f_score)
     elif algorithm == 2:
-        accuracy, precision, recall, f_score = m.decision_tree()
-        h.calculateMetrics(accuracy, precision, recall, f_score)
+        m.decision_tree()
     elif algorithm == 3:
         print("Enter train File name")
         trainData = h.get_file_bayes(h.get_fileName(), kCrossValidation = 10)
@@ -173,7 +216,6 @@ def random_forest(self, kCrossValidation = 10):
         accuracy, precision, recall, f_score = m.bayes_naive(predictData, trainData)
         h.calculateMetrics(accuracy, precision, recall, f_score)
     elif algorithm == 4:
-        accuracy, precision, recall, f_score = m.random_forest()
-        h.calculateMetrics(accuracy, precision, recall, f_score)
+        m.random_forest()
     else:
         print("\nWrong input")