Skip to content

Commit

Permalink
Merge pull request #35 from mkamran37/decisionTree
Browse files Browse the repository at this point in the history
Decision tree
  • Loading branch information
Areeb-Aatif authored Nov 28, 2019
2 parents 51bb06c + 37686e8 commit 1562f4f
Show file tree
Hide file tree
Showing 6 changed files with 713 additions and 2 deletions.
126 changes: 126 additions & 0 deletions project3/Code/decision_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import numpy as np
import pandas as pd
from math import log
import random

class decisionTree:

def decision(self, trainData, maxFeatures=None, depth=float('inf'), minLeafRows=0, rf=False):
features = trainData.columns.values.tolist()
features.pop()
root = self.createTree(trainData, features, maxFeatures, depth, minLeafRows, rf)
# print(root)
return root

def createTree(self, data, features, maxFeatures, depth, minLeafRows, rf):
n = Node()

if depth <= 0 or data.shape[0] <= minLeafRows:
n.feature = data.iloc[:,-1].value_counts().index[0]
return n

if data.iloc[:,-1].value_counts().shape[0] == 1:
n.feature = data.iloc[:,-1].iloc[0]
return n

if len(features) == 0:
n.feature = data.iloc[:,-1].value_counts().index[0]
return n

if rf == True:
sampledData = pd.concat([data[random.sample(features, k=maxFeatures)], data.iloc[:,-1]], axis=1)
bestFeature, condition = self.getBestFeature(sampledData)
else:
bestFeature, condition = self.getBestFeature(pd.concat([data[features], data.iloc[:,-1]], axis=1))
features = [x for _,x in enumerate(features) if x != bestFeature]
n.feature = bestFeature
n.condition = condition

leftChildData = data.loc[data[bestFeature] < condition]
if leftChildData.shape[0] == 0:
temp = Node()
temp.feature = data.iloc[:,-1].value_counts().index[0]
n.left = temp
else:
n.left = self.createTree(leftChildData, features, maxFeatures, depth-1, minLeafRows, rf)

rightChildData = data.loc[data[bestFeature] >= condition]
if rightChildData.shape[0] == 0:
temp = Node()
temp.feature = data.iloc[:,-1].value_counts().index[0]
n.right = temp
else:
n.right = self.createTree(rightChildData, features, maxFeatures, depth-1, minLeafRows, rf)

return n

def getBestFeature(self, data):
entropy_p = self.entropy(data)
max_gain = float('-inf')
bestFeature = 0.0
bestCondition = 0.0
for colName, colData in data.iloc[:,:-1].iteritems():
percent = [0.2, 0.5, 0.8]
for p in percent:
condition = (colData.max() - colData.min()) * p
entropy_i = 0.0
subData1 = data.loc[data[colName] < condition]
prob1 = len(subData1) / float(len(data))
entropy_i += prob1 * self.entropy(subData1)

subData2 = data.loc[data[colName] >= condition]
prob2 = len(subData2) / float(len(data))
entropy_i += prob2 * self.entropy(subData2)

info_gain = entropy_p - entropy_i
if info_gain > max_gain:
max_gain = info_gain
bestFeature = colName
bestCondition = condition

return bestFeature, bestCondition

def entropy(self, data):
entropy = 0.0
labelCounts = data.iloc[:,-1].value_counts()
for idx in labelCounts.index:
prob = float(labelCounts[idx]) / len(data)
entropy -= prob * log(prob, 2)

return entropy

def predictData(self, data, root):
predicted = []
for index, row in data.iterrows():
predicted.append(self.predictRow(row, root))

return predicted

def predictRow(self, data, root):
if not root.left and not root.right:
return root.feature

if data[root.feature] < root.condition:
return self.predictRow(data, root.left)
elif data[root.feature] >= root.condition:
return self.predictRow(data, root.right)


class Node:

def __init__(self):
self.feature = None
self.left = None
self.right = None
self.condition = None

def __str__(self, level=0):
ret = "\t"*level+repr(self.feature)+"\n"
if self.left:
ret += self.left.__str__(level+1)
if self.right:
ret += self.right.__str__(level+1)
return ret

def __repr__(self):
return '<tree node representation>'
42 changes: 41 additions & 1 deletion project3/Code/helpers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import numpy as np
from point import point
import math
import pandas as pd
from collections import defaultdict

class helpers:
def get_fileName(self):
filename = input("enter file name (without extension): ")
filename = input("Enter file name (without extension): ")
return filename

def get_file_bayes(self, filename, kCrossValidation = 10, fileType='trainData'):
Expand Down Expand Up @@ -225,3 +226,42 @@ def calculateMetrics(self, accuracy, precision, recall, f_score):
print("PRECISION = {}%".format(averagePrecision*100))
print("RECALL = {}%".format(averageRecall*100))
print("F MEASURE = {}%".format(averageFscore*100))

def readData(self, filePath):
'''
Read input data for decision tree and random forest classifier
input: filepath
output: Data Points- a pandas dataframe of input data
Labels - a pandas dataframe of labels for each data point
'''
data = np.genfromtxt(filePath, dtype=None, delimiter="\t", encoding=None)
dataDf = pd.DataFrame(data)
labels = dataDf.iloc[:,-1]
return dataDf.iloc[:,:-1], dataDf.iloc[:,-1]

def oneHotEncoding(self, data, labels):
'''
One Hot Encode the input data file and then concat the labels to return a single dataframe
input: data - pandas dataframe of input data
labels - pandas dataframe of labels associated with input data points
output: returns a dataframe with one hot encoding and joining the labels to the data points
'''
for colName, colData in data.iteritems():
if colData.dtype == np.object:
data = pd.concat([data, pd.get_dummies(colData, prefix=colName)], axis=1)
data.drop([colName], axis=1, inplace=True)

return pd.concat([data, labels], axis=1)

def findParameters(self, predicted, target, tp=1, tn=0):
truePositives, trueNegatives, falsePositives, falseNegatives = 0,0,0,0
for p, t in zip(predicted, target):
if p == tp and t == tp:
truePositives+=1
elif p == tp and t == tn:
falsePositives+=1
elif p == tn and t == tp:
falseNegatives+=1
else:
trueNegatives+=1
return truePositives, trueNegatives, falsePositives, falseNegatives
Loading

0 comments on commit 1562f4f

Please sign in to comment.