-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
113 lines (86 loc) · 3.62 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
class Data:
# Import dataset
df = pd.read_csv('Tumor Cancer Prediction Data.csv')
# Drop index column
df.drop("Index", axis=1, inplace=True) # inplace -> specifies the drop operation to be in same dataframe rather creating a copy of the dataframe after drop.
# To check if there're null values
# print( 'Number of null values :', df.isnull().sum())
# Drop rows with at least one Nan value if found
df = df.dropna()
# Map strings to numbers
df['diagnosis'] = df.diagnosis.map({"B": 0, "M": 1})
# Split the data into train and test set
x = df.iloc[:, 0:30] # Features
y = df[['diagnosis']] # Target
x_train, x_test, y_train, y_test = train_test_split(x , y, test_size = 0.25, random_state = 1) # 25% test & 75% training
# Feature Scaling - Normalizing
stndScal = StandardScaler() # X = ((X - X.mean()) / X.std())
x_train = stndScal.fit_transform(x_train)
x_test = stndScal.transform(x_test)
# Feature Selection
for pos, i in enumerate(df.iloc[:, 0:30]):
x = df[i]
y = df["diagnosis"]
r = x.corr(y).round(2)
if (abs(r)<= 0.3):
df.drop(i, axis=1, inplace=True) # Drop F5, F10, F12, F15, F20, F23 and F25
def get_head(self):
print(self.df.head())
def get_shape(self):
shape = self.df.shape
print('\nDataFrame Shape :', shape)
print('\nNumber of rows :', shape[0])
print('\nNumber of columns :', shape[1])
def get_tain_test_shape(self):
print(
"X train : ", self.x_train.shape,
"\nX test : ", self.x_test.shape,
"\nY train : ", self.y_train.shape,
"\nY test : ", self.y_test.shape
)
# Diagnosis Bar-chart
def get_diagnosis(self):
diagnosis_count = Counter(self.df.diagnosis)
diagnosis_bar= pd.DataFrame.from_dict(diagnosis_count, orient='index')
diagnosis_bar.plot(kind='bar')
plt.show()
def voting(self,model1,model2,model3,model4,model5):
ans = []
for i in range(len(model1)) :
cnt = 0
if model1[i] == 1 :
cnt = cnt + 1
if model2[i] == 1 :
cnt = cnt + 1
if model3[i] == 1 :
cnt = cnt + 1
if model4[i] == 1 :
cnt = cnt + 1
if model5[i] == 1 :
cnt = cnt + 1
if cnt > 2 :
ans.insert(i ,1)
else :
ans.insert(i,0)
return(ans)
# # Correlation Coefficient
# def show_correlation(self):
# for pos, i in enumerate(self.df.iloc[:, 0:30]):
# x = self.df[i]
# y = self.df["diagnosis"]
# r = x.corr(y).round(2)
# print("Correlation Between ", i, "and diagnosis : ")
# if abs(r) <= 0.3:
# print("Weak relation --> r = ",round(r, 2),"\n")
# elif abs(r) <= 0.6:
# print("Moderate relation --> r = ",round(r, 2),"\n")
# elif abs(r) <= 0.9:
# print("***************")
# print("Strong relation --> r = ",round(r, 2),"\n***************\n")
# else:
# print("Perfect relation --> r = ",round(r, 2),"\n")