-
Notifications
You must be signed in to change notification settings - Fork 7
/
Analysis_of_Patients_dataset.py
71 lines (52 loc) · 1.92 KB
/
Analysis_of_Patients_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
#from datetime import datetime, date, time, timedelta
#Function for importing CSV File
def importcsv(var):
imported = pd.read_csv(var)
return imported
#Function to plot Histogram
def histogram(attribute,bins):
plt.hist(attribute, bins)
df_list = list(df.columns[1:24])
#Correlation Plot Function
def corr_plot(list_of_df):
corr = df[list_of_df].corr()
#plotting the layout for map
plt.figure(figsize=(25,25))
sns.heatmap(corr, cmap='coolwarm', xticklabels = list_of_df, yticklabels = list_of_df, annot=True)
#Function for Random Forest Classification
def RandomForest(dataset):
X = dataset.iloc[:, [1, 23]].values
y = dataset.iloc[:, 24].values
# Splitting the dataset as Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
# Feature Scaling
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
# Classifying Random Forest to the Training set
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
classifier.fit(X_train, y_train)
# Predicting results for test set
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
return conf_mat
if __name__ == "__main__":
# Importing Data Frame
df = importcsv("C:\\Users\\DTP\\Desktop\\cancer_info.csv")
# Plotting Histogram
histogram(df.Age, bins=5)
#Plotting Correlation Plot
corr_plot(df_list)
#Applying Random forest function
RandomForest(df)