-
Notifications
You must be signed in to change notification settings - Fork 0
/
Feature_Selection.py
110 lines (74 loc) · 2.38 KB
/
Feature_Selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import heapq
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
#clf = DecisionTreeClassifier(random_state=0)
def apply_Model(X, Y):
# Chi2 based feature selection
CH=SelectKBest(chi2, k=15).fit(X, Y)
Feature_CH = X.iloc[:, CH.get_support(True)].columns
#h=heapq.nlargest(10, range(len(Sub.scores_)), Sub.scores_.take)
#Mutual Information based feature selection
MI = SelectKBest(mutual_info_classif, k=15).fit(X, Y)
Feature_MI= X.iloc[:, MI.get_support(True)].columns
#Information Gain based feature selection
DT = DecisionTreeClassifier(criterion='entropy',random_state=0)
DT=DT.fit(X,Y)
h=heapq.nlargest(15, range(len(DT.feature_importances_)), DT.feature_importances_.take)
Feature_IG = X.columns[h]
#print(h,MI.get_support(True),CH.get_support(True))
# Logistic Regression based feature selection
LR= linear_model.LogisticRegression(C=1e5).fit(X, Y)
h=LR.coef_
h = np.mean(h, axis=0)
print(h.shape)
h = heapq.nlargest(15, range(len(h)), h.take)
Feature_LR = X.columns[h]
T=np.intersect1d(Feature_CH,Feature_MI)
#print(np.intersect1d(T,Feature_IG))
return Feature_CH,Feature_MI,Feature_IG,Feature_LR
def import_Data():
Data = pd.read_csv('Disease_Data_BiGram.csv')
NumOfFeatures = Data.shape[1]-2
X = Data.iloc[:, 0:NumOfFeatures]
Y = Data['Class']
Y_ = Data['Subject']
return X, Y, Y_
X, Y, Y_= import_Data()
Ch2,MI,IG,LR = apply_Model(X, Y_)
# Ch2=np.sort(Ch2)
# MI=np.sort(MI)
# IG=np.sort(IG)
# LR=np.sort(LR)
SelectWords={}
SelectWords['MI']=MI
SelectWords['Ch2']=Ch2
SelectWords['IG']=IG
SelectWords['LR']=LR
#words=list(set(MI)|set(IG)|set(LR)|set(Ch2))
#Temp=pd.DataFrame([],index=words)
SelectWords=pd.DataFrame(SelectWords)
#print(SelectWords)
SelectWords.to_csv('Selected_Features_BiG.csv')
#result=pd.concat([MI_df,Ch2_df],axis=1)
# MI_df=pd.DataFrame(MI,index=MI)
# Ch2_df=pd.DataFrame(Ch2,index=Ch2)
# IG_df=pd.DataFrame(IG,index=IG)
# LR_df=pd.DataFrame(LR,index=LR)
#
# Temp['MI']=MI_df
# Temp['Ch2']=Ch2_df
# Temp['IG']=IG_df
# Temp['LR']=LR_df
#
# #print(Temp.shape)
#
# Temp[Temp.notnull()] = 1
#
# Temp[Temp.isnull()] = 0
#
# print(Temp.shape)