-
Notifications
You must be signed in to change notification settings - Fork 3
/
smote.py
157 lines (121 loc) · 5.88 KB
/
smote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from numpy import where, random
def smote_binary(df, num_samples_dict=None):
"""
Performs data augmentation and/or downsampling on a dataframe with binary classes.
The SMOTE algorithm is used for augmentation and random sampling is used for downsampling of the specified classes.
Args:
df: the dataframe to manipulate
num_samples_dict: a dictionary of the form {0: num_desired_samples, 1: num_desired_samples} that indicates the
number of samples that the returned dataframe should have in each class
Returns: the augmented and/or downsampled dataframe
"""
if df.columns[[0]] == 'Unnamed: 0':
df.drop(df.columns[[0]], axis=1, inplace=True) # remove first column
if num_samples_dict is not None:
strategy = num_samples_dict
# downsampling classes if needed
counter = Counter(df.Status)
class_samples = []
for i in range(2):
if counter[i] > num_samples_dict[i]: # class i needs to be downsampled
class_i = df.loc[df['Status'] == i]
class_i_sample = class_i.sample(n=num_samples_dict[i])
class_samples.append(class_i_sample)
else: # class i does not need downsampling
class_i = df.loc[df['Status'] == i]
class_samples.append(class_i)
df = pd.concat(class_samples).reset_index(drop=True)
else:
strategy = 'auto'
y = df.Status # labels
x = df.drop(['Status', 'Gender'], axis=1) # features only, Gender also removed because it must be binary
gender = df.Gender
oversample = SMOTE(sampling_strategy=strategy)
x_oversampled, y_oversampled = oversample.fit_resample(x, y)
# get sample counts for augmented dataset
counter = Counter(y_oversampled) # {0: num_samples, 1: num_samples}
if num_samples_dict is not None and (counter[0] != num_samples_dict[0] or counter[1] != num_samples_dict[1]):
print("Sampling failed")
elif num_samples_dict is None and counter[0] != counter[1]:
print("Sampling failed")
# regenerate dataframe with new samples
df_concat = pd.concat([pd.DataFrame(y_oversampled), pd.DataFrame(gender), pd.DataFrame(x_oversampled)], axis=1)
df_concat.columns = df.columns
# fill missing gender values with a random choice of 0 or 1
for row in df_concat.loc[df_concat.Gender.isnull(), 'Gender'].index:
df_concat.at[row, 'Gender'] = random.randint(0, 2)
return df_concat
def smote_multiclass(df, num_samples_dict=None, one_hot_encoded=False):
"""
Performs data augmentation and/or downsampling on a dataframe with multiple classes.
Supports both numerical and one-hot encoded class values.
The SMOTE algorithm is used for augmentation and random sampling is used for downsampling of the specified classes.
Args:
df: the dataframe to manipulate
num_samples_dict: a dictionary of the form {0: num_desired_samples, 1: num_desired_samples, ...} that indicates the
number of samples that the returned dataframe should have in each class
one_hot_encoded: Boolean value indicating if the dataframe is one-hot encoded
Returns: the augmented and/or downsampled dataframe
"""
if df.columns[[0]] == 'Unnamed: 0':
df.drop(df.columns[[0]], axis=1, inplace=True) # remove first column
# change one hot encoded dataset into numerical labels
if one_hot_encoded:
df.insert(loc=0, column='Status', value=np.nan)
for i, row in df.iterrows():
if df.iloc[i]['Control'] == 1:
status = 0
elif df.iloc[i]['Parkinsons'] == 1:
status = 1
elif df.iloc[i]['ALS'] == 1:
status = 2
elif df.iloc[i]['Cerebral_palsy'] == 1:
status = 3
df.at[i,'Status'] = status
df = df.drop(columns=['Control', 'Parkinsons', 'ALS', 'Cerebral_palsy'])
if num_samples_dict is not None:
strategy = num_samples_dict
# downsampling classes if needed
counter = Counter(df.Status)
class_samples = []
for i in range(4):
if counter[i] > num_samples_dict[i]: # class i needs to be downsampled
class_i = df.loc[df['Status'] == i]
class_i_sample = class_i.sample(n=num_samples_dict[i])
class_samples.append(class_i_sample)
else: # class i does not need downsampling
class_i = df.loc[df['Status'] == i]
class_samples.append(class_i)
df = pd.concat(class_samples).reset_index(drop=True)
else:
strategy = 'auto'
y = df.Status # labels
x = df.drop(['Status', 'Gender'], axis=1) # features only, Gender also removed because it must be binary
gender = df.Gender
oversample = SMOTE(sampling_strategy=strategy)
x_oversampled, y_oversampled = oversample.fit_resample(x, y)
# get sample counts for augmented dataset
counter = Counter(y_oversampled)
if num_samples_dict is not None and (counter[0] != num_samples_dict[0] or counter[1] != num_samples_dict[1]
or counter[2] != num_samples_dict[2] or counter[3] != num_samples_dict[3]):
print("Sampling failed")
elif num_samples_dict is None and (counter[0] != counter[1] or counter[1] != counter[2] or counter[2] != counter[3]):
print("Sampling failed")
# regenerate dataframe with new samples
df_concat = pd.concat([pd.DataFrame(y_oversampled), pd.DataFrame(gender), pd.DataFrame(x_oversampled)], axis=1)
df_concat.columns = df.columns
# fill missing gender values with a random choice of 0 or 1
for row in df_concat.loc[df_concat.Gender.isnull(), 'Gender'].index:
df_concat.at[row, 'Gender'] = random.randint(0, 2)
# change numerical class values back into one-hot encoded values
if one_hot_encoded:
one_hot = pd.get_dummies(df_concat['Status'])
one_hot.rename(columns={0:'Control', 1:'Parkinsons', 2:'ALS', 3:'Cerebral_palsy'}, inplace=True)
df_concat = df_concat.drop(columns=['Status'], axis=1)
df_concat = pd.concat([one_hot, df_concat], axis=1)
return df_concat