forked from DongZhouGu/MathModel-Pretrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
imputer.py
63 lines (54 loc) · 2.19 KB
/
imputer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
### 绘制缺失变量的热图
def miss_heatamp(data, title):
cols = data.columns
colours = ['#006699', '#ffff99'] ## 第一项没缺失的颜色,第二项,缺失的颜色
plt.figure(figsize=(12, 8))
plt.rcParams['font.sans-serif'] = ['SimHei'] # 黑体
plt.rcParams['axes.unicode_minus'] = False # 解决无法显示符号的问题
sns.set(font='SimHei', font_scale=1)
f = sns.heatmap(data[cols].isnull(), cmap=sns.color_palette(colours))
f.set_title(title)
### 缺失值的直方图
def miss_bar(data, title):
'''
data : dataframe格式的数据,行为数据,列为特征
title : 图表名字
'''
missValue2miss_num = {}
for col in data.columns:
missing = data[col].isnull()
num_missing = np.sum(missing)
if num_missing > 0:
missValue2miss_num[col] = num_missing
df = pd.DataFrame([missValue2miss_num])
df.index = ['miss_num']
df = df.T
plt.figure(figsize=(10, 8))
plt.rcParams['font.sans-serif'] = ['SimHei'] # 黑体
plt.rcParams['axes.unicode_minus'] = False # 解决无法显示符号的问题
sns.set(font='SimHei', font_scale=1)
f = sns.barplot(x=df.index, y=df['miss_num'], color='#336699')
f.set_title(title)
### 随机森林插值
def rf_impute(data):
'''
data:dataframe格式
'''
copy_data = data.copy()
miss_columns = copy_data.isnull().sum()[copy_data.isnull().sum() != 0].sort_values().index.tolist()
unmiss_columns = copy_data.isnull().sum()[copy_data.isnull().sum() == 0].sort_values().index.tolist()
for col in miss_columns:
X_train = copy_data[copy_data[col].notnull()][unmiss_columns].values
Y_train = copy_data[copy_data[col].notnull()][col].values
X_test = copy_data[copy_data[col].isnull()][unmiss_columns][unmiss_columns].values
rfr=RandomForestRegressor()
rfr.fit(X_train,Y_train)
predict_value = rfr.predict(X_test)
copy_data.loc[(copy_data[col].isnull()),col] = predict_value
unmiss_columns.append(col)
return copy_data