-
Notifications
You must be signed in to change notification settings - Fork 2
/
cluster.py
149 lines (117 loc) · 9.09 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
cluster=[0,0,3,1,1,0,4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,2,4,3,0,0,0,2,2,4,4,2,1,0,2,2,2,2,2]
avg_missing_value=[0,0,0,0,0,0,0,0.01022055,0.007987719,0.007383295,0.011889146,0.006365741,0.00817419,0.011209169,0.011532279,0.01204186,0.013549704,0.008201517,0.008280285,0.112722603,0.190958904,0.152106164,0.112722603,0.152106164,0.139931507,0.151541096,0,0,0,0,0,0,0,0,0,0,0,0.123016975,0,0,0,0,0,0.000309725,0.004038066,0,0.001713293,0,0,0,0.130628302,0.080526457,0.09630898,0.091694646,0.099507566]
avg_missing_row=[0,0,0,0,0,0,0,0.060108025,0.039737654,0.033179012,0.102237654,0.028626543,0.050617284,0.068518519,0.119675926,0.063888889,0.066975309,0.057638889,0.056635802,0.126575342,0.205479452,0.162739726,0.126575342,0.162739726,0.151780822,0.164383562,0,0,0,0,0,0,0,0,0,0,0,0.203055556,0,0,0,0,0,0.001045455,0.019764957,0,0.008878601,0,0,0,0.378431373,0.176470588,0.180941358,0.294607843,0.295098039
]
avg_anomaly=[0.011483696,0.010615079,0.017455965,0.016326531,0.002417373,0.01333649,0.008464171,0.005585106,0.008244681,0.007299054,0.006309102,0.008008274,0.005998818,0.00660461,0.006102246,0.007742317,0.006589835,0.007239953,0.007579787,0.016526734,0.01727795,0.016084843,0.016526734,0.016084843,0.016349978,0.01723376,0.016371445,0.012827072,0.017454083,0.015614035,0.016362039,0.014968672,0.015663408,0.016925996,0.017252711,0.017186324,0.015020833,0.019570707,0.019695423,0.009908537,0.019459162,0.006432512,0.006965812,0.015772222,0.011640772,0.009348291,0.02708161,0.027626001,0.01108644,0.007288596,0.010802469,0.011130401,0.014400077,0.009616127,0.010088735]
max_anomaly=[0.01475,0.040277778,0.070140281,0.038888889,0.008016032,0.09077381,0.036111111,0.038888889,0.05625,0.061805556,0.050694444,0.047916667,0.048611111,0.044444444,0.0375,0.077777778,0.051388889,0.054166667,0.076388889,0.034246575,0.026027397,0.024657534,0.034246575,0.024657534,0.026027397,0.028767123,0.085,0.065,0.075,0.065,0.05,0.095,0.06,0.065,0.07,0.07,0.055,0.051388889,0.026775,0.202083333,0.088996711,0.008428571,0.039583333,0.03725,0.091666667,0.111805556,0.042164352,0.055699482,0.074802632,0.008266667,0.102083333,0.050694444,0.065972222,0.046527778,0.124305556
]
avg_drift=[0.085726272,0.39484127,0.263157895,0.267219388,0.050964455,0.384848485,0.7,0.281323877,0.280732861,0.266548463,0.263790386,0.270291568,0.285657998,0.286052009,0.280929866,0.270094563,0.273049645,0.278171789,0.26536643,0.155913978,0.212365591,0.077956989,0.155913978,0.077956989,0.215053763,0.10483871,0.187633872,0.101849155,0.257408115,0.262778158,0.202268733,0.266613503,0.217374513,0.186403887,0.243181986,0.229349816,0.213794192,0.598484848,0.430897887,0.417301829,0.52393617,0.10454055,0.148040639,0.15,0.363559322,0.648504274,0.795212766,0.156013986,0.44367284,0.092451524,0.377480159,0.404017857,0.432539683,0.365079365,0.362847222]
avg_concept_drift=[0.003333008,0.000295101,0.000351432,0.002324256,0.000106716,0.002934588,0.003329619,0.003329063,0.003329063,0.003329063,0.003329063,0.003329063,0.00332235,0.003329063,0.003261123,0.003329063,0.003329063,0.003329063,0.003329063,0.003324949,0.003324949,0.003324949,0.003324949,0.003324949,0.003324949,0.003324949,7.79E-05,6.26E-06,5.39E-05,0.000408916,8.33E-05,0.00046316,0.000118696,0.000284371,0.000151936,0.000123909,0.00040194,0.003319054,0.003333231,0.001651667,0.000587951,8.96E-05,0.003327437,0.000142483,0.003333024,0.003330704,0.001047603,0.003324395,0.005131865,1.26E-05,0.003226387,0.000989768,0.00332922,0.00332922,0.002969304]
avg_concept_drift_warning=[0,0.358023477,0.328589,0.23582448,0.341113466,0.107706093,0,0,0,0,0,0,0,0,0.002203199,0.000349406,0,0,0,0,0,0,0,0,0,0,0.101457967,0.052173465,0.039147696,0.223297365,0.308049794,0.235982108,0.359423184,0.126146963,0.365071041,0.327999339,0.348676369,0,0,0.012674015,0.156640383,0.362781846,0,0.347366075,0,0,0,0,0.092308195,0.061925641,0.003271376,0.374344439,0,0,0.008882205]
avg_drift_warning=[0.000301671,0,0.013157895,0.021683673,0.000101544,0.001515152,0.002173913,0.005910165,0.00394011,0.005516154,0.00177305,0.005910165,0.003743105,0.003349094,0.00748621,0.006895193,0.004925138,0.006501182,0.006501182,0.005376344,0.010752688,0.01344086,0.005376344,0.01344086,0.021505376,0.005376344,0.023493169,0.011515152,0.019062611,0.016537556,0.020730149,0.015844536,0.01919862,0.022986027,0.018876528,0.019726475,0.016824495,0.007575758,0.000880282,0.000381098,0.000443262,4.92E-05,0.003338171,0.027777778,0.003813559,0.000356125,0.00177305,0.000839161,0.00308642,0.002308403,0.004960317,0.004712302,0.002232143,0.005704365,0.004712302]
task=[1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1]
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
avg_missing_value=np.array(avg_missing_value)
avg_missing_row=np.array(avg_missing_row)
avg_anomaly = np.array(avg_anomaly)
max_anomaly = np.array(max_anomaly)
avg_drift = np.array(avg_drift)
avg_concept_drift = np.array(avg_concept_drift)
avg_concept_drift_warning = np.array(avg_concept_drift_warning)
avg_drift_warning = np.array(avg_drift_warning)
avg_anomaly = (avg_anomaly + max_anomaly)/2
avg_drift = (avg_drift+avg_concept_drift)/2 + (avg_drift_warning+avg_concept_drift_warning)/4
avg_missing_value=(avg_missing_row+avg_missing_value)/2
#avg_missing_value /= np.std(avg_missing_value)
#avg_anomaly /= np.std(avg_anomaly)
#avg_drift /= np.std(avg_drift)
# Create a new figure
fig = plt.figure()
# Reduce white space around the edges
fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
ax = fig.add_subplot(111, projection='3d')
# Define the colors for each cluster
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
# Define the markers for each task
markers = ['s', '^'] # s for square, ^ for triangle
# List of datasets to outline
outline_datasets = [5, 6, 16, 29, 39]
# Normal and enlarged point sizes
normal_size = 20
large_size = 300
# Add data to the scatter plot with color depending on the cluster
for i in range(len(cluster)):
point_size = large_size if i in outline_datasets else normal_size
edge_color = 'k' if i in outline_datasets else None
ax.scatter(avg_missing_value[i], avg_anomaly[i], avg_drift[i],
c=colors[cluster[i]], marker=markers[task[i]], edgecolor=edge_color, s=point_size)
# Set labels for the axes with larger fonts
ax.set_zlabel('Drift Ratios', fontsize=16, labelpad=10)
ax.set_xlabel('Missing Value Ratios', fontsize=16, labelpad=10)
ax.set_ylabel('Anomaly Ratios', fontsize=16, labelpad=10)
# Increase tick label size
ax.tick_params(axis='both', which='major', labelsize=14)
# Manually tighten the axis range
ax.set_xlim([avg_missing_value.min(), avg_missing_value.max()*0.95])
ax.set_ylim([avg_anomaly.min(), avg_anomaly.max()*0.95])
ax.set_zlim([avg_drift.min(), avg_drift.max()*0.95])
# Set fewer ticks on the axes
ax.locator_params(nbins=4)
# Show the plot
plt.show()
'''
# 3,4,5,26-36,41,48
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Create a DataFrame
import pandas as pd
dataframe = pd.DataFrame({
'avg_anomaly': avg_anomaly,
'avg_drift': avg_drift,
'avg_missing_value': avg_missing_value
})
# IDs for the different groups
group1_ids = range(len(dataframe)) # All data
group2_ids = [3,4,5,26,27,28,29,30,31,32,33,34,35,36,41,48] # IDs start from 0
group3_ids = [5, 6, 16, 29, 39]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Adjusting default text sizes
plt.rc('font', size=20) # controls default text sizes
plt.rc('axes', titlesize=20) # fontsize of the axes title
plt.rc('axes', labelsize=20) # fontsize of the x and y labels
plt.rc('xtick', labelsize=20) # fontsize of the tick labels
plt.rc('ytick', labelsize=20) # fontsize of the tick labels
plt.rc('legend', fontsize=20) # legend fontsize
plt.rc('figure', titlesize=20) # fontsize of the figure title
# Assuming 'dataframe' is your DataFrame and 'group1_ids', 'group2_ids', 'group3_ids' are your group of IDs
# Selecting the data for each group
df_all = dataframe.loc[dataframe.index.isin(group1_ids)]
df_group2 = dataframe.loc[dataframe.index.isin(group2_ids)]
df_group3 = dataframe.loc[dataframe.index.isin(group3_ids)]
# List of variables of interest
variables = ['avg_anomaly', 'avg_drift', 'avg_missing_value']
name_list = ['Anomaly ratio', 'Drift ratio', 'Missing value ratio']
# Setting up the plot
fig, ax = plt.subplots(1, len(variables), figsize=(15, 5))
# For each variable of interest
for i, var in enumerate(variables):
# Create a combined dataframe with a new 'Group' column indicating the group
df_combined = pd.concat([
df_group2[var].reset_index().assign(Group='USP DS'),
df_all[var].reset_index().assign(Group='Ours'),
df_group3[var].reset_index().assign(Group='Selected')
])
# Create boxplot
sns.boxplot(x='Group', y=var, data=df_combined, ax=ax[i], width=0.4) # less width = thinner bars
# Set title and y-label
ax[i].set_title(name_list[i])
ax[i].set_ylabel('')
ax[i].set_xlabel('') # remove x-axis label
plt.tight_layout()
plt.show()
'''