-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_plot_UMAP.py
135 lines (107 loc) · 5.55 KB
/
script_plot_UMAP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# SCRIPT PLOT UMAP
# This scripts perform UMAP calculation over the different features of the sounds
# A DBSCAN in performed on the UMAP to identify clusters
#%% IMPORT packages
import pandas as pd
import scipy as sp
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
from datetime import datetime
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import umap
import umap.plot # pip install "umap-learn[plot]"
from babyplots import Babyplot
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
#%% ------------------------------------------------
# =================== Load file===========================
df_indices=pd.read_csv("df_indices_weather.csv")
# Retrieve datetime index
fileformat='%Y%m%d_%H%M%S.WAV'
df_indices['Date']=df_indices["filename"].apply(lambda x: datetime.strptime(x, fileformat))
df_indices.set_index('Date', inplace=True)
df_indices=df_indices.sort_values(['period','recpos','Date'])
# Calculate missing H indice
df_indices["H"]=df_indices["Ht"]*df_indices["Hf"]
# Drop EPS which is not calculated
df_indices.drop('EPS',inplace=True, axis=1)
df_indices["date"]=pd.to_datetime(df_indices['date'])
df_indices['Time'] = df_indices['date'].dt.strftime('%H:%M')
df_indices['hour']=df_indices['date'].dt.hour
df_indices['hourstr']=df_indices['hour'].astype(str)
df_indices['hourstr']=df_indices['hourstr'].str.zfill(2)+'h' # add zeros to get double digits hours and h
df_indices['isloud']=df_indices['LEQf']>60
ALL_FEATURES=['MEANf','VARf','SKEWf','KURTf','NBPEAKS','LEQf',
'ENRf','BGNf','SNRf','Hf', 'EAS','ECU','ECV','EPS_KURT','EPS_SKEW','ACI',
'NDSI','rBA','AnthroEnergy','BioEnergy','BI','ROU','ADI','AEI','LFC','MFC','HFC',
'ACTspFract','ACTspCount','ACTspMean', 'EVNspFract','EVNspMean','EVNspCount',
'TFSD','H_Havrda','H_Renyi','H_pairedShannon', 'H_gamma', 'H_GiniSimpson','RAOQ',
'AGI','ROItotal','ROIcover','ZCR','MEANt', 'VARt', 'SKEWt', 'KURTt',
'LEQt','BGNt', 'SNRt','MED', 'Ht','H','ACTtFraction', 'ACTtCount','EVNtFraction', 'EVNtMean', 'EVNtCount']
SHORT_FEATURES=['LEQf','SNRf','BioEnergy','AnthroEnergy','ROItotal','NDSI','H','Hf','Ht','ACI']
#SHORT_FEATURES=['AnthroEnergy','ROItotal','NDSI','LEQf']
# %% ------------------------------------------------
# ================= UMAP input ===================
#----------- USER INPUT------------
usefull_dates=["2021-03-26","2021-04-27"] # P1=["2021-03-26","2021-04-27"] P2=["2021-07-28","2021-08-26"]
sel_recopos=["CH9","CH10","CH11","CH12"]#["CH9"] # ["CH9","CH10","CH11","CH12"]
Ndim_umap=3 # Number of dimensions for the UMAP
n_neighbors_umap=15# Number of neighbors for UMAP
random_seed=42 # Random seed for UMAP
FEATURES=ALL_FEATURES # All features or a subset ALL_FEATURES or SHORT_FEATURES
#----------------------------------
# Compute the subset of dates and recpos
df=df_indices[df_indices['recpos'].isin(sel_recopos)]
df=df[((df.index >= usefull_dates[0]) & (df.index <= usefull_dates[1]))]
#df=df[(df.index.hour>=3) & (df.index.hour=<6)]
X=df[ALL_FEATURES].to_numpy()
# ==================== PROCESSING UMAP=======================
# Preprocess with a quantile transformer
pipe = make_pipeline(SimpleImputer(strategy="mean"), QuantileTransformer())
X = pipe.fit_transform(X.copy())
# Fit UMAP to processed data
manifold = umap.UMAP(random_state=random_seed,n_neighbors=n_neighbors_umap,n_components=Ndim_umap).fit(X)
X_reduced = manifold.transform(X)
# %% ------------------------------------------------
# ==============Cluster Analysis with DBSCAN ==============
from sklearn.cluster import DBSCAN
db=DBSCAN(eps=0.6, min_samples=20)
db.fit(X_reduced)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labelsdb= db.labels_
df['labelsdbscan']=labelsdb
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labelsdb)) - (1 if -1 in labelsdb else 0)
n_noise_ = list(labelsdb).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
#%% ------------------------------------------------
# ================== PLOT UMAP ====================
umap_label_by='recpos' # 'recpos' 'israining' 'hour' 'labelsdbscan' 'isloud'
mycolorscale="custom" # 'custom'
my_colors=["#F0F000","#F00000","#00A0F0","#00A000"]
if Ndim_umap==2: # Use UMAP.plot
umap.plot.points(manifold, labels=df[umap_label_by],color_key_cmap=mycolorscale)
elif Ndim_umap==3: # Use babyplot
outname_html='UMAP_3D.html'
outname_json='UMAP_3D.json'
labelbb=df[umap_label_by].to_numpy().tolist() # labels
bp=Babyplot(turntable=False,width=1280,height=960)
bp.add_plot(X_reduced[:,:].tolist(), "shapeCloud", "categories",labelbb, {"shape": "sphere",
"size": 0.4,
"colorScale": mycolorscale,
"customColorScale": my_colors,
"showAxes": [False, False, False],
"axisLabels": ["UMAP 1", "UMAP 2", "UMAP 3"],
"showLegend": True,
"fontSize": 12,
"labelSize":20})
#bp
bp.save_as_html(outname_html,fullscreen=True,title="Corridors UMAP plot")
bp.save_as_json(outname_json)
bp
# %%