-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocalization_segmenting.py
129 lines (94 loc) · 3.42 KB
/
vocalization_segmenting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
from scipy.io import wavfile
from scipy.signal import stft, spectrogram
from scipy.ndimage.filters import gaussian_filter
import os
import numpy as np
from librosa import feature as libfeature
def get_spec_min_max(wav_path, start_s=0, stop_s=-1):
"""
Automatically calculate the 'spec_min_val' and 'spec_max_val' parameter for ava segmenting.
Parameters:
wav_path: str, path
Path to an audio file with vocalizations
start_s, stop_s: int
If you're audio file is huge, use a small segment with a diversity of loud and soft sounds, bounded onset start_s (seconds) and offset stop_s (seconds)
Returns:
percentiles: tuple
spec_min_val and spec_max_val for segmentation
sr: int
audio sampling rate for segmentation
"""
#load audio
sr, audio = wavfile.read(wav_path)
#chunk the audio, if your file is huge
audio_segment = audio[int(start_s*sr):int(stop_s*sr)]
#compute stft of chunk
_, _, scipy_spec = stft(audio_segment,
fs=sr,
nperseg=512,
noverlap=256)
#compute spectrogram
scipy_spec = np.log(np.abs(scipy_spec) + 1e-9)
#calculate percentiles for ava spectrogram processing
percentiles = np.quantile(scipy_spec, (.95, .975))
return percentiles, sr
def load_oo(wav_path):
"""
Loads onsets/offsets from segment files.
Parameters:
wav_path: str, path
Path to an audio file with vocalizations
"""
dirname, basename = os.path.split(wav_path)
segment_txt_file = os.path.join(dirname, 'segment', basename.replace('wav', 'txt'))
data = np.genfromtxt(segment_txt_file)
if data.shape == (0,):
return
if data.shape == (2,):
data = np.reshape(data, (1,2))
#load in onset/offset times from text file
onsets, offsets = data[:,0], data[:,1]
return onsets, offsets
def get_sf(audio):
"""
Get the mean spectral flatness of an audio snippet.
Paramters:
audio: np.array
Audio of a single segment extracted by AVA.
Returns:
sf: float
Spectral flatness of the snippet. Typically less than 0.3 is a vocalization.
"""
sf = libfeature.spectral_flatness(y=audio, n_fft=256,
hop_length=128, win_length=256,
center = False, power=2.0)[0]
return np.mean(sf)
def filter_segments(wav_path, sf_threshold=0.3):
"""
This function filters out putative vocalizations from noise using spectral flatness.
Parameters:
wav_path: str, path
Path to an audio file with vocalizations
sf_threshold: float
Spectral flatness threshold. More strict is towards 0.0.
Returns:
onsets_filt, offsets_filt: np.array
Onsets/offsets of putatitve vocalizations
"""
#load audio
sr, audio = wavfile.read(wav_path)
#get onsets/offsets
onsets, offsets = load_oo(wav_path)
#get spectral flatness for all sounds detected
sfs = np.array([get_sf(audio[on:off]) for on, off in zip((onsets*sr).astype(int), (offsets*sr).astype(int))])
#filter the onsets/offsets
onsets_filt, offsets_filt = onsets[sfs<sf_threshold], offsets[sfs<sf_threshold]
#save new onsets/offsets
dirname, basename = os.path.split(wav_path)
new_seg_dir = os.path.join(dirname, 'segment_filt')
if not os.path.exists(new_seg_dir):
os.makedirs(new_seg_dir)
outfile = os.path.join(new_seg_dir, basename.replace('wav', 'txt'))
np.savetxt(outfile, np.vstack((onsets_filt, offsets_filt)).T)
print('{} segments after filtering. Saved to: {}'.format(len(onsets_filt), outfile))