-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
840 lines (739 loc) · 43.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
import warnings
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import scipy.signal as sig
from librosa.display import waveshow, specshow
from IPython.display import Audio, Video
import parselmouth
import math
import soundfile as sf
import ffmpeg
sns.set_theme(rc={"xtick.bottom" : True, "ytick.left" : False, "xtick.major.size":4, "xtick.minor.size":2, "ytick.major.size":4, "ytick.minor.size":2, "xtick.labelsize": 10, "ytick.labelsize": 10})
def readCycleAnnotation(cyclePath, numDiv, startTime, duration):
'''Function to read cycle annotation and add divisions in the middle if required.
Parameters:
cyclePath (str): path to the cycle annotation file
numDiv (int): number of equally spaced divisions to add between pairs of annotations (numDiv - 1 timestamps will be added between each pair)
startTime (float): start time of audio being analysed
duration (float): duration of the audio to be analysed
Returns:
provided (np.ndarray): a numpy array of annotations from the file
computed (list): a list of division between annotations
'''
cycle_df = pd.read_csv(cyclePath)
index_values = cycle_df.loc[(cycle_df['Time'] >= startTime) & (cycle_df['Time'] <= startTime + duration)].index.values
if len(index_values) == 0:
return None, None
provided = cycle_df.iloc[max(index_values[0]-1, 0):min(index_values[-1]+2, cycle_df.shape[0])]
# add divisions in the middle
computed = []
for ind, val in enumerate(provided['Time'].values[:-1]):
computed.extend(np.around(np.linspace(val, provided['Time'].values[ind+1], num = numDiv, endpoint=False), 2)[1:])
return [provided], computed
def readOnsetAnnotation(onsetPath, startTime, duration, onsetKeyword=['Inst']):
'''Function to read cycle annotation and add divisions in the middle if required.
Parameters:
onsetPath (str): path to the cycle annotation file
startTime (float): start time of audio being analysed
duration (float): duration of the audio to be analysed
onsetKeyword (list): list of column names in the onset file to take onsets from
Returns:
provided (list): list of numpy arrays of annotations from the file
'''
onset_df = pd.read_csv(onsetPath)
provided = [] # variable to store onset timestamps
for keyword in onsetKeyword:
provided.append(onset_df.loc[(onset_df[keyword] >= startTime) & (onset_df[keyword] <= startTime + duration)])
return provided
def drawAnnotation(cyclePath=None, onsetPath=None, onsetTimeKeyword='Inst', onsetLabelKeyword='Label', numDiv=0, startTime=0, duration=None, ax=None, annotLabel=True, c='purple', alpha=0.8, y=0.7, size=10):
'''Draws annotations on ax
Parameters
cyclePath (str): path to the cycle annotation file
onsetPath (str): path to onset annotations; only considered if cyclePath is None
onsetKeyword (str): column name in the onset file to take onsets from
onsetLabelKeyword (str): column name with labels for the onsets; if None, no label will be printed
numDiv (int): number of equally spaced divisions to add between pairs of annotations (numDiv - 1 timestamps will be added between each pair)
startTime (float): start time of audio being analysed
duration (float): duration of the audio to be analysed
ax (plt.Axes.axis): axis to plot in
annotLabel (bool): if True, will print annotation label along with line
c (str or list): list of colour to plot lines in, one for each onsetTimeKeyword (if provided)
alpha (float): controls opacity of the annotation lines drawn
y (float): float value from [0, 1] indicating where the label should occur on the y-axis. 0 indicates the lower ylim, 1 indicates the higher ylim.
size (int): font size for annotated text
Returns
ax (plt.Axes.axis): axis that has been plotted in
'''
if cyclePath is not None:
provided, computed = readCycleAnnotation(cyclePath, numDiv, startTime, duration)
timeCol = ['Time'] # name of column with time readings
labelCol = ['Cycle'] # name of column to extract label of annotation from
c = c if isinstance(c, list) else [c]
elif onsetPath is not None:
if annotLabel and type(onsetTimeKeyword) == type(onsetLabelKeyword) and isinstance(onsetTimeKeyword, list):
# check if length of lists is the same
if len(onsetTimeKeyword) != len(onsetLabelKeyword):
raise Exception('Length of onsetTimeKeyword and onsetLabelKeyword should match')
if type(onsetTimeKeyword) == type(c) and isinstance(onsetTimeKeyword, list):
# check if length of lists is the same
if len(onsetTimeKeyword) != len(c):
raise Exception('Length of onsetTimeKeyword and c should match')
timeCol = onsetTimeKeyword if isinstance(onsetTimeKeyword, list) else [onsetTimeKeyword] # name of column with time readings
labelCol = onsetLabelKeyword if isinstance(onsetLabelKeyword, list) else [onsetLabelKeyword] # name of column to extract label of annotation from
c = c if isinstance(c, list) else [c]
provided = readOnsetAnnotation(onsetPath, startTime, duration, onsetKeyword=timeCol)
computed = None
else:
raise Exception('A cycle or onset path has to be provided for annotation')
if computed is not None:
for computedVal in computed:
ax.axvline(computedVal - startTime, linestyle='--', c=c[0], alpha=0.4)
if provided is not None:
for i, providedListVal in enumerate(provided):
firstLabel = True # marker for first line being plotted; to prevent duplicates from occuring in the legend
for _, providedVal in providedListVal.iterrows():
ax.axvline((providedVal[timeCol[i]]) - startTime, linestyle='-', c=c[i], label=timeCol[i] if firstLabel and cyclePath is None else '', alpha=alpha) # add label only for first line of onset for each keyword
if firstLabel: firstLabel = False
if annotLabel:
ylims = ax.get_ylim() # used to set label at 0.7 height of the plot
if isinstance(providedVal[labelCol[i]], str):
ax.annotate(f"{providedVal[labelCol[i]]}", (providedVal[timeCol[i]]-startTime, (ylims[1]-ylims[0])*y + ylims[0]), bbox=dict(facecolor='grey', edgecolor='white'), c='white')
else:
ax.annotate(f"{float(providedVal[labelCol[i]]):g}", (providedVal[timeCol[i]]-startTime, (ylims[1]-ylims[0])*y + ylims[0]), bbox=dict(facecolor='grey', edgecolor='white'), c='white')
if onsetPath is not None and cyclePath is None: # add legend only is onsets are given, i.e. legend is added
ax.legend()
return ax
def pitchCountour(audio=None, sr=16000, audioPath=None, startTime=0, duration=None, minPitch=98, maxPitch=660, notes=None, tonic=220, timeStep=0.01, octaveJumpCost=0.9, veryAccurate=True, ax=None, freqXlabels=5, annotate=False, cyclePath=None, numDiv=0, onsetPath=None, onsetTimeKeyword='Inst', onsetLabelKeyword='Label', xticks=False, yticks=False, xlabel=True, ylabel=True, title='Pitch Contour (Cents)', annotLabel=True, cAnnot='purple', ylim=None, annotAlpha=0.8):
'''Returns pitch contour (in cents) for the audio
Uses `plotPitch` to plot pitch contour if ax is not None.
Parameters
audio (np.ndarray): loaded audio time series
sr (int): sample rate of audio time series/ to load the audio at
audioPath (str): path to audio file; only needed if audio is None
startTime (float): time to start reading audio file
duration (float): duration of the audio file to read
minPitch (float): minimum pitch to read for contour extraction
maxPitch (float): maximum pitch to read for contour extraction
notes (dict): list of note objects indicating notes present in the raga
tonic (float): tonic of the audio
timeStep (float): time steps in which audio is extracted
octaveJumpCost (float): parameter passed to pitch detection function
veryAccurate (bool): parameter passed to pitch detection function
ax (matplotlib.axes.Axes): axis to plot the pitch contour in
freqXlabels (float): time (in seconds) after which each x label occurs
annotate (bool): if True, will annotate tala markings
cyclePath (str): path to file with tala cycle annotations
numDiv (int): number of divisions to put between each annotation marking
onsetPath (str): path to file with onset annotations; only considered if cyclePath is None
onsetTimeKeyword (str): column name in the onset file to take onsets from
onsetLabelKeyword (str): column name with labels for the onsets; if None, no label will be printed
xticks (bool): if True, will plot xticklabels
yticks (bool): if True, will plot yticklabels
xlabel (bool): if True, will print xlabel
ylabel (bool): if True will pring ylabel
title (str): Title to add to the plot
annotLabel: if True, will print annotation label along with line; used only if annotate is True; used only if annotate is True
cAnnot: input to the ax.annotate function for the colour (c) parameter
ylim (tuple): (min, max) limits for the y axis; if None, will be directly interpreted from the data
annotAlpha (float): controls opacity of the annotation lines
Returns:
ax (matplotlib.axes.Axes): plot of pitch contour if ax was not None
(pitchvals, timevals): tuple with arrays of pitch values (in cents) and time stamps; returned if ax was None
'''
startTime = math.floor(startTime) # set start time to an integer, for better readability on the x axis of the plot
if audio is None:
# if audio is not given, load audio from audioPath
audio, sr = librosa.load(audioPath, sr=sr, mono=True, offset=startTime, duration=duration)
if duration is None:
duration = librosa.get_duration(audio, sr=sr)
duration = math.floor(duration) # set duration to an integer, for better readability on the x axis of the plot
audio = audio[:int(duration*sr)] # ensure that audio length = duration
snd = parselmouth.Sound(audio, sr)
pitch = snd.to_pitch_ac(time_step=timeStep, pitch_floor=minPitch, very_accurate=veryAccurate, octave_jump_cost=octaveJumpCost, pitch_ceiling=maxPitch)
pitchvals = pitch.selected_array['frequency']
pitchvals[pitchvals==0] = np.nan # mark unvoiced regions as np.nan
if tonic is None: Exception('No tonic provided')
pitchvals[~(np.isnan(pitchvals))] = 1200*np.log2(pitchvals[~(np.isnan(pitchvals))]/tonic) #convert Hz to cents
timevals = pitch.xs()
if ax is None:
warnings.warn('ax not provided; returning pitch and time values')
return (pitchvals, timevals)
else:
# plot the contour
return plotPitch(pitchvals, timevals, notes, ax, tonic, startTime, duration, freqXlabels, annotate=annotate, cyclePath=cyclePath, numDiv=numDiv, onsetPath=onsetPath, onsetTimeKeyword=onsetTimeKeyword, onsetLabelKeyword=onsetLabelKeyword, xticks=xticks, yticks=yticks, xlabel=xlabel, ylabel=ylabel, title=title, cAnnot=cAnnot, annotLabel=annotLabel, ylim=ylim, annotAlpha=annotAlpha)
def plotPitch(pitchvals=None, timevals=None, notes=None, ax=None, tonic=None, startTime=0, duration=None, freqXlabels=5, xticks=True, yticks=True, xlabel=True, ylabel=True, title='Pitch Contour (Cents)', annotate=False, cyclePath=None, numDiv=0, onsetPath=None, onsetTimeKeyword='Inst', onsetLabelKeyword='Label', cAnnot='purple', annotLabel=True, ylim=None, annotAlpha=0.8, yAnnot=0.7, sizeAnnot=10):
'''Plots the pitch contour
Parameters
pitchvals (np.ndarray): pitch values in cents
timevals (np.ndarray): time values in seconds
notes (dict): object for each note used for labelling y-axis
ax (matplotlib.Axes.axes): axis object on which plot is to be plotted
tonic (float): tonic (in Hz) of audio clip
startTime (float): start time for x labels in the plot
duration (float): duration of audio in the plot (used for x labels)
freqXlabels (int): time (in seconds) after which each x label occurs
annotate (bool): if true will mark annotations provided
xticks (bool): if True, will print x tick labels
yticks (bool): if True, will print y tick labels
xlabel (bool): if True, will add label to x axis
ylabel (bool): if True, will add label to y axis
title (str): title to add to the plot
annotate (bool): if True, will add beat annotations to the plot
cyclePath (bool): path to file with cycle annotations; used only if annotate is True
numDiv (int): number of divisions to add between each marked cycle; used only if annotate is True
onsetPath (str): path to file with onset annotations; only considered if cyclePath is None
onsetKeyword (str): column name in the onset file to take onsets from
onsetLabelKeyword (str): column name with labels for the onsets; if None, no label will be printed
cAnnot: input to the ax.annotate function for the colour (c) parameter; used only if annotate is True
annotLabel (bool): if True, will print annotation label along with line; used only if annotate is True
ylim (tuple): (min, max) limits for the y axis; if None, will be directly interpreted from the data
annotAlpha (float): controls opacity of the line drawn
yAnnot (float): float value from [0, 1] indicating where the label should occur on the y-axis. 0 indicates the lower ylim, 1 indicates the higher ylim.
sizeAnnot (int): font size for annotated text
Returns
ax: plotted axis
'''
# Check that all required parameters are present
if pitchvals is None:
Exception('No pitch contour provided')
if timevals is None:
warnings.warn('No time values provided, assuming 0.01 s time steps in pitch contour')
timevals = np.arange(0, len(pitchvals)*0.01, 0.01)
if ax is None:
Exception('ax parameter has to be provided')
# duration = xvals[-1] + 1 # set duration as last x value + 1
ax = sns.lineplot(x=timevals, y=pitchvals, ax=ax)
ax.set(xlabel='Time Stamp (s)' if xlabel else '',
ylabel='Notes' if ylabel else '',
title=title,
xlim=(0, duration),
xticks=np.around(np.arange(math.ceil(startTime)-startTime, duration, freqXlabels)).astype(int), # start the xticks such that each one corresponds to an integer with xticklabels
xticklabels=np.around(np.arange(startTime, duration+startTime, freqXlabels) ).astype(int) if xticks else [])
if notes is not None and yticks:
# add yticks if needed
ax.set(
yticks=[x['cents'] for x in notes if (x['cents'] >= min(pitchvals[~(np.isnan(pitchvals))])) & (x['cents'] <= max(pitchvals[~(np.isnan(pitchvals))]))] if yticks else [],
yticklabels=[x['label'] for x in notes if (x['cents'] >= min(pitchvals[~(np.isnan(pitchvals))])) & (x['cents'] <= max(pitchvals[~(np.isnan(pitchvals))]))] if yticks else [])
if ylim is not None:
ax.set(ylim=ylim)
if annotate:
ax = drawAnnotation(cyclePath, onsetPath, onsetTimeKeyword, onsetLabelKeyword, numDiv, startTime, duration, ax, c=cAnnot, annotLabel=annotLabel, alpha=annotAlpha, y=yAnnot, size=sizeAnnot)
return ax
def spectrogram(audio=None, sr=16000, audioPath=None, startTime=0, duration=None, winSize=0.04, hopSize=0.01, n_fft=None, cmap='Blues', ax=None, amin=1e-5, freqXlabels=5, xticks=False, yticks=False, xlabel=True, ylabel=True, title='Spectrogram', annotate=False, cyclePath=None, numDiv=0, onsetPath=None, onsetTimeKeyword='Inst', onsetLabelKeyword='Label', cAnnot='purple', annotLabel=True):
'''Plots spectrogram
Parameters
audio (np.ndarray): loaded audio time series
sr (int): sample rate that audio time series is loaded/ is to be loaded in
audioPath (str): path to the audio file; only needed if audio is None
startTime (float): time to start reading the audio at
duration (float): duration of audio
winSize (float): size of window for STFT in seconds
hopSize (float): size of hop for STFT in seconds
n_fft (int): DFT size
cmap (matplotlib.colors.Colormap or str): colormap to use to plot spectrogram
ax (plt.Axes.axes): axis to plot spectrogram in
amin (float): controls the contrast of the spectrogram; passed into librosa.power_to_db function
freqXlabels (float): time (in seconds) after which each x label occurs
xticks (bool): if true, will print x labels
yticks (bool): if true, will print y labels
xlabel (bool): if true, will add an xlabel
ylabel (bool): if true, will add a ylabel
title (str): title for the plot
annotate (bool): if True, will annotate either tala or onset markings; if both are provided, tala annotations will be marked
cyclePath (str): path to file with tala cycle annotations
numDiv (int): number of divisions to put between each tala annotation marking
onsetPath (str): path to file with onset annotations; only considered if cyclePath is None
onsetKeyword (str): column name in the onset file to take onsets from
onsetLabelKeyword (str): column name with labels for the onsets; if None, no label will be printed
cAnnot: input to the ax.annotate function for the colour (c) parameter; used only if annotate is True
annotLabel (bool): if True, will print annotation label along with line; used only if annotate is True; used only if annotate is True
'''
if ax is None:
Exception('ax parameter has to be provided')
startTime = math.floor(startTime) # set start time to an integer, for better readability on the x axis of the plot
if audio is None:
audio, sr = librosa.load(audioPath, sr=sr, mono=True, offset=startTime, duration=duration)
if duration is None:
duration = librosa.get_duration(audio, sr=sr)
duration = math.floor(duration) # set duration to an integer, for better readability on the x axis of the plot
audio = audio[:int(duration*sr)] # ensure that audio length = duration
# convert winSize and hopSize from seconds to samples
winSize = int(np.ceil(sr*winSize))
hopSize = int(np.ceil(sr*hopSize))
if n_fft is None:
n_fft = int(2**np.ceil(np.log2(winSize)))
# STFT
f,t,X = sig.stft(audio, fs=sr, window='hann', nperseg=winSize, noverlap=(winSize-hopSize), nfft=n_fft)
X_dB = librosa.power_to_db(np.abs(X), ref = np.max, amin=amin)
specshow(X_dB, x_axis='time', y_axis='linear', sr=sr, fmax=sr//2, hop_length=hopSize, ax=ax, cmap=cmap)
ax.set(ylabel='Frequency (Hz)' if ylabel else '',
xlabel='Time (s)' if xlabel else '',
title=title,
xlim=(0, duration),
xticks=np.around(np.arange(math.ceil(startTime)-startTime, duration, freqXlabels)).astype(int), # start the xticks such that each one corresponds to an integer with xticklabels
xticklabels=np.around(np.arange(startTime, duration+startTime, freqXlabels) ).astype(int) if xticks else [],
ylim=(0, 5000),
yticks=[0, 2e3, 4e3] if yticks else [],
yticklabels=['0', '2k', '4k'] if yticks else [])
if annotate:
ax = drawAnnotation(cyclePath, onsetPath, onsetTimeKeyword, onsetLabelKeyword, numDiv, startTime, duration, ax, c=cAnnot, annotLabel=annotLabel)
return ax
def drawWave(audio=None, sr=16000, audioPath=None, startTime=0, duration=None, ax=None, xticks=False, yticks=True, xlabel=True, ylabel=True, freqXlabels=5, annotate=False, cyclePath=None, numDiv=0, onsetPath=None, cAnnot='purple', annotLabel=True, odf=False, winSize_odf=0.4, hopSize_odf=0.01, nFFT_odf=1024, source_odf='vocal', cOdf='black', title='Waveform'):
'''Plots the wave plot of the audio
audio (np.ndarray): loaded audio time series
sr (int): sample rate that audio time series is loaded/ is to be loaded in
audioPath (str): path to the audio file
startTime (float): time to start reading the audio at
duration (float): duration of audio to load
ax (plt.Axes.axes): axis to plot waveshow in
xticks (bool): if True, will plot xticklabels
yticks (bool): if True, will plot yticklabels
xlabel (bool): if True, will add a x label
ylabel (bool): if True will add a y label
freqXlabels (float): time (in seconds) after which each x label occurs
annotate (bool): if True, will annotate tala markings
cyclePath (str): path to file with tala cycle annotations
numDiv (int): number of divisions to put between each annotation marking
onsetPath (str): path to file with onset annotations; only considered if cyclePath is None
cAnnot: colour for the annotation marking; input to the ax.annotate function for the colour (c) parameter; used only if annotate is True
annotLabel (bool): if True, will print annotation label along with line; used only if annotate is True; used only if annotate is True
odf (bool): if True, will plot the onset detection function over the wave form
winSize_odf (float): window size in seconds, fed to the onset detection function; valid only if odf is true
hopSize_odf (float): hop size in seconds, fed to the onset detection function; valid only if odf is true
nFFT_odf (int): size of DFT used in onset detection function; valid only if odf is true
source_odf (str): type of instrument - vocal or pakhawaj, fed to odf; valid only if odf is true
cOdf: colour to plot onset detection function in; valid only if odf is true
title (str): title of the plot
'''
if ax is None:
Exception('ax parameter has to be provided')
startTime = math.floor(startTime) # set start time to an integer, for better readability on the x axis of the plot
if audio is None:
audio, sr = librosa.load(audioPath, sr=sr, offset=startTime, duration=duration)
if duration is None:
duration = librosa.get_duration(audio, sr=sr)
duration = math.floor(duration) # set duration to an integer, for better readability on the x axis of the plot
audio = audio[:int(duration*sr)] # ensure that audio length = duration
waveshow(audio, sr, ax=ax)
if odf:
plotODF(audio=audio, sr=sr, startTime=0, duration=None, ax=ax, winSize_odf=winSize_odf, hopSize_odf=hopSize_odf, nFFT_odf=nFFT_odf, source_odf=source_odf, cOdf=cOdf, ylim=True)
ax.set(xlabel='' if not xlabel else 'Time (s)',
ylabel = '' if not ylabel else 'Amplitude',
xlim=(0, duration),
xticks=[] if not xticks else np.around(np.arange(0, duration, freqXlabels)),
xticklabels=[] if not xticks else np.around(np.arange(startTime, duration+startTime, freqXlabels), 2),
yticks=[] if not yticks else np.around(np.linspace(min(audio), max(audio), 3), 2),
yticklabels=[] if not yticks else np.around(np.linspace(min(audio), max(audio), 3), 2),
title=title)
if annotate:
ax = drawAnnotation(cyclePath=cyclePath, onsetPath=onsetPath, numDiv=numDiv, startTime=startTime, duration=duration, ax=ax, c=cAnnot, annotLabel=annotLabel)
return ax
def plotODF(audio=None, sr=16000, audioPath=None, startTime=0, duration=None, ax=None, winSize_odf=0.4, hopSize_odf=0.01, nFFT_odf=1024, source_odf='vocal', cOdf='black', freqXlabels=5, ylim=True, xlabel=False, ylabel=False, xticks=False, yticks=False, title='Onset Detection Function'):
'''
Plots onset detection function if ax is provided. If not returns an a tuple with 2 arrays - onset detection function values and time stamps
audio (np.ndarray): loaded audio time series
sr (int): sample rate that audio time series is loaded/ is to be loaded in
audioPath (str): path to the audio file
startTime (float): time to start reading the audio at
duration (float): duration of audio to load
ax (plt.Axes.axes): axis to plot waveshow in
winSize_odf (float): window size in seconds, fed to the onset detection function
hopSize_odf (float): hop size in seconds, fed to the onset detection function
nFFT_odf (int): size of DFT used in onset detection function
source_odf (str): type of instrument - vocal or pakhawaj, fed to odf
cOdf: colour to plot onset detection function in
freqXlabels (float): time (in seconds) after which each x label occurs
ylim (bool): if True, will reset the ylim to the range of the output of the ODF function; this is added because when the ODF is plotted over another plot, say the waveform, it is easier to see if the ylim is readjusted
xticks (bool): if True, will plot xticklabels
yticks (bool): if True, will plot yticklabels
xlabel (bool): if True, will add a x label
ylabel (bool): if True will add a y label
title (str): title of the plot
Returns
ax (matplotlib.Axes.axes): if ax is not None, returns a plot
(odf_vals, time_vals): if ax is None, returns a tuple with ODF values and time stamps.
'''
startTime = math.floor(startTime) # set start time to an integer, for better readability on the x axis of the plot
if audio is None:
audio, sr = librosa.load(audioPath, sr=sr, offset=startTime, duration=duration)
if duration is None:
duration = librosa.get_duration(audio, sr=sr)
duration = math.floor(duration) # set duration to an integer, for better readability on the x axis of the plot
audio = audio[:int(duration*sr)] # ensure that audio length = duration
odf_vals, _, _ = getOnsetActivation(x=audio, audioPath=None, startTime=startTime, endTime=duration+startTime, fs=sr, winSize=winSize_odf, hopSize=hopSize_odf, nFFT=nFFT_odf, source=source_odf)
# set time and odf values in variables
time_vals = np.arange(0, duration, hopSize_odf)
odf_vals = odf_vals[:-1] # disregard the last frame of odf_vals since it is centered around the frame at time stamp 'duration'
if ax is None:
# if ax is None, return (odf_vals, time_vals)
return (odf_vals, time_vals)
else:
ax.plot(time_vals, odf_vals[:-1], c=cOdf) # plot odf_vals and consider odf_vals for all values except the last frame
max_abs_val = max(abs(min(odf_vals)), abs(max(odf_vals))) # find maximum value to set y limits to ensure symmetrical plot
# set ax parameters only if they are not None
ax.set(xlabel='' if not xlabel else 'Time (s)',
ylabel = '' if not ylabel else 'ODF',
xlim=(0, duration),
xticks=[] if not xticks else np.around(np.arange(0, duration, freqXlabels)),
xticklabels=[] if not xticks else np.around(np.arange(startTime, duration+startTime, freqXlabels), 2),
yticks=[] if not yticks else np.around(np.linspace(min(audio), max(audio), 3), 2),
yticklabels=[] if not yticks else np.around(np.linspace(min(audio), max(audio), 3), 2),
ylim= ax.get_ylim() if not ylim else (-max_abs_val, max_abs_val),
title=title)
return ax
def plotEnergy(audio=None, sr=16000, audioPath=None, startTime=0, duration=None, ax=None, xticks=False, freqXlabels=5, annotate=False, cyclePath=None, numDiv=0, onsetPath=None, cAnnot='purple', annotLabel=True, winSize_odf=0.4, hopSize_odf=0.01, nFFT_odf=1024, source_odf='vocal', cOdf='black'):
'''
For debugging puposes only - plots energy function used to calculate odf
Parameters
audio: loaded audio time series
sr: sample rate that audio time series is loaded/ is to be loaded in
audioPath: path to the audio file
startTime: time to start reading the audio at
duration: duration of audio to load
ax: axis to plot waveshow in
xticks: if True, will plot xticklabels
freqXlabels: time (in seconds) after which each x label occurs
annotate: if True, will annotate tala markings
cyclePath: path to file with tala cycle annotations
numDiv: number of divisions to put between each annotation marking
onsetPath: path to file with onset annotations; only considered if cyclePath is None
cAnnot: colour for the annotation marking
annotLabel: if True, will print annotation label along with line; used only if annotate is True; used only if annotate is True
odf: if True, will plot the onset detection function over the wave form
winSize_odf: window size, fed to the onset detection function; valid only if odf is true
hopSize_odf: hop size in seconds, fed to the onset detection function; valid only if odf is true
nFFT_odf: size of DFT used in onset detection function; valid only if odf is true
source_odf: type of instrument - vocal or pakhawaj, fed to odf; valid only if odf is true
cOdf: colour to plot onset detection function in; valid only if odf is true
'''
if ax is None:
Exception('ax parameter has to be provided')
startTime = math.floor(startTime) # set start time to an integer, for better readability on the x axis of the plot
if audio is None:
audio, sr = librosa.load(audioPath, sr=sr, offset=startTime, duration=duration)
audio /= np.max(np.abs(audio))
if duration is None:
duration = librosa.get_duration(audio, sr=sr)
duration = math.floor(duration) # set duration to an integer, for better readability on the x axis of the plot
audio = audio[:int(duration*sr)] # ensure that audio length = duration
ax.set(xlabel='' if not xticks else 'Time (s)',
xlim=(0, duration),
xticks=[] if not xticks else np.around(np.arange(0, duration, freqXlabels)),
xticklabels=[] if not xticks else np.around(np.arange(startTime, duration+startTime, freqXlabels), 2),
title='Energy Contour',
ylabel='dB')
_, _, energy = getOnsetActivation(x=audio, audioPath=None, startTime=0, endTime=duration, fs=sr, winSize=winSize_odf, hopSize=hopSize_odf, nFFT=nFFT_odf, source=source_odf)
ax.plot(np.arange(0, duration, hopSize_odf), energy[:-1], c=cOdf)
if annotate:
ax = drawAnnotation(cyclePath=cyclePath, onsetPath=onsetPath, numDiv=numDiv, startTime=startTime, duration=duration, ax=ax, c=cAnnot, annotLabel=annotLabel)
return ax
def playAudio(audio=None, sr=16000, audioPath=None, startTime=0, duration=None):
'''Plays relevant part of audio
Parameters
audio: loaded audio sample
sr: sample rate of audio; valid only if audio is not None
audioPath: path to audio file
startTime: time to start reading audio at
duration: duration of the audio to load
Returns:
iPython.display.Audio object that plays the audio
'''
if audio is None:
audio, sr = librosa.load(audioPath, sr=None, offset=startTime, duration=duration)
return Audio(audio, rate=sr)
def playAudioWClicks(audio=None, sr=16000, audioPath=None, startTime=0, duration=None, onsetFile=None, onsetLabels=['Inst', 'Tabla'], destPath=None):
'''Plays relevant part of audio along with clicks at timestamps in onsetTimes
Parameters
audio (np.ndarray): loaded audio sample
sr (float): sample rate of audio
audioPath (str): path to audio file
startTime (float): time to start reading audio at
duration (float): duration of the audio to load
onsetFile (str): file path to onset values
onsetLabels (str): column names in onsetFile to mark with clicks
destPath (str): path to save audio file at; if None, will not save any audio file
Returns:
iPython.display.Audio object that plays the audio
'''
if audio is None:
audio, sr = librosa.load(audioPath, sr=None, offset=startTime, duration=duration)
if duration is None:
duration = librosa.get_duration(audio)
onsetFileVals = pd.read_csv(onsetFile)
onsetTimes = []
for onsetLabel in onsetLabels:
onsetTimes.append(onsetFileVals.loc[(onsetFileVals[onsetLabel] >= startTime) & (onsetFileVals[onsetLabel] <= startTime+duration), onsetLabel].values)
clickTracks = [librosa.clicks(onsetTime-startTime, sr=sr, length=len(audio), click_freq=1000*(2*i+1)) for i, onsetTime in enumerate(onsetTimes)]
audioWClicks = 0.8*audio # add clicks to this variable
for clickTrack in clickTracks:
audioWClicks += 0.2/len(clickTracks)*clickTrack
if destPath is not None:
sf.write(destPath, audioWClicks, sr)
return Audio(audioWClicks, rate=sr)
def playVideo(video=None, videoPath=None, startTime=0, duration=None, destPath='Data/Temp/VideoPart.mp4', videoOffset=0):
'''Plays relevant part of audio
Parameters
video (np.ndarray): loaded video sample
videoPath (str): path to video file
startTime (float): time to start reading the video from
duration (float): duration of the video to load
destPath (str): path to store shortened video
videoOffset (float): number of seconds offset between video and audio; time in audio + videioOffset = time in video
Returns:
iPython.display.Video object that plays the video
'''
if video is None:
if duration is None and startTime == 0:
# play the entire video
return Video(videoPath, embed=True)
else:
# store a shortened video in destPath
vid = ffmpeg.input(videoPath)
joined = ffmpeg.concat(
vid.video.filter('trim', start=startTime+videoOffset, duration=duration).filter('setpts', 'PTS-STARTPTS'),
vid.audio.filter('atrim', start=startTime+videoOffset, duration=duration).filter('asetpts', 'PTS-STARTPTS'),
v=1,
a=1
).node
v3 = joined['v']
a3 = joined['a']
out = ffmpeg.output(v3, a3, destPath).overwrite_output()
out.run()
return Video(destPath, embed=True)
else:
return Video (data=video, embed=True)
def generateFig(noRows, figSize=(14, 7), heightRatios=None):
'''Generates a matplotlib.pyplot.figure and axes to plot in
Axes in the plot are stacked vertically in one column, with height of each axis determined by heightRatios
Parameters
noRows: number of rows in the figure
figSize: (width, height) in inches of the figure
heightRatios: list of the fraction of height that each axis should take; len(heightRatios) has to be equal to noRows
Returns:
fig: figure object
axs: list of axes objects
'''
if len(heightRatios) != noRows:
Exception("Length of heightRatios has to be equal to noRows")
if heightRatios is None:
# if heightRatios is None
heightRatios = np.ones(noRows)
fig = plt.figure(figsize=figSize)
specs = fig.add_gridspec(noRows, 1, height_ratios = heightRatios)
axs = [fig.add_subplot(specs[i, 0]) for i in range(noRows)]
return fig, axs
def to_dB(x, C):
'''Applies logarithmic (base 10) transformation
Parameters
x: input signal
C: scaling constant
Returns
log-scaled x
'''
return np.log10(1 + x*C)/(np.log10(1+C))
def subBandEner(X,fs,band):
'''Computes spectral sub-band energy (suitable for vocal onset detection)
Parameters
X: STFT of an audio signal x
fs: sampling rate
band: edge frequencies (in Hz) of the sub-band of interest
Returns
sbe: array with each value representing the magnitude STFT values in a short-time frame squared & summed over the sub-band
'''
binLow = int(np.ceil(band[0]*X.shape[0]/(fs/2)))
binHi = int(np.ceil(band[1]*X.shape[0]/(fs/2)))
sbe = np.sum(np.abs(X[binLow:binHi])**2, 0)
return sbe
def biphasicDerivative(x, tHop, norm=1, rectify=1):
'''Computes a biphasic derivative (See [1] for a detailed explanation of the algorithm)
Parameters
x: input signal
tHop: frame- or hop-length used to obtain input signal values (reciprocal of sampling rate of x)
norm: if output is to be normalized
rectify: if output is to be rectified to keep only positive values (sufficient for peak-picking)
Returns
x: after performing the biphasic derivative of input x (i.e, convolving with a biphasic derivative filter)
'''
n = np.arange(-0.1, 0.1, tHop)
tau1 = 0.015 # = (1/(T_1*sqrt(2))) || -ve lobe width
tau2 = 0.025 # = (1/(T_2*sqrt(2))) || +ve lobe width
d1 = 0.02165 # -ve lobe position
d2 = 0.005 # +ve lobe position
A = np.exp(-pow((n-d1)/(np.sqrt(2)*tau1), 2))/(tau1*np.sqrt(2*np.pi))
B = np.exp(-pow((n+d2)/(np.sqrt(2)*tau2), 2))/(tau2*np.sqrt(2*np.pi))
biphasic = A-B
x = np.convolve(x, biphasic, mode='same')
x = -1*x
if norm==1:
x/=np.max(x)
x-=np.mean(x)
if rectify==1:
x*=(x>0)
return x
def getOnsetActivation(x=None, audioPath=None, startTime=0, endTime=None, fs=16000, winSize=0.4, hopSize=0.01, nFFT=1024, source='vocal'):
'''Computes onset activation function
Parameters
x: audio signal array
audioPath: path to the audio file
startTime: time to start reading the audio at
endTime: time to stop reading audio at
fs: sampling rate to read audio at
winSize: window size in seconds for STFT
hopSize: hop size in seconds for STFT
nFFT: DFT size
source: choice of instrument - vocal or pakhawaj
Returns
odf: the frame-wise onset activation function (at a sampling rate of 1/hopSize)
onsets: time locations of detected onset peaks in the odf (peaks detected using peak picker from librosa)
'''
winSize = int(np.ceil(winSize*fs))
hopSize = int(np.ceil(hopSize*fs))
nFFT = int(2**(np.ceil(np.log2(winSize))))
if x is not None:
x = fadeIn(x,int(0.5*fs))
x = fadeOut(x,int(0.5*fs))
x = x[int(np.ceil(startTime*fs)):int(np.ceil(endTime*fs))]
elif audioPath is not None:
x, _ = librosa.load(audioPath, sr=fs, offset=startTime, duration=endTime-startTime)
else:
print('Provide either the audio signal or path to the stored audio file on disk')
raise
X,_ = librosa.magphase(librosa.stft(x,win_length=winSize, hop_length=hopSize, n_fft=nFFT))
if source=='vocal':
sub_band = [600,2500]
odf = subBandEner(X, fs, sub_band)
odf = to_dB(odf, 100)
energy = odf.copy()
odf = biphasicDerivative(odf, hopSize/fs, norm=1, rectify=1)
onsets = librosa.onset.onset_detect(onset_envelope=odf.copy(), sr=fs, hop_length=hopSize, pre_max=4, post_max=4, pre_avg=6, post_avg=6, wait=50, delta=0.12)*hopSize/fs
else:
sub_band = [0,fs/2]
odf = spectralFlux(X, fs, sub_band, aMin=1e-4, normalize=True)
energy = odf.copy()
odf = biphasicDerivative(odf, hopSize, norm=1, rectify=1)
onsets = librosa.onset.onset_detect(onset_envelope=odf, sr=fs, hop_length=hopSize, pre_max=1, post_max=1, pre_avg=1, post_avg=1, wait=10, delta=0.05)*hopSize/fs
return odf, onsets, energy
def spectralFlux(X, fs, band, aMin=1e-4, normalize=True):
'''Computes 1st order rectified spectral flux (difference) of a given STFT input
Parameters
X: input STFT matrix
fs: sampling rate of audio signal
band: frequency band over which to compute flux from STFT (sub-band spectral flux)
aMin: lower threshold value to prevent log(0)
normalize: whether to normalize output before returning
Returns
specFlux: array with frame-wise spectral flux values
'''
X = 20*np.log10(aMin+abs(X)/np.max(np.abs(X)))
binLow = int(band[0]*X.shape[0]/(fs/2))
binHi = int(band[1]*X.shape[0]/(fs/2))
specFlux = np.array([0])
for hop in range(1,X.shape[1]):
diff = X[binLow:binHi,hop]-X[binLow:binHi,hop-1]
diff = (diff + abs(diff))/2
specFlux=np.append(specFlux,sum(diff))
if normalize:
specFlux/=max(specFlux)
return specFlux
def fadeIn(x,length):
fade_func = np.ones(len(x))
fade_func[:length] = np.hanning(2*length)[:length]
x*=fade_func
return x
def fadeOut(x,length):
fade_func = np.ones(len(x))
fade_func[-length:] = np.hanning(2*length)[length:]
x*=fade_func
return x
def ACF_DFT_sal(signal, t_ACF_lag, t_ACF_frame, t_ACF_hop, fs):
n_ACF_lag = int(t_ACF_lag*fs)
n_ACF_frame = int(t_ACF_frame*fs)
n_ACF_hop = int(t_ACF_hop*fs)
signal = subsequences(signal, n_ACF_frame, n_ACF_hop)
ACF = np.zeros((len(signal), n_ACF_lag))
for i in range(len(ACF)):
ACF[i][0] = np.dot(signal[i], signal[i])
for j in range(1, n_ACF_lag):
ACF[i][j] = np.dot(signal[i][:-j], signal[i][j:])
DFT = (abs(np.fft.rfft(signal)))
sal = np.zeros(len(ACF))
for i in range(len(ACF)):
sal[i] = max(ACF[i])
for i in range(len(ACF)):
if max(ACF[i])!=0:
ACF[i] = ACF[i]/max(ACF[i])
if max(DFT[i])!=0:
DFT[i] = DFT[i]/max(DFT[i])
return (ACF, DFT, sal)
def subsequences(signal, frame_length, hop_length):
shape = (int(1 + (len(signal) - frame_length)/hop_length), frame_length)
strides = (hop_length*signal.strides[0], signal.strides[0])
return np.lib.stride_tricks.as_strided(signal, shape=shape, strides=strides)
def plot_matrix(X, Fs=1, Fs_F=1, T_coef=None, F_coef=None, xlabel='Time (seconds)', ylabel='Frequency (Hz)', title='',
dpi=72, colorbar=True, colorbar_aspect=20.0, ax=None, figsize=(6, 3), **kwargs):
"""Plot a matrix, e.g. a spectrogram or a tempogram (function from Notebook: B/B_PythonVisualization.ipynb in [2])
Args:
X: The matrix
Fs: Sample rate for axis 1
Fs_F: Sample rate for axis 0
T_coef: Time coeffients. If None, will be computed, based on Fs.
F_coef: Frequency coeffients. If None, will be computed, based on Fs_F.
xlabel: Label for x axis
ylabel: Label for y axis
title: Title for plot
dpi: Dots per inch
colorbar: Create a colorbar.
colorbar_aspect: Aspect used for colorbar, in case only a single axes is used.
ax: Either (1.) a list of two axes (first used for matrix, second for colorbar), or (2.) a list with a single
axes (used for matrix), or (3.) None (an axes will be created).
figsize: Width, height in inches
**kwargs: Keyword arguments for matplotlib.pyplot.imshow
Returns:
fig: The created matplotlib figure or None if ax was given.
ax: The used axes.
im: The image plot
"""
fig = None
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
ax = [ax]
if T_coef is None:
T_coef = np.arange(X.shape[1]) / Fs
if F_coef is None:
F_coef = np.arange(X.shape[0]) / Fs_F
if 'extent' not in kwargs:
x_ext1 = (T_coef[1] - T_coef[0]) / 2
x_ext2 = (T_coef[-1] - T_coef[-2]) / 2
y_ext1 = (F_coef[1] - F_coef[0]) / 2
y_ext2 = (F_coef[-1] - F_coef[-2]) / 2
kwargs['extent'] = [T_coef[0] - x_ext1, T_coef[-1] + x_ext2, F_coef[0] - y_ext1, F_coef[-1] + y_ext2]
if 'cmap' not in kwargs:
kwargs['cmap'] = 'gray_r'
if 'aspect' not in kwargs:
kwargs['aspect'] = 'auto'
if 'origin' not in kwargs:
kwargs['origin'] = 'lower'
im = ax[0].imshow(X, **kwargs)
if len(ax) == 2 and colorbar:
plt.colorbar(im, cax=ax[1])
elif len(ax) == 2 and not colorbar:
ax[1].set_axis_off()
elif len(ax) == 1 and colorbar:
plt.sca(ax[0])
plt.colorbar(im, aspect=colorbar_aspect)
ax[0].set_xlabel(xlabel, fontsize=14)
ax[0].set_ylabel(ylabel, fontsize=14)
ax[0].set_title(title, fontsize=18)
if fig is not None:
plt.tight_layout()
return fig, ax, im
'''
References
[1] Rao, P., Vinutha, T.P. and Rohit, M.A., 2020. Structural Segmentation of Alap in Dhrupad Vocal Concerts.
Transactions of the International Society for Music Information Retrieval, 3(1), pp.137–152. DOI: http://doi.org/10.5334/tismir.64
[2] Meinard Müller and Frank Zalkow: FMP Notebooks: Educational Material for Teaching and Learning Fundamentals of Music Processing.
Proceedings of the International Conference on Music Information Retrieval (ISMIR), Delft, The Netherlands, 2019.
'''