Skip to content

Commit

Permalink
centralize pydub samples loading logic.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 340573401
  • Loading branch information
cghawthorne authored and Magenta Team committed Nov 4, 2020
1 parent e1d767d commit 05458cf
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 0 deletions.
44 changes: 44 additions & 0 deletions note_seq/audio_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import librosa
import numpy as np
import pydub
import scipy


Expand Down Expand Up @@ -49,6 +50,49 @@ def float_samples_to_int16(y):
return (y * np.iinfo(np.int16).max).astype(np.int16)


def wav_data_to_samples_pydub(wav_data: bytes,
sample_rate: int,
remove_dc_bias: bool = False,
num_channels: int = None,
normalize_db: float = None):
"""Convert audio file data (in bytes) into a numpy array using Pydub.
Args:
wav_data: A byte stream of audio data.
sample_rate: Resample recorded audio to this sample rate.
remove_dc_bias: If true, will remove DC bias from audio.
num_channels: If not specified, output shape will be based on the contents
of wav_data. Otherwise, will force to be 1 or 2 channels.
normalize_db: Normalize the audio to this many decibels. Set to None to skip
normalization step.
Returns:
An array of the recorded audio at sample_rate. If mono, will be shape
[samples], otherwise [channels, samples].
"""
# Parse and normalize the audio.
aseg = pydub.AudioSegment.from_file(io.BytesIO(wav_data))
if num_channels:
aseg = aseg.set_channels(num_channels)
if remove_dc_bias:
aseg = aseg.remove_dc_offset()
if normalize_db is not None:
aseg.normalize(headroom=normalize_db)
aseg = aseg.set_frame_rate(sample_rate)

# Convert to numpy array.
channel_asegs = aseg.split_to_mono()
samples = [s.get_array_of_samples() for s in channel_asegs]
fp_arr = np.array(samples).astype(np.float32)
fp_arr /= np.iinfo(samples[0].typecode).max

# If only 1 channel, remove extra dim.
if fp_arr.shape[0] == 1:
fp_arr = fp_arr[0]

return fp_arr


def wav_data_to_samples(wav_data, sample_rate):
"""Read PCM-formatted WAV data and return a NumPy array of samples.
Expand Down
23 changes: 23 additions & 0 deletions note_seq/audio_io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,29 @@ def testFloatWavDataToSamples(self):
wav_io.getvalue(), sample_rate=16000)
np.testing.assert_array_equal(y, y_from_float)

def testWavDataToSamplesPydub(self):
w = wave.open(self.wav_filename, 'rb')
w_mono = wave.open(self.wav_filename_mono, 'rb')

# Check content size.
y = audio_io.wav_data_to_samples_pydub(
self.wav_data, sample_rate=16000, num_channels=1)
y_mono = audio_io.wav_data_to_samples_pydub(
self.wav_data_mono, sample_rate=22050, num_channels=1)
self.assertEqual(
round(16000.0 * w.getnframes() / w.getframerate()), y.shape[0])
self.assertEqual(
round(22050.0 * w_mono.getnframes() / w_mono.getframerate()),
y_mono.shape[0])

# Check a few obvious failure modes.
self.assertLess(0.01, y.std())
self.assertLess(0.01, y_mono.std())
self.assertGreater(-0.1, y.min())
self.assertGreater(-0.1, y_mono.min())
self.assertLess(0.1, y.max())
self.assertLess(0.1, y_mono.max())

def testRepeatSamplesToDuration(self):
samples = np.arange(5)
repeated = audio_io.repeat_samples_to_duration(
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
'pandas >= 0.18.1',
'pretty_midi >= 0.2.6',
'protobuf >= 3.6.1',
'pydub',
'scipy >= 0.18.1',
]

Expand Down

0 comments on commit 05458cf

Please sign in to comment.