From 05458cfc9f4e4a85e58c44296a4122daa7afd82c Mon Sep 17 00:00:00 2001
From: Curtis Hawthorne <fjord@google.com>
Date: Tue, 3 Nov 2020 20:13:18 -0800
Subject: [PATCH] centralize pydub samples loading logic.

PiperOrigin-RevId: 340573401
---
 note_seq/audio_io.py      | 44 +++++++++++++++++++++++++++++++++++++++
 note_seq/audio_io_test.py | 23 ++++++++++++++++++++
 setup.py                  |  1 +
 3 files changed, 68 insertions(+)

diff --git a/note_seq/audio_io.py b/note_seq/audio_io.py
index f917ec7..ca26c32 100644
--- a/note_seq/audio_io.py
+++ b/note_seq/audio_io.py
@@ -20,6 +20,7 @@
 
 import librosa
 import numpy as np
+import pydub
 import scipy
 
 
@@ -49,6 +50,49 @@ def float_samples_to_int16(y):
   return (y * np.iinfo(np.int16).max).astype(np.int16)
 
 
+def wav_data_to_samples_pydub(wav_data: bytes,
+                              sample_rate: int,
+                              remove_dc_bias: bool = False,
+                              num_channels: int = None,
+                              normalize_db: float = None):
+  """Convert audio file data (in bytes) into a numpy array using Pydub.
+
+  Args:
+    wav_data: A byte stream of audio data.
+    sample_rate: Resample recorded audio to this sample rate.
+    remove_dc_bias: If true, will remove DC bias from audio.
+    num_channels: If not specified, output shape will be based on the contents
+      of wav_data. Otherwise, will force to be 1 or 2 channels.
+    normalize_db: Normalize the audio to this many decibels. Set to None to skip
+      normalization step.
+
+  Returns:
+    An array of the recorded audio at sample_rate. If mono, will be shape
+    [samples], otherwise [channels, samples].
+  """
+  # Parse and normalize the audio.
+  aseg = pydub.AudioSegment.from_file(io.BytesIO(wav_data))
+  if num_channels:
+    aseg = aseg.set_channels(num_channels)
+  if remove_dc_bias:
+    aseg = aseg.remove_dc_offset()
+  if normalize_db is not None:
+    aseg.normalize(headroom=normalize_db)
+  aseg = aseg.set_frame_rate(sample_rate)
+
+  # Convert to numpy array.
+  channel_asegs = aseg.split_to_mono()
+  samples = [s.get_array_of_samples() for s in channel_asegs]
+  fp_arr = np.array(samples).astype(np.float32)
+  fp_arr /= np.iinfo(samples[0].typecode).max
+
+  # If only 1 channel, remove extra dim.
+  if fp_arr.shape[0] == 1:
+    fp_arr = fp_arr[0]
+
+  return fp_arr
+
+
 def wav_data_to_samples(wav_data, sample_rate):
   """Read PCM-formatted WAV data and return a NumPy array of samples.
 
diff --git a/note_seq/audio_io_test.py b/note_seq/audio_io_test.py
index 9b69bed..5303bf8 100644
--- a/note_seq/audio_io_test.py
+++ b/note_seq/audio_io_test.py
@@ -65,6 +65,29 @@ def testFloatWavDataToSamples(self):
         wav_io.getvalue(), sample_rate=16000)
     np.testing.assert_array_equal(y, y_from_float)
 
+  def testWavDataToSamplesPydub(self):
+    w = wave.open(self.wav_filename, 'rb')
+    w_mono = wave.open(self.wav_filename_mono, 'rb')
+
+    # Check content size.
+    y = audio_io.wav_data_to_samples_pydub(
+        self.wav_data, sample_rate=16000, num_channels=1)
+    y_mono = audio_io.wav_data_to_samples_pydub(
+        self.wav_data_mono, sample_rate=22050, num_channels=1)
+    self.assertEqual(
+        round(16000.0 * w.getnframes() / w.getframerate()), y.shape[0])
+    self.assertEqual(
+        round(22050.0 * w_mono.getnframes() / w_mono.getframerate()),
+        y_mono.shape[0])
+
+    # Check a few obvious failure modes.
+    self.assertLess(0.01, y.std())
+    self.assertLess(0.01, y_mono.std())
+    self.assertGreater(-0.1, y.min())
+    self.assertGreater(-0.1, y_mono.min())
+    self.assertLess(0.1, y.max())
+    self.assertLess(0.1, y_mono.max())
+
   def testRepeatSamplesToDuration(self):
     samples = np.arange(5)
     repeated = audio_io.repeat_samples_to_duration(
diff --git a/setup.py b/setup.py
index 24e31dc..13c5cf9 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@
     'pandas >= 0.18.1',
     'pretty_midi >= 0.2.6',
     'protobuf >= 3.6.1',
+    'pydub',
     'scipy >= 0.18.1',
 ]