From f1b9fa4dbe8992d1492a9f2d0a285239949c460d Mon Sep 17 00:00:00 2001 From: Tom Denton Date: Fri, 4 Aug 2023 13:15:19 -0700 Subject: [PATCH] Seamlessly load audio from Xeno-Canto or URL (eg, using the A2O API). PiperOrigin-RevId: 553893595 --- chirp/audio_utils.py | 34 +++++++++++++++++++++---- chirp/inference/search_embeddings.ipynb | 11 ++++---- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/chirp/audio_utils.py b/chirp/audio_utils.py index beca66fe..86d385dc 100644 --- a/chirp/audio_utils.py +++ b/chirp/audio_utils.py @@ -19,6 +19,7 @@ """ import concurrent import functools +import io import logging import os import tempfile @@ -46,7 +47,17 @@ _BOUNDARY_TO_PADDING_MODE = {'zeros': 'CONSTANT'} -def load_audio( +def load_audio(path: str, target_sample_rate: int, **kwargs) -> jnp.ndarray: + """Load a general audio resource.""" + if path.startswith('xc'): + return load_xc_audio(path, target_sample_rate) + elif path.startswith('http'): + return load_url_audio(path, target_sample_rate) + else: + return load_audio_file(path, target_sample_rate, **kwargs) + + +def load_audio_file( filepath: str | epath.Path, target_sample_rate: int, resampling_type: str = 'polyphase', @@ -170,7 +181,7 @@ def multi_load_audio_window( yield futures.pop(0).result() -def load_xc_audio(xc_id: str, sample_rate: int) -> jnp.ndarray | None: +def load_xc_audio(xc_id: str, sample_rate: int) -> jnp.ndarray: """Load audio from Xeno-Canto given an ID like 'xc12345'.""" if not xc_id.startswith('xc'): raise ValueError(f'XenoCanto id {xc_id} does not start with "xc".') @@ -190,12 +201,25 @@ def load_xc_audio(xc_id: str, sample_rate: int) -> jnp.ndarray | None: try: data = session.get(url=url).content except requests.exceptions.RequestException as e: - print(f'Failed to load audio from Xeno-Canto {xc_id}') - return None + raise requests.exceptions.RequestException( + f'Failed to load audio from Xeno-Canto {xc_id}' + ) from e with tempfile.NamedTemporaryFile(suffix='.mp3', mode='wb') as f: f.write(data) f.flush() - audio = load_audio(f.name, target_sample_rate=sample_rate) + audio = load_audio_file(f.name, target_sample_rate=sample_rate) + return audio + + +def load_url_audio(url: str, sample_rate: int) -> jnp.ndarray: + """Load audio from a URL.""" + data = requests.get(url).content + with io.BytesIO(data) as f: + sf = soundfile.SoundFile(f) + audio = sf.read(dtype='float32') + audio = librosa.resample( + audio, sf.samplerate, sample_rate, res_type='polyphase' + ) return audio diff --git a/chirp/inference/search_embeddings.ipynb b/chirp/inference/search_embeddings.ipynb index baa571a4..048e483c 100644 --- a/chirp/inference/search_embeddings.ipynb +++ b/chirp/inference/search_embeddings.ipynb @@ -110,17 +110,14 @@ "source": [ "#@title Load query audio. { vertical-output: true }\n", "\n", - "# Point to an audio file or Xeno-Canto id (like 'xc12345') of your choice.\n", + "# Point to an audio file, Xeno-Canto id (like 'xc12345') or audio file URL.\n", "audio_path = 'xc12345' #@param\n", "# Muck around with manual selection of the query start time...\n", "start_s = 1 #@param\n", "\n", "window_s = config.model_config['window_size_s']\n", "sample_rate = config.model_config['sample_rate']\n", - "if audio_path.startswith('xc'):\n", - " audio = audio_utils.load_xeno_canto_audio(audio_path, sample_rate)\n", - "else:\n", - " audio = audio_utils.load_audio(audio_path, sample_rate)\n", + "audio = audio_utils.load_audio(audio_path, sample_rate)\n", "\n", "# Display the full file.\n", "display.plot_audio_melspec(audio, sample_rate)\n", @@ -128,9 +125,11 @@ "# Display the selected window.\n", "print('-' * 80)\n", "print('Selected audio window:')\n", - "# TODO(tomdenton): Pad or shift if too close to the end of the file.\n", "st = int(start_s * sample_rate)\n", "end = int(st + window_s * sample_rate)\n", + "if end \u003e audio.shape[0]:\n", + " end = audio.shape[0]\n", + " st = max([0, end - window_s * sample_rate])\n", "audio_window = audio[st:end]\n", "display.plot_audio_melspec(audio_window, sample_rate)\n", "\n",