From a4b22d65908d9d6d681e921b50a517f829c41c1b Mon Sep 17 00:00:00 2001
From: mbsantiago <santiago.mbal@gmail.com>
Date: Sun, 10 Nov 2024 22:39:10 +0000
Subject: [PATCH] Improve the pad_audio function

This function was the culprit of the error. Broke the function into
other helper functions to make the flow easier to follow
---
 batdetect2/utils/audio_utils.py | 201 +++++++++++++++++++++++---------
 1 file changed, 147 insertions(+), 54 deletions(-)

diff --git a/batdetect2/utils/audio_utils.py b/batdetect2/utils/audio_utils.py
index 7c5852a..a60ea94 100644
--- a/batdetect2/utils/audio_utils.py
+++ b/batdetect2/utils/audio_utils.py
@@ -6,6 +6,8 @@
 import numpy as np
 import torch
 
+from batdetect2.detector import parameters
+
 from . import wavfile
 
 __all__ = [
@@ -15,18 +17,42 @@
 ]
 
 
-def time_to_x_coords(time_in_file, sampling_rate, fft_win_length, fft_overlap):
-    nfft = np.floor(fft_win_length * sampling_rate)  # int() uses floor
-    noverlap = np.floor(fft_overlap * nfft)
-    return (time_in_file * sampling_rate - noverlap) / (nfft - noverlap)
+def time_to_x_coords(
+    time_in_file: float,
+    samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
+    window_overlap: float = parameters.FFT_OVERLAP,
+) -> float:
+    nfft = np.floor(window_duration * samplerate)  # int() uses floor
+    noverlap = np.floor(window_overlap * nfft)
+    return (time_in_file * samplerate - noverlap) / (nfft - noverlap)
+
+
+def x_coords_to_time(
+    x_pos: int,
+    samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
+    window_overlap: float = parameters.FFT_OVERLAP,
+) -> float:
+    n_fft = np.floor(window_duration * samplerate)
+    n_overlap = np.floor(window_overlap * n_fft)
+    n_step = n_fft - n_overlap
+    return ((x_pos * n_step) + n_overlap) / samplerate
+    # return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5)  # 0.5 is for center of temporal window
 
 
-# NOTE this is also defined in post_process
-def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
-    nfft = np.floor(fft_win_length * sampling_rate)
-    noverlap = np.floor(fft_overlap * nfft)
-    return ((x_pos * (nfft - noverlap)) + noverlap) / sampling_rate
-    # return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5)  # 0.5 is for center of temporal window
+def x_coord_to_sample(
+    x_pos: int,
+    samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
+    window_overlap: float = parameters.FFT_OVERLAP,
+    resize_factor: float = parameters.RESIZE_FACTOR,
+) -> int:
+    n_fft = np.floor(window_duration * samplerate)
+    n_overlap = np.floor(window_overlap * n_fft)
+    n_step = n_fft - n_overlap
+    x_pos = int(x_pos / resize_factor)
+    return int((x_pos * n_step) + n_overlap)
 
 
 def generate_spectrogram(
@@ -184,55 +210,118 @@ def load_audio(
     return sampling_rate, audio_raw
 
 
+def compute_spectrogram_width(
+    length: int,
+    samplerate: int = parameters.TARGET_SAMPLERATE_HZ,
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
+    window_overlap: float = parameters.FFT_OVERLAP,
+    resize_factor: float = parameters.RESIZE_FACTOR,
+) -> int:
+    n_fft = int(window_duration * samplerate)
+    n_overlap = int(window_overlap * n_fft)
+    n_step = n_fft - n_overlap
+    width = (length - n_overlap) // n_step
+    return int(width * resize_factor)
+
+
 def pad_audio(
-    audio_raw,
-    fs,
-    ms,
-    overlap_perc,
-    resize_factor,
-    divide_factor,
-    fixed_width=None,
+    audio: np.ndarray,
+    samplerate: int = parameters.TARGET_SAMPLERATE_HZ,
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
+    window_overlap: float = parameters.FFT_OVERLAP,
+    resize_factor: float = parameters.RESIZE_FACTOR,
+    divide_factor: int = parameters.SPEC_DIVIDE_FACTOR,
+    fixed_width: Optional[int] = None,
 ):
-    # Adds zeros to the end of the raw data so that the generated sepctrogram
-    # will be evenly divisible by `divide_factor`
-    # Also deals with very short audio clips and fixed_width during training
+    """Pad audio to be evenly divisible by `divide_factor`.
+
+    This function pads the audio signal with zeros to ensure that the
+    generated spectrogram length will be evenly divisible by `divide_factor`.
+    This is important for the model to work correctly.
+
+    This `divide_factor` comes from the model architecture as it downscales
+    the spectrogram by this factor, so the input must be divisible by this
+    integer number.
+
+    Parameters
+    ----------
+    audio : np.ndarray
+        The audio signal.
+    samplerate : int
+        The sampling rate of the audio signal.
+    window_size : float
+        The window size in seconds used for the spectrogram computation.
+    window_overlap : float
+        The overlap between windows in the spectrogram computation.
+    resize_factor : float
+        This factor is used to resize the spectrogram after the STFT
+        computation. Default is 0.5 which means that the spectrogram will be
+        reduced by half. Important to take into account for the final size of
+        the spectrogram.
+    divide_factor : int
+        The factor by which the spectrogram will be divided.
+    fixed_width : int, optional
+        If provided, the audio will be padded or cut so that the resulting
+        spectrogram width will be equal to this value.
+
+    Returns
+    -------
+    np.ndarray
+        The padded audio signal.
+    """
+    spec_width = compute_spectrogram_width(
+        audio.shape[0],
+        samplerate=samplerate,
+        window_duration=window_duration,
+        window_overlap=window_overlap,
+        resize_factor=resize_factor,
+    )
 
-    # This code could be clearer, clean up
-    nfft = int(ms * fs)
-    noverlap = int(overlap_perc * nfft)
-    step = nfft - noverlap
-    min_size = int(divide_factor * (1.0 / resize_factor))
-    spec_width = (audio_raw.shape[0] - noverlap) // step
-    spec_width_rs = spec_width * resize_factor
-
-    if fixed_width is not None and spec_width < fixed_width:
-        # too small
-        # used during training to ensure all the batches are the same size
-        diff = fixed_width * step + noverlap - audio_raw.shape[0]
-        audio_raw = np.hstack(
-            (audio_raw, np.zeros(diff, dtype=audio_raw.dtype))
+    if fixed_width:
+        target_samples = x_coord_to_sample(
+            fixed_width,
+            samplerate=samplerate,
+            window_duration=window_duration,
+            window_overlap=window_overlap,
+            resize_factor=resize_factor,
         )
 
-    elif fixed_width is not None and spec_width > fixed_width:
-        # too big
-        # used during training to ensure all the batches are the same size
-        diff = fixed_width * step + noverlap - audio_raw.shape[0]
-        audio_raw = audio_raw[:diff]
-
-    elif (
-        spec_width_rs < min_size
-        or (np.floor(spec_width_rs) % divide_factor) != 0
-    ):
-        # need to be at least min_size
-        div_amt = np.ceil(spec_width_rs / float(divide_factor))
-        div_amt = np.maximum(1, div_amt)
-        target_size = int(div_amt * divide_factor * (1.0 / resize_factor))
-        diff = target_size * step + noverlap - audio_raw.shape[0]
-        audio_raw = np.hstack(
-            (audio_raw, np.zeros(diff, dtype=audio_raw.dtype))
-        )
+        if spec_width < fixed_width:
+            # need to be at least min_size
+            diff = target_samples - audio.shape[0]
+            return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
+
+        if spec_width > fixed_width:
+            return audio[:target_samples]
 
-    return audio_raw
+        return audio
+
+    min_width = int(divide_factor / resize_factor)
+
+    if spec_width < min_width:
+        target_samples = x_coord_to_sample(
+            min_width,
+            samplerate=samplerate,
+            window_duration=window_duration,
+            window_overlap=window_overlap,
+            resize_factor=resize_factor,
+        )
+        diff = target_samples - audio.shape[0]
+        return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
+
+    if (spec_width % divide_factor) == 0:
+        return audio
+
+    target_width = int(np.ceil(spec_width / divide_factor)) * divide_factor
+    target_samples = x_coord_to_sample(
+        target_width,
+        samplerate=samplerate,
+        window_duration=window_duration,
+        window_overlap=window_overlap,
+        resize_factor=resize_factor,
+    )
+    diff = target_samples - audio.shape[0]
+    return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
 
 
 def gen_mag_spectrogram(x, fs, ms, overlap_perc):
@@ -247,7 +336,11 @@ def gen_mag_spectrogram(x, fs, ms, overlap_perc):
 
     # compute spec
     spec, _ = librosa.core.spectrum._spectrogram(
-        y=x, power=1, n_fft=nfft, hop_length=step, center=False
+        y=x,
+        power=1,
+        n_fft=nfft,
+        hop_length=step,
+        center=False,
     )
 
     # remove DC component and flip vertical orientation