Update beam and pandas dependencies, and add instructions to download…

… and prepare the BIRB evaluation data PiperOrigin-RevId: 557524484
google-research · Aug 16, 2023 · aaf8ccb · aaf8ccb
1 parent e4cd2c8
commit aaf8ccb
Show file tree

Hide file tree

Showing 7 changed files with 1,697 additions and 520 deletions.
diff --git a/README.md b/README.md
@@ -12,8 +12,8 @@ You might need the following dependencies.
 # Install Poetry for package management
 curl -sSL https://install.python-poetry.org | python3 -
 
-# Install dependencies for librosa (required for testing only)
-sudo apt-get install libsndfile1
+# Install dependencies for librosa
+sudo apt-get install libsndfile1 ffmpeg
 
 # Install all dependencies specified in the poetry configs.
 poetry install
@@ -26,4 +26,18 @@ dependencies, in which you can run the Chirp codebase. To run the tests, try
 poetry run python -m unittest discover -s chirp/tests -p "*test.py"
 ```
 
+## BIRB data preparation
+
+### Evaluation data
+
+After [installing](#installation) the `chirp` package, run the following command from the repository's root directory:
+
+```bash
+poetry run tfds build -i chirp.data.bird_taxonomy,chirp.data.soundscapes \
+    soundscapes/{ssw,hawaii,coffee_farms,sierras_kahl,high_sierras,peru}_full_length \
+    bird_taxonomy/{downstream_full_length,class_representatives_slice_peaked}
+```
+
+The process should take 36 to 48 hours to complete and use around 256 GiB of disk space.
+
 *This is not an officially supported Google product.*
diff --git a/chirp/data/bird_taxonomy/bird_taxonomy.py b/chirp/data/bird_taxonomy/bird_taxonomy.py
@@ -17,6 +17,7 @@
 
 import dataclasses
 import functools
+import resource
 import tempfile
 from typing import Any, Callable
 import warnings
@@ -27,6 +28,7 @@
 from chirp.data.bird_taxonomy import premade_queries
 from chirp.taxonomy import namespace_db
 from etils import epath
+import jax
 from jax import numpy as jnp
 import numpy as np
 import pandas as pd
@@ -50,6 +52,10 @@
 be retrieved from the 'filename' feature: 'XC{xeno_canto_id}.mp3'.
 """
 
+# The maximum audio sequence length to consider if a localization function is
+# provided. This is 5 * 60 seconds = 5 minutes.
+_MAX_LOCALIZATION_LENGTH_S = 5 * 60
+
 LocalizationFn = Callable[[Any, int, float], jnp.ndarray]
 
 
@@ -335,6 +341,11 @@ def _info(self) -> tfds.core.DatasetInfo:
     )
 
   def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    # Increase the file handle resource soft limit to the hard limit. The
+    # dataset is large enough that it causes TFDS to hit the soft limit.
+    _low, _high = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (_high, _high))
+
     # No checksum is found for the new taxonomy_info. dl_manager may raise
     # an error when removing the line below.
     dl_manager._force_checksums_validation = (
@@ -435,6 +446,9 @@ def _process_example(row):
           # Resampling can introduce artifacts that push the signal outside the
           # [-1, 1) interval.
           audio = np.clip(audio, -1.0, 1.0 - (1.0 / float(1 << 15)))
+      # Skip empty audio files.
+      if audio.shape[0] == 0 or np.max(np.abs(audio)) == 0.0:
+        return None
       # The scrubbed foreground annotations are replaced by ''. When this is the
       # case, we translate this annotation into []  rather than [''].
       foreground_label = (
@@ -463,19 +477,24 @@ def _process_example(row):
           'sound_type': source['sound_type'],
       }
 
-    pipeline = beam.Create(source_info.iterrows()) | beam.Map(_process_example)
-
     if self.builder_config.localization_fn:
 
-      def _localize_intervals(args):
+      def localize_intervals_fn(args):
         key, example = args
         sample_rate_hz = self.builder_config.sample_rate_hz
         interval_length_s = self.builder_config.interval_length_s
         target_length = int(sample_rate_hz * interval_length_s)
 
-        audio = audio_utils.pad_to_length_if_shorter(
-            example['audio'], target_length
-        )
+        audio = example['audio']
+
+        # We limit audio sequence length to _MAX_LOCALIZATION_LENGTH_S when
+        # localizing intervals because the localization function can result in
+        # very large memory consumption for long audio sequences.
+        max_length = sample_rate_hz * _MAX_LOCALIZATION_LENGTH_S
+        if audio.shape[0] > max_length:
+          audio = audio[:max_length]
+
+        audio = audio_utils.pad_to_length_if_shorter(audio, target_length)
         # Pass padded audio to avoid localization_fn having to pad again
         audio_intervals = self.builder_config.localization_fn(
             audio, sample_rate_hz, interval_length_s
@@ -499,6 +518,23 @@ def _localize_intervals(args):
           ))
         return interval_examples
 
-      pipeline = pipeline | beam.FlatMap(_localize_intervals)
-
-    return pipeline
+    else:
+      localize_intervals_fn = None
+
+    for i, key_and_example in enumerate(
+        map(_process_example, source_info.iterrows())
+    ):
+      # Since the audio files have variable length, the JAX compilation cache
+      # can use up a large amount of memory after a while.
+      if i % 100 == 0:
+        jax.clear_caches()
+
+      # Skip empty audio files.
+      if key_and_example is None:
+        continue
+
+      if localize_intervals_fn:
+        for key_and_example in localize_intervals_fn(key_and_example):
+          yield key_and_example
+      else:
+        yield key_and_example
diff --git a/chirp/data/soundscapes/soundscapes.py b/chirp/data/soundscapes/soundscapes.py
@@ -604,7 +604,6 @@ def _process_group(
           beam.metrics.Metrics.counter('soundscapes', 'examples').inc()
         return valid_segments
 
-    pipeline = beam.Create(
-        enumerate(segments.groupby('filename'))
-    ) | beam.FlatMap(_process_group)
-    return pipeline
+    for group in enumerate(segments.groupby('filename')):
+      for key, example in _process_group(group):
+        yield key, example
diff --git a/chirp/tests/bird_taxonomy_test.py b/chirp/tests/bird_taxonomy_test.py
@@ -76,9 +76,15 @@ def setUpClass(cls):
     subdir = epath.Path(cls.tempdir) / 'audio-data' / 'comter'
     subdir.mkdir(parents=True)
     for i in range(4):
-      tfds.core.lazy_imports.pydub.AudioSegment.silent(duration=10000).export(
-          subdir / f'XC{i:05d}.mp3', format='mp3'
-      )
+      tfds.core.lazy_imports.pydub.AudioSegment(
+          b'\0\1' * int(10_000 * 10),
+          metadata={
+              'channels': 1,
+              'sample_width': 2,
+              'frame_rate': 10_000,
+              'frame_width': 2,
+          },
+      ).export(subdir / f'XC{i:05d}.mp3', format='mp3')
 
   @classmethod
   def tearDownClass(cls):

diff --git a/chirp/tests/filter_scrub_test.py b/chirp/tests/filter_scrub_test.py
@@ -458,7 +458,10 @@ def test_merge_concat_no_duplicates(self):
     # of .to_dict().
     self.assertTrue(
         fsu.apply_parallel(self.toy_df, query_parallel).equals(
-            self.toy_df.loc[[0]].append([scrubbed_r0, self.toy_df.loc[1]])
+            pd.concat([
+                self.toy_df.loc[[0]],
+                pd.DataFrame([scrubbed_r0, self.toy_df.loc[1]]),
+            ])
         )
     )