Skip to content

Commit

Permalink
Update beam and pandas dependencies, and add instructions to download…
Browse files Browse the repository at this point in the history
… and prepare the BIRB evaluation data

PiperOrigin-RevId: 557524484
  • Loading branch information
vdumoulin authored and copybara-github committed Aug 16, 2023
1 parent e4cd2c8 commit aaf8ccb
Show file tree
Hide file tree
Showing 7 changed files with 1,697 additions and 520 deletions.
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ You might need the following dependencies.
# Install Poetry for package management
curl -sSL https://install.python-poetry.org | python3 -

# Install dependencies for librosa (required for testing only)
sudo apt-get install libsndfile1
# Install dependencies for librosa
sudo apt-get install libsndfile1 ffmpeg

# Install all dependencies specified in the poetry configs.
poetry install
Expand All @@ -26,4 +26,18 @@ dependencies, in which you can run the Chirp codebase. To run the tests, try
poetry run python -m unittest discover -s chirp/tests -p "*test.py"
```

## BIRB data preparation

### Evaluation data

After [installing](#installation) the `chirp` package, run the following command from the repository's root directory:

```bash
poetry run tfds build -i chirp.data.bird_taxonomy,chirp.data.soundscapes \
soundscapes/{ssw,hawaii,coffee_farms,sierras_kahl,high_sierras,peru}_full_length \
bird_taxonomy/{downstream_full_length,class_representatives_slice_peaked}
```

The process should take 36 to 48 hours to complete and use around 256 GiB of disk space.

*This is not an officially supported Google product.*
54 changes: 45 additions & 9 deletions chirp/data/bird_taxonomy/bird_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import dataclasses
import functools
import resource
import tempfile
from typing import Any, Callable
import warnings
Expand All @@ -27,6 +28,7 @@
from chirp.data.bird_taxonomy import premade_queries
from chirp.taxonomy import namespace_db
from etils import epath
import jax
from jax import numpy as jnp
import numpy as np
import pandas as pd
Expand All @@ -50,6 +52,10 @@
be retrieved from the 'filename' feature: 'XC{xeno_canto_id}.mp3'.
"""

# The maximum audio sequence length to consider if a localization function is
# provided. This is 5 * 60 seconds = 5 minutes.
_MAX_LOCALIZATION_LENGTH_S = 5 * 60

LocalizationFn = Callable[[Any, int, float], jnp.ndarray]


Expand Down Expand Up @@ -335,6 +341,11 @@ def _info(self) -> tfds.core.DatasetInfo:
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
# Increase the file handle resource soft limit to the hard limit. The
# dataset is large enough that it causes TFDS to hit the soft limit.
_low, _high = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (_high, _high))

# No checksum is found for the new taxonomy_info. dl_manager may raise
# an error when removing the line below.
dl_manager._force_checksums_validation = (
Expand Down Expand Up @@ -435,6 +446,9 @@ def _process_example(row):
# Resampling can introduce artifacts that push the signal outside the
# [-1, 1) interval.
audio = np.clip(audio, -1.0, 1.0 - (1.0 / float(1 << 15)))
# Skip empty audio files.
if audio.shape[0] == 0 or np.max(np.abs(audio)) == 0.0:
return None
# The scrubbed foreground annotations are replaced by ''. When this is the
# case, we translate this annotation into [] rather than [''].
foreground_label = (
Expand Down Expand Up @@ -463,19 +477,24 @@ def _process_example(row):
'sound_type': source['sound_type'],
}

pipeline = beam.Create(source_info.iterrows()) | beam.Map(_process_example)

if self.builder_config.localization_fn:

def _localize_intervals(args):
def localize_intervals_fn(args):
key, example = args
sample_rate_hz = self.builder_config.sample_rate_hz
interval_length_s = self.builder_config.interval_length_s
target_length = int(sample_rate_hz * interval_length_s)

audio = audio_utils.pad_to_length_if_shorter(
example['audio'], target_length
)
audio = example['audio']

# We limit audio sequence length to _MAX_LOCALIZATION_LENGTH_S when
# localizing intervals because the localization function can result in
# very large memory consumption for long audio sequences.
max_length = sample_rate_hz * _MAX_LOCALIZATION_LENGTH_S
if audio.shape[0] > max_length:
audio = audio[:max_length]

audio = audio_utils.pad_to_length_if_shorter(audio, target_length)
# Pass padded audio to avoid localization_fn having to pad again
audio_intervals = self.builder_config.localization_fn(
audio, sample_rate_hz, interval_length_s
Expand All @@ -499,6 +518,23 @@ def _localize_intervals(args):
))
return interval_examples

pipeline = pipeline | beam.FlatMap(_localize_intervals)

return pipeline
else:
localize_intervals_fn = None

for i, key_and_example in enumerate(
map(_process_example, source_info.iterrows())
):
# Since the audio files have variable length, the JAX compilation cache
# can use up a large amount of memory after a while.
if i % 100 == 0:
jax.clear_caches()

# Skip empty audio files.
if key_and_example is None:
continue

if localize_intervals_fn:
for key_and_example in localize_intervals_fn(key_and_example):
yield key_and_example
else:
yield key_and_example
7 changes: 3 additions & 4 deletions chirp/data/soundscapes/soundscapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,6 @@ def _process_group(
beam.metrics.Metrics.counter('soundscapes', 'examples').inc()
return valid_segments

pipeline = beam.Create(
enumerate(segments.groupby('filename'))
) | beam.FlatMap(_process_group)
return pipeline
for group in enumerate(segments.groupby('filename')):
for key, example in _process_group(group):
yield key, example
12 changes: 9 additions & 3 deletions chirp/tests/bird_taxonomy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,15 @@ def setUpClass(cls):
subdir = epath.Path(cls.tempdir) / 'audio-data' / 'comter'
subdir.mkdir(parents=True)
for i in range(4):
tfds.core.lazy_imports.pydub.AudioSegment.silent(duration=10000).export(
subdir / f'XC{i:05d}.mp3', format='mp3'
)
tfds.core.lazy_imports.pydub.AudioSegment(
b'\0\1' * int(10_000 * 10),
metadata={
'channels': 1,
'sample_width': 2,
'frame_rate': 10_000,
'frame_width': 2,
},
).export(subdir / f'XC{i:05d}.mp3', format='mp3')

@classmethod
def tearDownClass(cls):
Expand Down
5 changes: 4 additions & 1 deletion chirp/tests/filter_scrub_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,10 @@ def test_merge_concat_no_duplicates(self):
# of .to_dict().
self.assertTrue(
fsu.apply_parallel(self.toy_df, query_parallel).equals(
self.toy_df.loc[[0]].append([scrubbed_r0, self.toy_df.loc[1]])
pd.concat([
self.toy_df.loc[[0]],
pd.DataFrame([scrubbed_r0, self.toy_df.loc[1]]),
])
)
)

Expand Down
Loading

0 comments on commit aaf8ccb

Please sign in to comment.