From b7e122a5410d3d90ec11508c25ff20b616fa243c Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Thu, 11 Nov 2021 06:47:33 +0000 Subject: [PATCH 01/11] Modify audio stats func to avoid failing at directories without audio --- hearpreprocess/util/audio.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hearpreprocess/util/audio.py b/hearpreprocess/util/audio.py index d5eb824b..88a52c20 100644 --- a/hearpreprocess/util/audio.py +++ b/hearpreprocess/util/audio.py @@ -160,6 +160,9 @@ def get_audio_dir_stats( all_file_paths, ) ) + if len(audio_paths) == 0: + print("No audio files present in the folder") + return {} rng = random.Random(0) rng.shuffle(audio_paths) From 4b65a9a7cd9d14a3043307f397c3556a9aecc9e7 Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Thu, 11 Nov 2021 06:53:46 +0000 Subject: [PATCH 02/11] Modify shchema to allow for actual sample duration without any trim and pad --- hearpreprocess/util/task_config.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/hearpreprocess/util/task_config.py b/hearpreprocess/util/task_config.py index 14d3bed2..59ae0f97 100644 --- a/hearpreprocess/util/task_config.py +++ b/hearpreprocess/util/task_config.py @@ -113,7 +113,9 @@ def validate_generic_task_config( "embedding_type": Or("scene", "event", str), "prediction_type": Or("multiclass", "multilabel", str), "split_mode": Or("trainvaltest", "presplit_kfold", "new_split_kfold"), - "sample_duration": Or(float, int), + # When the sample duration is None, the original audio is retained + # without any trimming and padding + "sample_duration": Or(float, int, None), "evaluation": Schema([str]), "default_mode": Or("5h", "50h", "full", str), } @@ -177,6 +179,12 @@ def validate_generic_task_config( ): object, } ) + # If the sample duration is set to None, the max_task_duration_by_split + # should also be None and no subsampling will be done + if task_config["sample_duration"] is None: + schema["max_task_duration_by_split"] = Schema( + {split: Or(int, float, None) for split in SPLITS} + ) elif split_mode in ["presplit_kfold", "new_split_kfold"]: assert ( @@ -203,6 +211,10 @@ def validate_generic_task_config( ): object, } ) + # If the sample duration is set to None, the max_task_duration_by_fold + # should also be None and no subsampling will be done + if task_config["sample_duration"] is None: + schema["max_task_duration_by_fold"] = None else: raise ValueError("Invalid split_mode") From 498ddfb82e7162318ffb2d394df8d3a1763bad9d Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Thu, 11 Nov 2021 07:06:18 +0000 Subject: [PATCH 03/11] Modify sampler to work for custom download functions --- hearpreprocess/sampler.py | 63 +++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py index 6179740d..33fedd12 100644 --- a/hearpreprocess/sampler.py +++ b/hearpreprocess/sampler.py @@ -17,9 +17,10 @@ import random import shutil from pathlib import Path -from typing import Optional, Dict, Any +from typing import Callable, Optional, Dict, Any from urllib.parse import urlparse import tempfile +import copy import click import luigi @@ -30,6 +31,7 @@ from hearpreprocess import dcase2016_task2, nsynth_pitch, speech_commands, spoken_digit from hearpreprocess.util.luigi import WorkTask import hearpreprocess.util.audio as audio_util +import hearpreprocess.util.luigi as luigi_util logger = logging.getLogger("luigi-interface") # Currently the sampler is only allowed to run for open tasks @@ -59,21 +61,25 @@ "task_config": dcase2016_task2.generic_task_config, "audio_sample_size": 4, "necessary_keys": [], + "get_download_and_extract_tasks": pipeline.get_download_and_extract_tasks, }, "nsynth_pitch": { "task_config": nsynth_pitch.generic_task_config, "audio_sample_size": 100, "necessary_keys": [], + "get_download_and_extract_tasks": pipeline.get_download_and_extract_tasks, }, "speech_commands": { "task_config": speech_commands.generic_task_config, "audio_sample_size": 100, "necessary_keys": [], + "get_download_and_extract_tasks": pipeline.get_download_and_extract_tasks, }, "spoken_digit": { "task_config": spoken_digit.generic_task_config, "audio_sample_size": 100, "necessary_keys": [], + "get_download_and_extract_tasks": tfds_pipeline.get_download_and_extract_tasks_tfds, }, # Add the sampler config for the secrets task if the secret task config was found. # Not available for participants @@ -81,23 +87,10 @@ } -class RandomSampleOriginalDataset(WorkTask): +class _RandomSampleOriginalDataset(WorkTask): necessary_keys = luigi.ListParameter() audio_sample_size = luigi.IntParameter() - def requires(self): - # If this is a TensorFlow dataset then use the tfds pipeline - if "tfds_task_name" in self.task_config: - return tfds_pipeline.get_download_and_extract_tasks_tfds(self.task_config) - - return pipeline.get_download_and_extract_tasks(self.task_config) - - @staticmethod - def safecopy(src, dst): - # Make sure the parent destination directory exists - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) - @staticmethod def trimcopy_audio(src, tmp_dst, fin_dst, small_duration): """ @@ -167,7 +160,9 @@ def run(self): # Copy all the non audio files for file in tqdm(copy_files): - self.safecopy(src=copy_from.joinpath(file), dst=copy_to.joinpath(file)) + luigi_util.safecopy( + src=copy_from.joinpath(file), dst=copy_to.joinpath(file) + ) # Save all the audio after trimming them to small sample duration # The small sample duration(in seconds) is specified in the small @@ -189,6 +184,29 @@ def run(self): shutil.make_archive(copy_to, "zip", copy_to) +def get_sampler_task(sampler_config: Dict[str, Any]) -> _RandomSampleOriginalDataset: + """ + Returns a task to do sampling after downloading the dataset with + download and extract tasks from the dataset specific + `get_download_and_extract_tasks` function + """ + _task_config: Dict[str, Any] = copy.deepcopy(sampler_config["task_config"]) + _task_config["mode"] = _task_config["default_mode"] + _get_download_and_extract_tasks: Callable = sampler_config[ + "get_download_and_extract_tasks" + ] + + class RandomSampleOriginalDataset(_RandomSampleOriginalDataset): + task_config = _task_config + audio_sample_size = sampler_config["audio_sample_size"] + necessary_keys = sampler_config["necessary_keys"] + + def requires(self): + return _get_download_and_extract_tasks(self.task_config) + + return RandomSampleOriginalDataset + + @click.command() @click.argument("task") @click.option( @@ -202,15 +220,10 @@ def main(task: str, num_workers: Optional[int] = None): if num_workers is None: num_workers = multiprocessing.cpu_count() logger.info(f"Using {num_workers} workers") - config: Dict[str, Any] = configs[task] - default_config: str = config["task_config"]["default_mode"] - config["task_config"]["mode"] = default_config - sampler = RandomSampleOriginalDataset( - task_config=config["task_config"], - audio_sample_size=config["audio_sample_size"], - necessary_keys=config["necessary_keys"], - ) - pipeline.run(sampler, num_workers=num_workers) + + sampler_config: Dict[str, Any] = configs[task] + sampler = get_sampler_task(sampler_config) + pipeline.run(sampler(), num_workers=num_workers) if __name__ == "__main__": From b5b3719dd3e87f69a46f4ec448723d6d7e64c15b Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Thu, 11 Nov 2021 07:07:55 +0000 Subject: [PATCH 04/11] Modify pipeline for variable audio size --- hearpreprocess/pipeline.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/hearpreprocess/pipeline.py b/hearpreprocess/pipeline.py index 0851de4f..1a38d39f 100644 --- a/hearpreprocess/pipeline.py +++ b/hearpreprocess/pipeline.py @@ -26,6 +26,7 @@ download_file, new_basedir, str2int, + safecopy, ) INCLUDE_DATESTR_IN_FINAL_PATHS = False @@ -665,9 +666,10 @@ def run(self): # duration # sample duration is specified in the task config. # The specified sample duration is in seconds - metadata = self.trim_event_metadata( - metadata, duration=self.task_config["sample_duration"] - ) + if self.task_config["sample_duration"] is not None: + metadata = self.trim_event_metadata( + metadata, duration=self.task_config["sample_duration"] + ) else: raise ValueError( "%s embedding_type unknown" % self.task_config["embedding_type"] @@ -818,6 +820,14 @@ def run(self): # minutes or the timestamp embeddings will explode sample_duration = self.task_config["sample_duration"] max_split_duration = self.get_max_split_duration() + if sample_duration is None: + assert max_split_duration is None, ( + "If the sample duration is set to None i.e. orignal audio files " + "are being used without any trimming or padding, then the " + "max_split_duration should also be None, so that no " + "subsampling is done as the audio file length is not " + "consistent." + ) # If max_split_duration is not None set the max_files so that # the total duration of all the audio files after subsampling @@ -919,15 +929,18 @@ def requires(self): def run(self): self.createsplit() - for audiofile in tqdm(list(self.requires()["corpus"].splitdir.iterdir())): newaudiofile = self.splitdir.joinpath(f"{audiofile.stem}.wav") - audio_util.trim_pad_wav( - str(audiofile), - str(newaudiofile), - duration=self.task_config["sample_duration"], - ) - + if self.task_config["sample_duration"] is not None: + audio_util.trim_pad_wav( + str(audiofile), + str(newaudiofile), + duration=self.task_config["sample_duration"], + ) + else: + # If the sample_duration is None, the file will be copied + # without any trimming or padding + safecopy(src=audiofile, dst=newaudiofile) self.mark_complete() From e3f15fcde5836e2e135c46036872413d4e8239d4 Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Thu, 11 Nov 2021 07:08:26 +0000 Subject: [PATCH 05/11] Add safecopy Utilility function --- hearpreprocess/util/luigi.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/hearpreprocess/util/luigi.py b/hearpreprocess/util/luigi.py index 5c293058..af68e835 100644 --- a/hearpreprocess/util/luigi.py +++ b/hearpreprocess/util/luigi.py @@ -8,6 +8,7 @@ import os.path from functools import partial from pathlib import Path +import shutil import luigi import requests @@ -219,3 +220,13 @@ def str2int(s: str) -> int: https://stackoverflow.com/a/16008760/82733 """ return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (2 ** 32 - 1) + + +def safecopy(src, dst): + """ + Copies a file after checking if the parent destination directory exists + If the parent doesnot exists, the parent directory will be made and the + file will be copied + """ + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) From 0ca020b6d04765b9b1c18c2126db9128457eee29 Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Sun, 14 Nov 2021 10:29:00 +0000 Subject: [PATCH 06/11] Add description --- hearpreprocess/pipeline.py | 7 +++++++ hearpreprocess/util/task_config.py | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/hearpreprocess/pipeline.py b/hearpreprocess/pipeline.py index 1a38d39f..1737f3a3 100644 --- a/hearpreprocess/pipeline.py +++ b/hearpreprocess/pipeline.py @@ -666,6 +666,13 @@ def run(self): # duration # sample duration is specified in the task config. # The specified sample duration is in seconds + + # If the sample duration is set to None, no trimming of events will + # be done and the full audio file will be selected. This mode is + # only for special tasks and should not be generally used. + # Having all the audio files of the same length is more + # efficient for downstream pipelines + if self.task_config["sample_duration"] is not None: metadata = self.trim_event_metadata( metadata, duration=self.task_config["sample_duration"] diff --git a/hearpreprocess/util/task_config.py b/hearpreprocess/util/task_config.py index 59ae0f97..2063b912 100644 --- a/hearpreprocess/util/task_config.py +++ b/hearpreprocess/util/task_config.py @@ -73,6 +73,17 @@ def validate_generic_task_config( This validator checks if the tfds task configuration is correctly defined + * sample_duration can also be set to `None`, rather than integer or + float, in which case, the audio files in the dataset will not + be trimmed or padded, rather the original file duration will be + retained in the output of the pipeline. + In this case, the max split duration should be set to + None and no subsampling can be done, as file durations are + not consistent. + However, this is only for specific tasks and should not be generally + used as it is not efficient for downstream pipelines, particularly + embedding generation in heareval + Args: task_config: Task config to be used with the pipeline ignore_extra_keys: Flag for ignoring extra keys in the task configuration. From 466ff28b29ea9dcd7f8ece8eabbe1d0515640e49 Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Wed, 17 Nov 2021 04:08:06 +0000 Subject: [PATCH 07/11] Update comment and reverse variable name --- hearpreprocess/sampler.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py index 33fedd12..4efb7aa9 100644 --- a/hearpreprocess/sampler.py +++ b/hearpreprocess/sampler.py @@ -56,6 +56,10 @@ # Note: Necessary key helps to select audios with the necessary keys in there name +# Note: The `get_download_and_extract_tasks` is the task specific function which +# returns the tasks to download and extract the dataset for the task. This is +# requried here, because the sampling task needs to download and extract the +# tasks before actual sampling configs = { "dcase2016_task2": { "task_config": dcase2016_task2.generic_task_config, @@ -87,7 +91,7 @@ } -class _RandomSampleOriginalDataset(WorkTask): +class RandomSampleOriginalDataset(WorkTask): necessary_keys = luigi.ListParameter() audio_sample_size = luigi.IntParameter() @@ -184,7 +188,7 @@ def run(self): shutil.make_archive(copy_to, "zip", copy_to) -def get_sampler_task(sampler_config: Dict[str, Any]) -> _RandomSampleOriginalDataset: +def get_sampler_task(sampler_config: Dict[str, Any]) -> RandomSampleOriginalDataset: """ Returns a task to do sampling after downloading the dataset with download and extract tasks from the dataset specific @@ -196,7 +200,7 @@ def get_sampler_task(sampler_config: Dict[str, Any]) -> _RandomSampleOriginalDat "get_download_and_extract_tasks" ] - class RandomSampleOriginalDataset(_RandomSampleOriginalDataset): + class _RandomSampleOriginalDataset(RandomSampleOriginalDataset): task_config = _task_config audio_sample_size = sampler_config["audio_sample_size"] necessary_keys = sampler_config["necessary_keys"] @@ -204,7 +208,7 @@ class RandomSampleOriginalDataset(_RandomSampleOriginalDataset): def requires(self): return _get_download_and_extract_tasks(self.task_config) - return RandomSampleOriginalDataset + return _RandomSampleOriginalDataset @click.command() From 28e8f28cde754a523c3afe4f1e2c94a669aa6fcc Mon Sep 17 00:00:00 2001 From: Humair Raj Khan Date: Wed, 17 Nov 2021 04:14:26 +0000 Subject: [PATCH 08/11] Modify task config --- hearpreprocess/util/task_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hearpreprocess/util/task_config.py b/hearpreprocess/util/task_config.py index 2063b912..d3af6daa 100644 --- a/hearpreprocess/util/task_config.py +++ b/hearpreprocess/util/task_config.py @@ -194,7 +194,7 @@ def validate_generic_task_config( # should also be None and no subsampling will be done if task_config["sample_duration"] is None: schema["max_task_duration_by_split"] = Schema( - {split: Or(int, float, None) for split in SPLITS} + {split: None for split in SPLITS} ) elif split_mode in ["presplit_kfold", "new_split_kfold"]: From e17732c29ab6f68dafb1ac8b5568e77d41daebf1 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Thu, 18 Nov 2021 16:44:13 +0000 Subject: [PATCH 09/11] imports --- hearpreprocess/nsynth_pitch_kfold.py | 4 ++-- hearpreprocess/pipeline.py | 2 +- hearpreprocess/sampler.py | 8 +++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/hearpreprocess/nsynth_pitch_kfold.py b/hearpreprocess/nsynth_pitch_kfold.py index 49f26670..6a1051ba 100644 --- a/hearpreprocess/nsynth_pitch_kfold.py +++ b/hearpreprocess/nsynth_pitch_kfold.py @@ -7,8 +7,8 @@ import copy import hearpreprocess.nsynth_pitch as nsynth_pitch -from hearpreprocess.nsynth_pitch import ( # noqa: F401 - ExtractMetadata, +from hearpreprocess.nsynth_pitch import ( + ExtractMetadata, # noqa: F401 extract_metadata_task, ) diff --git a/hearpreprocess/pipeline.py b/hearpreprocess/pipeline.py index 3e890fbf..15b32f95 100644 --- a/hearpreprocess/pipeline.py +++ b/hearpreprocess/pipeline.py @@ -25,8 +25,8 @@ diagnostics, download_file, new_basedir, - str2int, safecopy, + str2int, ) INCLUDE_DATESTR_IN_FINAL_PATHS = False diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py index 3a996e0e..6a437dcc 100644 --- a/hearpreprocess/sampler.py +++ b/hearpreprocess/sampler.py @@ -12,16 +12,15 @@ it simple to scale across multiple dataset """ +import copy import logging import multiprocessing import random import shutil import tempfile from pathlib import Path -from typing import Callable, Optional, Dict, Any +from typing import Any, Callable, Dict, Optional from urllib.parse import urlparse -import tempfile -import copy import click import luigi @@ -30,10 +29,9 @@ import hearpreprocess.pipeline as pipeline import hearpreprocess.tfds_pipeline as tfds_pipeline import hearpreprocess.util.audio as audio_util +import hearpreprocess.util.luigi as luigi_util from hearpreprocess import dcase2016_task2, nsynth_pitch, speech_commands, spoken_digit from hearpreprocess.util.luigi import WorkTask -import hearpreprocess.util.audio as audio_util -import hearpreprocess.util.luigi as luigi_util logger = logging.getLogger("luigi-interface") # Currently the sampler is only allowed to run for open tasks From 34992b117eebec67eee792aef3f18f1505c5f520 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Thu, 18 Nov 2021 16:48:35 +0000 Subject: [PATCH 10/11] flake8 --- hearpreprocess/nsynth_pitch_kfold.py | 4 ++-- hearpreprocess/sampler.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hearpreprocess/nsynth_pitch_kfold.py b/hearpreprocess/nsynth_pitch_kfold.py index 6a1051ba..49f26670 100644 --- a/hearpreprocess/nsynth_pitch_kfold.py +++ b/hearpreprocess/nsynth_pitch_kfold.py @@ -7,8 +7,8 @@ import copy import hearpreprocess.nsynth_pitch as nsynth_pitch -from hearpreprocess.nsynth_pitch import ( - ExtractMetadata, # noqa: F401 +from hearpreprocess.nsynth_pitch import ( # noqa: F401 + ExtractMetadata, extract_metadata_task, ) diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py index 6a437dcc..ff1d7277 100644 --- a/hearpreprocess/sampler.py +++ b/hearpreprocess/sampler.py @@ -83,7 +83,7 @@ "task_config": spoken_digit.generic_task_config, "audio_sample_size": 100, "necessary_keys": [], - "get_download_and_extract_tasks": tfds_pipeline.get_download_and_extract_tasks_tfds, + "get_download_and_extract_tasks": tfds_pipeline.get_download_and_extract_tasks_tfds, # noqa: E501 }, # Add the sampler config for the secrets task if the secret task config was found. # Not available for participants From 8a56aced50937f8914827ca97058a11446e3d9ff Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Thu, 18 Nov 2021 16:51:48 +0000 Subject: [PATCH 11/11] type fix --- hearpreprocess/sampler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py index ff1d7277..fc09d148 100644 --- a/hearpreprocess/sampler.py +++ b/hearpreprocess/sampler.py @@ -19,7 +19,7 @@ import shutil import tempfile from pathlib import Path -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, Type from urllib.parse import urlparse import click @@ -188,7 +188,9 @@ def run(self): shutil.make_archive(copy_to, "zip", copy_to) -def get_sampler_task(sampler_config: Dict[str, Any]) -> RandomSampleOriginalDataset: +def get_sampler_task( + sampler_config: Dict[str, Any] +) -> Type[RandomSampleOriginalDataset]: """ Returns a task to do sampling after downloading the dataset with download and extract tasks from the dataset specific