From b7e122a5410d3d90ec11508c25ff20b616fa243c Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Thu, 11 Nov 2021 06:47:33 +0000
Subject: [PATCH 01/11] Modify audio stats func to avoid failing at directories
 without audio

---
 hearpreprocess/util/audio.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hearpreprocess/util/audio.py b/hearpreprocess/util/audio.py
index d5eb824b..88a52c20 100644
--- a/hearpreprocess/util/audio.py
+++ b/hearpreprocess/util/audio.py
@@ -160,6 +160,9 @@ def get_audio_dir_stats(
             all_file_paths,
         )
     )
+    if len(audio_paths) == 0:
+        print("No audio files present in the folder")
+        return {}
     rng = random.Random(0)
     rng.shuffle(audio_paths)
 

From 4b65a9a7cd9d14a3043307f397c3556a9aecc9e7 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Thu, 11 Nov 2021 06:53:46 +0000
Subject: [PATCH 02/11] Modify shchema to allow for actual sample duration
 without any trim and pad

---
 hearpreprocess/util/task_config.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hearpreprocess/util/task_config.py b/hearpreprocess/util/task_config.py
index 14d3bed2..59ae0f97 100644
--- a/hearpreprocess/util/task_config.py
+++ b/hearpreprocess/util/task_config.py
@@ -113,7 +113,9 @@ def validate_generic_task_config(
             "embedding_type": Or("scene", "event", str),
             "prediction_type": Or("multiclass", "multilabel", str),
             "split_mode": Or("trainvaltest", "presplit_kfold", "new_split_kfold"),
-            "sample_duration": Or(float, int),
+            # When the sample duration is None, the original audio is retained
+            # without any trimming and padding
+            "sample_duration": Or(float, int, None),
             "evaluation": Schema([str]),
             "default_mode": Or("5h", "50h", "full", str),
         }
@@ -177,6 +179,12 @@ def validate_generic_task_config(
                     ): object,
                 }
             )
+            # If the sample duration is set to None, the max_task_duration_by_split
+            # should also be None and no subsampling will be done
+            if task_config["sample_duration"] is None:
+                schema["max_task_duration_by_split"] = Schema(
+                    {split: Or(int, float, None) for split in SPLITS}
+                )
         elif split_mode in ["presplit_kfold", "new_split_kfold"]:
 
             assert (
@@ -203,6 +211,10 @@ def validate_generic_task_config(
                     ): object,
                 }
             )
+            # If the sample duration is set to None, the max_task_duration_by_fold
+            # should also be None and no subsampling will be done
+            if task_config["sample_duration"] is None:
+                schema["max_task_duration_by_fold"] = None
         else:
             raise ValueError("Invalid split_mode")
 

From 498ddfb82e7162318ffb2d394df8d3a1763bad9d Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Thu, 11 Nov 2021 07:06:18 +0000
Subject: [PATCH 03/11] Modify sampler to work for custom download functions

---
 hearpreprocess/sampler.py | 63 +++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py
index 6179740d..33fedd12 100644
--- a/hearpreprocess/sampler.py
+++ b/hearpreprocess/sampler.py
@@ -17,9 +17,10 @@
 import random
 import shutil
 from pathlib import Path
-from typing import Optional, Dict, Any
+from typing import Callable, Optional, Dict, Any
 from urllib.parse import urlparse
 import tempfile
+import copy
 
 import click
 import luigi
@@ -30,6 +31,7 @@
 from hearpreprocess import dcase2016_task2, nsynth_pitch, speech_commands, spoken_digit
 from hearpreprocess.util.luigi import WorkTask
 import hearpreprocess.util.audio as audio_util
+import hearpreprocess.util.luigi as luigi_util
 
 logger = logging.getLogger("luigi-interface")
 # Currently the sampler is only allowed to run for open tasks
@@ -59,21 +61,25 @@
         "task_config": dcase2016_task2.generic_task_config,
         "audio_sample_size": 4,
         "necessary_keys": [],
+        "get_download_and_extract_tasks": pipeline.get_download_and_extract_tasks,
     },
     "nsynth_pitch": {
         "task_config": nsynth_pitch.generic_task_config,
         "audio_sample_size": 100,
         "necessary_keys": [],
+        "get_download_and_extract_tasks": pipeline.get_download_and_extract_tasks,
     },
     "speech_commands": {
         "task_config": speech_commands.generic_task_config,
         "audio_sample_size": 100,
         "necessary_keys": [],
+        "get_download_and_extract_tasks": pipeline.get_download_and_extract_tasks,
     },
     "spoken_digit": {
         "task_config": spoken_digit.generic_task_config,
         "audio_sample_size": 100,
         "necessary_keys": [],
+        "get_download_and_extract_tasks": tfds_pipeline.get_download_and_extract_tasks_tfds,
     },
     # Add the sampler config for the secrets task if the secret task config was found.
     # Not available for participants
@@ -81,23 +87,10 @@
 }
 
 
-class RandomSampleOriginalDataset(WorkTask):
+class _RandomSampleOriginalDataset(WorkTask):
     necessary_keys = luigi.ListParameter()
     audio_sample_size = luigi.IntParameter()
 
-    def requires(self):
-        # If this is a TensorFlow dataset then use the tfds pipeline
-        if "tfds_task_name" in self.task_config:
-            return tfds_pipeline.get_download_and_extract_tasks_tfds(self.task_config)
-
-        return pipeline.get_download_and_extract_tasks(self.task_config)
-
-    @staticmethod
-    def safecopy(src, dst):
-        # Make sure the parent destination directory exists
-        dst.parent.mkdir(parents=True, exist_ok=True)
-        shutil.copy2(src, dst)
-
     @staticmethod
     def trimcopy_audio(src, tmp_dst, fin_dst, small_duration):
         """
@@ -167,7 +160,9 @@ def run(self):
 
             # Copy all the non audio files
             for file in tqdm(copy_files):
-                self.safecopy(src=copy_from.joinpath(file), dst=copy_to.joinpath(file))
+                luigi_util.safecopy(
+                    src=copy_from.joinpath(file), dst=copy_to.joinpath(file)
+                )
 
             # Save all the audio after trimming them to small sample duration
             # The small sample duration(in seconds) is specified in the small
@@ -189,6 +184,29 @@ def run(self):
             shutil.make_archive(copy_to, "zip", copy_to)
 
 
+def get_sampler_task(sampler_config: Dict[str, Any]) -> _RandomSampleOriginalDataset:
+    """
+    Returns a task to do sampling after downloading the dataset with
+    download and extract tasks from the dataset specific
+    `get_download_and_extract_tasks` function
+    """
+    _task_config: Dict[str, Any] = copy.deepcopy(sampler_config["task_config"])
+    _task_config["mode"] = _task_config["default_mode"]
+    _get_download_and_extract_tasks: Callable = sampler_config[
+        "get_download_and_extract_tasks"
+    ]
+
+    class RandomSampleOriginalDataset(_RandomSampleOriginalDataset):
+        task_config = _task_config
+        audio_sample_size = sampler_config["audio_sample_size"]
+        necessary_keys = sampler_config["necessary_keys"]
+
+        def requires(self):
+            return _get_download_and_extract_tasks(self.task_config)
+
+    return RandomSampleOriginalDataset
+
+
 @click.command()
 @click.argument("task")
 @click.option(
@@ -202,15 +220,10 @@ def main(task: str, num_workers: Optional[int] = None):
     if num_workers is None:
         num_workers = multiprocessing.cpu_count()
     logger.info(f"Using {num_workers} workers")
-    config: Dict[str, Any] = configs[task]
-    default_config: str = config["task_config"]["default_mode"]
-    config["task_config"]["mode"] = default_config
-    sampler = RandomSampleOriginalDataset(
-        task_config=config["task_config"],
-        audio_sample_size=config["audio_sample_size"],
-        necessary_keys=config["necessary_keys"],
-    )
-    pipeline.run(sampler, num_workers=num_workers)
+
+    sampler_config: Dict[str, Any] = configs[task]
+    sampler = get_sampler_task(sampler_config)
+    pipeline.run(sampler(), num_workers=num_workers)
 
 
 if __name__ == "__main__":

From b5b3719dd3e87f69a46f4ec448723d6d7e64c15b Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Thu, 11 Nov 2021 07:07:55 +0000
Subject: [PATCH 04/11] Modify pipeline for variable audio size

---
 hearpreprocess/pipeline.py | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/hearpreprocess/pipeline.py b/hearpreprocess/pipeline.py
index 0851de4f..1a38d39f 100644
--- a/hearpreprocess/pipeline.py
+++ b/hearpreprocess/pipeline.py
@@ -26,6 +26,7 @@
     download_file,
     new_basedir,
     str2int,
+    safecopy,
 )
 
 INCLUDE_DATESTR_IN_FINAL_PATHS = False
@@ -665,9 +666,10 @@ def run(self):
             # duration
             # sample duration is specified in the task config.
             # The specified sample duration is in seconds
-            metadata = self.trim_event_metadata(
-                metadata, duration=self.task_config["sample_duration"]
-            )
+            if self.task_config["sample_duration"] is not None:
+                metadata = self.trim_event_metadata(
+                    metadata, duration=self.task_config["sample_duration"]
+                )
         else:
             raise ValueError(
                 "%s embedding_type unknown" % self.task_config["embedding_type"]
@@ -818,6 +820,14 @@ def run(self):
         # minutes or the timestamp embeddings will explode
         sample_duration = self.task_config["sample_duration"]
         max_split_duration = self.get_max_split_duration()
+        if sample_duration is None:
+            assert max_split_duration is None, (
+                "If the sample duration is set to None i.e. orignal audio files "
+                "are being used without any trimming or padding, then the "
+                "max_split_duration should also be None, so that no "
+                "subsampling is done as the audio file length is not "
+                "consistent."
+            )
 
         # If max_split_duration is not None set the max_files so that
         # the total duration of all the audio files after subsampling
@@ -919,15 +929,18 @@ def requires(self):
 
     def run(self):
         self.createsplit()
-
         for audiofile in tqdm(list(self.requires()["corpus"].splitdir.iterdir())):
             newaudiofile = self.splitdir.joinpath(f"{audiofile.stem}.wav")
-            audio_util.trim_pad_wav(
-                str(audiofile),
-                str(newaudiofile),
-                duration=self.task_config["sample_duration"],
-            )
-
+            if self.task_config["sample_duration"] is not None:
+                audio_util.trim_pad_wav(
+                    str(audiofile),
+                    str(newaudiofile),
+                    duration=self.task_config["sample_duration"],
+                )
+            else:
+                # If the sample_duration is None, the file will be copied
+                # without any trimming or padding
+                safecopy(src=audiofile, dst=newaudiofile)
         self.mark_complete()
 
 

From e3f15fcde5836e2e135c46036872413d4e8239d4 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Thu, 11 Nov 2021 07:08:26 +0000
Subject: [PATCH 05/11] Add safecopy Utilility function

---
 hearpreprocess/util/luigi.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/hearpreprocess/util/luigi.py b/hearpreprocess/util/luigi.py
index 5c293058..af68e835 100644
--- a/hearpreprocess/util/luigi.py
+++ b/hearpreprocess/util/luigi.py
@@ -8,6 +8,7 @@
 import os.path
 from functools import partial
 from pathlib import Path
+import shutil
 
 import luigi
 import requests
@@ -219,3 +220,13 @@ def str2int(s: str) -> int:
     https://stackoverflow.com/a/16008760/82733
     """
     return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (2 ** 32 - 1)
+
+
+def safecopy(src, dst):
+    """
+    Copies a file after checking if the parent destination directory exists
+    If the parent doesnot exists, the parent directory will be made and the
+    file will be copied
+    """
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src, dst)

From 0ca020b6d04765b9b1c18c2126db9128457eee29 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 14 Nov 2021 10:29:00 +0000
Subject: [PATCH 06/11] Add description

---
 hearpreprocess/pipeline.py         |  7 +++++++
 hearpreprocess/util/task_config.py | 11 +++++++++++
 2 files changed, 18 insertions(+)

diff --git a/hearpreprocess/pipeline.py b/hearpreprocess/pipeline.py
index 1a38d39f..1737f3a3 100644
--- a/hearpreprocess/pipeline.py
+++ b/hearpreprocess/pipeline.py
@@ -666,6 +666,13 @@ def run(self):
             # duration
             # sample duration is specified in the task config.
             # The specified sample duration is in seconds
+
+            # If the sample duration is set to None, no trimming of events will
+            # be done and the full audio file will be selected. This mode is
+            # only for special tasks and should not be generally used.
+            # Having all the audio files of the same length is more
+            # efficient for downstream pipelines
+
             if self.task_config["sample_duration"] is not None:
                 metadata = self.trim_event_metadata(
                     metadata, duration=self.task_config["sample_duration"]
diff --git a/hearpreprocess/util/task_config.py b/hearpreprocess/util/task_config.py
index 59ae0f97..2063b912 100644
--- a/hearpreprocess/util/task_config.py
+++ b/hearpreprocess/util/task_config.py
@@ -73,6 +73,17 @@ def validate_generic_task_config(
             This validator checks if the tfds task configuration is correctly
             defined
 
+        * sample_duration can also be set to `None`, rather than integer or
+            float, in which case, the audio files in the dataset will not
+            be trimmed or padded, rather the original file duration will be
+            retained in the output of the pipeline.
+            In this case, the max split duration should be set to
+            None and no subsampling can be done, as file durations are
+            not consistent.
+            However, this is only for specific tasks and should not be generally
+            used as it is not efficient for downstream pipelines, particularly
+            embedding generation in heareval
+
     Args:
         task_config: Task config to be used with the pipeline
         ignore_extra_keys: Flag for ignoring extra keys in the task configuration.

From 466ff28b29ea9dcd7f8ece8eabbe1d0515640e49 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Wed, 17 Nov 2021 04:08:06 +0000
Subject: [PATCH 07/11] Update comment and reverse variable name

---
 hearpreprocess/sampler.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py
index 33fedd12..4efb7aa9 100644
--- a/hearpreprocess/sampler.py
+++ b/hearpreprocess/sampler.py
@@ -56,6 +56,10 @@
 
 
 # Note: Necessary key helps to select audios with the necessary keys in there name
+# Note: The `get_download_and_extract_tasks` is the task specific function which
+#   returns the tasks to download and extract the dataset for the task. This is
+#   requried here, because the sampling task needs to download and extract the
+#   tasks before actual sampling
 configs = {
     "dcase2016_task2": {
         "task_config": dcase2016_task2.generic_task_config,
@@ -87,7 +91,7 @@
 }
 
 
-class _RandomSampleOriginalDataset(WorkTask):
+class RandomSampleOriginalDataset(WorkTask):
     necessary_keys = luigi.ListParameter()
     audio_sample_size = luigi.IntParameter()
 
@@ -184,7 +188,7 @@ def run(self):
             shutil.make_archive(copy_to, "zip", copy_to)
 
 
-def get_sampler_task(sampler_config: Dict[str, Any]) -> _RandomSampleOriginalDataset:
+def get_sampler_task(sampler_config: Dict[str, Any]) -> RandomSampleOriginalDataset:
     """
     Returns a task to do sampling after downloading the dataset with
     download and extract tasks from the dataset specific
@@ -196,7 +200,7 @@ def get_sampler_task(sampler_config: Dict[str, Any]) -> _RandomSampleOriginalDat
         "get_download_and_extract_tasks"
     ]
 
-    class RandomSampleOriginalDataset(_RandomSampleOriginalDataset):
+    class _RandomSampleOriginalDataset(RandomSampleOriginalDataset):
         task_config = _task_config
         audio_sample_size = sampler_config["audio_sample_size"]
         necessary_keys = sampler_config["necessary_keys"]
@@ -204,7 +208,7 @@ class RandomSampleOriginalDataset(_RandomSampleOriginalDataset):
         def requires(self):
             return _get_download_and_extract_tasks(self.task_config)
 
-    return RandomSampleOriginalDataset
+    return _RandomSampleOriginalDataset
 
 
 @click.command()

From 28e8f28cde754a523c3afe4f1e2c94a669aa6fcc Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Wed, 17 Nov 2021 04:14:26 +0000
Subject: [PATCH 08/11] Modify task config

---
 hearpreprocess/util/task_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hearpreprocess/util/task_config.py b/hearpreprocess/util/task_config.py
index 2063b912..d3af6daa 100644
--- a/hearpreprocess/util/task_config.py
+++ b/hearpreprocess/util/task_config.py
@@ -194,7 +194,7 @@ def validate_generic_task_config(
             # should also be None and no subsampling will be done
             if task_config["sample_duration"] is None:
                 schema["max_task_duration_by_split"] = Schema(
-                    {split: Or(int, float, None) for split in SPLITS}
+                    {split: None for split in SPLITS}
                 )
         elif split_mode in ["presplit_kfold", "new_split_kfold"]:
 

From e17732c29ab6f68dafb1ac8b5568e77d41daebf1 Mon Sep 17 00:00:00 2001
From: Joseph Turian <turian@gmail.com>
Date: Thu, 18 Nov 2021 16:44:13 +0000
Subject: [PATCH 09/11] imports

---
 hearpreprocess/nsynth_pitch_kfold.py | 4 ++--
 hearpreprocess/pipeline.py           | 2 +-
 hearpreprocess/sampler.py            | 8 +++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/hearpreprocess/nsynth_pitch_kfold.py b/hearpreprocess/nsynth_pitch_kfold.py
index 49f26670..6a1051ba 100644
--- a/hearpreprocess/nsynth_pitch_kfold.py
+++ b/hearpreprocess/nsynth_pitch_kfold.py
@@ -7,8 +7,8 @@
 import copy
 
 import hearpreprocess.nsynth_pitch as nsynth_pitch
-from hearpreprocess.nsynth_pitch import (  # noqa: F401
-    ExtractMetadata,
+from hearpreprocess.nsynth_pitch import (
+    ExtractMetadata,  # noqa: F401
     extract_metadata_task,
 )
 
diff --git a/hearpreprocess/pipeline.py b/hearpreprocess/pipeline.py
index 3e890fbf..15b32f95 100644
--- a/hearpreprocess/pipeline.py
+++ b/hearpreprocess/pipeline.py
@@ -25,8 +25,8 @@
     diagnostics,
     download_file,
     new_basedir,
-    str2int,
     safecopy,
+    str2int,
 )
 
 INCLUDE_DATESTR_IN_FINAL_PATHS = False
diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py
index 3a996e0e..6a437dcc 100644
--- a/hearpreprocess/sampler.py
+++ b/hearpreprocess/sampler.py
@@ -12,16 +12,15 @@
 it simple to scale across multiple dataset
 """
 
+import copy
 import logging
 import multiprocessing
 import random
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Callable, Optional, Dict, Any
+from typing import Any, Callable, Dict, Optional
 from urllib.parse import urlparse
-import tempfile
-import copy
 
 import click
 import luigi
@@ -30,10 +29,9 @@
 import hearpreprocess.pipeline as pipeline
 import hearpreprocess.tfds_pipeline as tfds_pipeline
 import hearpreprocess.util.audio as audio_util
+import hearpreprocess.util.luigi as luigi_util
 from hearpreprocess import dcase2016_task2, nsynth_pitch, speech_commands, spoken_digit
 from hearpreprocess.util.luigi import WorkTask
-import hearpreprocess.util.audio as audio_util
-import hearpreprocess.util.luigi as luigi_util
 
 logger = logging.getLogger("luigi-interface")
 # Currently the sampler is only allowed to run for open tasks

From 34992b117eebec67eee792aef3f18f1505c5f520 Mon Sep 17 00:00:00 2001
From: Joseph Turian <turian@gmail.com>
Date: Thu, 18 Nov 2021 16:48:35 +0000
Subject: [PATCH 10/11] flake8

---
 hearpreprocess/nsynth_pitch_kfold.py | 4 ++--
 hearpreprocess/sampler.py            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hearpreprocess/nsynth_pitch_kfold.py b/hearpreprocess/nsynth_pitch_kfold.py
index 6a1051ba..49f26670 100644
--- a/hearpreprocess/nsynth_pitch_kfold.py
+++ b/hearpreprocess/nsynth_pitch_kfold.py
@@ -7,8 +7,8 @@
 import copy
 
 import hearpreprocess.nsynth_pitch as nsynth_pitch
-from hearpreprocess.nsynth_pitch import (
-    ExtractMetadata,  # noqa: F401
+from hearpreprocess.nsynth_pitch import (  # noqa: F401
+    ExtractMetadata,
     extract_metadata_task,
 )
 
diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py
index 6a437dcc..ff1d7277 100644
--- a/hearpreprocess/sampler.py
+++ b/hearpreprocess/sampler.py
@@ -83,7 +83,7 @@
         "task_config": spoken_digit.generic_task_config,
         "audio_sample_size": 100,
         "necessary_keys": [],
-        "get_download_and_extract_tasks": tfds_pipeline.get_download_and_extract_tasks_tfds,
+        "get_download_and_extract_tasks": tfds_pipeline.get_download_and_extract_tasks_tfds,  # noqa: E501
     },
     # Add the sampler config for the secrets task if the secret task config was found.
     # Not available for participants

From 8a56aced50937f8914827ca97058a11446e3d9ff Mon Sep 17 00:00:00 2001
From: Joseph Turian <turian@gmail.com>
Date: Thu, 18 Nov 2021 16:51:48 +0000
Subject: [PATCH 11/11] type fix

---
 hearpreprocess/sampler.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hearpreprocess/sampler.py b/hearpreprocess/sampler.py
index ff1d7277..fc09d148 100644
--- a/hearpreprocess/sampler.py
+++ b/hearpreprocess/sampler.py
@@ -19,7 +19,7 @@
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Type
 from urllib.parse import urlparse
 
 import click
@@ -188,7 +188,9 @@ def run(self):
             shutil.make_archive(copy_to, "zip", copy_to)
 
 
-def get_sampler_task(sampler_config: Dict[str, Any]) -> RandomSampleOriginalDataset:
+def get_sampler_task(
+    sampler_config: Dict[str, Any]
+) -> Type[RandomSampleOriginalDataset]:
     """
     Returns a task to do sampling after downloading the dataset with
     download and extract tasks from the dataset specific