Merge branch 'main' of github.com:nomadkaraoke/python-audio-separator

nomadkaraoke · Nov 2, 2024 · dae33d0 · dae33d0
2 parents ec4bfcf + 5e83a96
commit dae33d0
Show file tree

Hide file tree

Showing 6 changed files with 863 additions and 783 deletions.
diff --git a/README.md b/README.md
@@ -133,7 +133,7 @@ Any file listed in the list models output can be specified (with file extension)
 
 ```sh
 usage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--invert_spect]
-                       [--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE]
+                       [--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--use_autocast] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE]
                        [--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE] [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta]
                        [--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] [--demucs_shifts DEMUCS_SHIFTS]
                        [--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] [--mdxc_segment_size MDXC_SEGMENT_SIZE] [--mdxc_override_model_segment_size]
@@ -166,6 +166,7 @@ Common Separation Parameters:
   --normalization NORMALIZATION                          value by which to multiply the amplitude of the output files (default: 0.9). Example: --normalization=0.7
   --single_stem SINGLE_STEM                              output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental
   --sample_rate SAMPLE_RATE                              set the sample rate of the output audio (default: 44100). Example: --sample_rate=44100
+  --use_autocast                                         use PyTorch autocast for faster inference (default: False). Do not use for CPU inference. Example: --use_autocast
 
 MDX Architecture Parameters:
   --mdx_segment_size MDX_SEGMENT_SIZE                    larger consumes more resources, but may give better results (default: 256). Example: --mdx_segment_size=256
@@ -315,6 +316,7 @@ output_file_paths_6 = separator.separate('audio3.wav')
 - invert_using_spec: (Optional) Flag to invert using spectrogram. Default: False
 - sample_rate: (Optional) Set the sample rate of the output audio. Default: 44100
 - use_soundfile: (Optional) Use soundfile for output writing, can solve OOM issues, especially on longer audio. 
+- use_autocast: (Optional) Flag to use PyTorch autocast for faster inference.　Do not use for CPU inference. Default: False
 - mdx_params: (Optional) MDX Architecture Specific Attributes & Defaults. Default: {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}
 - vr_params: (Optional) VR Architecture Specific Attributes & Defaults. Default: {"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}
 - demucs_params: (Optional) VR Architecture Specific Attributes & Defaults. {"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}

diff --git a/audio_separator/separator/architectures/mdxc_separator.py b/audio_separator/separator/architectures/mdxc_separator.py
@@ -239,7 +239,6 @@ def demix(self, mix: np.ndarray) -> dict:
             # Transfer to the weighting plate for the same device as the other tensors
             window = window.to(device)
 
-            # with torch.cuda.amp.autocast():
             with torch.no_grad():
                 req_shape = (len(self.model_data_cfgdict.training.instruments),) + tuple(mix.shape)
                 result = torch.zeros(req_shape, dtype=torch.float32).to(device)

diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
@@ -15,6 +15,7 @@
 import yaml
 import requests
 import torch
+import torch.amp.autocast_mode as autocast_mode
 import onnxruntime as ort
 from tqdm import tqdm
 
@@ -43,6 +44,7 @@ class Separator:
         invert_using_spec (bool): Flag to invert using spectrogram.
         sample_rate (int): The sample rate of the audio.
         use_soundfile (bool): Use soundfile for audio writing, can solve OOM issues.
+        use_autocast (bool): Flag to use PyTorch autocast for faster inference.
 
     MDX Architecture Specific Attributes:
         hop_length (int): The hop length for STFT.
@@ -78,6 +80,7 @@ def __init__(
         invert_using_spec=False,
         sample_rate=44100,
         use_soundfile=False,
+        use_autocast=False,
         mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
         vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
         demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
@@ -150,6 +153,7 @@ def __init__(
             raise ValueError("The sample rate must be a non-zero whole number. Please provide a valid integer.")
 
         self.use_soundfile = use_soundfile
+        self.use_autocast = use_autocast
 
         # These are parameters which users may want to configure so we expose them to the top-level Separator class,
         # even though they are specific to a single model architecture
@@ -737,15 +741,25 @@ def separate(self, audio_file_path):
         Returns:
         - output_files (list of str): A list containing the paths to the separated audio stem files.
         """
+        if not (self.torch_device and self.model_instance):
+            raise ValueError("Initialization failed or model not loaded. Please load a model before attempting to separate.")
+
         # Starting the separation process
         self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}")
         separate_start_time = time.perf_counter()
 
         self.logger.debug(f"Normalization threshold set to {self.normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping.")
         self.logger.debug(f"Amplification threshold set to {self.amplification_threshold}, waveform will scaled up to this max amplitude if below it.")
 
-        # Run separation method for the loaded model
-        output_files = self.model_instance.separate(audio_file_path)
+        # Run separation method for the loaded model with autocast enabled if supported by the device.
+        output_files = None
+        if self.use_autocast and autocast_mode.is_autocast_available(self.torch_device.type):
+            self.logger.debug("Autocast available.")
+            with autocast_mode.autocast(self.torch_device.type):
+                output_files = self.model_instance.separate(audio_file_path)
+        else:
+            self.logger.debug("Autocast unavailable.")
+            output_files = self.model_instance.separate(audio_file_path)
 
         # Clear GPU cache to free up memory
         self.model_instance.clear_gpu_cache()

diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py
@@ -55,6 +55,7 @@ def main():
     single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental"
     sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
     use_soundfile_help = "Use soundfile to write audio output (default: %(default)s). Example: --use_soundfile"
+    use_autocast_help = "use PyTorch autocast for faster inference (default: %(default)s). Do not use for CPU inference. Example: --use_autocast"
 
     common_params = parser.add_argument_group("Common Separation Parameters")
     common_params.add_argument("--invert_spect", action="store_true", help=invert_spect_help)
@@ -63,6 +64,7 @@ def main():
     common_params.add_argument("--single_stem", default=None, help=single_stem_help)
     common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help)
     common_params.add_argument("--use_soundfile", action="store_true", help=use_soundfile_help)
+    common_params.add_argument("--use_autocast", action="store_true", help=use_autocast_help)
 
     mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
     mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
@@ -163,6 +165,8 @@ def main():
         output_single_stem=args.single_stem,
         invert_using_spec=args.invert_spect,
         sample_rate=args.sample_rate,
+        use_soundfile=args.use_soundfile,
+        use_autocast=args.use_autocast,
         mdx_params={
             "hop_length": args.mdx_hop_length,
             "segment_size": args.mdx_segment_size,