Skip to content

Commit

Permalink
Merge branch 'main' of github.com:nomadkaraoke/python-audio-separator
Browse files Browse the repository at this point in the history
  • Loading branch information
beveradb committed Nov 2, 2024
2 parents ec4bfcf + 5e83a96 commit dae33d0
Show file tree
Hide file tree
Showing 6 changed files with 863 additions and 783 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ Any file listed in the list models output can be specified (with file extension)

```sh
usage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--invert_spect]
[--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE]
[--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--use_autocast] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE]
[--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE] [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta]
[--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] [--demucs_shifts DEMUCS_SHIFTS]
[--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] [--mdxc_segment_size MDXC_SEGMENT_SIZE] [--mdxc_override_model_segment_size]
Expand Down Expand Up @@ -166,6 +166,7 @@ Common Separation Parameters:
--normalization NORMALIZATION value by which to multiply the amplitude of the output files (default: 0.9). Example: --normalization=0.7
--single_stem SINGLE_STEM output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental
--sample_rate SAMPLE_RATE set the sample rate of the output audio (default: 44100). Example: --sample_rate=44100
--use_autocast use PyTorch autocast for faster inference (default: False). Do not use for CPU inference. Example: --use_autocast
MDX Architecture Parameters:
--mdx_segment_size MDX_SEGMENT_SIZE larger consumes more resources, but may give better results (default: 256). Example: --mdx_segment_size=256
Expand Down Expand Up @@ -315,6 +316,7 @@ output_file_paths_6 = separator.separate('audio3.wav')
- invert_using_spec: (Optional) Flag to invert using spectrogram. Default: False
- sample_rate: (Optional) Set the sample rate of the output audio. Default: 44100
- use_soundfile: (Optional) Use soundfile for output writing, can solve OOM issues, especially on longer audio.
- use_autocast: (Optional) Flag to use PyTorch autocast for faster inference. Do not use for CPU inference. Default: False
- mdx_params: (Optional) MDX Architecture Specific Attributes & Defaults. Default: {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}
- vr_params: (Optional) VR Architecture Specific Attributes & Defaults. Default: {"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}
- demucs_params: (Optional) VR Architecture Specific Attributes & Defaults. {"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}
Expand Down
1 change: 0 additions & 1 deletion audio_separator/separator/architectures/mdxc_separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,6 @@ def demix(self, mix: np.ndarray) -> dict:
# Transfer to the weighting plate for the same device as the other tensors
window = window.to(device)

# with torch.cuda.amp.autocast():
with torch.no_grad():
req_shape = (len(self.model_data_cfgdict.training.instruments),) + tuple(mix.shape)
result = torch.zeros(req_shape, dtype=torch.float32).to(device)
Expand Down
18 changes: 16 additions & 2 deletions audio_separator/separator/separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import yaml
import requests
import torch
import torch.amp.autocast_mode as autocast_mode
import onnxruntime as ort
from tqdm import tqdm

Expand Down Expand Up @@ -43,6 +44,7 @@ class Separator:
invert_using_spec (bool): Flag to invert using spectrogram.
sample_rate (int): The sample rate of the audio.
use_soundfile (bool): Use soundfile for audio writing, can solve OOM issues.
use_autocast (bool): Flag to use PyTorch autocast for faster inference.
MDX Architecture Specific Attributes:
hop_length (int): The hop length for STFT.
Expand Down Expand Up @@ -78,6 +80,7 @@ def __init__(
invert_using_spec=False,
sample_rate=44100,
use_soundfile=False,
use_autocast=False,
mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
Expand Down Expand Up @@ -150,6 +153,7 @@ def __init__(
raise ValueError("The sample rate must be a non-zero whole number. Please provide a valid integer.")

self.use_soundfile = use_soundfile
self.use_autocast = use_autocast

# These are parameters which users may want to configure so we expose them to the top-level Separator class,
# even though they are specific to a single model architecture
Expand Down Expand Up @@ -737,15 +741,25 @@ def separate(self, audio_file_path):
Returns:
- output_files (list of str): A list containing the paths to the separated audio stem files.
"""
if not (self.torch_device and self.model_instance):
raise ValueError("Initialization failed or model not loaded. Please load a model before attempting to separate.")

# Starting the separation process
self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}")
separate_start_time = time.perf_counter()

self.logger.debug(f"Normalization threshold set to {self.normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping.")
self.logger.debug(f"Amplification threshold set to {self.amplification_threshold}, waveform will scaled up to this max amplitude if below it.")

# Run separation method for the loaded model
output_files = self.model_instance.separate(audio_file_path)
# Run separation method for the loaded model with autocast enabled if supported by the device.
output_files = None
if self.use_autocast and autocast_mode.is_autocast_available(self.torch_device.type):
self.logger.debug("Autocast available.")
with autocast_mode.autocast(self.torch_device.type):
output_files = self.model_instance.separate(audio_file_path)
else:
self.logger.debug("Autocast unavailable.")
output_files = self.model_instance.separate(audio_file_path)

# Clear GPU cache to free up memory
self.model_instance.clear_gpu_cache()
Expand Down
4 changes: 4 additions & 0 deletions audio_separator/utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def main():
single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental"
sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
use_soundfile_help = "Use soundfile to write audio output (default: %(default)s). Example: --use_soundfile"
use_autocast_help = "use PyTorch autocast for faster inference (default: %(default)s). Do not use for CPU inference. Example: --use_autocast"

common_params = parser.add_argument_group("Common Separation Parameters")
common_params.add_argument("--invert_spect", action="store_true", help=invert_spect_help)
Expand All @@ -63,6 +64,7 @@ def main():
common_params.add_argument("--single_stem", default=None, help=single_stem_help)
common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help)
common_params.add_argument("--use_soundfile", action="store_true", help=use_soundfile_help)
common_params.add_argument("--use_autocast", action="store_true", help=use_autocast_help)

mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
Expand Down Expand Up @@ -163,6 +165,8 @@ def main():
output_single_stem=args.single_stem,
invert_using_spec=args.invert_spect,
sample_rate=args.sample_rate,
use_soundfile=args.use_soundfile,
use_autocast=args.use_autocast,
mdx_params={
"hop_length": args.mdx_hop_length,
"segment_size": args.mdx_segment_size,
Expand Down
Loading

0 comments on commit dae33d0

Please sign in to comment.