1.3.0 ( 7/* - Renewal Process... ) BACKUP

Bebra777228 · Nov 12, 2024 · ad10752 · ad10752
1 parent 84f9af4
commit ad10752
Show file tree

Hide file tree

Showing 9 changed files with 328 additions and 82 deletions.
diff --git a/app.py b/app.py
@@ -14,7 +14,7 @@
     zip_upload,
 )
 from tabs.uvr.uvr import uvr_tab
-from tabs.info.welcome import welcome_tab
+from tabs.welcome import welcome_tab
 
 DEFAULT_PORT = 4000
 MAX_PORT_ATTEMPTS = 10

diff --git a/app_offline.py b/app_offline.py
@@ -6,7 +6,7 @@
 from tabs.inference.inference_batch import inference_batch_tab
 from tabs.inference.inference_single import inference_single_tab
 from tabs.install.install import files_upload, output_message, zip_upload
-from tabs.info.welcome import welcome_tab
+from tabs.welcome import welcome_tab
 
 DEFAULT_PORT = 4000
 MAX_PORT_ATTEMPTS = 10

diff --git a/rvc/cli/edge_tts_cli.py b/rvc/cli/edge_tts_cli.py
@@ -1,7 +1,7 @@
 import argparse
 import os
 
-from rvc.infer.infer import edge_tts_pipeline
+from tabs.edge_tts.edge_tts import edge_tts_pipeline
 
 rvc_models_dir = os.path.join(os.getcwd(), "models")
 

diff --git a/rvc/cli/rvc_cli.py b/rvc/cli/rvc_cli.py
@@ -1,7 +1,7 @@
 import argparse
 import os
 
-from rvc.infer.infer import voice_pipeline_single
+from tabs.inference.inference_single import voice_pipeline_single
 
 rvc_models_dir = os.path.join(os.getcwd(), "models")
 

diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py
@@ -1,83 +1,134 @@
 import gc
 import os
+
 import librosa
 import numpy as np
 import soundfile as sf
 import torch
-import asyncio
-import gradio as gr
 from fairseq import checkpoint_utils
 from pydub import AudioSegment
 from scipy.io import wavfile
-import random
 
 from rvc.infer.config import Config
 from rvc.infer.pipeline import VC
 from rvc.lib.algorithm.synthesizers import Synthesizer
 from rvc.lib.my_utils import load_audio
 
+# Инициализация конфигурации
 config = Config()
 
+
 RVC_MODELS_DIR = os.path.join(os.getcwd(), "models")
 EMBEDDERS_DIR = os.path.join(os.getcwd(), "rvc", "models", "embedders")
 HUBERT_BASE_PATH = os.path.join(EMBEDDERS_DIR, "hubert_base.pt")
-OUTPUT_DIRS = {
-    "single": os.path.join(os.getcwd(), "output", "converted_audio", "single"),
-    "batch": os.path.join(os.getcwd(), "output", "converted_audio", "batch"),
-    "tts": os.path.join(os.getcwd(), "output", "converted_audio", "tts")
-}
-
-for dir_path in OUTPUT_DIRS.values():
-    os.makedirs(dir_path, exist_ok=True)
-
-def create_unique_output_dir(base_dir, prefix=""):
-    counter = 1
-    while True:
-        unique_dir = os.path.join(base_dir, f"{prefix}{counter}")
-        if not os.path.exists(unique_dir):
-            os.makedirs(unique_dir)
-            return unique_dir
-        counter += 1
 
+os.makedirs(EMBEDDERS_DIR, exist_ok=True)
+os.makedirs(RVC_MODELS_DIR, exist_ok=True)
+
+
+# Загружает модель RVC и индекс по имени модели.
 def load_rvc_model(rvc_model):
+    # Формируем путь к директории модели
     model_dir = os.path.join(RVC_MODELS_DIR, rvc_model)
+    # Получаем список файлов в директории модели
     model_files = os.listdir(model_dir)
-    rvc_model_path = next((os.path.join(model_dir, f) for f in model_files if f.endswith(".pth")), None)
-    rvc_index_path = next((os.path.join(model_dir, f) for f in model_files if f.endswith(".index")), None)
+
+    # Находим файл модели с расширением .pth
+    rvc_model_path = next(
+        (os.path.join(model_dir, f) for f in model_files if f.endswith(".pth")), None
+    )
+    # Находим файл индекса с расширением .index
+    rvc_index_path = next(
+        (os.path.join(model_dir, f) for f in model_files if f.endswith(".index")), None
+    )
+
+    # Проверяем, существует ли файл модели
     if not rvc_model_path:
-        raise ValueError(f"Модель {rvc_model} не найдена.")
+        raise ValueError(
+            f"Модель {rvc_model} не существует. Возможно, вы ввели имя неправильно."
+        )
+
     return rvc_model_path, rvc_index_path
 
+
+# Загружает модель Hubert
 def load_hubert(model_path):
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task([model_path])
+    # Загружаем модель Hubert и её конфигурацию
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [model_path], suffix=""
+    )
+    # Перемещаем модель на устройство (GPU или CPU)
     hubert = models[0].to(config.device)
-    hubert.eval().half() if config.is_half else hubert.float()
+    # Преобразуем модель в полуточность (half) или полную точность (float)
+    hubert = hubert.half() if config.is_half else hubert.float()
+    # Устанавливаем модель в режим оценки (инференс)
+    hubert.eval()
     return hubert
 
+
+# Получает конвертер голоса
 def get_vc(model_path):
-    cpt = torch.load(model_path, map_location="cpu")
+    # Загружаем состояние модели из файла
+    cpt = torch.load(model_path, map_location="cpu", weights_only=True)
+
+    # Проверяем корректность формата модели
+    if "config" not in cpt or "weight" not in cpt:
+        raise ValueError(
+            f"Некорректный формат для {model_path}. Используйте голосовую модель, обученную с RVC v2."
+        )
+
+    # Извлекаем параметры модели
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+    pitch_guidance = cpt.get("f0", 1)
+    version = cpt.get("version", "v1")
+    input_dim = 768 if version == "v2" else 256
+
+    # Инициализируем синтезатор
     net_g = Synthesizer(
         *cpt["config"],
-        use_f0=cpt.get("f0", 1),
-        input_dim=768 if cpt.get("version") == "v2" else 256,
+        use_f0=pitch_guidance,
+        input_dim=input_dim,
         is_half=config.is_half,
     )
+
+    # Удаляем ненужный слой
     del net_g.enc_q
-    net_g.eval().to(config.device).half() if config.is_half else net_g.float()
-    return cpt, cpt.get("version", "v1"), net_g, tgt_sr, VC(tgt_sr, config)
+    print(net_g.load_state_dict(cpt["weight"], strict=False))
+    # Устанавливаем модель в режим оценки и перемещаем на устройство
+    net_g.eval().to(config.device)
+    net_g = net_g.half() if config.is_half else net_g.float()
+
+    # Инициализируем объект конвертера голоса
+    vc = VC(tgt_sr, config)
+    return cpt, version, net_g, tgt_sr, vc
+
+
+# Конвертирует аудиофайл в стерео формат.
+def convert_to_stereo(input_path, output_path):
+    # Загружаем аудиофайл
+    y, sr = sf.read(input_path)
+
+    # Если аудио моно, дублируем канал
+    if len(y.shape) == 1:
+        y = np.vstack([y, y]).T
+    elif len(y.shape) > 2:
+        y = y[:, :2]
+
+    # Сохраняем результат в файл с форматом .flac
+    sf.write(output_path, y, sr, format="FLAC")
+
 
-def convert_audio(input_path, output_path, stereo=True, output_format="flac"):
-    audio, sr = sf.read(input_path)
-    if stereo and len(audio.shape) == 1:
-        audio = np.vstack([audio, audio]).T
-    sf.write(output_path, audio, sr, format=output_format)
+# Конвертирует аудиофайл в выбранный пользователем формат.
+def convert_to_user_format(input_path, output_path, output_format):
+    # Загружаем аудиофайл
+    audio = AudioSegment.from_file(input_path)
 
-async def text_to_speech(text, voice, output_path):
-    communicate = edge_tts.Communicate(text=text, voice=voice)
-    await communicate.save(output_path)
+    # Сохраняем аудиофайл в выбранном формате
+    audio.export(output_path, format=output_format)
 
+
+# Выполняет инференс с использованием RVC
 def rvc_infer(
     voice_model,
     input_path,
@@ -92,21 +143,24 @@ def rvc_infer(
     f0_min,
     f0_max,
     output_format,
-    progress=None,
 ):
-    progress(0.1, "Загрузка модели Hubert...")
+    # Загружаем модель Hubert
     hubert_model = load_hubert(HUBERT_BASE_PATH)
-
-    progress(0.4, "Загрузка RVC модели...")
+    # Загружаем модель RVC и индекс
     model_path, index_path = load_rvc_model(voice_model)
+    # Получаем конвертер голоса
     cpt, version, net_g, tgt_sr, vc = get_vc(model_path)
+    # Загружаем аудиофайл
     audio = load_audio(input_path, 16000)
 
+    # Формируем имя выходного файла
     base_name = os.path.splitext(os.path.basename(input_path))[0]
     temp_output_path = os.path.join(output_dir, f"{base_name}_(Converted).flac")
-    final_output_path = os.path.join(output_dir, f"{base_name}_(Converted).{output_format}")
+    final_output_path = os.path.join(
+        output_dir, f"{base_name}_(Converted).{output_format}"
+    )
 
-    progress(0.5, "Выполнение конверсии голоса...")
+    # Выполняем конвертацию голоса
     audio_opt = vc.pipeline(
         hubert_model,
         net_g,
@@ -125,48 +179,86 @@ def rvc_infer(
         hop_length,
         f0_min,
         f0_max,
-        None,
+        f0_file=None,
     )
+    # Сохраняем результат в файл
     wavfile.write(temp_output_path, tgt_sr, audio_opt)
 
-    progress(0.8, "Конвертация в стерео и финальный формат...")
-    convert_audio(temp_output_path, final_output_path, stereo=True, output_format=output_format)
+    # Конвертируем файл в стерео формат с использованием scipy
+    convert_to_stereo(temp_output_path, temp_output_path)
+
+    # Конвертируем файл в выбранный пользователем формат
+    convert_to_user_format(temp_output_path, final_output_path, output_format)
+
+    # Удаляем временный файл
     os.remove(temp_output_path)
 
-    # Очистка памяти
+    # Освобождаем память
     del hubert_model, cpt, net_g, vc
     gc.collect()
     torch.cuda.empty_cache()
 
-    progress(1, "Обработка завершена.")
-    return final_output_path
-
+    return final_output_path  # Возвращаем путь к выходному файлу
 
-def voice_pipeline_single(uploaded_file, voice_model, *args, **kwargs):
-    progress = kwargs.pop('progress', None)
-    output_dir = create_unique_output_dir(OUTPUT_DIRS["single"], prefix=os.path.splitext(os.path.basename(uploaded_file))[0])
-    return rvc_infer(voice_model, uploaded_file, output_dir, *args, **kwargs, progress=progress)
 
-def voice_pipeline_batch(uploaded_files, voice_model, *args, **kwargs):
-    progress = kwargs.pop('progress', None)
-    output_dir = create_unique_output_dir(OUTPUT_DIRS["batch"], prefix="batch_")
-    completed_files = []
-    num_files = len(uploaded_files)
-
-    for i, input_file in enumerate(uploaded_files):
-        progress(i / num_files, f"Преобразование файла {input_file}...")
-        output_path = rvc_infer(voice_model, input_file, output_dir, *args, **kwargs, progress=None)
-        completed_files.append(f"{os.path.basename(input_file)} - Готово")
-
-    progress(1, "Все файлы обработаны.")
-    return "\n".join(completed_files) + f"\n\nФайлы расположены в {output_dir}"
+# Выполняет пакетное преобразование файлов с использованием rvc_infer
+def rvc_infer_batch(
+    voice_model,
+    input_dir,
+    output_dir,
+    index_rate,
+    pitch,
+    f0_method,
+    filter_radius,
+    volume_envelope,
+    protect,
+    hop_length,
+    f0_min,
+    f0_max,
+    output_format,
+):
+    # Получаем список файлов в директории input_dir
+    input_files = [
+        f
+        for f in os.listdir(input_dir)
+        if f.endswith(
+            (
+                "wav",
+                "mp3",
+                "flac",
+                "ogg",
+                "opus",
+                "m4a",
+                "mp4",
+                "aac",
+                "alac",
+                "wma",
+                "aiff",
+                "webm",
+                "ac3",
+            )
+        )
+    ]
 
-def edge_tts_pipeline(text, voice_model, voice, *args, **kwargs):
-    progress = kwargs.pop('progress', None)
-    tts_path = os.path.join(OUTPUT_DIRS["tts"], "TTS_Voice.wav")
+    for input_file in input_files:
+        # Формируем пути к входному и выходному файлам
+        input_path = os.path.join(input_dir, input_file)
 
-    progress(0.2, "Синтез речи...")
-    asyncio.run(text_to_speech(text, voice, tts_path))
+        print(f"Преобразование {input_file}...")
 
-    output_dir = create_unique_output_dir(OUTPUT_DIRS["tts"], prefix="tts_")
-    return rvc_infer(voice_model, tts_path, output_dir, *args, **kwargs, progress=progress)
+        # Выполняем преобразование для текущего файла
+        rvc_infer(
+            voice_model,
+            input_path,
+            output_dir,
+            index_rate,
+            pitch,
+            f0_method,
+            filter_radius,
+            volume_envelope,
+            protect,
+            hop_length,
+            f0_min,
+            f0_max,
+            output_format,
+        )