diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 99eafa2..a6acd35 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,7 +1,7 @@ # Pull request checklist - [ ] The PR has a proper title. Use [Semantic Commit Messages](https://seesparkbox.com/foundry/semantic_commit_messages). (No more branch-name title please) -- [ ] Make sure you are requesting the right branch. +- [ ] Make sure you are requesting the right branch: `dev`. - [ ] Make sure this is ready to be merged into the relevant branch. Please don't create a PR and let it hang for a few days. - [ ] Ensure all tests are passing. - [ ] Ensure linting is passing. diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 68f74fa..e983560 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,4 +1,4 @@ -name: Build And Push Docker Image +name: Build and Push Docker Image on: workflow_dispatch: diff --git a/.github/workflows/genlocale.yml b/.github/workflows/genlocale.yml index 96a29e8..c56f7d0 100644 --- a/.github/workflows/genlocale.yml +++ b/.github/workflows/genlocale.yml @@ -1,15 +1,14 @@ -name: genlocale +name: Generate and Sync Locale on: push: branches: - main + - dev jobs: genlocale: - name: genlocale runs-on: ubuntu-latest steps: - - name: Check out - uses: actions/checkout@master + - uses: actions/checkout@master - name: Run locale generation run: | @@ -19,15 +18,21 @@ jobs: - name: Commit back if: ${{ !github.head_ref }} + id: commitback continue-on-error: true run: | git config --local user.name 'github-actions[bot]' git config --local user.email 'github-actions[bot]@users.noreply.github.com' git add --all - git commit -m "🎨 同步 locale" + git commit -m "chore(i18n): sync locale on ${{github.ref_name}}" - name: Create Pull Request - if: ${{ !github.head_ref }} + if: steps.commitback.outcome == 'success' continue-on-error: true - uses: peter-evans/create-pull-request@v4 - + uses: peter-evans/create-pull-request@v5 + with: + delete-branch: true + body: "Automatically sync i18n translation jsons" + title: "chore(i18n): sync locale on ${{github.ref_name}}" + commit-message: "chore(i18n): sync locale on ${{github.ref_name}}" + branch: genlocale-${{github.ref_name}} diff --git a/.github/workflows/pull_format.yml b/.github/workflows/pull_format.yml index 57a830d..484a214 100644 --- a/.github/workflows/pull_format.yml +++ b/.github/workflows/pull_format.yml @@ -1,24 +1,34 @@ -name: pull format +name: Check Pull Format -on: [pull_request] - -permissions: - contents: write +on: + pull_request_target: + types: [opened, reopened] jobs: - pull_format: - runs-on: ${{ matrix.os }} + # This workflow closes invalid PR + close_pr: + # The type of runner that the job will run on + runs-on: ubuntu-latest + permissions: write-all + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + - name: Close PR if it is not pointed to dev branch + if: github.event.pull_request.base.ref != 'dev' + uses: superbrothers/close-pull-request@v3 + with: + # Optional. Post a issue comment just before closing a pull request. + comment: "Invalid PR to `non-dev` branch `${{ github.event.pull_request.base.ref }}`." - strategy: - matrix: - python-version: ["3.10"] - os: [ubuntu-latest] - fail-fast: false + pull_format: + runs-on: ubuntu-latest + permissions: + contents: write continue-on-error: true steps: - - name: checkout + - name: Checkout continue-on-error: true uses: actions/checkout@v3 with: diff --git a/.github/workflows/push_format.yml b/.github/workflows/push_format.yml index 0a72789..7278a81 100644 --- a/.github/workflows/push_format.yml +++ b/.github/workflows/push_format.yml @@ -1,23 +1,18 @@ -name: push format +name: Standardize Code Format on: push: branches: - main - -permissions: - contents: write - pull-requests: write + - dev jobs: push_format: - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10"] - os: [ubuntu-latest] - fail-fast: false + permissions: + contents: write + pull-requests: write steps: - uses: actions/checkout@v3 @@ -43,7 +38,7 @@ jobs: git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" git add --all - git commit -m "Format code" + git commit -m "chore(format): run black on ${{github.ref_name}}" - name: Create Pull Request if: steps.commitback.outcome == 'success' @@ -51,6 +46,7 @@ jobs: uses: peter-evans/create-pull-request@v5 with: delete-branch: true - body: Apply Code Formatter Change - title: Apply Code Formatter Change - commit-message: Automatic code format + body: "Automatically apply code formatter change" + title: "chore(format): run black on ${{github.ref_name}}" + commit-message: "chore(format): run black on ${{github.ref_name}}" + branch: formatter-${{github.ref_name}} diff --git a/.github/workflows/sync_dev.yml b/.github/workflows/sync_dev.yml new file mode 100644 index 0000000..4696511 --- /dev/null +++ b/.github/workflows/sync_dev.yml @@ -0,0 +1,23 @@ +name: Merge dev into main + +on: + workflow_dispatch: + +jobs: + sync_dev: + runs-on: ubuntu-latest + + permissions: + contents: write + pull-requests: write + + steps: + - uses: actions/checkout@v3 + with: + ref: main + + - name: Create Pull Request + run: | + gh pr create --title "chore(sync): merge dev into main" --body "Merge dev to main" --base main --head dev + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index 1ab65f6..904195c 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -1,4 +1,4 @@ -name: unitest +name: Unit Test on: [ push, pull_request ] jobs: build: @@ -7,7 +7,7 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10"] os: [ubuntu-latest] - fail-fast: false + fail-fast: true steps: - uses: actions/checkout@master diff --git a/README.md b/README.md index f221877..5385a04 100644 --- a/README.md +++ b/README.md @@ -68,12 +68,16 @@ poetry install 你也可以通过 pip 来安装依赖: ```bash N卡: - -pip install -r requirements.txt + pip install -r requirements.txt A卡/I卡: -pip install -r requirements-dml.txt + pip install -r requirements-dml.txt + +A卡Rocm(Linux): + pip install -r requirements-amd.txt +I卡IPEX(Linux): + pip install -r requirements-ipex.txt ``` ------ @@ -122,11 +126,34 @@ https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt ```bash python infer-web.py ``` - 如果你正在使用Windows 或 macOS,你可以直接下载并解压`RVC-beta.7z`,前者可以运行`go-web.bat`以启动WebUI,后者则运行命令`sh ./run.sh`以启动WebUI。 +对于需要使用IPEX技术的I卡用户,请先在终端执行`source /opt/intel/oneapi/setvars.sh`(仅Linux)。 + 仓库内还有一份`小白简易教程.doc`以供参考。 +## AMD显卡Rocm相关(仅Linux) +如果你想基于AMD的Rocm技术在Linux系统上运行RVC,请先在[这里](https://rocm.docs.amd.com/en/latest/deploy/linux/os-native/install.html)安装所需的驱动。 + +若你使用的是Arch Linux,可以使用pacman来安装所需驱动: +```` +pacman -S rocm-hip-sdk rocm-opencl-sdk +```` +对于某些型号的显卡,你可能需要额外配置如下的环境变量(如:RX6700XT): +```` +export ROCM_PATH=/opt/rocm +export HSA_OVERRIDE_GFX_VERSION=10.3.0 +```` +同时确保你的当前用户处于`render`与`video`用户组内: +```` +sudo usermod -aG render $USERNAME +sudo usermod -aG video $USERNAME +```` +之后运行WebUI: +```bash +python infer-web.py +``` + ## 参考项目 + [ContentVec](https://github.com/auspicious3000/contentvec/) + [VITS](https://github.com/jaywalnut310/vits) diff --git a/assets/Synthesizer_inputs.pth b/assets/Synthesizer_inputs.pth new file mode 100644 index 0000000..faa509e Binary files /dev/null and b/assets/Synthesizer_inputs.pth differ diff --git a/assets/hubert/.gitignore b/assets/hubert/.gitignore index d6b7ef3..03dfb38 100644 --- a/assets/hubert/.gitignore +++ b/assets/hubert/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!hubert_inputs.pth \ No newline at end of file diff --git a/assets/hubert/hubert_inputs.pth b/assets/hubert/hubert_inputs.pth new file mode 100644 index 0000000..46d2886 Binary files /dev/null and b/assets/hubert/hubert_inputs.pth differ diff --git a/assets/rmvpe/.gitignore b/assets/rmvpe/.gitignore index d6b7ef3..dbb24a6 100644 --- a/assets/rmvpe/.gitignore +++ b/assets/rmvpe/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!rmvpe_inputs.pth \ No newline at end of file diff --git a/assets/rmvpe/rmvpe_inputs.pth b/assets/rmvpe/rmvpe_inputs.pth new file mode 100644 index 0000000..a4cfb86 Binary files /dev/null and b/assets/rmvpe/rmvpe_inputs.pth differ diff --git a/configs/config.json b/configs/config.json index 8e9c176..668299d 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1,15 +1,15 @@ { - "pth_path": "assets/weights/kikiV1.pth", - "index_path": "logs/kikiV1.index", - "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", - "sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)", - "threhold": -45.0, - "pitch": 12.0, - "index_rate": 0.0, - "rms_mix_rate": 0.0, - "block_time": 0.25, - "crossfade_length": 0.04, - "extra_time": 2.0, - "n_cpu": 6.0, - "f0method": "rmvpe" + "pth_path": "assets/weights/kikiV1.pth", + "index_path": "logs/kikiV1.index", + "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", + "sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)", + "threhold": -45.0, + "pitch": 12.0, + "index_rate": 0.0, + "rms_mix_rate": 0.0, + "block_time": 0.25, + "crossfade_length": 0.04, + "extra_time": 2.0, + "n_cpu": 6.0, + "f0method": "rmvpe" } diff --git a/configs/config.py b/configs/config.py index 20bbb36..b40a9af 100644 --- a/configs/config.py +++ b/configs/config.py @@ -43,7 +43,8 @@ def wrapper(*args, **kwargs): class Config: def __init__(self): self.device = "cuda:0" - self.is_half = True + self.is_half = False + self.use_jit = True self.n_cpu = 0 self.gpu_name = None self.json_config = self.load_config_json() diff --git a/gui_v1.py b/gui_v1.py index f86809d..f804148 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -14,6 +14,7 @@ import multiprocessing logger = logging.getLogger(__name__) +stream_latency = -1 class Harvest(multiprocessing.Process): @@ -61,9 +62,11 @@ def run(self): import tools.rvc_for_realtime as rvc_for_realtime from i18n.i18n import I18nAuto + from configs.config import Config i18n = I18nAuto() - device = rvc_for_realtime.config.device + + # device = rvc_for_realtime.config.device # device = torch.device( # "cuda" # if torch.cuda.is_available() @@ -98,9 +101,11 @@ def __init__(self) -> None: class GUI: def __init__(self) -> None: - self.config = GUIConfig() + self.gui_config = GUIConfig() + self.config = Config() self.flag_vc = False - + self.function = "vc" + self.delay_time = 0 self.launcher() def load(self): @@ -112,6 +117,10 @@ def load(self): data["harvest"] = data["f0method"] == "harvest" data["crepe"] = data["f0method"] == "crepe" data["rmvpe"] = data["f0method"] == "rmvpe" + if data["sg_input_device"] not in input_devices: + data["sg_input_device"] = input_devices[sd.default.device[0]] + if data["sg_output_device"] not in output_devices: + data["sg_output_device"] = output_devices[sd.default.device[1]] except: with open("configs/config.json", "w") as j: data = { @@ -127,6 +136,7 @@ def load(self): "crossfade_length": "0.04", "extra_time": "2", "f0method": "rmvpe", + "use_jit": False, } data["pm"] = data["f0method"] == "pm" data["harvest"] = data["f0method"] == "harvest" @@ -136,6 +146,7 @@ def load(self): def launcher(self): data = self.load() + self.config.use_jit = data.get("use_jit", self.config.use_jit) sg.theme("LightBlue3") input_devices, output_devices, _, _ = self.get_devices() layout = [ @@ -288,6 +299,17 @@ def launcher(self): enable_events=True, ), ], + # [ + # sg.Text("设备延迟"), + # sg.Slider( + # range=(0, 1), + # key="device_latency", + # resolution=0.001, + # orientation="h", + # default_value=data.get("device_latency", "0.1"), + # enable_events=True, + # ), + # ], [ sg.Text(i18n("harvest进程数")), sg.Slider( @@ -296,7 +318,7 @@ def launcher(self): resolution=1, orientation="h", default_value=data.get( - "n_cpu", min(self.config.n_cpu, n_cpu) + "n_cpu", min(self.gui_config.n_cpu, n_cpu) ), enable_events=True, ), @@ -334,7 +356,14 @@ def launcher(self): key="O_noise_reduce", enable_events=True, ), + sg.Checkbox( + "JIT加速", + default=self.config.use_jit, + key="use_jit", + enable_events=True, + ), ], + [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")], ], title=i18n("性能设置"), ), @@ -342,6 +371,22 @@ def launcher(self): [ sg.Button(i18n("开始音频转换"), key="start_vc"), sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Radio( + i18n("输入监听"), + "function", + key="im", + default=False, + enable_events=True, + ), + sg.Radio( + i18n("输出变声"), + "function", + key="vc", + default=True, + enable_events=True, + ), + sg.Text(i18n("算法延迟(ms):")), + sg.Text("0", key="delay_time"), sg.Text(i18n("推理时间(ms):")), sg.Text("0", key="infer_time"), ], @@ -360,20 +405,20 @@ def event_handler(self): prev_output = self.window["sg_output_device"].get() input_devices, output_devices, _, _ = self.get_devices(update=True) if prev_input not in input_devices: - self.config.sg_input_device = input_devices[0] + self.gui_config.sg_input_device = input_devices[0] else: - self.config.sg_input_device = prev_input + self.gui_config.sg_input_device = prev_input self.window["sg_input_device"].Update(values=input_devices) self.window["sg_input_device"].Update( - value=self.config.sg_input_device + value=self.gui_config.sg_input_device ) if prev_output not in output_devices: - self.config.sg_output_device = output_devices[0] + self.gui_config.sg_output_device = output_devices[0] else: - self.config.sg_output_device = prev_output + self.gui_config.sg_output_device = prev_output self.window["sg_output_device"].Update(values=output_devices) self.window["sg_output_device"].Update( - value=self.config.sg_output_device + value=self.gui_config.sg_output_device ) if event == "start_vc" and self.flag_vc == False: if self.set_values(values) == True: @@ -388,10 +433,12 @@ def event_handler(self): "pitch": values["pitch"], "rms_mix_rate": values["rms_mix_rate"], "index_rate": values["index_rate"], + # "device_latency": values["device_latency"], "block_time": values["block_time"], "crossfade_length": values["crossfade_length"], "extra_time": values["extra_time"], "n_cpu": values["n_cpu"], + "use_jit": values["use_jit"], "f0method": ["pm", "harvest", "crepe", "rmvpe"][ [ values["pm"], @@ -403,31 +450,51 @@ def event_handler(self): } with open("configs/config.json", "w") as j: json.dump(settings, j) + global stream_latency + while stream_latency < 0: + time.sleep(0.01) + self.delay_time = ( + stream_latency + + values["block_time"] + + values["crossfade_length"] + + 0.01 + ) + if values["I_noise_reduce"]: + self.delay_time += values["crossfade_length"] + self.window["delay_time"].update(int(self.delay_time * 1000)) if event == "stop_vc" and self.flag_vc == True: self.flag_vc = False - + stream_latency = -1 # Parameter hot update if event == "threhold": - self.config.threhold = values["threhold"] + self.gui_config.threhold = values["threhold"] elif event == "pitch": - self.config.pitch = values["pitch"] + self.gui_config.pitch = values["pitch"] if hasattr(self, "rvc"): self.rvc.change_key(values["pitch"]) elif event == "index_rate": - self.config.index_rate = values["index_rate"] + self.gui_config.index_rate = values["index_rate"] if hasattr(self, "rvc"): self.rvc.change_index_rate(values["index_rate"]) elif event == "rms_mix_rate": - self.config.rms_mix_rate = values["rms_mix_rate"] + self.gui_config.rms_mix_rate = values["rms_mix_rate"] elif event in ["pm", "harvest", "crepe", "rmvpe"]: - self.config.f0method = event + self.gui_config.f0method = event elif event == "I_noise_reduce": - self.config.I_noise_reduce = values["I_noise_reduce"] + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + if stream_latency > 0: + self.delay_time += ( + 1 if values["I_noise_reduce"] else -1 + ) * values["crossfade_length"] + self.window["delay_time"].update(int(self.delay_time * 1000)) elif event == "O_noise_reduce": - self.config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + elif event in ["vc", "im"]: + self.function = event elif event != "start_vc" and self.flag_vc == True: # Other parameters do not support hot update self.flag_vc = False + stream_latency = -1 def set_values(self, values): if len(values["pth_path"].strip()) == 0: @@ -444,19 +511,21 @@ def set_values(self, values): sg.popup(i18n("index文件路径不可包含中文")) return False self.set_devices(values["sg_input_device"], values["sg_output_device"]) - self.config.pth_path = values["pth_path"] - self.config.index_path = values["index_path"] - self.config.threhold = values["threhold"] - self.config.pitch = values["pitch"] - self.config.block_time = values["block_time"] - self.config.crossfade_time = values["crossfade_length"] - self.config.extra_time = values["extra_time"] - self.config.I_noise_reduce = values["I_noise_reduce"] - self.config.O_noise_reduce = values["O_noise_reduce"] - self.config.rms_mix_rate = values["rms_mix_rate"] - self.config.index_rate = values["index_rate"] - self.config.n_cpu = values["n_cpu"] - self.config.f0method = ["pm", "harvest", "crepe", "rmvpe"][ + self.config.use_jit = values["use_jit"] + # self.device_latency = values["device_latency"] + self.gui_config.pth_path = values["pth_path"] + self.gui_config.index_path = values["index_path"] + self.gui_config.threhold = values["threhold"] + self.gui_config.pitch = values["pitch"] + self.gui_config.block_time = values["block_time"] + self.gui_config.crossfade_time = values["crossfade_length"] + self.gui_config.extra_time = values["extra_time"] + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + self.gui_config.index_rate = values["index_rate"] + self.gui_config.n_cpu = values["n_cpu"] + self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe"][ [ values["pm"], values["harvest"], @@ -470,34 +539,48 @@ def start_vc(self): torch.cuda.empty_cache() self.flag_vc = True self.rvc = rvc_for_realtime.RVC( - self.config.pitch, - self.config.pth_path, - self.config.index_path, - self.config.index_rate, - self.config.n_cpu, + self.gui_config.pitch, + self.gui_config.pth_path, + self.gui_config.index_path, + self.gui_config.index_rate, + self.gui_config.n_cpu, inp_q, opt_q, - device, + self.config, self.rvc if hasattr(self, "rvc") else None, ) - self.config.samplerate = self.rvc.tgt_sr + self.gui_config.samplerate = self.rvc.tgt_sr self.zc = self.rvc.tgt_sr // 100 self.block_frame = ( - int(np.round(self.config.block_time * self.config.samplerate / self.zc)) + int( + np.round( + self.gui_config.block_time + * self.gui_config.samplerate + / self.zc + ) + ) * self.zc ) self.block_frame_16k = 160 * self.block_frame // self.zc self.crossfade_frame = ( int( np.round( - self.config.crossfade_time * self.config.samplerate / self.zc + self.gui_config.crossfade_time + * self.gui_config.samplerate + / self.zc ) ) * self.zc ) self.sola_search_frame = self.zc self.extra_frame = ( - int(np.round(self.config.extra_time * self.config.samplerate / self.zc)) + int( + np.round( + self.gui_config.extra_time + * self.gui_config.samplerate + / self.zc + ) + ) * self.zc ) self.input_wav: torch.Tensor = torch.zeros( @@ -505,12 +588,12 @@ def start_vc(self): + self.crossfade_frame + self.sola_search_frame + self.block_frame, - device=device, + device=self.config.device, dtype=torch.float32, ) self.input_wav_res: torch.Tensor = torch.zeros( 160 * self.input_wav.shape[0] // self.zc, - device=device, + device=self.config.device, dtype=torch.float32, ) self.pitch: np.ndarray = np.zeros( @@ -522,12 +605,12 @@ def start_vc(self): dtype="float64", ) self.sola_buffer: torch.Tensor = torch.zeros( - self.crossfade_frame, device=device, dtype=torch.float32 + self.crossfade_frame, device=self.config.device, dtype=torch.float32 ) self.nr_buffer: torch.Tensor = self.sola_buffer.clone() self.output_buffer: torch.Tensor = self.input_wav.clone() self.res_buffer: torch.Tensor = torch.zeros( - 2 * self.zc, device=device, dtype=torch.float32 + 2 * self.zc, device=self.config.device, dtype=torch.float32 ) self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] self.fade_in_window: torch.Tensor = ( @@ -538,7 +621,7 @@ def start_vc(self): 0.0, 1.0, steps=self.crossfade_frame, - device=device, + device=self.config.device, dtype=torch.float32, ) ) @@ -546,11 +629,13 @@ def start_vc(self): ) self.fade_out_window: torch.Tensor = 1 - self.fade_in_window self.resampler = tat.Resample( - orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32 - ).to(device) + orig_freq=self.gui_config.samplerate, + new_freq=16000, + dtype=torch.float32, + ).to(self.config.device) self.tg = TorchGate( - sr=self.config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 - ).to(device) + sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 + ).to(self.config.device) thread_vc = threading.Thread(target=self.soundinput) thread_vc.start() @@ -563,11 +648,13 @@ def soundinput(self): channels=channels, callback=self.audio_callback, blocksize=self.block_frame, - samplerate=self.config.samplerate, + samplerate=self.gui_config.samplerate, dtype="float32", - ): + ) as stream: + global stream_latency + stream_latency = stream.latency[-1] while self.flag_vc: - time.sleep(self.config.block_time) + time.sleep(self.gui_config.block_time) logger.debug("Audio block passed.") logger.debug("ENDing VC") @@ -579,12 +666,12 @@ def audio_callback( """ start_time = time.perf_counter() indata = librosa.to_mono(indata.T) - if self.config.threhold > -60: + if self.gui_config.threhold > -60: rms = librosa.feature.rms( y=indata, frame_length=4 * self.zc, hop_length=self.zc ) db_threhold = ( - librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold + librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold ) for i in range(db_threhold.shape[0]): if db_threhold[i]: @@ -592,12 +679,14 @@ def audio_callback( self.input_wav[: -self.block_frame] = self.input_wav[ self.block_frame : ].clone() - self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to(device) + self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to( + self.config.device + ) self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ self.block_frame_16k : ].clone() # input noise reduction and resampling - if self.config.I_noise_reduce: + if self.gui_config.I_noise_reduce and self.function == "vc": input_wav = self.input_wav[ -self.crossfade_frame - self.block_frame - 2 * self.zc : ] @@ -621,23 +710,32 @@ def audio_callback( self.input_wav[-self.block_frame - 2 * self.zc :] )[160:] # infer - f0_extractor_frame = self.block_frame_16k + 800 - if self.config.f0method == "rmvpe": - f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - infer_wav = self.rvc.infer( - self.input_wav_res, - self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), - self.block_frame_16k, - self.valid_rate, - self.pitch, - self.pitchf, - self.config.f0method, - ) - infer_wav = infer_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ] + if self.function == "vc": + f0_extractor_frame = self.block_frame_16k + 800 + if self.gui_config.f0method == "rmvpe": + f0_extractor_frame = ( + 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + ) + infer_wav = self.rvc.infer( + self.input_wav_res, + self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), + self.block_frame_16k, + self.valid_rate, + self.pitch, + self.pitchf, + self.gui_config.f0method, + ) + infer_wav = infer_wav[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ] + else: + infer_wav = self.input_wav[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ].clone() # output noise reduction - if self.config.O_noise_reduce: + if (self.gui_config.O_noise_reduce and self.function == "vc") or ( + self.gui_config.I_noise_reduce and self.function == "im" + ): self.output_buffer[: -self.block_frame] = self.output_buffer[ self.block_frame : ].clone() @@ -646,7 +744,7 @@ def audio_callback( infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) ).squeeze(0) # volume envelop mixing - if self.config.rms_mix_rate < 1: + if self.gui_config.rms_mix_rate < 1 and self.function == "vc": rms1 = librosa.feature.rms( y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] .cpu() @@ -654,7 +752,7 @@ def audio_callback( frame_length=640, hop_length=160, ) - rms1 = torch.from_numpy(rms1).to(device) + rms1 = torch.from_numpy(rms1).to(self.config.device) rms1 = F.interpolate( rms1.unsqueeze(0), size=infer_wav.shape[0] + 1, @@ -666,7 +764,7 @@ def audio_callback( frame_length=4 * self.zc, hop_length=self.zc, ) - rms2 = torch.from_numpy(rms2).to(device) + rms2 = torch.from_numpy(rms2).to(self.config.device) rms2 = F.interpolate( rms2.unsqueeze(0), size=infer_wav.shape[0] + 1, @@ -675,7 +773,7 @@ def audio_callback( )[0, 0, :-1] rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) infer_wav *= torch.pow( - rms1 / rms2, torch.tensor(1 - self.config.rms_mix_rate) + rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate) ) # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC conv_input = infer_wav[ @@ -685,7 +783,7 @@ def audio_callback( cor_den = torch.sqrt( F.conv1d( conv_input**2, - torch.ones(1, 1, self.crossfade_frame, device=device), + torch.ones(1, 1, self.crossfade_frame, device=self.config.device), ) + 1e-8 ) diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json index 0fb158b..884674d 100644 --- a/i18n/locale/en_US.json +++ b/i18n/locale/en_US.json @@ -38,6 +38,7 @@ "加载模型": "Load model", "加载预训练底模D路径": "Load pre-trained base model D path:", "加载预训练底模G路径": "Load pre-trained base model G path:", + "单人": "Single", "卸载音色省显存": "Unload voice to save GPU memory:", "变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.", "性能设置": "Performance settings", "总训练轮数total_epoch": "Total training epochs (total_epoch):", + "批次": "Batch", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').", "指定输出主人声文件夹": "Specify the output folder for vocals:", "指定输出文件夹": "Specify output folder:", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.", "目标采样率": "Target sample rate:", + "算法延迟(ms):": "Algorithmic delays(ms):", "自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:", "融合": "Fusion", "要改的模型信息": "Model information to be modified:", @@ -95,8 +98,8 @@ "训练特征索引": "Train feature index", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.", "请指定说话人id": "Please specify the speaker/singer ID:", - "请选择index文件": "请选择index文件", - "请选择pth文件": "请选择pth文件", + "请选择index文件": "Please choose the .index file", + "请选择pth文件": "Please choose the .pth file", "请选择说话人id": "Select Speaker/Singer ID:", "转换": "Convert", "输入实验名": "Enter the experiment name:", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):", "输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:", + "输入监听": "Input voice monitor", "输入训练文件夹路径": "Enter the path of the training folder:", "输入设备": "Input device", "输入降噪": "Input noise reduction", "输出信息": "Output information", + "输出变声": "Output converted voice", "输出设备": "Output device", "输出降噪": "Output noise reduction", "输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)", diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json index 74c30e5..fcd5f83 100644 --- a/i18n/locale/es_ES.json +++ b/i18n/locale/es_ES.json @@ -38,6 +38,7 @@ "加载模型": "Cargar modelo", "加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.", "加载预训练底模G路径": "Cargue la ruta del modelo G base pre-entrenada.", + "单人": "Individual", "卸载音色省显存": "Descargue la voz para ahorrar memoria GPU", "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)", "后处理重采样至最终采样率,0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento", "性能设置": "Configuración de rendimiento", "总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)", + "批次": "Lote", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).", "指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal", "指定输出文件夹": "Especificar carpeta de salida", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "Ruta del archivo de la biblioteca de características, si está vacío, se utilizará el resultado de la selección desplegable", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tecla +12 recomendada para conversión de voz de hombre a mujer, tecla -12 para conversión de voz de mujer a hombre. Si el rango de tono es demasiado amplio y causa distorsión, ajústelo usted mismo a un rango adecuado.", "目标采样率": "Tasa de muestreo objetivo", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "Detección automática de la ruta del índice, selección desplegable (dropdown)", "融合": "Fusión", "要改的模型信息": "Información del modelo a modificar", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Ingrese la ruta a la carpeta de audio que se procesará (simplemente cópiela desde la barra de direcciones del administrador de archivos)", "输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Proporción de fusión para reemplazar el sobre de volumen de entrada con el sobre de volumen de salida, cuanto más cerca de 1, más se utiliza el sobre de salida", + "输入监听": "输入监听", "输入训练文件夹路径": "Introduzca la ruta de la carpeta de entrenamiento", "输入设备": "Dispositivo de entrada", "输入降噪": "Reducción de ruido de entrada", "输出信息": "Información de salida", + "输出变声": "输出变声", "输出设备": "Dispositivo de salida", "输出降噪": "Reducción de ruido de salida", "输出音频(右下角三个点,点了可以下载)": "Salida de audio (haga clic en los tres puntos en la esquina inferior derecha para descargar)", diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json index 5991dab..d2fa618 100644 --- a/i18n/locale/fr_FR.json +++ b/i18n/locale/fr_FR.json @@ -1,125 +1,130 @@ -{ - ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Si >=3 : appliquer un filtrage médian aux résultats de la reconnaissance de la hauteur de récolte. La valeur représente le rayon du filtre et peut réduire la respiration.", - "A模型权重": "Poids (w) pour le modèle A :", - "A模型路径": "Chemin d'accès au modèle A :", - "B模型路径": "Chemin d'accès au modèle B :", - "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Fichier de courbe F0 (facultatif). Une hauteur par ligne. Remplace la fréquence fondamentale par défaut et la modulation de la hauteur :", - "Index Rate": "Taux d'indexation", - "Onnx导出": "Exporter en ONNX", - "Onnx输出路径": "Chemin d'exportation ONNX :", - "RVC模型路径": "Chemin du modèle RVC :", - "ckpt处理": "Traitement des fichiers .ckpt", - "harvest进程数": "Nombre de processus CPU utilisés pour l'algorithme de reconnaissance de la hauteur (pitch) dans le cadre de la récolte (harvest).", - "index文件路径不可包含中文": "Le chemin du fichier d'index ne doit pas contenir de caractères chinois.", - "pth文件路径不可包含中文": "Le chemin du fichier .pth ne doit pas contenir de caractères chinois.", - "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Configuration des numéros de carte RMVPE : séparez les index GPU par des tirets \"-\", par exemple, 0-0-1 pour utiliser 2 processus sur GPU0 et 1 processus sur GPU1.", - "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Étape 1 : Remplissez la configuration expérimentale. Les données expérimentales sont stockées dans le dossier 'logs', avec chaque expérience ayant un dossier distinct. Entrez manuellement le chemin du nom de l'expérience, qui contient la configuration expérimentale, les journaux et les fichiers de modèle entraînés.", - "step1:正在处理数据": "Étape 1 : Traitement des données en cours.", - "step2:正在提取音高&正在提取特征": "Étape 2 : Extraction de la hauteur et extraction des caractéristiques en cours.", - "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Étape 2a : Parcours automatique de tous les fichiers du dossier d'entraînement qui peuvent être décodés en fichiers audio et réalisation d'une normalisation par tranches. Génère 2 dossiers wav dans le répertoire de l'expérience. Actuellement, seule la formation avec un seul chanteur/locuteur est prise en charge.", - "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Étape 2b : Utilisez le CPU pour extraire la hauteur (si le modèle le permet), utilisez le GPU pour extraire les caractéristiques (sélectionnez l'index du GPU) :", - "step3: 填写训练设置, 开始训练模型和索引": "Étape 3 : Remplissez les paramètres d'entraînement et démarrez l'entraînement du modèle ainsi que l'indexation.", - "step3a:正在训练模型": "Étape 3a : L'entraînement du modèle a commencé.", - "一键训练": "Entraînement en un clic", - "也可批量输入音频文件, 二选一, 优先读文件夹": "Il est également possible d'importer plusieurs fichiers audio. Si un chemin de dossier existe, cette entrée est ignorée.", - "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Traitement en lot pour la séparation de la voix et de l'accompagnement vocal à l'aide du modèle UVR5.
Exemple d'un format de chemin de dossier valide : D:\\chemin\\vers\\dossier\\d'entrée (copiez-le depuis la barre d'adresse du gestionnaire de fichiers).
Le modèle est divisé en trois catégories :
1. Préserver la voix : Choisissez cette option pour l'audio sans harmonies. Elle préserve la voix mieux que HP5. Il comprend deux modèles intégrés : HP2 et HP3. HP3 peut légèrement laisser passer l'accompagnement mais préserve légèrement mieux la voix que HP2.
2. Préserver uniquement la voix principale : Choisissez cette option pour l'audio avec harmonies. Cela peut affaiblir la voix principale. Il comprend un modèle intégré : HP5.
3. Modèles de suppression de la réverbération et du délai (par FoxJoy) :
  (1) MDX-Net : Le meilleur choix pour la suppression de la réverbération stéréo, mais ne peut pas supprimer la réverbération mono.
  (234) DeEcho : Supprime les effets de délai. Le mode Aggressive supprime plus efficacement que le mode Normal. DeReverb supprime également la réverbération et peut supprimer la réverbération mono, mais pas très efficacement pour les contenus à haute fréquence fortement réverbérés.
Notes sur la suppression de la réverbération et du délai :
1. Le temps de traitement pour le modèle DeEcho-DeReverb est environ deux fois plus long que pour les autres deux modèles DeEcho.
2. Le modèle MDX-Net-Dereverb est assez lent.
3. La configuration la plus propre recommandée est d'appliquer d'abord MDX-Net, puis DeEcho-Aggressive.", - "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Entrez le(s) index GPU séparé(s) par '-', par exemple, 0-1-2 pour utiliser les GPU 0, 1 et 2 :", - "伴奏人声分离&去混响&去回声": "Séparation des voix/accompagnement et suppression de la réverbération", - "保存名": "Nom de sauvegarde :", - "保存的文件名, 默认空为和源文件同名": "Nom du fichier de sauvegarde (par défaut : identique au nom du fichier source) :", - "保存的模型名不带后缀": "Nom du modèle enregistré (sans extension) :", - "保存频率save_every_epoch": "Fréquence de sauvegarde (save_every_epoch) :", - "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protéger les consonnes sourdes et les bruits de respiration pour éviter les artefacts tels que le déchirement dans la musique électronique. Réglez à 0,5 pour désactiver. Diminuez la valeur pour renforcer la protection, mais cela peut réduire la précision de l'indexation :", - "修改": "Modifier", - "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifier les informations du modèle (uniquement pris en charge pour les petits fichiers de modèle extraits du dossier 'weights')", - "停止音频转换": "Arrêter la conversion audio", - "全流程结束!": "Toutes les étapes ont été terminées !", - "刷新音色列表和索引路径": "Actualiser la liste des voix et le vers l'index.", - "加载模型": "Charger le modèle.", - "加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :", - "加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :", - "卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.", - "变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :", - "后处理重采样至最终采样率,0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :", - "否": "Non", - "响应阈值": "Seuil de réponse", - "响度因子": "Facteur de volume sonore", - "处理数据": "Traitement des données", - "导出Onnx模型": "Exporter le modèle au format ONNX.", - "导出文件格式": "Format de fichier d'exportation", - "常见问题解答": "FAQ (Foire Aux Questions)", - "常规设置": "Paramètres généraux", - "开始音频转换": "Démarrer la conversion audio.", - "很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.", - "性能设置": "Paramètres de performance", - "总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :", - "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').", - "指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :", - "指定输出文件夹": "Spécifiez le dossier de sortie :", - "指定输出非主人声文件夹": "Spécifiez le dossier de sortie pour l'accompagnement :", - "推理时间(ms):": "Temps d'inférence (ms) :", - "推理音色": "Voix pour l'inférence", - "提取": "Extraire", - "提取音高和处理数据使用的CPU进程数": "Nombre de processus CPU utilisés pour l'extraction de la hauteur et le traitement des données :", - "是": "Oui", - "是否仅保存最新的ckpt文件以节省硬盘空间": "Enregistrer uniquement le dernier fichier '.ckpt' pour économiser de l'espace disque :", - "是否在每次保存时间点将最终小模型保存至weights文件夹": "Enregistrer un petit modèle final dans le dossier 'weights' à chaque point de sauvegarde :", - "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Mettre en cache tous les ensembles d'entrainement dans la mémoire GPU. Mettre en cache de petits ensembles de données (moins de 10 minutes) peut accélérer l'entrainement, mais mettre en cache de grands ensembles de données consommera beaucoup de mémoire GPU et peut ne pas apporter beaucoup d'amélioration de vitesse :", - "显卡信息": "Informations sur la carte graphique (GPU)", - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Ce logiciel est open source sous la licence MIT. L'auteur n'a aucun contrôle sur le logiciel. Les utilisateurs qui utilisent le logiciel et distribuent les sons exportés par le logiciel en sont entièrement responsables.
Si vous n'acceptez pas cette clause, vous ne pouvez pas utiliser ou faire référence à aucun code ni fichier contenu dans le package logiciel. Consultez le fichier Agreement-LICENSE.txt dans le répertoire racine pour plus de détails.", - "查看": "Voir", - "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Afficher les informations sur le modèle (uniquement pour les petits fichiers de modèle extraits du dossier \"weights\")", - "检索特征占比": "Rapport de recherche de caractéristiques (contrôle l'intensité de l'accent, un rapport trop élevé provoque des artefacts) :", - "模型": "Modèle", - "模型推理": "Inférence du modèle", - "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extraction du modèle (saisissez le chemin d'accès au modèle du grand fichier dans le dossier \"logs\"). Cette fonction est utile si vous souhaitez arrêter l'entrainement à mi-chemin et extraire et enregistrer manuellement un petit fichier de modèle, ou si vous souhaitez tester un modèle intermédiaire :", - "模型是否带音高指导": "Indique si le modèle dispose d'un guidage en hauteur :", - "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Indique si le modèle dispose d'un système de guidage de la hauteur (obligatoire pour le chant, facultatif pour la parole) :", - "模型是否带音高指导,1是0否": "Le modèle dispose-t-il d'un guide de hauteur (1 : oui, 0 : non) ?", - "模型版本型号": "Version de l'architecture du modèle :", - "模型融合, 可用于测试音色融合": "Fusion de modèles, peut être utilisée pour tester la fusion de timbres", - "模型路径": "Le chemin vers le modèle :", - "每张显卡的batch_size": "Taille du batch par GPU :", - "淡入淡出长度": "Longueur de la transition", - "版本": "Version", - "特征提取": "Extraction des caractéristiques", - "特征检索库文件路径,为空则使用下拉的选择结果": "Chemin d'accès au fichier d'index des caractéristiques. Laisser vide pour utiliser le résultat sélectionné dans la liste déroulante :", - "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Il est recommandé d'utiliser la clé +12 pour la conversion homme-femme et la clé -12 pour la conversion femme-homme. Si la plage sonore est trop large et que la voix est déformée, vous pouvez également l'ajuster vous-même à la plage appropriée.", - "目标采样率": "Taux d'échantillonnage cible :", - "自动检测index路径,下拉式选择(dropdown)": "Détecter automatiquement le chemin d'accès à l'index et le sélectionner dans la liste déroulante :", - "融合": "Fusion", - "要改的模型信息": "Informations sur le modèle à modifier :", - "要置入的模型信息": "Informations sur le modèle à placer :", - "训练": "Entraîner", - "训练模型": "Entraîner le modèle", - "训练特征索引": "Entraîner l'index des caractéristiques", - "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entraînement terminé. Vous pouvez consulter les rapports d'entraînement dans la console ou dans le fichier 'train.log' situé dans le dossier de l'expérience.", - "请指定说话人id": "Veuillez spécifier l'ID de l'orateur ou du chanteur :", - "请选择index文件": "Veuillez sélectionner le fichier d'index", - "请选择pth文件": "Veuillez sélectionner le fichier pth", - "请选择说话人id": "Sélectionner l'ID de l'orateur ou du chanteur :", - "转换": "Convertir", - "输入实验名": "Saisissez le nom de l'expérience :", - "输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter :", - "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Entrez le chemin du dossier audio à traiter (copiez-le depuis la barre d'adresse du gestionnaire de fichiers) :", - "输入待处理音频文件路径(默认是正确格式示例)": "Entrez le chemin d'accès du fichier audio à traiter (par défaut, l'exemple de format correct) :", - "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Ajustez l'échelle de l'enveloppe de volume. Plus il est proche de 0, plus il imite le volume des voix originales. Cela peut aider à masquer les bruits et à rendre le volume plus naturel lorsqu'il est réglé relativement bas. Plus le volume est proche de 1, plus le volume sera fort et constant :", - "输入训练文件夹路径": "Indiquez le chemin d'accès au dossier d'entraînement :", - "输入设备": "Dispositif d'entrée", - "输入降噪": "Réduction du bruit d'entrée", - "输出信息": "Informations sur la sortie", - "输出设备": "Dispositif de sortie", - "输出降噪": "Réduction du bruit de sortie", - "输出音频(右下角三个点,点了可以下载)": "Exporter l'audio (cliquer sur les trois points dans le coin inférieur droit pour télécharger)", - "选择.index文件": "Sélectionner le fichier .index", - "选择.pth文件": "Sélectionner le fichier .pth", - "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Sélection de l'algorithme d'extraction de la hauteur, les voix d'entrée peuvent être accélérées avec pm, harvest a de bonnes basses mais est très lent, crepe est bon mais consomme beaucoup de ressources GPU.", - "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Sélectionnez l'algorithme d'extraction de la hauteur de ton (\"pm\" : extraction plus rapide mais parole de moindre qualité ; \"harvest\" : meilleure basse mais extrêmement lente ; \"crepe\" : meilleure qualité mais utilisation intensive du GPU), \"rmvpe\" : meilleure qualité et peu d'utilisation du GPU.", - "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Sélection de l'algorithme d'extraction de la hauteur : la chanson d'entrée peut être traitée plus rapidement par pm, avec une voix de haute qualité mais un CPU médiocre, par dio, harvest est meilleur mais plus lent, rmvpe est le meilleur, mais consomme légèrement le CPU/GPU.", - "采样长度": "Longueur de l'échantillon", - "重载设备列表": "Recharger la liste des dispositifs", - "音调设置": "Réglages de la hauteur", - "音频设备(请使用同种类驱动)": "Périphérique audio (veuillez utiliser le même type de pilote)", - "音高算法": "algorithme de détection de la hauteur", - "额外推理时长": "Temps d'inférence supplémentaire" -} \ No newline at end of file +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Si >=3 : appliquer un filtrage médian aux résultats de la reconnaissance de la hauteur de récolte. La valeur représente le rayon du filtre et peut réduire la respiration.", + "A模型权重": "Poids (w) pour le modèle A :", + "A模型路径": "Chemin d'accès au modèle A :", + "B模型路径": "Chemin d'accès au modèle B :", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Fichier de courbe F0 (facultatif). Une hauteur par ligne. Remplace la fréquence fondamentale par défaut et la modulation de la hauteur :", + "Index Rate": "Taux d'indexation", + "Onnx导出": "Exporter en ONNX", + "Onnx输出路径": "Chemin d'exportation ONNX :", + "RVC模型路径": "Chemin du modèle RVC :", + "ckpt处理": "Traitement des fichiers .ckpt", + "harvest进程数": "Nombre de processus CPU utilisés pour l'algorithme de reconnaissance de la hauteur (pitch) dans le cadre de la récolte (harvest).", + "index文件路径不可包含中文": "Le chemin du fichier d'index ne doit pas contenir de caractères chinois.", + "pth文件路径不可包含中文": "Le chemin du fichier .pth ne doit pas contenir de caractères chinois.", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Configuration des numéros de carte RMVPE : séparez les index GPU par des tirets \"-\", par exemple, 0-0-1 pour utiliser 2 processus sur GPU0 et 1 processus sur GPU1.", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Étape 1 : Remplissez la configuration expérimentale. Les données expérimentales sont stockées dans le dossier 'logs', avec chaque expérience ayant un dossier distinct. Entrez manuellement le chemin du nom de l'expérience, qui contient la configuration expérimentale, les journaux et les fichiers de modèle entraînés.", + "step1:正在处理数据": "Étape 1 : Traitement des données en cours.", + "step2:正在提取音高&正在提取特征": "Étape 2 : Extraction de la hauteur et extraction des caractéristiques en cours.", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Étape 2a : Parcours automatique de tous les fichiers du dossier d'entraînement qui peuvent être décodés en fichiers audio et réalisation d'une normalisation par tranches. Génère 2 dossiers wav dans le répertoire de l'expérience. Actuellement, seule la formation avec un seul chanteur/locuteur est prise en charge.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Étape 2b : Utilisez le CPU pour extraire la hauteur (si le modèle le permet), utilisez le GPU pour extraire les caractéristiques (sélectionnez l'index du GPU) :", + "step3: 填写训练设置, 开始训练模型和索引": "Étape 3 : Remplissez les paramètres d'entraînement et démarrez l'entraînement du modèle ainsi que l'indexation.", + "step3a:正在训练模型": "Étape 3a : L'entraînement du modèle a commencé.", + "一键训练": "Entraînement en un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Il est également possible d'importer plusieurs fichiers audio. Si un chemin de dossier existe, cette entrée est ignorée.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Traitement en lot pour la séparation de la voix et de l'accompagnement vocal à l'aide du modèle UVR5.
Exemple d'un format de chemin de dossier valide : D:\\chemin\\vers\\dossier\\d'entrée (copiez-le depuis la barre d'adresse du gestionnaire de fichiers).
Le modèle est divisé en trois catégories :
1. Préserver la voix : Choisissez cette option pour l'audio sans harmonies. Elle préserve la voix mieux que HP5. Il comprend deux modèles intégrés : HP2 et HP3. HP3 peut légèrement laisser passer l'accompagnement mais préserve légèrement mieux la voix que HP2.
2. Préserver uniquement la voix principale : Choisissez cette option pour l'audio avec harmonies. Cela peut affaiblir la voix principale. Il comprend un modèle intégré : HP5.
3. Modèles de suppression de la réverbération et du délai (par FoxJoy) :
  (1) MDX-Net : Le meilleur choix pour la suppression de la réverbération stéréo, mais ne peut pas supprimer la réverbération mono.
  (234) DeEcho : Supprime les effets de délai. Le mode Aggressive supprime plus efficacement que le mode Normal. DeReverb supprime également la réverbération et peut supprimer la réverbération mono, mais pas très efficacement pour les contenus à haute fréquence fortement réverbérés.
Notes sur la suppression de la réverbération et du délai :
1. Le temps de traitement pour le modèle DeEcho-DeReverb est environ deux fois plus long que pour les autres deux modèles DeEcho.
2. Le modèle MDX-Net-Dereverb est assez lent.
3. La configuration la plus propre recommandée est d'appliquer d'abord MDX-Net, puis DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Entrez le(s) index GPU séparé(s) par '-', par exemple, 0-1-2 pour utiliser les GPU 0, 1 et 2 :", + "伴奏人声分离&去混响&去回声": "Séparation des voix/accompagnement et suppression de la réverbération", + "保存名": "Nom de sauvegarde :", + "保存的文件名, 默认空为和源文件同名": "Nom du fichier de sauvegarde (par défaut : identique au nom du fichier source) :", + "保存的模型名不带后缀": "Nom du modèle enregistré (sans extension) :", + "保存频率save_every_epoch": "Fréquence de sauvegarde (save_every_epoch) :", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protéger les consonnes sourdes et les bruits de respiration pour éviter les artefacts tels que le déchirement dans la musique électronique. Réglez à 0,5 pour désactiver. Diminuez la valeur pour renforcer la protection, mais cela peut réduire la précision de l'indexation :", + "修改": "Modifier", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifier les informations du modèle (uniquement pris en charge pour les petits fichiers de modèle extraits du dossier 'weights')", + "停止音频转换": "Arrêter la conversion audio", + "全流程结束!": "Toutes les étapes ont été terminées !", + "刷新音色列表和索引路径": "Actualiser la liste des voix et le vers l'index.", + "加载模型": "Charger le modèle.", + "加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :", + "加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :", + "单人": "Individu", + "卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :", + "后处理重采样至最终采样率,0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :", + "否": "Non", + "响应阈值": "Seuil de réponse", + "响度因子": "Facteur de volume sonore", + "处理数据": "Traitement des données", + "导出Onnx模型": "Exporter le modèle au format ONNX.", + "导出文件格式": "Format de fichier d'exportation", + "常见问题解答": "FAQ (Foire Aux Questions)", + "常规设置": "Paramètres généraux", + "开始音频转换": "Démarrer la conversion audio.", + "很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.", + "性能设置": "Paramètres de performance", + "总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :", + "批次": "Lote", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').", + "指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :", + "指定输出文件夹": "Spécifiez le dossier de sortie :", + "指定输出非主人声文件夹": "Spécifiez le dossier de sortie pour l'accompagnement :", + "推理时间(ms):": "Temps d'inférence (ms) :", + "推理音色": "Voix pour l'inférence", + "提取": "Extraire", + "提取音高和处理数据使用的CPU进程数": "Nombre de processus CPU utilisés pour l'extraction de la hauteur et le traitement des données :", + "是": "Oui", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Enregistrer uniquement le dernier fichier '.ckpt' pour économiser de l'espace disque :", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Enregistrer un petit modèle final dans le dossier 'weights' à chaque point de sauvegarde :", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Mettre en cache tous les ensembles d'entrainement dans la mémoire GPU. Mettre en cache de petits ensembles de données (moins de 10 minutes) peut accélérer l'entrainement, mais mettre en cache de grands ensembles de données consommera beaucoup de mémoire GPU et peut ne pas apporter beaucoup d'amélioration de vitesse :", + "显卡信息": "Informations sur la carte graphique (GPU)", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Ce logiciel est open source sous la licence MIT. L'auteur n'a aucun contrôle sur le logiciel. Les utilisateurs qui utilisent le logiciel et distribuent les sons exportés par le logiciel en sont entièrement responsables.
Si vous n'acceptez pas cette clause, vous ne pouvez pas utiliser ou faire référence à aucun code ni fichier contenu dans le package logiciel. Consultez le fichier Agreement-LICENSE.txt dans le répertoire racine pour plus de détails.", + "查看": "Voir", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Afficher les informations sur le modèle (uniquement pour les petits fichiers de modèle extraits du dossier \"weights\")", + "检索特征占比": "Rapport de recherche de caractéristiques (contrôle l'intensité de l'accent, un rapport trop élevé provoque des artefacts) :", + "模型": "Modèle", + "模型推理": "Inférence du modèle", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extraction du modèle (saisissez le chemin d'accès au modèle du grand fichier dans le dossier \"logs\"). Cette fonction est utile si vous souhaitez arrêter l'entrainement à mi-chemin et extraire et enregistrer manuellement un petit fichier de modèle, ou si vous souhaitez tester un modèle intermédiaire :", + "模型是否带音高指导": "Indique si le modèle dispose d'un guidage en hauteur :", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Indique si le modèle dispose d'un système de guidage de la hauteur (obligatoire pour le chant, facultatif pour la parole) :", + "模型是否带音高指导,1是0否": "Le modèle dispose-t-il d'un guide de hauteur (1 : oui, 0 : non) ?", + "模型版本型号": "Version de l'architecture du modèle :", + "模型融合, 可用于测试音色融合": "Fusion de modèles, peut être utilisée pour tester la fusion de timbres", + "模型路径": "Le chemin vers le modèle :", + "每张显卡的batch_size": "Taille du batch par GPU :", + "淡入淡出长度": "Longueur de la transition", + "版本": "Version", + "特征提取": "Extraction des caractéristiques", + "特征检索库文件路径,为空则使用下拉的选择结果": "Chemin d'accès au fichier d'index des caractéristiques. Laisser vide pour utiliser le résultat sélectionné dans la liste déroulante :", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Il est recommandé d'utiliser la clé +12 pour la conversion homme-femme et la clé -12 pour la conversion femme-homme. Si la plage sonore est trop large et que la voix est déformée, vous pouvez également l'ajuster vous-même à la plage appropriée.", + "目标采样率": "Taux d'échantillonnage cible :", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Détecter automatiquement le chemin d'accès à l'index et le sélectionner dans la liste déroulante :", + "融合": "Fusion", + "要改的模型信息": "Informations sur le modèle à modifier :", + "要置入的模型信息": "Informations sur le modèle à placer :", + "训练": "Entraîner", + "训练模型": "Entraîner le modèle", + "训练特征索引": "Entraîner l'index des caractéristiques", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entraînement terminé. Vous pouvez consulter les rapports d'entraînement dans la console ou dans le fichier 'train.log' situé dans le dossier de l'expérience.", + "请指定说话人id": "Veuillez spécifier l'ID de l'orateur ou du chanteur :", + "请选择index文件": "Veuillez sélectionner le fichier d'index", + "请选择pth文件": "Veuillez sélectionner le fichier pth", + "请选择说话人id": "Sélectionner l'ID de l'orateur ou du chanteur :", + "转换": "Convertir", + "输入实验名": "Saisissez le nom de l'expérience :", + "输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter :", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Entrez le chemin du dossier audio à traiter (copiez-le depuis la barre d'adresse du gestionnaire de fichiers) :", + "输入待处理音频文件路径(默认是正确格式示例)": "Entrez le chemin d'accès du fichier audio à traiter (par défaut, l'exemple de format correct) :", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Ajustez l'échelle de l'enveloppe de volume. Plus il est proche de 0, plus il imite le volume des voix originales. Cela peut aider à masquer les bruits et à rendre le volume plus naturel lorsqu'il est réglé relativement bas. Plus le volume est proche de 1, plus le volume sera fort et constant :", + "输入监听": "输入监听", + "输入训练文件夹路径": "Indiquez le chemin d'accès au dossier d'entraînement :", + "输入设备": "Dispositif d'entrée", + "输入降噪": "Réduction du bruit d'entrée", + "输出信息": "Informations sur la sortie", + "输出变声": "输出变声", + "输出设备": "Dispositif de sortie", + "输出降噪": "Réduction du bruit de sortie", + "输出音频(右下角三个点,点了可以下载)": "Exporter l'audio (cliquer sur les trois points dans le coin inférieur droit pour télécharger)", + "选择.index文件": "Sélectionner le fichier .index", + "选择.pth文件": "Sélectionner le fichier .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Sélection de l'algorithme d'extraction de la hauteur, les voix d'entrée peuvent être accélérées avec pm, harvest a de bonnes basses mais est très lent, crepe est bon mais consomme beaucoup de ressources GPU.", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Sélectionnez l'algorithme d'extraction de la hauteur de ton (\"pm\" : extraction plus rapide mais parole de moindre qualité ; \"harvest\" : meilleure basse mais extrêmement lente ; \"crepe\" : meilleure qualité mais utilisation intensive du GPU), \"rmvpe\" : meilleure qualité et peu d'utilisation du GPU.", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Sélection de l'algorithme d'extraction de la hauteur : la chanson d'entrée peut être traitée plus rapidement par pm, avec une voix de haute qualité mais un CPU médiocre, par dio, harvest est meilleur mais plus lent, rmvpe est le meilleur, mais consomme légèrement le CPU/GPU.", + "采样长度": "Longueur de l'échantillon", + "重载设备列表": "Recharger la liste des dispositifs", + "音调设置": "Réglages de la hauteur", + "音频设备(请使用同种类驱动)": "Périphérique audio (veuillez utiliser le même type de pilote)", + "音高算法": "algorithme de détection de la hauteur", + "额外推理时长": "Temps d'inférence supplémentaire" +} diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json index a77e82a..00dc3b8 100644 --- a/i18n/locale/it_IT.json +++ b/i18n/locale/it_IT.json @@ -38,6 +38,7 @@ "加载模型": "Carica modello", "加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:", "加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:", + "单人": "Individuale", "卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:", "变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.", "性能设置": "Impostazioni delle prestazioni", "总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):", + "批次": "Lote", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ", "指定输出主人声文件夹": "Specifica la cartella di output per le voci:", "指定输出文件夹": "Specifica la cartella di output:", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "Percorso del file di indice delle caratteristiche. ", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tonalità +12 consigliata per la conversione da maschio a femmina e tonalità -12 per la conversione da femmina a maschio. ", "目标采样率": "Frequenza di campionamento target:", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "Rileva automaticamente il percorso dell'indice e seleziona dal menu a tendina:", "融合": "Fusione", "要改的模型信息": "Informazioni sul modello da modificare:", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Immettere il percorso della cartella audio da elaborare (copiarlo dalla barra degli indirizzi del file manager):", "输入待处理音频文件路径(默认是正确格式示例)": "Immettere il percorso del file audio da elaborare (l'impostazione predefinita è l'esempio di formato corretto):", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Regola il ridimensionamento dell'inviluppo del volume. ", + "输入监听": "输入监听", "输入训练文件夹路径": "Inserisci il percorso della cartella di addestramento:", "输入设备": "Dispositivo di input", "输入降噪": "Riduzione del rumore in ingresso", "输出信息": "Informazioni sull'uscita", + "输出变声": "输出变声", "输出设备": "Dispositivo di uscita", "输出降噪": "Riduzione del rumore in uscita", "输出音频(右下角三个点,点了可以下载)": "Esporta audio (clicca sui tre puntini in basso a destra per scaricarlo)", diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json index 9f0f242..9f3214b 100644 --- a/i18n/locale/ja_JP.json +++ b/i18n/locale/ja_JP.json @@ -38,6 +38,7 @@ "加载模型": "モデルをロード", "加载预训练底模D路径": "事前学習済みのDモデルのパス", "加载预训练底模G路径": "事前学習済みのGモデルのパス", + "单人": "個人", "卸载音色省显存": "音源を削除してメモリを節約", "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)", "后处理重采样至最终采样率,0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。", "性能设置": "パフォーマンス設定", "总训练轮数total_epoch": "総エポック数", + "批次": "バッチ", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。", "指定输出主人声文件夹": "マスターの出力音声フォルダーを指定する", "指定输出文件夹": "出力フォルダを指定してください", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "特徴検索ライブラリへのパス 空の場合はドロップダウンで選択", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性から女性へは+12キーをお勧めします。女性から男性へは-12キーをお勧めします。音域が広すぎて音質が劣化した場合は、適切な音域に自分で調整してください。", "目标采样率": "目標サンプリングレート", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "インデックスパスの自動検出 ドロップダウンで選択", "融合": "マージ", "要改的模型信息": "変更するモデル情報", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "処理対象音声フォルダーのパスを入力してください(エクスプローラーのアドレスバーからコピーしてください)", "输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "入力ソースの音量エンベロープと出力音量エンベロープの融合率 1に近づくほど、出力音量エンベロープの割合が高くなる", + "输入监听": "输入监听", "输入训练文件夹路径": "トレーニング用フォルダのパスを入力してください", "输入设备": "入力デバイス", "输入降噪": "入力ノイズの低減", "输出信息": "出力情報", + "输出变声": "输出变声", "输出设备": "出力デバイス", "输出降噪": "出力ノイズの低減", "输出音频(右下角三个点,点了可以下载)": "出力音声(右下の三点をクリックしてダウンロードできます)", diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json index 26ba6dc..10df9cb 100644 --- a/i18n/locale/ru_RU.json +++ b/i18n/locale/ru_RU.json @@ -38,6 +38,7 @@ "加载模型": "Загрузить модель", "加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:", "加载预训练底模G路径": "Путь к предварительно обученной базовой модели G:", + "单人": "Одиночный", "卸载音色省显存": "Выгрузить модель из памяти GPU для освобождения ресурсов", "变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):", "后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "К сожалению, у вас нету графического процессора, который поддерживает обучение моделей.", "性能设置": "Настройки быстроты", "总训练轮数total_epoch": "Полное количество эпох (total_epoch):", + "批次": "Партия", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Массовое преобразование. Введите путь к папке, в которой находятся файлы для преобразования голоса или выгрузите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию: 'opt').", "指定输出主人声文件夹": "Путь к папке для сохранения вокала:", "指定输出文件夹": "Папка для результатов:", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "Путь к файлу индекса черт. Оставьте пустым, чтобы использовать выбранный вариант из списка ниже:", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Рекомендуется выбрать +12 для конвертирования мужского голоса в женский и -12 для конвертирования женского в мужской. Если диапазон голоса слишком велик, и голос искажается, можно выбрать значение на свой вкус.", "目标采样率": "Частота дискретизации аудио:", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "Автоматически найденные файлы индексов черт (выберите вариант из списка):", "融合": "Запустить слияние", "要改的模型信息": "Информация, которая будет изменена:", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Путь к папке с аудиофайлами для переработки (можно скопировать путь из адресной строки файлового менеджера):", "输入待处理音频文件路径(默认是正确格式示例)": "Путь к аудиофайлу, который хотите обработать (ниже указан пример пути к файлу):", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Использовать громкость входного файла для замены или перемешивания с громкостью выходного файла. Чем ближе соотношение к 1, тем больше используется звука из выходного файла:", + "输入监听": "输入监听", "输入训练文件夹路径": "Путь к папке с аудиозаписями, на которых будет обучаться модель:", "输入设备": "Входное устройство", "输入降噪": "Уменьшение входного шума", "输出信息": "Статистика", + "输出变声": "输出变声", "输出设备": "Выходное устройство", "输出降噪": "Уменьшение выходного шума", "输出音频(右下角三个点,点了可以下载)": "Аудиофайл (чтобы скачать, нажмите на три точки справа в плеере)", diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json index e290c93..b8f7fa4 100644 --- a/i18n/locale/tr_TR.json +++ b/i18n/locale/tr_TR.json @@ -38,6 +38,7 @@ "加载模型": "Model yükle", "加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:", "加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:", + "单人": "单人", "卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır", "变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):", "后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.", "性能设置": "Performans ayarları", "总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):", + "批次": "批次", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir", "指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:", "指定输出文件夹": "Çıkış klasörünü belirt:", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "Özellik indeksi dosyasının yolunu belirtin. Seçilen sonucu kullanmak için boş bırakın veya açılır menüden seçim yapın.", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Erkekten kadına çevirmek için +12 tuş önerilir, kadından erkeğe çevirmek için ise -12 tuş önerilir. Eğer ses aralığı çok fazla genişler ve ses bozulursa, isteğe bağlı olarak uygun aralığa kendiniz de ayarlayabilirsiniz.", "目标采样率": "Hedef örnekleme oranı:", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "İndeks yolunu otomatik olarak tespit et ve açılır menüden seçim yap.", "融合": "Birleştir", "要改的模型信息": "Düzenlenecek model bilgileri:", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "İşlenecek ses klasörünün yolunu girin (dosya yöneticisinin adres çubuğundan kopyalayın):", "输入待处理音频文件路径(默认是正确格式示例)": "İşlenecek ses dosyasının yolunu girin (varsayılan doğru format örneğidir):", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Sesin hacim zarfını ayarlayın. 0'a yakın değerler, sesin orijinal vokallerin hacmine benzer olmasını sağlar. Düşük bir değerle ses gürültüsünü maskeleyebilir ve hacmi daha doğal bir şekilde duyulabilir hale getirebilirsiniz. 1'e yaklaştıkça sürekli bir yüksek ses seviyesi elde edilir:", + "输入监听": "输入监听", "输入训练文件夹路径": "Eğitim klasörünün yolunu girin:", "输入设备": "Giriş cihazı", "输入降噪": "Giriş gürültü azaltma", "输出信息": "Çıkış bilgisi", + "输出变声": "输出变声", "输出设备": "Çıkış cihazı", "输出降噪": "Çıkış gürültü azaltma", "输出音频(右下角三个点,点了可以下载)": "Ses dosyasını dışa aktar (indirmek için sağ alt köşedeki üç noktaya tıklayın)", diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json index a65cc47..8f2c67a 100644 --- a/i18n/locale/zh_CN.json +++ b/i18n/locale/zh_CN.json @@ -38,6 +38,7 @@ "加载模型": "加载模型", "加载预训练底模D路径": "加载预训练底模D路径", "加载预训练底模G路径": "加载预训练底模G路径", + "单人": "单人", "卸载音色省显存": "卸载音色省显存", "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "性能设置": "性能设置", "总训练轮数total_epoch": "总训练轮数total_epoch", + "批次": "批次", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", "指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出文件夹": "指定输出文件夹", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", "目标采样率": "目标采样率", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)", "融合": "融合", "要改的模型信息": "要改的模型信息", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络", + "输入监听": "输入监听", "输入训练文件夹路径": "输入训练文件夹路径", "输入设备": "输入设备", "输入降噪": "输入降噪", "输出信息": "输出信息", + "输出变声": "输出变声", "输出设备": "输出设备", "输出降噪": "输出降噪", "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json index 47ed97c..a0c010a 100644 --- a/i18n/locale/zh_HK.json +++ b/i18n/locale/zh_HK.json @@ -38,6 +38,7 @@ "加载模型": "載入模型", "加载预训练底模D路径": "加載預訓練底模D路徑", "加载预训练底模G路径": "加載預訓練底模G路徑", + "单人": "单人", "卸载音色省显存": "卸載音色節省 VRAM", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "性能设置": "效能設定", "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批次": "批次", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", "指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出文件夹": "指定輸出資料夾", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", "融合": "融合", "要改的模型信息": "要改的模型資訊", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", "输入训练文件夹路径": "輸入訓練檔案夾路徑", "输入设备": "輸入設備", "输入降噪": "輸入降噪", "输出信息": "輸出訊息", + "输出变声": "输出变声", "输出设备": "輸出設備", "输出降噪": "輸出降噪", "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json index 47ed97c..a0c010a 100644 --- a/i18n/locale/zh_SG.json +++ b/i18n/locale/zh_SG.json @@ -38,6 +38,7 @@ "加载模型": "載入模型", "加载预训练底模D路径": "加載預訓練底模D路徑", "加载预训练底模G路径": "加載預訓練底模G路徑", + "单人": "单人", "卸载音色省显存": "卸載音色節省 VRAM", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "性能设置": "效能設定", "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批次": "批次", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", "指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出文件夹": "指定輸出資料夾", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", "融合": "融合", "要改的模型信息": "要改的模型資訊", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", "输入训练文件夹路径": "輸入訓練檔案夾路徑", "输入设备": "輸入設備", "输入降噪": "輸入降噪", "输出信息": "輸出訊息", + "输出变声": "输出变声", "输出设备": "輸出設備", "输出降噪": "輸出降噪", "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json index 47ed97c..a0c010a 100644 --- a/i18n/locale/zh_TW.json +++ b/i18n/locale/zh_TW.json @@ -38,6 +38,7 @@ "加载模型": "載入模型", "加载预训练底模D路径": "加載預訓練底模D路徑", "加载预训练底模G路径": "加載預訓練底模G路徑", + "单人": "单人", "卸载音色省显存": "卸載音色節省 VRAM", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", @@ -53,6 +54,7 @@ "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", "性能设置": "效能設定", "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批次": "批次", "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", "指定输出主人声文件夹": "指定输出主人声文件夹", "指定输出文件夹": "指定輸出資料夾", @@ -86,6 +88,7 @@ "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", "融合": "融合", "要改的模型信息": "要改的模型資訊", @@ -104,10 +107,12 @@ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", "输入训练文件夹路径": "輸入訓練檔案夾路徑", "输入设备": "輸入設備", "输入降噪": "輸入降噪", "输出信息": "輸出訊息", + "输出变声": "输出变声", "输出设备": "輸出設備", "输出降噪": "輸出降噪", "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", diff --git a/infer-web.py b/infer-web.py index 542b42b..859114e 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1,36 +1,36 @@ -import os, sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -import logging -import shutil -import threading -import traceback -import warnings -from random import shuffle -from subprocess import Popen -from time import sleep -import json -import pathlib - -import fairseq -import faiss -import gradio as gr -import numpy as np -import torch -from dotenv import load_dotenv -from sklearn.cluster import MiniBatchKMeans - -from configs.config import Config -from i18n.i18n import I18nAuto +from infer.modules.vc.modules import VC +from infer.modules.uvr5.modules import uvr from infer.lib.train.process_ckpt import ( change_info, extract_small_model, merge, show_info, ) -from infer.modules.uvr5.modules import uvr -from infer.modules.vc.modules import VC +from i18n.i18n import I18nAuto +from configs.config import Config +from sklearn.cluster import MiniBatchKMeans +from dotenv import load_dotenv +import torch +import numpy as np +import gradio as gr +import faiss +import fairseq +import pathlib +import json +from time import sleep +from subprocess import Popen +from random import shuffle +import warnings +import traceback +import threading +import shutil +import logging +import os +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) + logging.getLogger("numba").setLevel(logging.WARNING) @@ -165,10 +165,10 @@ def clean(): return {"value": "", "__type__": "update"} -def export_onnx(): +def export_onnx(ModelPath, ExportedPath): from infer.modules.onnx.export import export_onnx as eo - eo() + eo(ModelPath, ExportedPath) sr_dict = { @@ -219,8 +219,9 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): per, ) logger.info(cmd) - p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir - ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir + p = Popen(cmd, shell=True) + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 done = [False] threading.Thread( target=if_done, @@ -263,7 +264,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp p = Popen( cmd, shell=True, cwd=now_dir ) # , stdin=PIPE, stdout=PIPE,stderr=PIPE - ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 done = [False] threading.Thread( target=if_done, @@ -295,7 +296,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp cmd, shell=True, cwd=now_dir ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir ps.append(p) - ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 done = [False] threading.Thread( target=if_done_multi, # @@ -331,7 +332,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp log = f.read() logger.info(log) yield log - ####对不同part分别开多进程 + # 对不同part分别开多进程 """ n_part=int(sys.argv[1]) i_part=int(sys.argv[2]) @@ -360,7 +361,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp cmd, shell=True, cwd=now_dir ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir ps.append(p) - ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 done = [False] threading.Thread( target=if_done_multi, @@ -701,11 +702,11 @@ def get_info_str(strr): infos.append(strr) return "\n".join(infos) - ####### step1:处理数据 + # step1:处理数据 yield get_info_str(i18n("step1:正在处理数据")) [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)] - ####### step2a:提取音高 + # step2a:提取音高 yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) [ get_info_str(_) @@ -714,7 +715,7 @@ def get_info_str(strr): ) ] - ####### step3a:训练模型 + # step3a:训练模型 yield get_info_str(i18n("step3a:正在训练模型")) click_train( exp_dir1, @@ -734,7 +735,7 @@ def get_info_str(strr): ) yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")) - ####### step3b:训练索引 + # step3b:训练索引 [get_info_str(_) for _ in train_index(exp_dir1, version19)] yield get_info_str(i18n("全流程结束!")) @@ -768,6 +769,7 @@ def change_f0_method(f0method8): with gr.Blocks(title="RVC WebUI") as app: + gr.Markdown("## RVC WebUI") gr.Markdown( value=i18n( "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." @@ -775,134 +777,143 @@ def change_f0_method(f0method8): ) with gr.Tabs(): with gr.TabItem(i18n("模型推理")): - with gr.Row(): - sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) - refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary") - clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary") - spk_item = gr.Slider( - minimum=0, - maximum=2333, - step=1, - label=i18n("请选择说话人id"), - value=0, - visible=False, - interactive=True, - ) - clean_button.click( - fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean" - ) - with gr.Group(): - gr.Markdown( - value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ") - ) + with gr.TabItem(i18n("单人")): with gr.Row(): + sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) + file_index2 = gr.Dropdown( + label=i18n("自动检测index路径,下拉式选择(dropdown)"), + choices=sorted(index_paths), + interactive=True, + ) with gr.Column(): - vc_transform0 = gr.Number( - label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0 - ) - input_audio0 = gr.Textbox( - label=i18n("输入待处理音频文件路径(默认是正确格式示例)"), - value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav", - ) - f0method0 = gr.Radio( - label=i18n( - "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" - ), - choices=["pm", "harvest", "crepe", "rmvpe"] - if config.dml == False - else ["pm", "harvest", "rmvpe"], - value="pm", - interactive=True, - ) - filter_radius0 = gr.Slider( - minimum=0, - maximum=7, - label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), - value=3, - step=1, - interactive=True, - ) - with gr.Column(): - file_index1 = gr.Textbox( - label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), - value="", - interactive=True, - ) - file_index2 = gr.Dropdown( - label=i18n("自动检测index路径,下拉式选择(dropdown)"), - choices=sorted(index_paths), - interactive=True, - ) - refresh_button.click( - fn=change_choices, - inputs=[], - outputs=[sid0, file_index2], - api_name="infer_refresh", - ) - # file_big_npy1 = gr.Textbox( - # label=i18n("特征文件路径"), - # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", - # interactive=True, - # ) - index_rate1 = gr.Slider( - minimum=0, - maximum=1, - label=i18n("检索特征占比"), - value=0.75, - interactive=True, + refresh_button = gr.Button( + i18n("刷新音色列表和索引路径"), variant="primary" ) + clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary") + spk_item = gr.Slider( + minimum=0, + maximum=2333, + step=1, + label=i18n("请选择说话人id"), + value=0, + visible=False, + interactive=True, + ) + clean_button.click( + fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean" + ) + with gr.Group(): + with gr.Row(): + with gr.Column(): + vc_transform0 = gr.Number( + label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0 + ) + input_audio0 = gr.Textbox( + label=i18n("输入待处理音频文件路径(默认是正确格式示例)"), + placeholder="C:\\User\\Desktop\\audio_example.wav", + ) + file_index1 = gr.Textbox( + label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), + placeholder="C:\\User\\Desktop\\model_example.index", + interactive=True, + ) + f0method0 = gr.Radio( + label=i18n( + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" + ), + choices=["pm", "harvest", "crepe", "rmvpe"] + if config.dml == False + else ["pm", "harvest", "rmvpe"], + value="rmvpe", + interactive=True, + ) + # f0_file = gr.File( + # label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调") + # ) + + refresh_button.click( + fn=change_choices, + inputs=[], + outputs=[sid0, file_index2], + api_name="infer_refresh", + ) + # file_big_npy1 = gr.Textbox( + # label=i18n("特征文件路径"), + # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", + # interactive=True, + # ) + + with gr.Column(): + resample_sr0 = gr.Slider( + minimum=0, + maximum=48000, + label=i18n("后处理重采样至最终采样率,0为不进行重采样"), + value=0, + step=1, + interactive=True, + ) + rms_mix_rate0 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), + value=0.25, + interactive=True, + ) + protect0 = gr.Slider( + minimum=0, + maximum=0.5, + label=i18n( + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果" + ), + value=0.33, + step=0.01, + interactive=True, + ) + filter_radius0 = gr.Slider( + minimum=0, + maximum=7, + label=i18n( + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音" + ), + value=3, + step=1, + interactive=True, + ) + index_rate1 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("检索特征占比"), + value=0.75, + interactive=True, + ) + with gr.Group(): with gr.Column(): - resample_sr0 = gr.Slider( - minimum=0, - maximum=48000, - label=i18n("后处理重采样至最终采样率,0为不进行重采样"), - value=0, - step=1, - interactive=True, - ) - rms_mix_rate0 = gr.Slider( - minimum=0, - maximum=1, - label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), - value=0.25, - interactive=True, - ) - protect0 = gr.Slider( - minimum=0, - maximum=0.5, - label=i18n( - "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果" - ), - value=0.33, - step=0.01, - interactive=True, + but0 = gr.Button(i18n("转换"), variant="primary") + with gr.Row(): + vc_output1 = gr.Textbox(label=i18n("输出信息")) + vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) + + but0.click( + vc.vc_single, + [ + spk_item, + input_audio0, + vc_transform0, + # f0_file, + f0method0, + file_index1, + file_index2, + # file_big_npy1, + index_rate1, + filter_radius0, + resample_sr0, + rms_mix_rate0, + protect0, + ], + [vc_output1, vc_output2], + api_name="infer_convert", ) - f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) - but0 = gr.Button(i18n("转换"), variant="primary") - with gr.Row(): - vc_output1 = gr.Textbox(label=i18n("输出信息")) - vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) - but0.click( - vc.vc_single, - [ - spk_item, - input_audio0, - vc_transform0, - f0_file, - f0method0, - file_index1, - file_index2, - # file_big_npy1, - index_rate1, - filter_radius0, - resample_sr0, - rms_mix_rate0, - protect0, - ], - [vc_output1, vc_output2], - api_name="infer_convert", - ) - with gr.Group(): + with gr.TabItem(i18n("批次")): gr.Markdown( value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ") ) @@ -912,6 +923,16 @@ def change_f0_method(f0method8): label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0 ) opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt") + file_index3 = gr.Textbox( + label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), + value="", + interactive=True, + ) + file_index4 = gr.Dropdown( + label=i18n("自动检测index路径,下拉式选择(dropdown)"), + choices=sorted(index_paths), + interactive=True, + ) f0method1 = gr.Radio( label=i18n( "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" @@ -919,28 +940,16 @@ def change_f0_method(f0method8): choices=["pm", "harvest", "crepe", "rmvpe"] if config.dml == False else ["pm", "harvest", "rmvpe"], - value="pm", - interactive=True, - ) - filter_radius1 = gr.Slider( - minimum=0, - maximum=7, - label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), - value=3, - step=1, + value="rmvpe", interactive=True, ) - with gr.Column(): - file_index3 = gr.Textbox( - label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), - value="", - interactive=True, - ) - file_index4 = gr.Dropdown( - label=i18n("自动检测index路径,下拉式选择(dropdown)"), - choices=sorted(index_paths), + format1 = gr.Radio( + label=i18n("导出文件格式"), + choices=["wav", "flac", "mp3", "m4a"], + value="wav", interactive=True, ) + refresh_button.click( fn=lambda: change_choices()[1], inputs=[], @@ -952,13 +961,7 @@ def change_f0_method(f0method8): # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", # interactive=True, # ) - index_rate2 = gr.Slider( - minimum=0, - maximum=1, - label=i18n("检索特征占比"), - value=1, - interactive=True, - ) + with gr.Column(): resample_sr1 = gr.Slider( minimum=0, @@ -985,23 +988,34 @@ def change_f0_method(f0method8): step=0.01, interactive=True, ) - with gr.Column(): - dir_input = gr.Textbox( - label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"), - value="E:\codes\py39\\test-20230416b\\todo-songs", - ) - inputs = gr.File( - file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") + filter_radius1 = gr.Slider( + minimum=0, + maximum=7, + label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), + value=3, + step=1, + interactive=True, ) - with gr.Row(): - format1 = gr.Radio( - label=i18n("导出文件格式"), - choices=["wav", "flac", "mp3", "m4a"], - value="flac", + index_rate2 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("检索特征占比"), + value=1, interactive=True, ) - but1 = gr.Button(i18n("转换"), variant="primary") - vc_output3 = gr.Textbox(label=i18n("输出信息")) + with gr.Row(): + dir_input = gr.Textbox( + label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"), + placeholder="C:\\User\\Desktop\\model_example.index", + ) + inputs = gr.File( + file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") + ) + + with gr.Row(): + but1 = gr.Button(i18n("转换"), variant="primary") + vc_output3 = gr.Textbox(label=i18n("输出信息")) + but1.click( vc.vc_multi, [ @@ -1024,12 +1038,12 @@ def change_f0_method(f0method8): [vc_output3], api_name="infer_convert_batch", ) - sid0.change( - fn=vc.get_vc, - inputs=[sid0, protect0, protect1], - outputs=[spk_item, protect0, protect1, file_index2, file_index4], - api_name="infer_change_voice", - ) + sid0.change( + fn=vc.get_vc, + inputs=[sid0, protect0, protect1], + outputs=[spk_item, protect0, protect1, file_index2, file_index4], + api_name="infer_change_voice", + ) with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): with gr.Group(): gr.Markdown( diff --git a/infer/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py index 2b6060c..2cc745a 100644 --- a/infer/lib/infer_pack/attentions.py +++ b/infer/lib/infer_pack/attentions.py @@ -1,5 +1,6 @@ import copy import math +from typing import Optional import numpy as np import torch @@ -22,11 +23,11 @@ def __init__( window_size=10, **kwargs ): - super().__init__() + super(Encoder, self).__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads - self.n_layers = n_layers + self.n_layers = int(n_layers) self.kernel_size = kernel_size self.p_dropout = p_dropout self.window_size = window_size @@ -61,14 +62,17 @@ def __init__( def forward(self, x, x_mask): attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, attn_mask) + zippep = zip( + self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 + ) + for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep: + y = attn_layers(x, x, attn_mask) y = self.drop(y) - x = self.norm_layers_1[i](x + y) + x = norm_layers_1(x + y) - y = self.ffn_layers[i](x, x_mask) + y = ffn_layers(x, x_mask) y = self.drop(y) - x = self.norm_layers_2[i](x + y) + x = norm_layers_2(x + y) x = x * x_mask return x @@ -86,7 +90,7 @@ def __init__( proximal_init=True, **kwargs ): - super().__init__() + super(Decoder, self).__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads @@ -172,7 +176,7 @@ def __init__( proximal_bias=False, proximal_init=False, ): - super().__init__() + super(MultiHeadAttention, self).__init__() assert channels % n_heads == 0 self.channels = channels @@ -213,19 +217,28 @@ def __init__( self.conv_k.weight.copy_(self.conv_q.weight) self.conv_k.bias.copy_(self.conv_q.bias) - def forward(self, x, c, attn_mask=None): + def forward( + self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None + ): q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) - x, self.attn = self.attention(q, k, v, mask=attn_mask) + x, _ = self.attention(q, k, v, mask=attn_mask) x = self.conv_o(x) return x - def attention(self, query, key, value, mask=None): + def attention( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ): # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) + b, d, t_s = key.size() + t_t = query.size(2) query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) @@ -292,16 +305,17 @@ def _matmul_with_relative_keys(self, x, y): ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) return ret - def _get_relative_embeddings(self, relative_embeddings, length): + def _get_relative_embeddings(self, relative_embeddings, length: int): max_relative_position = 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) + pad_length: int = max(length - (self.window_size + 1), 0) slice_start_position = max((self.window_size + 1) - length, 0) slice_end_position = slice_start_position + 2 * length - 1 if pad_length > 0: padded_relative_embeddings = F.pad( relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + [0, 0, pad_length, pad_length, 0, 0], ) else: padded_relative_embeddings = relative_embeddings @@ -317,12 +331,18 @@ def _relative_position_to_absolute_position(self, x): """ batch, heads, length, _ = x.size() # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + x = F.pad( + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + [0, 1, 0, 0, 0, 0, 0, 0], + ) # Concat extra elements so to add up to shape (len+1, 2*len-1). x_flat = x.view([batch, heads, length * 2 * length]) x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + x_flat, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]]) + [0, int(length) - 1, 0, 0, 0, 0], ) # Reshape and slice out the padded elements. @@ -339,15 +359,21 @@ def _absolute_position_to_relative_position(self, x): batch, heads, length, _ = x.size() # padd along column x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]]) + [0, int(length) - 1, 0, 0, 0, 0, 0, 0], ) - x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))]) # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_flat = F.pad( + x_flat, + # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]]) + [length, 0, 0, 0, 0, 0], + ) x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] return x_final - def _attention_bias_proximal(self, length): + def _attention_bias_proximal(self, length: int): """Bias for self-attention to encourage attention to close positions. Args: length: an integer scalar. @@ -367,10 +393,10 @@ def __init__( filter_channels, kernel_size, p_dropout=0.0, - activation=None, + activation: str = None, causal=False, ): - super().__init__() + super(FFN, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.filter_channels = filter_channels @@ -378,40 +404,56 @@ def __init__( self.p_dropout = p_dropout self.activation = activation self.causal = causal - - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding + self.is_activation = True if activation == "gelu" else False + # if causal: + # self.padding = self._causal_padding + # else: + # self.padding = self._same_padding self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) self.drop = nn.Dropout(p_dropout) - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - if self.activation == "gelu": + def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + if self.causal: + padding = self._causal_padding(x * x_mask) + else: + padding = self._same_padding(x * x_mask) + return padding + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor): + x = self.conv_1(self.padding(x, x_mask)) + if self.is_activation: x = x * torch.sigmoid(1.702 * x) else: x = torch.relu(x) x = self.drop(x) - x = self.conv_2(self.padding(x * x_mask)) + + x = self.conv_2(self.padding(x, x_mask)) return x * x_mask def _causal_padding(self, x): if self.kernel_size == 1: return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) + pad_l: int = self.kernel_size - 1 + pad_r: int = 0 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) return x def _same_padding(self, x): if self.kernel_size == 1: return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) + pad_l: int = (self.kernel_size - 1) // 2 + pad_r: int = self.kernel_size // 2 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) return x diff --git a/infer/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py index 7ba7d21..4ec6c24 100644 --- a/infer/lib/infer_pack/commons.py +++ b/infer/lib/infer_pack/commons.py @@ -1,3 +1,4 @@ +from typing import List, Optional import math import numpy as np @@ -16,10 +17,10 @@ def get_padding(kernel_size, dilation=1): return int((kernel_size * dilation - dilation) / 2) -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape +# def convert_pad_shape(pad_shape): +# l = pad_shape[::-1] +# pad_shape = [item for sublist in l for item in sublist] +# return pad_shape def kl_divergence(m_p, logs_p, m_q, logs_q): @@ -113,10 +114,14 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): return acts -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape +# def convert_pad_shape(pad_shape): +# l = pad_shape[::-1] +# pad_shape = [item for sublist in l for item in sublist] +# return pad_shape + + +def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: + return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist() def shift_1d(x): @@ -124,7 +129,7 @@ def shift_1d(x): return x -def sequence_mask(length, max_length=None): +def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): if max_length is None: max_length = length.max() x = torch.arange(max_length, dtype=length.dtype, device=length.device) diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index 711db22..a60ced6 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -1,5 +1,6 @@ import math import logging +from typing import Optional logger = logging.getLogger(__name__) @@ -28,25 +29,32 @@ def __init__( p_dropout, f0=True, ): - super().__init__() + super(TextEncoder256, self).__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.emb_phone = nn.Linear(256, hidden_channels) self.lrelu = nn.LeakyReLU(0.1, inplace=True) if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, phone, pitch, lengths): - if pitch == None: + def forward( + self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor + ): + if pitch is None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) @@ -75,25 +83,30 @@ def __init__( p_dropout, f0=True, ): - super().__init__() + super(TextEncoder768, self).__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.emb_phone = nn.Linear(768, hidden_channels) self.lrelu = nn.LeakyReLU(0.1, inplace=True) if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, phone, pitch, lengths): - if pitch == None: + def forward(self, phone: torch.Tensor, pitch: torch.Tensor, lengths: torch.Tensor): + if pitch is None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) @@ -121,7 +134,7 @@ def __init__( n_flows=4, gin_channels=0, ): - super().__init__() + super(ResidualCouplingBlock, self).__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size @@ -145,19 +158,36 @@ def __init__( ) self.flows.append(modules.Flip()) - def forward(self, x, x_mask, g=None, reverse=False): + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): if not reverse: for flow in self.flows: x, _ = flow(x, x_mask, g=g, reverse=reverse) else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) + for flow in self.flows[::-1]: + x, _ = flow.forward(x, x_mask, g=g, reverse=reverse) return x def remove_weight_norm(self): for i in range(self.n_flows): self.flows[i * 2].remove_weight_norm() + def __prepare_scriptable__(self): + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + class PosteriorEncoder(nn.Module): def __init__( @@ -170,7 +200,7 @@ def __init__( n_layers, gin_channels=0, ): - super().__init__() + super(PosteriorEncoder, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels @@ -189,7 +219,9 @@ def __init__( ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, x, x_lengths, g=None): + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( x.dtype ) @@ -203,6 +235,15 @@ def forward(self, x, x_lengths, g=None): def remove_weight_norm(self): self.enc.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + class Generator(torch.nn.Module): def __init__( @@ -252,7 +293,7 @@ def __init__( if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - def forward(self, x, g=None): + def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): x = self.conv_pre(x) if g is not None: x = x + self.cond(g) @@ -273,6 +314,28 @@ def forward(self, x, g=None): return x + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + def remove_weight_norm(self): for l in self.ups: remove_weight_norm(l) @@ -293,7 +356,7 @@ class SineGen(torch.nn.Module): voiced_thoreshold: F0 threshold for U/V classification (default 0) flag_for_pulse: this SinGen is used inside PulseGen (default False) Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) + segment is always sin(torch.pi) or cos(0) """ def __init__( @@ -321,7 +384,7 @@ def _f02uv(self, f0): uv = uv.float() return uv - def forward(self, f0, upp): + def forward(self, f0: torch.Tensor, upp: int): """sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 @@ -333,7 +396,7 @@ def forward(self, f0, upp): f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num): + for idx in range(self.harmonic_num): f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( idx + 2 ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic @@ -347,12 +410,12 @@ def forward(self, f0, upp): tmp_over_one *= upp tmp_over_one = F.interpolate( tmp_over_one.transpose(2, 1), - scale_factor=upp, + scale_factor=float(upp), mode="linear", align_corners=True, ).transpose(2, 1) rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" ).transpose( 2, 1 ) ####### @@ -361,12 +424,12 @@ def forward(self, f0, upp): cumsum_shift = torch.zeros_like(rad_values) cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi ) sine_waves = sine_waves * self.sine_amp uv = self._f02uv(f0) uv = F.interpolate( - uv.transpose(2, 1), scale_factor=upp, mode="nearest" + uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" ).transpose(2, 1) noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) @@ -414,18 +477,19 @@ def __init__( # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() + # self.ddtype:int = -1 - def forward(self, x, upp=None): - if hasattr(self, "ddtype") == False: - self.ddtype = self.l_linear.weight.dtype + def forward(self, x: torch.Tensor, upp: int = 1): + # if self.ddtype ==-1: + # self.ddtype = self.l_linear.weight.dtype sine_wavs, uv, _ = self.l_sin_gen(x, upp) # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) # if self.is_half: # sine_wavs = sine_wavs.half() # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) # print(sine_wavs.dtype,self.ddtype) - if sine_wavs.dtype != self.ddtype: - sine_wavs = sine_wavs.to(self.ddtype) + # if sine_wavs.dtype != self.l_linear.weight.dtype: + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None # noise, uv @@ -448,7 +512,7 @@ def __init__( self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( sampling_rate=sr, harmonic_num=0, is_half=is_half ) @@ -473,7 +537,7 @@ def __init__( ) ) if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1 :]) + stride_f0 = math.prod(upsample_rates[i + 1 :]) self.noise_convs.append( Conv1d( 1, @@ -500,27 +564,36 @@ def __init__( if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - self.upp = np.prod(upsample_rates) + self.upp = math.prod(upsample_rates) + + self.lrelu_slope = modules.LRELU_SLOPE - def forward(self, x, f0, g=None): + def forward(self, x, f0, g: Optional[torch.Tensor] = None): har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) x = self.conv_pre(x) if g is not None: x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - x_source = self.noise_convs[i](har_source) - x = x + x_source - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels + # torch.jit.script() does not support direct indexing of torch modules + # That's why I wrote this + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + if i < self.num_upsamples: + x = F.leaky_relu(x, self.lrelu_slope) + x = ups(x) + x_source = noise_convs(har_source) + x = x + x_source + xs: Optional[torch.Tensor] = None + l = [i * self.num_kernels + j for j in range(self.num_kernels)] + for j, resblock in enumerate(self.resblocks): + if j in l: + if xs is None: + xs = resblock(x) + else: + xs += resblock(x) + # This assertion cannot be ignored! \ + # If ignored, it will cause torch.jit.script() compilation errors + assert isinstance(xs, torch.Tensor) + x = xs / self.num_kernels x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) @@ -532,6 +605,27 @@ def remove_weight_norm(self): for l in self.resblocks: l.remove_weight_norm() + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.resblocks: + for hook in self.resblocks._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + sr2sr = { "32k": 32000, @@ -563,8 +657,8 @@ def __init__( sr, **kwargs ): - super().__init__() - if type(sr) == type("strr"): + super(SynthesizerTrnMs256NSFsid, self).__init__() + if isinstance(sr, str): sr = sr2sr[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -573,7 +667,7 @@ def __init__( self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes @@ -591,7 +685,7 @@ def __init__( n_heads, n_layers, kernel_size, - p_dropout, + float(p_dropout), ) self.dec = GeneratorNSF( inter_channels, @@ -630,8 +724,42 @@ def remove_weight_norm(self): self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + pitchf: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + ds: Optional[torch.Tensor] = None, ): # 这里ds是id,[bs,1] # print(1,pitch.shape)#[bs,t] g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 @@ -647,15 +775,25 @@ def forward( o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + nsff0: torch.Tensor, + sid: torch.Tensor, + rate: Optional[torch.Tensor] = None, + ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - nsff0 = nsff0[:, -head:] + if rate is not None: + assert isinstance(rate, torch.Tensor) + head = int(z_p.shape[2] * (1 - rate.item())) + z_p = z_p[:, :, head:] + x_mask = x_mask[:, :, head:] + nsff0 = nsff0[:, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -684,8 +822,8 @@ def __init__( sr, **kwargs ): - super().__init__() - if type(sr) == type("strr"): + super(SynthesizerTrnMs768NSFsid, self).__init__() + if isinstance(sr, str): sr = sr2sr[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -694,7 +832,7 @@ def __init__( self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes @@ -712,7 +850,7 @@ def __init__( n_heads, n_layers, kernel_size, - p_dropout, + float(p_dropout), ) self.dec = GeneratorNSF( inter_channels, @@ -751,6 +889,33 @@ def remove_weight_norm(self): self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore def forward( self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds ): # 这里ds是id,[bs,1] @@ -768,15 +933,24 @@ def forward( o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + nsff0: torch.Tensor, + sid: torch.Tensor, + rate: Optional[torch.Tensor] = None, + ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - nsff0 = nsff0[:, -head:] + if rate is not None: + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p = z_p[:, :, head:] + x_mask = x_mask[:, :, head:] + nsff0 = nsff0[:, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -805,7 +979,7 @@ def __init__( sr=None, **kwargs ): - super().__init__() + super(SynthesizerTrnMs256NSFsid_nono, self).__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels @@ -813,7 +987,7 @@ def __init__( self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes @@ -831,7 +1005,7 @@ def __init__( n_heads, n_layers, kernel_size, - p_dropout, + float(p_dropout), f0=False, ) self.dec = Generator( @@ -869,6 +1043,33 @@ def remove_weight_norm(self): self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) @@ -880,14 +1081,22 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, rate=None): + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + sid: torch.Tensor, + rate: Optional[torch.Tensor] = None, + ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] + if rate is not None: + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p = z_p[:, :, head:] + x_mask = x_mask[:, :, head:] + nsff0 = nsff0[:, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -916,7 +1125,7 @@ def __init__( sr=None, **kwargs ): - super().__init__() + super(self, SynthesizerTrnMs768NSFsid_nono).__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels @@ -924,7 +1133,7 @@ def __init__( self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes @@ -942,7 +1151,7 @@ def __init__( n_heads, n_layers, kernel_size, - p_dropout, + float(p_dropout), f0=False, ) self.dec = Generator( @@ -980,6 +1189,33 @@ def remove_weight_norm(self): self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) @@ -991,14 +1227,22 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, rate=None): + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + sid: torch.Tensor, + rate: Optional[torch.Tensor] = None, + ): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] + if rate is not None: + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p = z_p[:, :, head:] + x_mask = x_mask[:, :, head:] + nsff0 = nsff0[:, head:] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py index 3e99763..ff60414 100644 --- a/infer/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -551,7 +551,7 @@ def __init__( gin_channels, sr, version, - **kwargs + **kwargs, ): super().__init__() if type(sr) == type("strr"): @@ -621,10 +621,7 @@ def __init__( self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) self.speaker_map = None logger.debug( - "gin_channels: " - + gin_channels - + ", self.spk_embed_dim: " - + self.spk_embed_dim + f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}" ) def remove_weight_norm(self): diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py index edf2207..51aeaf0 100644 --- a/infer/lib/infer_pack/modules.py +++ b/infer/lib/infer_pack/modules.py @@ -1,5 +1,6 @@ import copy import math +from typing import Optional, Tuple import numpy as np import scipy @@ -18,7 +19,7 @@ class LayerNorm(nn.Module): def __init__(self, channels, eps=1e-5): - super().__init__() + super(LayerNorm, self).__init__() self.channels = channels self.eps = eps @@ -41,13 +42,13 @@ def __init__( n_layers, p_dropout, ): - super().__init__() + super(ConvReluNorm, self).__init__() self.in_channels = in_channels self.hidden_channels = hidden_channels self.out_channels = out_channels self.kernel_size = kernel_size self.n_layers = n_layers - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) assert n_layers > 1, "Number of layers should be larger than 0." self.conv_layers = nn.ModuleList() @@ -58,7 +59,7 @@ def __init__( ) ) self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout))) for _ in range(n_layers - 1): self.conv_layers.append( nn.Conv1d( @@ -89,13 +90,13 @@ class DDSConv(nn.Module): """ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() + super(DDSConv, self).__init__() self.channels = channels self.kernel_size = kernel_size self.n_layers = n_layers - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) - self.drop = nn.Dropout(p_dropout) + self.drop = nn.Dropout(float(p_dropout)) self.convs_sep = nn.ModuleList() self.convs_1x1 = nn.ModuleList() self.norms_1 = nn.ModuleList() @@ -117,7 +118,7 @@ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): self.norms_1.append(LayerNorm(channels)) self.norms_2.append(LayerNorm(channels)) - def forward(self, x, x_mask, g=None): + def forward(self, x, x_mask, g: Optional[torch.Tensor] = None): if g is not None: x = x + g for i in range(self.n_layers): @@ -149,11 +150,11 @@ def __init__( self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels - self.p_dropout = p_dropout + self.p_dropout = float(p_dropout) self.in_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) + self.drop = nn.Dropout(float(p_dropout)) if gin_channels != 0: cond_layer = torch.nn.Conv1d( @@ -184,15 +185,19 @@ def __init__( res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) - def forward(self, x, x_mask, g=None, **kwargs): + def forward( + self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None + ): output = torch.zeros_like(x) n_channels_tensor = torch.IntTensor([self.hidden_channels]) if g is not None: g = self.cond_layer(g) - for i in range(self.n_layers): - x_in = self.in_layers[i](x) + for i, (in_layer, res_skip_layer) in enumerate( + zip(self.in_layers, self.res_skip_layers) + ): + x_in = in_layer(x) if g is not None: cond_offset = i * 2 * self.hidden_channels g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] @@ -202,7 +207,7 @@ def forward(self, x, x_mask, g=None, **kwargs): acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) acts = self.drop(acts) - res_skip_acts = self.res_skip_layers[i](acts) + res_skip_acts = res_skip_layer(acts) if i < self.n_layers - 1: res_acts = res_skip_acts[:, : self.hidden_channels, :] x = (x + res_acts) * x_mask @@ -219,6 +224,30 @@ def remove_weight_norm(self): for l in self.res_skip_layers: torch.nn.utils.remove_weight_norm(l) + def __prepare_scriptable__(self): + if self.gin_channels != 0: + for hook in self.cond_layer._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): @@ -294,14 +323,15 @@ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): ] ) self.convs2.apply(init_weights) + self.lrelu_slope = LRELU_SLOPE - def forward(self, x, x_mask=None): + def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None): for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) + xt = F.leaky_relu(x, self.lrelu_slope) if x_mask is not None: xt = xt * x_mask xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = F.leaky_relu(xt, self.lrelu_slope) if x_mask is not None: xt = xt * x_mask xt = c2(xt) @@ -316,6 +346,23 @@ def remove_weight_norm(self): for l in self.convs2: remove_weight_norm(l) + def __prepare_scriptable__(self): + for l in self.convs1: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.convs2: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): @@ -345,10 +392,11 @@ def __init__(self, channels, kernel_size=3, dilation=(1, 3)): ] ) self.convs.apply(init_weights) + self.lrelu_slope = LRELU_SLOPE - def forward(self, x, x_mask=None): + def forward(self, x, x_mask: Optional[torch.Tensor] = None): for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) + xt = F.leaky_relu(x, self.lrelu_slope) if x_mask is not None: xt = xt * x_mask xt = c(xt) @@ -361,9 +409,25 @@ def remove_weight_norm(self): for l in self.convs: remove_weight_norm(l) + def __prepare_scriptable__(self): + for l in self.convs: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: if not reverse: y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask logdet = torch.sum(-y, [1, 2]) @@ -374,18 +438,27 @@ def forward(self, x, x_mask, reverse=False, **kwargs): class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): + # torch.jit.script() Compiled functions \ + # can't take variable number of arguments or \ + # use keyword-only arguments with defaults + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: x = torch.flip(x, [1]) if not reverse: logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) return x, logdet else: - return x + return x, torch.zeros([1], device=x.device) class ElementwiseAffine(nn.Module): def __init__(self, channels): - super().__init__() + super(ElementwiseAffine, self).__init__() self.channels = channels self.m = nn.Parameter(torch.zeros(channels, 1)) self.logs = nn.Parameter(torch.zeros(channels, 1)) @@ -414,7 +487,7 @@ def __init__( mean_only=False, ): assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() + super(ResidualCouplingLayer, self).__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size @@ -429,14 +502,20 @@ def __init__( kernel_size, dilation_rate, n_layers, - p_dropout=p_dropout, + p_dropout=float(p_dropout), gin_channels=gin_channels, ) self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) self.post.weight.data.zero_() self.post.bias.data.zero_() - def forward(self, x, x_mask, g=None, reverse=False): + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): x0, x1 = torch.split(x, [self.half_channels] * 2, 1) h = self.pre(x0) * x_mask h = self.enc(h, x_mask, g=g) @@ -455,11 +534,20 @@ def forward(self, x, x_mask, g=None, reverse=False): else: x1 = (x1 - m) * torch.exp(-logs) * x_mask x = torch.cat([x0, x1], 1) - return x + return x, torch.zeros([1]) def remove_weight_norm(self): self.enc.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + class ConvFlow(nn.Module): def __init__( @@ -471,7 +559,7 @@ def __init__( num_bins=10, tail_bound=5.0, ): - super().__init__() + super(ConvFlow, self).__init__() self.in_channels = in_channels self.filter_channels = filter_channels self.kernel_size = kernel_size @@ -488,7 +576,13 @@ def __init__( self.proj.weight.data.zero_() self.proj.bias.data.zero_() - def forward(self, x, x_mask, g=None, reverse=False): + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse=False, + ): x0, x1 = torch.split(x, [self.half_channels] * 2, 1) h = self.pre(x0) h = self.convs(h, x_mask, g=g) diff --git a/infer/lib/jit/__init__.py b/infer/lib/jit/__init__.py new file mode 100644 index 0000000..d7f41dd --- /dev/null +++ b/infer/lib/jit/__init__.py @@ -0,0 +1,163 @@ +from io import BytesIO +import pickle +import time +import torch +from tqdm import tqdm +from collections import OrderedDict + + +def load_inputs(path, device, is_half=False): + parm = torch.load(path, map_location=torch.device("cpu")) + for key in parm.keys(): + parm[key] = parm[key].to(device) + if is_half and parm[key].dtype == torch.float32: + parm[key] = parm[key].half() + elif not is_half and parm[key].dtype == torch.float16: + parm[key] = parm[key].float() + return parm + + +def benchmark( + model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False +): + parm = load_inputs(inputs_path, device, is_half) + total_ts = 0.0 + bar = tqdm(range(epoch)) + for i in bar: + start_time = time.perf_counter() + o = model(**parm) + total_ts += time.perf_counter() - start_time + print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}") + + +def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False): + benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half) + + +def to_jit_model( + model_path, + model_type: str, + mode: str = "trace", + inputs_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + model = None + if model_type.lower() == "synthesizer": + from .get_synthesizer import get_synthesizer + + model, _ = get_synthesizer(model_path, device) + model.forward = model.infer + elif model_type.lower() == "rmvpe": + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + elif model_type.lower() == "hubert": + from .get_hubert import get_hubert_model + + model = get_hubert_model(model_path, device) + model.forward = model.infer + else: + raise ValueError(f"No model type named {model_type}") + model = model.eval() + model = model.half() if is_half else model.float() + if mode == "trace": + assert not inputs_path + inputs = load_inputs(inputs_path, device, is_half) + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + # model = model.half() if is_half else model.float() + return (model, model_jit) + + +def export( + model: torch.nn.Module, + mode: str = "trace", + inputs: dict = None, + device=torch.device("cpu"), + is_half: bool = False, +) -> dict: + model = model.half() if is_half else model.float() + model.eval() + if mode == "trace": + assert inputs is not None + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + buffer = BytesIO() + # model_jit=model_jit.cpu() + torch.jit.save(model_jit, buffer) + del model_jit + cpt = OrderedDict() + cpt["model"] = buffer.getvalue() + cpt["is_half"] = is_half + return cpt + + +def load(path: str): + with open(path, "rb") as f: + return pickle.load(f) + + +def save(ckpt: dict, save_path: str): + with open(save_path, "wb") as f: + pickle.dump(ckpt, f) + + +def rmvpe_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + ckpt["device"] = str(device) + save(ckpt, save_path) + return ckpt + + +def synthesizer_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_synthesizer import get_synthesizer + + model, cpt = get_synthesizer(model_path, device) + assert isinstance(cpt, dict) + model.forward = model.infer + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + cpt.pop("weight") + cpt["model"] = ckpt["model"] + cpt["device"] = device + save(cpt, save_path) + return cpt diff --git a/infer/lib/jit/get_hubert.py b/infer/lib/jit/get_hubert.py new file mode 100644 index 0000000..aec7132 --- /dev/null +++ b/infer/lib/jit/get_hubert.py @@ -0,0 +1,342 @@ +import math +import random +from typing import Optional, Tuple +from fairseq.checkpoint_utils import load_model_ensemble_and_task +import numpy as np +import torch +import torch.nn.functional as F + +# from fairseq.data.data_utils import compute_mask_indices +from fairseq.utils import index_put + + +# @torch.jit.script +def pad_to_multiple(x, multiple, dim=-1, value=0): + # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if int(tsz % multiple) == 0: + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder + + +def extract_features( + self, + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, +): + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + # pad to the sequence length dimension + x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0) + if pad_length > 0 and padding_mask is None: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + padding_mask, self.required_seq_len_multiple, dim=-1, value=True + ) + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + r = None + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() if self.layerdrop > 0 else 1 + if not self.training or (dropout_probability > self.layerdrop): + x, (z, lr) = layer( + x, self_attn_padding_mask=padding_mask, need_weights=False + ) + if i >= min_layer: + layer_results.append((x, z, lr)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + def undo_pad(a, b, c): + return ( + a[:-pad_length], + b[:-pad_length] if b is not None else b, + c[:-pad_length], + ) + + layer_results = [undo_pad(*u) for u in layer_results] + + return x, layer_results + + +def compute_mask_indices( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + mask_type: str = "static", + mask_other: float = 0.0, + min_masks: int = 0, + no_overlap: bool = False, + min_space: int = 0, + require_same_masks: bool = True, + mask_dropout: float = 0.0, +) -> torch.Tensor: + """ + Computes random mask spans for a given shape + + Args: + shape: the the shape for which to compute masks. + should be of size 2 where first element is batch size and 2nd is timesteps + padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements + mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by + number of timesteps divided by length of mask span to mask approximately this percentage of all elements. + however due to overlaps, the actual number will be smaller (unless no_overlap is True) + mask_type: how to compute mask lengths + static = fixed size + uniform = sample from uniform distribution [mask_other, mask_length*2] + normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element + poisson = sample from possion distribution with lambda = mask length + min_masks: minimum number of masked spans + no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping + min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample + mask_dropout: randomly dropout this percentage of masks in each example + """ + + bsz, all_sz = shape + mask = torch.full((bsz, all_sz), False) + + all_num_mask = int( + # add a random number for probabilistic rounding + mask_prob * all_sz / float(mask_length) + + torch.rand([1]).item() + ) + + all_num_mask = max(min_masks, all_num_mask) + + mask_idcs = [] + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand()) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask + + if mask_type == "static": + lengths = torch.full([num_mask], mask_length) + elif mask_type == "uniform": + lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask]) + elif mask_type == "normal": + lengths = torch.normal(mask_length, mask_other, size=[num_mask]) + lengths = [max(1, int(round(x))) for x in lengths] + else: + raise Exception("unknown mask selection " + mask_type) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + if no_overlap: + mask_idc = [] + + def arrange(s, e, length, keep_length): + span_start = torch.randint(low=s, high=e - length, size=[1]).item() + mask_idc.extend(span_start + i for i in range(length)) + + new_parts = [] + if span_start - s - min_space >= keep_length: + new_parts.append((s, span_start - min_space + 1)) + if e - span_start - length - min_space > keep_length: + new_parts.append((span_start + length + min_space, e)) + return new_parts + + parts = [(0, sz)] + min_length = min(lengths) + for length in sorted(lengths, reverse=True): + t = [e - s if e - s >= length + min_space else 0 for s, e in parts] + lens = torch.asarray(t, dtype=torch.int) + l_sum = torch.sum(lens) + if l_sum == 0: + break + probs = lens / torch.sum(lens) + c = torch.multinomial(probs.float(), len(parts)).item() + s, e = parts.pop(c) + parts.extend(arrange(s, e, length, min_length)) + mask_idc = torch.asarray(mask_idc) + else: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + mask_idc = torch.asarray( + random.sample([i for i in range(sz - min_len)], num_mask) + ) + mask_idc = torch.asarray( + [ + mask_idc[j] + offset + for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ] + ) + + mask_idcs.append(torch.unique(mask_idc[mask_idc < sz])) + + min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + if isinstance(mask_idc, torch.Tensor): + mask_idc = torch.asarray(mask_idc, dtype=torch.float) + if len(mask_idc) > min_len and require_same_masks: + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], min_len) + ) + if mask_dropout > 0: + num_holes = int(round(len(mask_idc) * mask_dropout)) + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes) + ) + + mask[i, mask_idc.int()] = True + + return mask + + +def apply_mask(self, x, padding_mask, target_list): + B, T, C = x.shape + torch.zeros_like(x) + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = mask_indices.to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + +def get_hubert_model( + model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu") +): + models, _, _ = load_model_ensemble_and_task( + [model_path], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(device) + + def _apply_mask(x, padding_mask, target_list): + return apply_mask(hubert_model, x, padding_mask, target_list) + + hubert_model.apply_mask = _apply_mask + + def _extract_features( + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, + ): + return extract_features( + hubert_model.encoder, + x, + padding_mask=padding_mask, + tgt_layer=tgt_layer, + min_layer=min_layer, + ) + + hubert_model.encoder.extract_features = _extract_features + + hubert_model._forward = hubert_model.forward + + def hubert_extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + res = self._forward( + source, + padding_mask=padding_mask, + mask=mask, + features_only=True, + output_layer=output_layer, + ) + feature = res["features"] if ret_conv else res["x"] + return feature, res["padding_mask"] + + def _hubert_extract_features( + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + return hubert_extract_features( + hubert_model, source, padding_mask, mask, ret_conv, output_layer + ) + + hubert_model.extract_features = _hubert_extract_features + + def infer(source, padding_mask, output_layer: torch.Tensor): + output_layer = output_layer.item() + logits = hubert_model.extract_features( + source=source, padding_mask=padding_mask, output_layer=output_layer + ) + feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0] + return feats + + hubert_model.infer = infer + # hubert_model.forward=infer + # hubert_model.forward + + return hubert_model diff --git a/infer/lib/jit/get_rmvpe.py b/infer/lib/jit/get_rmvpe.py new file mode 100644 index 0000000..e71c39f --- /dev/null +++ b/infer/lib/jit/get_rmvpe.py @@ -0,0 +1,12 @@ +import torch + + +def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): + from infer.lib.rmvpe import E2E + + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location=device) + model.load_state_dict(ckpt) + model.eval() + model = model.to(device) + return model diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py new file mode 100644 index 0000000..ef5fe58 --- /dev/null +++ b/infer/lib/jit/get_synthesizer.py @@ -0,0 +1,37 @@ +import torch + + +def get_synthesizer(pth_path, device=torch.device("cpu")): + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, + ) + + cpt = torch.load(pth_path, map_location=torch.device("cpu")) + # tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + if_f0 = cpt.get("f0", 1) + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del net_g.enc_q + # net_g.forward = net_g.infer + # ckpt = {} + # ckpt["config"] = cpt["config"] + # ckpt["f0"] = if_f0 + # ckpt["version"] = version + # ckpt["info"] = cpt.get("info", "0epoch") + net_g.load_state_dict(cpt["weight"], strict=False) + net_g = net_g.float() + net_g.eval().to(device) + return net_g, cpt diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index d305b53..eb64a6e 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -1,8 +1,11 @@ -import pdb, os - +from io import BytesIO +import os +from typing import List, Optional, Tuple import numpy as np import torch +from infer.lib import jit + try: # Fix "Torch not compiled with CUDA enabled" import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import @@ -23,58 +26,6 @@ logger = logging.getLogger(__name__) -###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py -def window_sumsquare( - window, - n_frames, - hop_length=200, - win_length=800, - n_fft=800, - dtype=np.float32, - norm=None, -): - """ - # from librosa 0.6 - Compute the sum-square envelope of a window function at a given hop length. - This is used to estimate modulation effects induced by windowing - observations in short-time fourier transforms. - Parameters - ---------- - window : string, tuple, number, callable, or list-like - Window specification, as in `get_window` - n_frames : int > 0 - The number of analysis frames - hop_length : int > 0 - The number of samples to advance between frames - win_length : [optional] - The length of the window function. By default, this matches `n_fft`. - n_fft : int > 0 - The length of each analysis frame. - dtype : np.dtype - The data type of the output - Returns - ------- - wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` - The sum-squared envelope of the window function - """ - if win_length is None: - win_length = n_fft - - n = n_fft + hop_length * (n_frames - 1) - x = np.zeros(n, dtype=dtype) - - # Compute the squared window at the desired length - win_sq = get_window(window, win_length, fftbins=True) - win_sq = normalize(win_sq, norm=norm) ** 2 - win_sq = pad_center(win_sq, n_fft) - - # Fill the envelope - for i in range(n_frames): - sample = i * hop_length - x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] - return x - - class STFT(torch.nn.Module): def __init__( self, filter_length=1024, hop_length=512, win_length=None, window="hann" @@ -101,17 +52,14 @@ def __init__( self.window = window self.forward_transform = None self.pad_amount = int(self.filter_length / 2) - scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack( [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] ) - forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) - inverse_basis = torch.FloatTensor( - np.linalg.pinv(scale * fourier_basis).T[:, None, :] - ) + forward_basis = torch.FloatTensor(fourier_basis) + inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis)) assert filter_length >= self.win_length # get window and zero center pad it to filter_length @@ -121,12 +69,13 @@ def __init__( # window the bases forward_basis *= fft_window - inverse_basis *= fft_window + inverse_basis = (inverse_basis.T * fft_window).T self.register_buffer("forward_basis", forward_basis.float()) self.register_buffer("inverse_basis", inverse_basis.float()) + self.register_buffer("fft_window", fft_window.float()) - def transform(self, input_data): + def transform(self, input_data, return_phase=False): """Take input data (audio) to STFT domain. Arguments: @@ -138,33 +87,24 @@ def transform(self, input_data): phase {tensor} -- Phase of STFT with shape (num_batch, num_frequencies, num_frames) """ - num_batches = input_data.shape[0] - num_samples = input_data.shape[-1] - - self.num_samples = num_samples - - # similar to librosa, reflect-pad the input - input_data = input_data.view(num_batches, 1, num_samples) - # print(1234,input_data.shape) input_data = F.pad( - input_data.unsqueeze(1), - (self.pad_amount, self.pad_amount, 0, 0, 0, 0), + input_data, + (self.pad_amount, self.pad_amount), mode="reflect", - ).squeeze(1) - # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length) - # pdb.set_trace() - forward_transform = F.conv1d( - input_data, self.forward_basis, stride=self.hop_length, padding=0 ) - + forward_transform = input_data.unfold( + 1, self.filter_length, self.hop_length + ).permute(0, 2, 1) + forward_transform = torch.matmul(self.forward_basis, forward_transform) cutoff = int((self.filter_length / 2) + 1) real_part = forward_transform[:, :cutoff, :] imag_part = forward_transform[:, cutoff:, :] - magnitude = torch.sqrt(real_part**2 + imag_part**2) - # phase = torch.atan2(imag_part.data, real_part.data) - - return magnitude # , phase + if return_phase: + phase = torch.atan2(imag_part.data, real_part.data) + return magnitude, phase + else: + return magnitude def inverse(self, magnitude, phase): """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced @@ -180,42 +120,25 @@ def inverse(self, magnitude, phase): inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of shape (num_batch, num_samples) """ - recombine_magnitude_phase = torch.cat( + cat = torch.cat( [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 ) - - inverse_transform = F.conv_transpose1d( - recombine_magnitude_phase, - self.inverse_basis, - stride=self.hop_length, - padding=0, + fold = torch.nn.Fold( + output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length), + kernel_size=(1, self.filter_length), + stride=(1, self.hop_length), ) - - if self.window is not None: - window_sum = window_sumsquare( - self.window, - magnitude.size(-1), - hop_length=self.hop_length, - win_length=self.win_length, - n_fft=self.filter_length, - dtype=np.float32, - ) - # remove modulation effects - approx_nonzero_indices = torch.from_numpy( - np.where(window_sum > tiny(window_sum))[0] - ) - window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) - inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ - approx_nonzero_indices - ] - - # scale by hop ratio - inverse_transform *= float(self.filter_length) / self.hop_length - - inverse_transform = inverse_transform[..., self.pad_amount :] - inverse_transform = inverse_transform[..., : self.num_samples] - inverse_transform = inverse_transform.squeeze(1) - + inverse_transform = torch.matmul(self.inverse_basis, cat) + inverse_transform = fold(inverse_transform)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + window_square_sum = ( + self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0) + ) + window_square_sum = fold(window_square_sum)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + inverse_transform /= window_square_sum return inverse_transform def forward(self, input_data): @@ -228,7 +151,7 @@ def forward(self, input_data): reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of shape (num_batch, num_samples) """ - self.magnitude, self.phase = self.transform(input_data) + self.magnitude, self.phase = self.transform(input_data, return_phase=True) reconstruction = self.inverse(self.magnitude, self.phase) return reconstruction @@ -276,17 +199,15 @@ def __init__(self, in_channels, out_channels, momentum=0.01): nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(), ) + # self.shortcut:Optional[nn.Module] = None if in_channels != out_channels: self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: - self.is_shortcut = False - def forward(self, x): - if self.is_shortcut: - return self.conv(x) + self.shortcut(x) - else: + def forward(self, x: torch.Tensor): + if not hasattr(self, "shortcut"): return self.conv(x) + x + else: + return self.conv(x) + self.shortcut(x) class Encoder(nn.Module): @@ -318,12 +239,12 @@ def __init__( self.out_size = in_size self.out_channel = out_channels - def forward(self, x): - concat_tensors = [] + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] x = self.bn(x) - for i in range(self.n_encoders): - _, x = self.layers[i](x) - concat_tensors.append(_) + for i, layer in enumerate(self.layers): + t, x = layer(x) + concat_tensors.append(t) return x, concat_tensors @@ -342,8 +263,8 @@ def __init__( self.pool = nn.AvgPool2d(kernel_size=kernel_size) def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) + for i, conv in enumerate(self.conv): + x = conv(x) if self.kernel_size is not None: return x, self.pool(x) else: @@ -364,8 +285,8 @@ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01) ) def forward(self, x): - for i in range(self.n_inters): - x = self.layers[i](x) + for i, layer in enumerate(self.layers): + x = layer(x) return x @@ -395,8 +316,8 @@ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01) def forward(self, x, concat_tensor): x = self.conv1(x) x = torch.cat((x, concat_tensor), dim=1) - for i in range(self.n_blocks): - x = self.conv2[i](x) + for i, conv2 in enumerate(self.conv2): + x = conv2(x) return x @@ -412,9 +333,9 @@ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): ) in_channels = out_channels - def forward(self, x, concat_tensors): - for i in range(self.n_decoders): - x = self.layers[i](x, concat_tensors[-1 - i]) + def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]): + for i, layer in enumerate(self.layers): + x = layer(x, concat_tensors[-1 - i]) return x @@ -442,7 +363,7 @@ def __init__( self.encoder.out_channel, en_de_layers, kernel_size, n_blocks ) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: x, concat_tensors = self.encoder(x) x = self.intermediate(x) x = self.decoder(x, concat_tensors) @@ -536,33 +457,28 @@ def forward(self, audio, keyshift=0, speed=1, center=True): keyshift_key = str(keyshift) + "_" + str(audio.device) if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( - # "cpu"if(audio.device.type=="privateuseone") else audio.device audio.device ) - # fft = torch.stft(#doesn't support pytorch_dml - # # audio.cpu() if(audio.device.type=="privateuseone")else audio, - # audio, - # n_fft=n_fft_new, - # hop_length=hop_length_new, - # win_length=win_length_new, - # window=self.hann_window[keyshift_key], - # center=center, - # return_complex=True, - # ) - # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - # print(1111111111) - # print(222222222222222,audio.device,self.is_half) - if hasattr(self, "stft") == False: - # print(n_fft_new,hop_length_new,win_length_new,audio.shape) - self.stft = STFT( - filter_length=n_fft_new, + if "privateuseone" in str(audio.device): + if not hasattr(self, "stft"): + self.stft = STFT( + filter_length=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window="hann", + ).to(audio.device) + magnitude = self.stft.transform(audio) + else: + fft = torch.stft( + audio, + n_fft=n_fft_new, hop_length=hop_length_new, win_length=win_length_new, - window="hann", - ).to(audio.device) - magnitude = self.stft.transform(audio) # phase - # if (audio.device.type == "privateuseone"): - # magnitude=magnitude.to(audio.device) + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) if keyshift != 0: size = self.n_fft // 2 + 1 resize = magnitude.size(1) @@ -573,17 +489,16 @@ def forward(self, audio, keyshift=0, speed=1, center=True): if self.is_half == True: mel_output = mel_output.half() log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) - # print(log_mel_spec.device.type) return log_mel_spec class RMVPE: - def __init__(self, model_path, is_half, device=None): + def __init__(self, model_path: str, is_half, device=None, use_jit=False): self.resample_kernel = {} self.resample_kernel = {} self.is_half = is_half if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" + device = "cuda:0" if torch.cuda.is_available() else "cpu" self.device = device self.mel_extractor = MelSpectrogram( is_half, 128, 16000, 1024, 160, None, 30, 8000 @@ -597,13 +512,56 @@ def __init__(self, model_path, is_half, device=None): ) self.model = ort_session else: - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu") - model.load_state_dict(ckpt) - model.eval() - if is_half == True: - model = model.half() - self.model = model + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + + def get_jit_model(): + jit_model_path = model_path.rstrip(".pth") + jit_model_path += ".half.jit" if is_half else ".jit" + reload = False + if os.path.exists(jit_model_path): + ckpt = jit.load(jit_model_path) + model_device = ckpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + ckpt = jit.rmvpe_jit_export( + model_path=model_path, + mode="script", + inputs_path=None, + save_path=jit_model_path, + device=device, + is_half=is_half, + ) + model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device) + return model + + def get_default_model(): + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half: + model = model.half() + else: + model = model.float() + return model + + if use_jit: + if is_half and "cpu" in str(self.device): + logger.warning( + "Use default rmvpe model. \ + Jit is not supported on the CPU for half floating point" + ) + self.model = get_default_model() + else: + self.model = get_jit_model() + else: + self.model = get_default_model() + self.model = self.model.to(device) cents_mapping = 20 * np.arange(360) + 1997.3794084376191 self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 @@ -611,9 +569,9 @@ def __init__(self, model_path, is_half, device=None): def mel2hidden(self, mel): with torch.no_grad(): n_frames = mel.shape[-1] - mel = F.pad( - mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant" - ) + n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames + if n_pad > 0: + mel = F.pad(mel, (0, n_pad), mode="constant") if "privateuseone" in str(self.device): onnx_input_name = self.model.get_inputs()[0].name onnx_outputs_names = self.model.get_outputs()[0].name @@ -622,6 +580,7 @@ def mel2hidden(self, mel): input_feed={onnx_input_name: mel.cpu().numpy()}, )[0] else: + mel = mel.half() if self.is_half else mel.float() hidden = self.model(mel) return hidden[:, :n_frames] diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index 763ad06..d58d768 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -104,14 +104,11 @@ def main(): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = str(randint(20000, 55555)) children = [] + logger = utils.get_logger(hps.model_dir) for i in range(n_gpus): subproc = mp.Process( target=run, - args=( - i, - n_gpus, - hps, - ), + args=(i, n_gpus, hps, logger), ) children.append(subproc) subproc.start() @@ -120,10 +117,10 @@ def main(): children[i].join() -def run(rank, n_gpus, hps): +def run(rank, n_gpus, hps, logger: logging.Logger): global global_step if rank == 0: - logger = utils.get_logger(hps.model_dir) + # logger = utils.get_logger(hps.model_dir) logger.info(hps) # utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 094e307..ca2a4f9 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -1,10 +1,15 @@ +from io import BytesIO import os +import pickle import sys import traceback import logging -logger = logging.getLogger(__name__) +from infer.lib import jit +from infer.lib.jit.get_synthesizer import get_synthesizer +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) from time import time as ttime import fairseq @@ -31,17 +36,9 @@ from configs.config import Config -config = Config() +# config = Config() mm = M() -if config.dml == True: - - def forward_dml(ctx, x, scale): - ctx.scale = scale - res = x.clone().detach() - return res - - fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml # config.device=torch.device("cpu")########强制cpu测试 @@ -56,18 +53,27 @@ def __init__( n_cpu, inp_q, opt_q, - device, + config: Config, last_rvc=None, ) -> None: """ 初始化 """ try: - global config + if config.dml == True: + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + # global config + self.config = config self.inp_q = inp_q self.opt_q = opt_q # device="cpu"########强制cpu测试 - self.device = device + self.device = config.device self.f0_up_key = key self.time_step = 160 / 16000 * 1000 self.f0_min = 50 @@ -77,11 +83,14 @@ def __init__( self.sr = 16000 self.window = 160 self.n_cpu = n_cpu + self.use_jit = self.config.use_jit + self.is_half = config.is_half + if index_rate != 0: self.index = faiss.read_index(index_path) self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) logger.info("Index search enabled") - self.pth_path = pth_path + self.pth_path: str = pth_path self.index_path = index_path self.index_rate = index_rate @@ -91,8 +100,8 @@ def __init__( suffix="", ) hubert_model = models[0] - hubert_model = hubert_model.to(device) - if config.is_half: + hubert_model = hubert_model.to(self.device) + if self.is_half: hubert_model = hubert_model.half() else: hubert_model = hubert_model.float() @@ -101,41 +110,75 @@ def __init__( else: self.model = last_rvc.model - if last_rvc is None or last_rvc.pth_path != self.pth_path: - cpt = torch.load(self.pth_path, map_location="cpu") + self.net_g: nn.Module = None + + def set_default_model(): + self.net_g, cpt = get_synthesizer(self.pth_path, self.device) self.tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] self.if_f0 = cpt.get("f0", 1) self.version = cpt.get("version", "v1") - if self.version == "v1": - if self.if_f0 == 1: - self.net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif self.version == "v2": - if self.if_f0 == 1: - self.net_g = SynthesizerTrnMs768NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del self.net_g.enc_q - logger.debug(self.net_g.load_state_dict(cpt["weight"], strict=False)) - self.net_g.eval().to(device) - # print(2333333333,device,config.device,self.device)#net_g是device,hubert是config.device - if config.is_half: + if self.is_half: self.net_g = self.net_g.half() else: self.net_g = self.net_g.float() - self.is_half = config.is_half + + def set_jit_model(): + jit_pth_path = self.pth_path.rstrip(".pth") + jit_pth_path += ".half.jit" if self.is_half else ".jit" + reload = False + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + if os.path.exists(jit_pth_path): + cpt = jit.load(jit_pth_path) + model_device = cpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + cpt = jit.synthesizer_jit_export( + self.pth_path, + "script", + None, + device=self.device, + is_half=self.is_half, + ) + + self.tgt_sr = cpt["config"][-1] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + self.net_g = torch.jit.load( + BytesIO(cpt["model"]), map_location=self.device + ) + self.net_g.infer = self.net_g.forward + self.net_g.eval().to(self.device) + + def set_synthesizer(): + if self.use_jit and not config.dml: + if self.is_half and "cpu" in str(self.device): + logger.warning( + "Use default Synthesizer model. \ + Jit is not supported on the CPU for half floating point" + ) + set_default_model() + else: + set_jit_model() + else: + set_default_model() + + if last_rvc is None or last_rvc.pth_path != self.pth_path: + set_synthesizer() else: self.tgt_sr = last_rvc.tgt_sr self.if_f0 = last_rvc.if_f0 self.version = last_rvc.version - self.net_g = last_rvc.net_g self.is_half = last_rvc.is_half + if last_rvc.use_jit != self.use_jit: + set_synthesizer() + else: + self.net_g = last_rvc.net_g if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): self.model_rmvpe = last_rvc.model_rmvpe @@ -275,6 +318,7 @@ def get_f0_rmvpe(self, x, f0_up_key): "assets/rmvpe/rmvpe.pt", is_half=self.is_half, device=self.device, ####正常逻辑 + use_jit=self.config.use_jit, ) # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) @@ -292,7 +336,7 @@ def infer( f0method, ) -> np.ndarray: feats = feats.view(1, -1) - if config.is_half: + if self.config.is_half: feats = feats.half() else: feats = feats.float() @@ -319,7 +363,7 @@ def infer( weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - if config.is_half: + if self.config.is_half: npy = npy.astype("float16") feats[0][-leng_replace_head:] = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate @@ -358,12 +402,17 @@ def infer( if self.if_f0 == 1: # print(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) infered_audio = self.net_g.infer( - feats, p_len, cache_pitch, cache_pitchf, sid, rate + feats, + p_len, + cache_pitch, + cache_pitchf, + sid, + torch.FloatTensor([rate]), )[0][0, 0].data.float() else: - infered_audio = self.net_g.infer(feats, p_len, sid, rate)[0][ - 0, 0 - ].data.float() + infered_audio = self.net_g.infer( + feats, p_len, sid, torch.FloatTensor([rate]) + )[0][0, 0].data.float() t5 = ttime() logger.info( "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", diff --git a/tools/torchgate/torchgate.py b/tools/torchgate/torchgate.py index 086f2ab..e4b80c4 100644 --- a/tools/torchgate/torchgate.py +++ b/tools/torchgate/torchgate.py @@ -1,4 +1,5 @@ import torch +from infer.lib.rmvpe import STFT from torch.nn.functional import conv1d, conv2d from typing import Union, Optional from .utils import linspace, temperature_sigmoid, amp_to_db @@ -139,17 +140,26 @@ def _stationary_mask( are set to 1, and the rest are set to 0. """ if xn is not None: - XN = torch.stft( - xn, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=self.win_length, - return_complex=True, - pad_mode="constant", - center=True, - window=torch.hann_window(self.win_length).to(xn.device), - ) - + if "privateuseone" in str(xn.device): + if not hasattr(self, "stft"): + self.stft = STFT( + filter_length=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + window="hann", + ).to(xn.device) + XN = self.stft.transform(xn) + else: + XN = torch.stft( + xn, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + return_complex=True, + pad_mode="constant", + center=True, + window=torch.hann_window(self.win_length).to(xn.device), + ) XN_db = amp_to_db(XN).to(dtype=X_db.dtype) else: XN_db = X_db @@ -211,25 +221,28 @@ def forward( Returns: torch.Tensor: The denoised audio signal, with the same shape as the input signal. """ - assert x.ndim == 2 - if x.shape[-1] < self.win_length * 2: - raise Exception(f"x must be bigger than {self.win_length * 2}") - - assert xn is None or xn.ndim == 1 or xn.ndim == 2 - if xn is not None and xn.shape[-1] < self.win_length * 2: - raise Exception(f"xn must be bigger than {self.win_length * 2}") # Compute short-time Fourier transform (STFT) - X = torch.stft( - x, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=self.win_length, - return_complex=True, - pad_mode="constant", - center=True, - window=torch.hann_window(self.win_length).to(x.device), - ) + if "privateuseone" in str(x.device): + if not hasattr(self, "stft"): + self.stft = STFT( + filter_length=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + window="hann", + ).to(x.device) + X, phase = self.stft.transform(x, return_phase=True) + else: + X = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + return_complex=True, + pad_mode="constant", + center=True, + window=torch.hann_window(self.win_length).to(x.device), + ) # Compute signal mask based on stationary or nonstationary assumptions if self.nonstationary: @@ -238,7 +251,7 @@ def forward( sig_mask = self._stationary_mask(amp_to_db(X), xn) # Propagate decrease in signal power - sig_mask = self.prop_decrease * (sig_mask * 1.0 - 1.0) + 1.0 + sig_mask = self.prop_decrease * (sig_mask.float() - 1.0) + 1.0 # Smooth signal mask with 2D convolution if self.smoothing_filter is not None: @@ -252,13 +265,16 @@ def forward( Y = X * sig_mask.squeeze(1) # Inverse STFT to obtain time-domain signal - y = torch.istft( - Y, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=self.win_length, - center=True, - window=torch.hann_window(self.win_length).to(Y.device), - ) + if "privateuseone" in str(Y.device): + y = self.stft.inverse(Y, phase) + else: + y = torch.istft( + Y, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + center=True, + window=torch.hann_window(self.win_length).to(Y.device), + ) return y.to(dtype=x.dtype)