diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions.py index 057d786f1..fbd56acd2 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions.py @@ -1,13 +1,11 @@ -import copy import math from typing import Optional -import numpy as np import torch from torch import nn from torch.nn import functional as F -from voice_changer.RVC.inferencer.rvc_models.infer_pack import commons, modules +from voice_changer.RVC.inferencer.rvc_models.infer_pack import commons from voice_changer.RVC.inferencer.rvc_models.infer_pack.modules import LayerNorm @@ -142,8 +140,9 @@ def forward(self, x, x_mask, h, h_mask): x: decoder input h: encoder output """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype + m_size = x_mask.size(2) + self_attn_mask = commons.subsequent_mask( + torch.ones(m_size, m_size, device=x.device, dtype=x.dtype), ) encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask @@ -191,6 +190,7 @@ def __init__( self.attn = None self.k_channels = channels // n_heads + self.k_channels_sqrt = math.sqrt(self.k_channels) self.conv_q = nn.Conv1d(channels, channels, 1) self.conv_k = nn.Conv1d(channels, channels, 1) self.conv_v = nn.Conv1d(channels, channels, 1) @@ -243,22 +243,21 @@ def attention( key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + query /= self.k_channels_sqrt + + scores = torch.matmul(query, key.transpose(-2, -1)) if self.window_size is not None: assert ( t_s == t_t ), "Relative attention is only available for self-attention." key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) + rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) scores_local = self._relative_position_to_absolute_position(rel_logits) scores = scores + scores_local if self.proximal_bias: assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) + r = torch.arange(t_s, dtype=scores.dtype, device=scores.device) + scores = scores + self._attention_bias_proximal(r) if mask is not None: scores = scores.masked_fill(mask == 0, -1e4) if self.block_length is not None: @@ -373,14 +372,13 @@ def _absolute_position_to_relative_position(self, x): x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] return x_final - def _attention_bias_proximal(self, length: int): + def _attention_bias_proximal(self, r: torch.Tensor): """Bias for self-attention to encourage attention to close positions. Args: - length: an integer scalar. + r: torch.Tensor Returns: a Tensor with shape [1, 1, length, length] """ - r = torch.arange(length, dtype=torch.float32) diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions_onnx.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions_onnx.py index 9ed67859f..ecc763e25 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions_onnx.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/attentions_onnx.py @@ -1,8 +1,6 @@ -import copy import math from typing import Optional -import numpy as np import torch from torch import nn from torch.nn import functional as F @@ -142,8 +140,9 @@ def forward(self, x, x_mask, h, h_mask): x: decoder input h: encoder output """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype + m_size = x_mask.size(2) + self_attn_mask = commons.subsequent_mask( + torch.ones(m_size, m_size, device=x.device, dtype=x.dtype), ) encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask @@ -191,6 +190,7 @@ def __init__( self.attn = None self.k_channels = channels // n_heads + self.k_channels_sqrt = math.sqrt(self.k_channels) self.conv_q = nn.Conv1d(channels, channels, 1) self.conv_k = nn.Conv1d(channels, channels, 1) self.conv_v = nn.Conv1d(channels, channels, 1) @@ -243,22 +243,21 @@ def attention( key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + query /= self.k_channels_sqrt + + scores = torch.matmul(query, key.transpose(-2, -1)) if self.window_size is not None: assert ( t_s == t_t ), "Relative attention is only available for self-attention." key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) + rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) scores_local = self._relative_position_to_absolute_position(rel_logits) scores = scores + scores_local if self.proximal_bias: assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) + r = torch.arange(t_s, dtype=scores.dtype, device=scores.device) + scores = scores + self._attention_bias_proximal(r) if mask is not None: scores = scores.masked_fill(mask == 0, -1e4) if self.block_length is not None: @@ -373,14 +372,13 @@ def _absolute_position_to_relative_position(self, x): x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] return x_final - def _attention_bias_proximal(self, length: int): + def _attention_bias_proximal(self, r: torch.Tensor): """Bias for self-attention to encourage attention to close positions. Args: - length: an integer scalar. + r: torch.Tensor Returns: a Tensor with shape [1, 1, length, length] """ - r = torch.arange(length, dtype=torch.float32) diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/commons.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/commons.py index 4ec6c244e..0431cc705 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/commons.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/commons.py @@ -1,9 +1,7 @@ from typing import List, Optional import math -import numpy as np import torch -from torch import nn from torch.nn import functional as F @@ -66,7 +64,7 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4): if x_lengths is None: x_lengths = t ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long) ret = slice_segments(x, ids_str, segment_size) return ret, ids_str @@ -99,9 +97,8 @@ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) -def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask +def subsequent_mask(mask: torch.Tensor): + return torch.tril(mask, out=mask).unsqueeze(0).unsqueeze(0) @torch.jit.script diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py index a2642c112..aed513c44 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py @@ -261,7 +261,7 @@ def forward( x = x + self.cond(g) for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE, inplace=True) x = self.ups[i](x) xs = None for j in range(self.num_kernels): @@ -270,9 +270,9 @@ def forward( else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels - x = F.leaky_relu(x) + x = F.leaky_relu(x, inplace=True) x = self.conv_post(x) - x = torch.tanh(x) + x = torch.tanh(x, out=x) return x @@ -338,15 +338,12 @@ def __init__( self.sampling_rate = samp_rate self.voiced_threshold = voiced_threshold - def _f02uv(self, f0): + def _f02uv(self, f0: torch.Tensor): # generate uv signal uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": # for DirectML - uv = uv.float() - return uv + return uv * (f0 > self.voiced_threshold) - def forward(self, f0: torch.Tensor, upp: int): + def forward(self, f0: torch.Tensor, upp: float): """sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 @@ -376,12 +373,12 @@ def forward(self, f0: torch.Tensor, upp: int): tmp_over_one *= upp tmp_over_one = F.interpolate( tmp_over_one.transpose(2, 1), - scale_factor=float(upp), + scale_factor=upp, mode="linear", align_corners=True, ).transpose(2, 1) rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" ).transpose( 2, 1 ) ####### @@ -393,9 +390,9 @@ def forward(self, f0: torch.Tensor, upp: int): torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi ) sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) + uv = self._f02uv(f0).to(f0.dtype) uv = F.interpolate( - uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + uv.transpose(2, 1), scale_factor=upp, mode="nearest" ).transpose(2, 1) noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) @@ -443,18 +440,9 @@ def __init__( # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - # self.ddtype:int = -1 - def forward(self, x: torch.Tensor, upp: int = 1): - # if self.ddtype ==-1: - # self.ddtype = self.l_linear.weight.dtype + def forward(self, x: torch.Tensor, upp: float = 1.): sine_wavs, uv, _ = self.l_sin_gen(x, upp) - # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) - # if self.is_half: - # sine_wavs = sine_wavs.half() - # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) - # print(sine_wavs.dtype,self.ddtype) - # if sine_wavs.dtype != self.l_linear.weight.dtype: sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None # noise, uv @@ -540,7 +528,7 @@ def forward( g: Optional[torch.Tensor] = None, n_res: Optional[int] = None, ): - har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source, noi_source, uv = self.m_source(f0, float(self.upp)) har_source = har_source.transpose(1, 2) if n_res is not None: if (n := n_res * self.upp) != har_source.shape[-1]: @@ -554,7 +542,7 @@ def forward( # That's why I wrote this for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): if i < self.num_upsamples: - x = F.leaky_relu(x, self.lrelu_slope) + x = F.leaky_relu(x, self.lrelu_slope, inplace=True) x = ups(x) x_source = noise_convs(har_source) x = x + x_source @@ -570,9 +558,9 @@ def forward( # If ignored, it will cause torch.jit.script() compilation errors assert isinstance(xs, torch.Tensor) x = xs / self.num_kernels - x = F.leaky_relu(x) + x = F.leaky_relu(x, inplace=True) x = self.conv_post(x) - x = torch.tanh(x) + x = torch.tanh(x, out=x) return x def remove_weight_norm(self): @@ -1121,7 +1109,7 @@ def forward(self, x): for l in self.convs: x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE, inplace=True) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -1205,7 +1193,7 @@ def forward(self, x): for l in self.convs: x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE, inplace=True) fmap.append(x) x = self.conv_post(x) fmap.append(x) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models_onnx.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models_onnx.py index b136c8449..a2723038e 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models_onnx.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models_onnx.py @@ -39,6 +39,7 @@ def __init__( self.kernel_size = kernel_size self.p_dropout = float(p_dropout) self.emb_phone = nn.Linear(in_channels, hidden_channels) + self.sqrt_hidden_channels = math.sqrt(self.hidden_channels) self.lrelu = nn.LeakyReLU(0.1, inplace=True) if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 @@ -63,7 +64,7 @@ def forward( x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = x * self.sqrt_hidden_channels # [b, t, h] x = self.lrelu(x) x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( @@ -260,7 +261,7 @@ def forward( x = x + self.cond(g) for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE, inplace=True) x = self.ups[i](x) xs = None for j in range(self.num_kernels): @@ -269,7 +270,7 @@ def forward( else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels - x = F.leaky_relu(x) + x = F.leaky_relu(x, inplace=True) x = self.conv_post(x) x = torch.tanh(x) @@ -337,15 +338,12 @@ def __init__( self.sampling_rate = samp_rate self.voiced_threshold = voiced_threshold - def _f02uv(self, f0): + def _f02uv(self, f0: torch.Tensor): # generate uv signal uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": # for DirectML - uv = uv.float() - return uv + return uv * (f0 > self.voiced_threshold) - def forward(self, f0: torch.Tensor, upp: int): + def forward(self, f0: torch.Tensor, upp: float): """sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 @@ -375,12 +373,12 @@ def forward(self, f0: torch.Tensor, upp: int): tmp_over_one *= upp tmp_over_one = F.interpolate( tmp_over_one.transpose(2, 1), - scale_factor=float(upp), + scale_factor=upp, mode="linear", align_corners=True, ).transpose(2, 1) rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" ).transpose( 2, 1 ) ####### @@ -392,9 +390,9 @@ def forward(self, f0: torch.Tensor, upp: int): torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi ) sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) + uv = self._f02uv(f0).to(f0.dtype) uv = F.interpolate( - uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + uv.transpose(2, 1), scale_factor=upp, mode="nearest" ).transpose(2, 1) noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) @@ -442,18 +440,9 @@ def __init__( # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - # self.ddtype:int = -1 - def forward(self, x: torch.Tensor, upp: int = 1): - # if self.ddtype ==-1: - # self.ddtype = self.l_linear.weight.dtype + def forward(self, x: torch.Tensor, upp: float = 1.): sine_wavs, uv, _ = self.l_sin_gen(x, upp) - # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) - # if self.is_half: - # sine_wavs = sine_wavs.half() - # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) - # print(sine_wavs.dtype,self.ddtype) - # if sine_wavs.dtype != self.l_linear.weight.dtype: sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None # noise, uv @@ -540,7 +529,7 @@ def forward( g: Optional[torch.Tensor] = None, n_res: Optional[int] = None, ): - har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source, noi_source, uv = self.m_source(f0, float(self.upp)) har_source = har_source.transpose(1, 2) if n_res is not None: if (n := n_res * self.upp) != har_source.shape[-1]: @@ -554,7 +543,7 @@ def forward( # That's why I wrote this for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): if i < self.num_upsamples: - x = F.leaky_relu(x, self.lrelu_slope) + x = F.leaky_relu(x, self.lrelu_slope, inplace=True) x = ups(x) x_source = noise_convs(har_source) x = x + x_source @@ -570,7 +559,7 @@ def forward( # If ignored, it will cause torch.jit.script() compilation errors assert isinstance(xs, torch.Tensor) x = xs / self.num_kernels - x = F.leaky_relu(x) + x = F.leaky_relu(x, inplace=True) x = self.conv_post(x) x = torch.tanh(x) return x @@ -916,7 +905,7 @@ def forward(self, x): for l in self.convs: x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE, inplace=True) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -1000,7 +989,7 @@ def forward(self, x): for l in self.convs: x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE, inplace=True) fmap.append(x) x = self.conv_post(x) fmap.append(x) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/modules.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/modules.py index d190f6362..354392aa6 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/modules.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/modules.py @@ -1,12 +1,9 @@ -import copy import math from typing import Optional, Tuple -import numpy as np -import scipy import torch from torch import nn -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, weight_norm @@ -331,7 +328,7 @@ def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None): if x_mask is not None: xt = xt * x_mask xt = c1(xt) - xt = F.leaky_relu(xt, self.lrelu_slope) + xt = F.leaky_relu(xt, self.lrelu_slope, inplace=True) if x_mask is not None: xt = xt * x_mask xt = c2(xt) @@ -450,10 +447,10 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: x = torch.flip(x, [1]) if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device) return x, logdet else: - return x, torch.zeros([1], device=x.device) + return x, torch.zeros([1], dtype=x.dtype, device=x.device) class ElementwiseAffine(nn.Module): @@ -562,6 +559,7 @@ def __init__( super(ConvFlow, self).__init__() self.in_channels = in_channels self.filter_channels = filter_channels + self.filter_channels_sqrt = math.sqrt(filter_channels) self.kernel_size = kernel_size self.n_layers = n_layers self.num_bins = num_bins @@ -591,10 +589,8 @@ def forward( b, c, t = x0.shape h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) + unnormalized_widths = h[..., : self.num_bins] / self.filter_channels_sqrt + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / self.filter_channels_sqrt unnormalized_derivatives = h[..., 2 * self.num_bins :] x1, logabsdet = piecewise_rational_quadratic_transform(