forked from PlayVoice/whisper-vits-svc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
svc_inference.py
143 lines (124 loc) · 4.64 KB
/
svc_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import torch
import librosa
import argparse
import numpy as np
import torchcrepe
from omegaconf import OmegaConf
from scipy.io.wavfile import write
from vits.models import SynthesizerInfer
def load_svc_model(checkpoint_path, model):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
saved_state_dict = checkpoint_dict["model_g"]
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items():
new_state_dict[k] = saved_state_dict[k]
model.load_state_dict(new_state_dict)
return model
def compute_f0_nn(filename, device):
audio, sr = librosa.load(filename, sr=16000)
assert sr == 16000
# Load audio
audio = torch.tensor(np.copy(audio))[None]
# Here we'll use a 20 millisecond hop length
hop_length = 320
# Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
# This would be a reasonable range for speech
fmin = 50
fmax = 1000
# Select a model capacity--one of "tiny" or "full"
model = "full"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
pitch, periodicity = torchcrepe.predict(
audio,
sr,
hop_length,
fmin,
fmax,
model,
batch_size=batch_size,
device=device,
return_periodicity=True,
)
pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2
periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
# CREPE was not trained on silent audio. some error on silent need filter.
periodicity = torchcrepe.filter.median(periodicity, 9)
pitch = torchcrepe.filter.mean(pitch, 9)
pitch[periodicity < 0.1] = 0
pitch = pitch.squeeze(0)
return pitch
def main(args):
if (args.ppg == None):
args.ppg = "svc_tmp.ppg.npy"
print(
f"Auto run : python whisper/inference.py -w {args.wave} -p {args.ppg}")
os.system(f"python whisper/inference.py -w {args.wave} -p {args.ppg}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hp = OmegaConf.load(args.config)
model = SynthesizerInfer(
hp.data.filter_length // 2 + 1,
hp.data.segment_size // hp.data.hop_length,
hp)
load_svc_model(args.model, model)
model.eval()
model.to(device)
spk = np.load(args.spk)
spk = torch.FloatTensor(spk)
ppg = np.load(args.ppg)
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
ppg = torch.FloatTensor(ppg)
pit = compute_f0_nn(args.wave, device)
if (args.statics == None):
print("don't use pitch shift")
else:
source = pit[pit > 0]
source_ave = source.mean()
source_min = source.min()
source_max = source.max()
print(f"source pitch statics: mean={source_ave:0.1f}, \
min={source_min:0.1f}, max={source_max:0.1f}")
singer_ave, singer_min, singer_max = np.load(args.statics)
print(f"singer pitch statics: mean={singer_ave:0.1f}, \
min={singer_min:0.1f}, max={singer_max:0.1f}")
shift = np.log2(singer_ave/source_ave) * 12
if (singer_ave >= source_ave):
shift = np.floor(shift)
else:
shift = np.ceil(shift)
shift = 2 ** (shift / 12)
pit = pit * shift
pit = torch.FloatTensor(pit)
len_pit = pit.size()[0]
len_ppg = ppg.size()[0]
len_min = min(len_pit, len_ppg)
pit = pit[:len_min]
ppg = ppg[:len_min, :]
with torch.no_grad():
spk = spk.unsqueeze(0).to(device)
ppg = ppg.unsqueeze(0).to(device)
pit = pit.unsqueeze(0).to(device)
len_min = torch.LongTensor([len_min]).to(device)
audio = model(ppg, pit, spk, len_min)
audio = audio[0, 0].data.cpu().detach().numpy()
write("svc_out.wav", hp.data.sampling_rate, audio)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True,
help="yaml file for config.")
parser.add_argument('-m', '--model', type=str, required=True,
help="path of model for evaluation")
parser.add_argument('-w', '--wave', type=str, required=True,
help="Path of raw audio.")
parser.add_argument('-s', '--spk', type=str, required=True,
help="Path of speaker.")
parser.add_argument('-p', '--ppg', type=str,
help="Path of content vector.")
parser.add_argument('-t', '--statics', type=str,
help="Path of pitch statics.")
args = parser.parse_args()
main(args)