-
Notifications
You must be signed in to change notification settings - Fork 0
/
speaker_speech.py
57 lines (44 loc) · 1.86 KB
/
speaker_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import torch
# from speechbrain.pretrained import EncoderClassifier
from speechbrain.inference.speaker import EncoderClassifier
from transformers import SpeechT5Processor
from transliteration import transliterate_text
from model import processor
import code
def dataset_creator():
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
dataset = transliterate_text()
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def prepare_dataset(example):
audio = example["audio"]
example = processor(
text=example["normalized_text"],
audio_target=audio["array"],
sampling_rate=audio["sampling_rate"],
return_attention_mask=False,
)
# strip off the batch dimension
example["labels"] = example["labels"][0]
# use SpeechBrain to obtain x-vector
example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
return example
# processed_example = prepare_dataset(dataset[0])
# code.interact(local=dict(globals(), **locals()))
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
def is_not_too_long(input_ids):
input_length = len(input_ids)
return input_length < 200
dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
return dataset