-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetect.yaml
115 lines (94 loc) · 3.23 KB
/
detect.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
##########################################
# Hyperparameters for detecting words
#
# Authors
# * Peter Plantinga 2021
##########################################
seed: 1237
__set_seed: !apply:torch.manual_seed [!ref <seed>]
data_folder: !PLACEHOLDER
output_folder: !ref results/detect/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
stats_file: !ref <output_folder>/stats.txt
encoder_save_file: ./encoder.txt
train_manifest: ./train_aligned.json
valid_manifest: ./valid_aligned.json
train_alignments: ./alignment.json
valid_alignments: ./alignment_valid.json
number_of_epochs: 30
n_mels: 40
learning_rate: 0.0001
frozen_module_keys: [] #[encoder]
dataloader_opts:
batch_size: 4
encoder: !new:time_freq_crdnn.TimeFreqCRDNN
input_size: !ref <n_mels>
cnn_channels: [64, 128]
rnn_neurons: 256
rnn_bidirectional: True
dnn_neurons: 256
dropout: 0.2
phoneme_embedding: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: 40
embedding_dim: 100
phn2word_embedding: !new:speechbrain.nnet.RNN.LSTM
input_size: !ref <phoneme_embedding[embedding_dim]>
hidden_size: 100
bidirectional: True
prediction_model: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
rnn_type: gru
attn_type: location
hidden_size: 100
attn_dim: 100
num_layers: 1
enc_dim: !ref <encoder[dnn_neurons]>
input_size: !ref <phn2word_embedding[hidden_size]> * 2
kernel_size: 20
channels: 20
# Output for existence
existence_output: !new:speechbrain.nnet.linear.Linear
input_size: !ref <prediction_model[hidden_size]>
n_neurons: 1
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
compute_feats: !new:speechbrain.lobes.features.Fbank
n_mels: !ref <n_mels>
normalize: !new:speechbrain.processing.features.InputNormalization
spec_augment: !new:speechbrain.lobes.augment.SpecAugment
n_freq_mask: 4
freq_mask_width: 5
time_mask: False
replace_with_zero: False
modules:
encoder: !ref <encoder>
phoneme_embedding: !ref <phoneme_embedding>
phn2word_embedding: !ref <phn2word_embedding>
prediction_model: !ref <prediction_model>
existence_output: !ref <existence_output>
opt_class: !name:torch.optim.Adam
lr: !ref <learning_rate>
lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <learning_rate>
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
encoder: !ref <encoder>
phoneme_embedding: !ref <phoneme_embedding>
phn2word_embedding: !ref <phn2word_embedding>
prediction_model: !ref <prediction_model>
existence_output: !ref <existence_output>
counter: !ref <counter>
ctc_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.ctc_loss
reduction: batch
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
pretrained: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
recognizer: !ref <encoder>