forked from homink/deepspeech.pytorch.ko
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcribe.py
91 lines (76 loc) · 4.08 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import argparse
import warnings
warnings.simplefilter('ignore')
from decoder import GreedyDecoder
from torch.autograd import Variable
from data.data_loader import SpectrogramParser
from model import DeepSpeech
import os.path
import json
parser = argparse.ArgumentParser(description='DeepSpeech transcription')
parser.add_argument('--model-path', default='models/deepspeech_final.pth.tar',
help='Path to model file created by training')
parser.add_argument('--audio-path', default='audio.wav',
help='Audio file to predict on')
parser.add_argument('--cuda', action="store_true", help='Use cuda to test model')
parser.add_argument('--decoder', default="greedy", choices=["greedy", "beam"], type=str, help="Decoder to use")
parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information')
beam_args = parser.add_argument_group("Beam Decode Options", "Configurations options for the CTC Beam Search decoder")
beam_args.add_argument('--top-paths', default=1, type=int, help='number of beams to return')
beam_args.add_argument('--beam-width', default=10, type=int, help='Beam width to use')
beam_args.add_argument('--lm-path', default=None, type=str,
help='Path to an (optional) kenlm language model for use with beam search (req\'d with trie)')
beam_args.add_argument('--alpha', default=0.8, type=float, help='Language model weight')
beam_args.add_argument('--beta', default=1, type=float, help='Language model word bonus (all words)')
beam_args.add_argument('--cutoff-top-n', default=40, type=int,
help='Cutoff number in pruning, only top cutoff_top_n characters with highest probs in '
'vocabulary will be used in beam search, default 40.')
beam_args.add_argument('--cutoff-prob', default=1.0, type=float,
help='Cutoff probability in pruning,default 1.0, no pruning.')
beam_args.add_argument('--lm-workers', default=1, type=int, help='Number of LM processes to use')
args = parser.parse_args()
def decode_results(model, decoded_output, decoded_offsets):
results = {
"output": [],
"_meta": {
"acoustic_model": {
"name": os.path.basename(args.model_path)
},
"language_model": {
"name": os.path.basename(args.lm_path) if args.lm_path else None,
},
"decoder": {
"lm": args.lm_path is not None,
"alpha": args.alpha if args.lm_path is not None else None,
"beta": args.beta if args.lm_path is not None else None,
"type": args.decoder,
}
}
}
results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model))
for b in range(len(decoded_output)):
for pi in range(min(args.top_paths, len(decoded_output[b]))):
result = {'transcription': decoded_output[b][pi]}
if args.offsets:
result['offsets'] = decoded_offsets[b][pi]
results['output'].append(result)
return results
if __name__ == '__main__':
model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
model.eval()
labels = DeepSpeech.get_labels(model)
audio_conf = DeepSpeech.get_audio_conf(model)
if args.decoder == "beam":
from decoder import BeamCTCDecoder
decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
beam_width=args.beam_width, num_processes=args.lm_workers)
else:
decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
parser = SpectrogramParser(audio_conf, normalize=True)
spect = parser.parse_audio(args.audio_path).contiguous()
spect = spect.view(1, 1, spect.size(0), spect.size(1))
out = model(Variable(spect, volatile=True))
out = out.transpose(0, 1) # TxNxH
decoded_output, decoded_offsets = decoder.decode(out.data)
print(json.dumps(decode_results(model, decoded_output, decoded_offsets)))