forked from alumae/kiirkirjutaja
-
Notifications
You must be signed in to change notification settings - Fork 0
/
turn.py
66 lines (52 loc) · 2.43 KB
/
turn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from online_scd.model import SCDModel
from online_scd.streaming import StreamingDecoder
from vad import SpeechSegment
from queue import Queue
import torch
import threading
import logging
class TurnGenerator:
def __init__(self, model, chunk_generator):
self.threshold = 0.1
self.chunk_generator = chunk_generator
self.model = model
self.model.eval()
self.speech_segment_queue = Queue(10)
self.streaming_decoder = StreamingDecoder(self.model)
thread = threading.Thread(target=self.run)
thread.daemon = True
thread.start()
def turns(self):
while True:
speech_segment = self.speech_segment_queue.get()
if speech_segment is not None:
yield speech_segment
else:
return
def run(self):
with torch.no_grad():
sample_pos = 0
chunk_queue = Queue(10)
speech_segment = SpeechSegment(sample_pos, chunk_queue)
self.speech_segment_queue.put(speech_segment)
buffer = torch.tensor([])
for chunk in self.chunk_generator.chunks():
buffer = torch.cat([buffer, chunk])
pos = 0
for i, probs in enumerate(self.streaming_decoder.process_audio(chunk.numpy())):
if probs[1] > self.threshold:
chunk_queue.put(None)
logging.debug(f"Speaker change detected at sample {sample_pos}")
chunk_queue = Queue(10)
speech_segment = SpeechSegment(sample_pos, chunk_queue)
self.speech_segment_queue.put(speech_segment)
else:
pass
chunk_queue.put(buffer[ : 1600])
buffer = buffer[1600: ]
sample_pos += 1600 # 100 ms
pos += 1
chunk_queue.put(buffer)
chunk_queue.put(None)
self.speech_segment_queue.put(None)
logging.debug("Ending turn detector for this speech segment")