forked from mallorbc/whisper_mic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmic.py
executable file
·83 lines (71 loc) · 3.18 KB
/
mic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import io
from pydub import AudioSegment
import speech_recognition as sr
import whisper
import queue
import tempfile
import os
import threading
import click
import torch
import numpy as np
@click.command()
@click.option("--model", default="base", help="Model to use", type=click.Choice(["tiny","base", "small","medium","large"]))
@click.option("--english", default=False, help="Whether to use English model",is_flag=True, type=bool)
@click.option("--verbose", default=False, help="Whether to print verbose output", is_flag=True,type=bool)
@click.option("--energy", default=300, help="Energy level for mic to detect", type=int)
@click.option("--dynamic_energy", default=False,is_flag=True, help="Flag to enable dynamic energy", type=bool)
@click.option("--pause", default=0.8, help="Pause time before entry ends", type=float)
@click.option("--save_file",default=False, help="Flag to save file", is_flag=True,type=bool)
def main(model, english,verbose, energy, pause,dynamic_energy,save_file):
temp_dir = tempfile.mkdtemp() if save_file else None
#there are no english models for large
if model != "large" and english:
model = model + ".en"
audio_model = whisper.load_model(model)
audio_queue = queue.Queue()
result_queue = queue.Queue()
threading.Thread(target=record_audio,
args=(audio_queue, energy, pause, dynamic_energy, save_file, temp_dir)).start()
threading.Thread(target=transcribe_forever,
args=(audio_queue, result_queue, audio_model, english, verbose, save_file)).start()
while True:
print(result_queue.get())
def record_audio(audio_queue, energy, pause, dynamic_energy, save_file, temp_dir):
#load the speech recognizer and set the initial energy threshold and pause threshold
r = sr.Recognizer()
r.energy_threshold = energy
r.pause_threshold = pause
r.dynamic_energy_threshold = dynamic_energy
with sr.Microphone(sample_rate=16000) as source:
print("Say something!")
i = 0
while True:
#get and save audio to wav file
audio = r.listen(source)
if save_file:
data = io.BytesIO(audio.get_wav_data())
audio_clip = AudioSegment.from_file(data)
filename = os.path.join(temp_dir, f"temp{i}.wav")
audio_clip.export(filename, format="wav")
audio_data = filename
else:
torch_audio = torch.from_numpy(np.frombuffer(audio.get_raw_data(), np.int16).flatten().astype(np.float32) / 32768.0)
audio_data = torch_audio
audio_queue.put_nowait(audio_data)
i += 1
def transcribe_forever(audio_queue, result_queue, audio_model, english, verbose, save_file):
while True:
audio_data = audio_queue.get()
if english:
result = audio_model.transcribe(audio_data,language='english')
else:
result = audio_model.transcribe(audio_data)
if not verbose:
predicted_text = result["text"]
result_queue.put_nowait("You said: " + predicted_text)
else:
result_queue.put_nowait(result)
if save_file:
os.remove(audio_data)
main()