-
Notifications
You must be signed in to change notification settings - Fork 0
/
streaming_server.py
94 lines (79 loc) · 3.25 KB
/
streaming_server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
import asyncio
import websockets
import wave
import whisper
import time
import torch
# Check if NVIDIA GPU is available
print("NVIDIA GPU is available: " + str(torch.cuda.is_available()))
if torch.cuda.is_available():
DEVICE = "cuda"
else:
DEVICE = "cpu"
# Load the Whisper model:
model = whisper.load_model("base", device=DEVICE)
async def audio_server(websocket, path):
print("WebSocket connection established.")
headers = websocket.request_headers
if "samplerate" in headers:
samplerate = int(headers["samplerate"])
else:
samplerate = 44100 # Default 44.1 kHz
if "channels" in headers:
channels = int(headers["channels"])
else:
channels = 1 # Mono
if "task" in headers:
task = headers["task"]
else:
task = "transcribe"
# Configure WAV file settings
wave_file = wave.open("audio.wav", "wb")
wave_file.setnchannels(channels)
wave_file.setsampwidth(2) # 2 bytes per sample
wave_file.setframerate(samplerate)
try:
while True:
# Receive audio data from the WebSocket client
audio_data = await websocket.recv()
if isinstance(audio_data, bytes):
# Write audio data to the WAV file
wave_file.writeframes(audio_data)
elif task == "translate":
# Detect language
audio = whisper.load_audio("audio.wav")
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
detect_start_time = time.time()
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
detected_language = max(probs, key=probs.get)
detect_duration = time.time() - detect_start_time
# Translate audio file
translate_start_time = time.time()
translate_result = model.transcribe(audio, task = 'translate', fp16=False)
translate_duration = time.time() - translate_start_time
await websocket.send(f"Translation: {translate_result['text']} (Duration: {translate_duration}) (Detected Language: {detected_language}) (Duration: {detect_duration})")
break
else:
# Transcribe audio file
transcribe_start_time = time.time()
transcribe_result = model.transcribe("audio.wav")
transcribe_duration = time.time() - transcribe_start_time
await websocket.send(f"Transcript: {transcribe_result['text']} (Duration: {transcribe_duration})")
break
except websockets.exceptions.ConnectionClosed:
print("WebSocket connection closed.")
finally:
# Close the WAV file
wave_file.close()
async def main():
# Start the WebSocket server
server = await websockets.serve(audio_server, "0.0.0.0", 8765)
print("WebSocket server started. Listening on port 8765.")
# Run the server indefinitely
await server.wait_closed()
if __name__ == "__main__":
asyncio.run(main())