forked from lliWcWill/liveTranslation_openai-whisper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
audio_processing.py
190 lines (157 loc) · 6.92 KB
/
audio_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import sounddevice as sd
import wavio
import os
import subprocess
from pynput import keyboard
from datetime import datetime
import numpy as np
import time
import sys
import logging
from colorama import Fore, Style
# Constants for recording
CHANNELS = 1
SAMPLE_WIDTH = 2
RATE = 16000
FORMAT = "int16"
WAVE_OUTPUT_FILENAME = "temp_audio.wav"
# Global variables
audio_frames = []
is_recording = False
def record_audio(duration, session_folder):
"""
Record audio for a specified duration.
Args:
duration (float): The duration of the recording in seconds.
session_folder (str): The folder path where the recorded audio will be saved.
Returns:
numpy.ndarray or None: The recorded audio data as a flattened numpy array, or None if an error occurred during recording.
"""
try:
audio_data = sd.rec(int(duration * RATE), samplerate=RATE, channels=CHANNELS, dtype=np.int16)
sd.wait()
return audio_data.flatten()
except Exception as e:
logging.error(f"Error during recording: {e}")
return None
def play_audio(audio_content=None, file_path=None):
"""
Play audio using ffplay.
Args:
audio_content (bytes, optional): The audio content to play. Defaults to None.
file_path (str, optional): The path to the audio file to play. Defaults to None.
Raises:
Exception: If an error occurs during audio playback.
"""
try:
cmd = ["ffplay", "-nodisp", "-autoexit"]
stdin_pipe = None
if file_path:
cmd.append(file_path)
else:
cmd.append("-")
stdin_pipe = subprocess.PIPE
ffplay_proc = subprocess.Popen(
cmd,
stdin=stdin_pipe,
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT,
)
if audio_content and not file_path:
ffplay_proc.stdin.write(audio_content)
ffplay_proc.stdin.flush()
ffplay_proc.stdin.close()
ffplay_proc.wait()
except Exception as e:
print(f"Error playing audio: {e}")
def save_audio_frames():
"""
Saves the recorded audio frames to a WAV file.
Returns:
str: The path to the saved WAV file.
"""
global audio_frames
wavio.write(WAVE_OUTPUT_FILENAME, audio_frames, RATE, sampwidth=SAMPLE_WIDTH)
return WAVE_OUTPUT_FILENAME
def voice_to_text(transcribe_func):
"""
Transcribes spoken words from recorded audio data into text.
This function first compiles the collected audio frames into a WAV file and then
uses the provided transcription function to convert the audio content into text.
Args:
transcribe_func (callable): A function that takes a file path and returns transcribed text.
Returns:
str: The transcribed text as a string.
Notes:
- The function uses the global `audio_frames` variable for audio data.
- It saves the audio data to a temporary WAV file before transcription.
"""
wav_file = save_audio_frames()
return transcribe_func(wav_file)
def clear_audio_frames():
"""Clears the global audio_frames list."""
global audio_frames
audio_frames = []
def record_audio_continuous():
"""
Initiates continuous audio recording, storing incoming audio frames in memory until the recording is
stopped. It relies on the `sounddevice.InputStream` to capture audio in real-time, with a designated
callback function that processes and stores each audio frame.
This function is designed to be part of a larger system that requires real-time audio data capture, such
as a speech translation or voice command application. It will record indefinitely until the `is_recording`
global variable is explicitly set to False, typically by another part of the application in response to
a user command or action.
Returns:
bytes: A bytes object that contains all recorded audio frames concatenated together. This object can
be written to a file, processed, or streamed further depending on the application's requirements.
Notes:
The recording loop runs on the main thread, with actual audio capture happening on a background thread
managed by the `sounddevice` library. The function prints a message to the console indicating it is
ready to record and will continue to do so until it is instructed to stop.
Example:
# Begin recording
audio_data = record_audio_continuous()
# To stop recording, set is_recording to False from another thread or signal handler.
"""
global is_recording
is_recording = True
print(Fore.GREEN + "Say 'stop' to end recording..." + Style.RESET_ALL)
with sd.InputStream(channels=CHANNELS, samplerate=RATE, callback=record_callback):
while is_recording:
time.sleep(0.1)
return b"".join(audio_frames)
def record_callback(indata, frames, time, status):
"""
Callback function for the sounddevice.InputStream that processes incoming audio data.
This function is called from a separate thread for each audio block captured by the
audio input stream. It appends the captured audio frames to a global list if recording
is active. It also handles the reporting of any audio stream statuses, such as overflows
or underflows, which are indicators of potential issues with the recording process.
Parameters:
indata (numpy.ndarray): A two-dimensional NumPy array containing the captured audio data
for each frame, where each row represents one frame.
frames (int): The number of frames (block size) of the audio data captured.
time (CData): An instance of sounddevice._ffi.CData containing the timestamp of the first sample
in 'indata'. The structure contains 'inputBufferAdcTime' and 'currentTime' attributes.
status (sounddevice.CallbackFlags): An instance of sounddevice.CallbackFlags indicating the status
of the audio input stream.
Notes:
- The function modifies the global `audio_frames` list, appending new audio data if `is_recording` is True.
- Any important `status` flags are printed to the standard error stream to alert of issues like buffer overflows.
- This callback is designed to operate in the background, and its efficiency is crucial to avoid latency or
loss of audio data. Therefore, operations within the callback should be kept to a minimum.
"""
global is_recording, audio_frames
if is_recording:
audio_frames.append(indata.copy())
if status:
print(status, file=sys.stderr)
def start_recording():
"""Starts the recording process by setting is_recording to True."""
global is_recording
is_recording = True
def stop_recording():
"""Stops the recording process by setting is_recording to False."""
global is_recording
is_recording = False
# Add more audio processing functions as needed