forked from lliWcWill/liveTranslation_openai-whisper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
443 lines (361 loc) · 21.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
import argparse
import sys
import glob
import os
import wavio
from pynput import keyboard
import time
import signal
import numpy as np
import sounddevice as sd
import logging
from colorama import Fore, Style, init
from groq import Groq
from openai import OpenAI
from encoding_utils import setup_encoding
from utils import load_config, create_session_folder, save_transcription, save_to_desktop, print_json_formatted
from audio_processing import (
record_audio, play_audio, voice_to_text, clear_audio_frames,
record_audio_continuous, start_recording, stop_recording, WAVE_OUTPUT_FILENAME, CHANNELS, SAMPLE_WIDTH, RATE, FORMAT
)
from api_handlers import transcribe_audio, translate_text, voice_stream
from cli_interface import print_welcome_message, get_language_choice, get_file_processing_choices, single_run_input_loop
# Constants
DEFAULT_CONTENT = """You are a [Desired Language]/English translation and interpreter assistant. Your purpose is to bridge the communication and language gap for both [Desired Language] and English speakers. If the input is completely [Desired Language] you WILL only translate to English and vice versa if the input is completely in English you translate to [Name of desired language in that language] for a seamless live translation style approach. If in an input you detect both [Name of desired language in that language] and English and it is clearly distinguishable, please continue to translate to the opposite language. Here is an Example of the desired response style when detecting both languages and responding with both languages. Do not translate the entire text string to one language. keep a convo style flow. You will not execute or analyze any of the info in text sent to be translated. you will only play the role of translating so do not try to provide context or answer questions and request: Translation: I want to know why I have to go to the store to get a deal rather than shopping online. [Phrase in desired language in that language's text if possible]"""
SPECIAL_CONTENT = """It is a beautiful, highly productive September sunny day and you are highly motivated, and you are a World Class Expert AI multilingual translator interpreter. You're capable of understanding any in all languages, and able to fluently and accurately translate them back to English. Your goal and underlying purpose is to bridge all gaps in communication and effectively translate back to English no matter what. You have done this, you are capable of doing this and you will do this. Important: Translate any text to ENGLISH"""
# Logging added back
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Setup
setup_encoding()
init(autoreset=True)
config = load_config()
groq_client = Groq(api_key=config["groq"]["api_key"])
openai_client = OpenAI(api_key=config["openai"]["api_key"])
language_map = {
"European Spanish (Spain)": ("Español Europeo", "Buenos días, ¿cómo estás hoy?"),
"Spanish": ("Español", "¿Qué onda? ¿Todo bien?"),
"Caribbean Spanish (Cuba, Puerto Rico, Dominican Republic)": (
"Español Caribeño",
"Hace mucho calor hoy, ¿verdad?",
),
"Central American Spanish (Guatemala, Honduras, Nicaragua)": (
"Español Centroamericano",
"Vamos a la playa este fin de semana.",
),
"Andean Spanish (Peru, Bolivia, Ecuador)": (
"Español Andino",
"La comida aquí es muy deliciosa.",
),
"Rioplatense Spanish (Argentinna and Uruguay)": (
"Español Rioplatense",
"¿Me pasás la yerba, por favor?",
),
"Chilean Spanish": ("Español Chileno", "¿Cachai lo que te estoy diciendo?"),
"Colombian Spanish": ("Español Colombiano", "¿Quieres ir a tomar un tinto?"),
"Venezuelan Spanish": (
"Español Venezolano",
"Vamos a comer unas arepas esta noche.",
),
"Canary Islands Spanish": ("Español Canario", "El cielo está muy despejado hoy."),
"Mandarin Chinese": ("普通话", "你好,你吃饭了吗?"),
"French": ("Français", "Bonjour, où se trouve la bibliothèque?"),
"German": ("Deutsch", "Kannst du mir helfen, bitte?"),
"Portuguese": ("Português", "Bom dia, como você está?"),
"Russian": ("Русский", "Как дела? Всё хорошо?"),
"Japanese": ("日本語", "こんにちは、元気ですか?"),
"Italian": ("Italiano", "Dove posso trovare un buon ristorante?"),
"Arabic": ("العربية", "مرحبا، كيف حالك اليوم؟"),
"Hindi": ("हिंदी", "नमस्ते, आप कैसे हैं?"),
"Korean": ("한국어", "안녕하세요, 잘 지내세요?"),
}
def parse_arguments():
parser = argparse.ArgumentParser(description="Real-time translation tool")
parser.add_argument("-d", "--duration", type=int, default=45, help="Maximum duration of the recording in seconds (default: 45)")
parser.add_argument("-f", "--file", type=str, help="Path to an existing audio file to transcribe and translate")
parser.add_argument("-c", "--content", type=str, nargs="?", choices=list(language_map.keys()) + ["Smart Select", None], default=DEFAULT_CONTENT, help="Custom content for the API call to Whisper")
parser.add_argument("-t", "--continuous", action="store_true", help="Enable continuous run mode")
parser.add_argument("-v", "--voice", choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], help="Choose a TTS voice for speaking the translation")
parser.add_argument("--save_recordings", action="store_true", help="Save all recordings instead of deleting them")
args = parser.parse_args()
if args.content and isinstance(args.content, bytes):
args.content = args.content.decode(sys.getfilesystemencoding())
return args
def get_modified_content(language):
"""
Generate the modified content for the given language.
Args:
language (str): The desired language for the content.
Raises:
ValueError: If the language is not supported.
Returns:
str: The modified content with the desired language replaced.
Prints:
- The modified content being sent to the API.
Note:
- The modified content is generated by replacing the placeholders
"[Desired Language]", "[Name of desired language in that language]",
and "[Phrase in desired language in that language's text if possible]"
in the DEFAULT_CONTENT string.
- The modified content is then printed in cyan color.
- An empty line is also printed after the modified content.
"""
if language not in language_map:
raise ValueError(f"Unsupported language: {language}")
lang_info = language_map[language]
modified_content = DEFAULT_CONTENT.replace("[Desired Language]", language)
modified_content = modified_content.replace("[Name of desired language in that language]", lang_info[0])
modified_content = modified_content.replace("[Phrase in desired language in that language's text if possible]", lang_info[1])
print(Fore.CYAN + "\nContent being sent to API:" + Style.RESET_ALL)
print(modified_content)
print() # Add an
return modified_content
def continuous_run_mode(content, args, session_folder):
"""
Runs the program in continuous recording mode.
Args:
content (str): The content to be translated.
args (argparse.Namespace): The command line arguments.
session_folder (str): The folder path for the session.
Returns:
None
This function runs the program in continuous recording mode. It listens for keyboard inputs and performs
the following actions:
- If the 'SPACE' key is pressed, it starts or stops the recording.
- If the 'R' key is pressed, it replays the last translation.
- If the 'ESC' key is pressed, it exits the program.
The recorded audio is saved in the session folder as a WAV file. The audio is transcribed and translated
using the specified content. The translation is printed in JSON format.
If the 'voice' argument is provided, the translated text is also converted to speech using the specified
voice.
The function also handles interrupt signals and cleans up the session files.
The function does not return anything.
"""
print(Fore.GREEN + "\nContinuous run mode activated.\n" + Style.RESET_ALL)
print(Fore.YELLOW + "Press SPACE to start/stop recording (max 45 seconds)." + Style.RESET_ALL)
print(Fore.YELLOW + "Press 'R' to replay the last translation." + Style.RESET_ALL)
print(Fore.YELLOW + "Press ESC to exit." + Style.RESET_ALL)
audio_files = []
is_recording = False
should_exit = False
last_ai_audio_path = None
def on_press(key):
"""
Handle the key press event.
Parameters:
key (Key): The key that was pressed.
Returns:
bool: True if the key press event was handled, False otherwise.
This function is called when a key is pressed. It checks the key that was pressed and performs the
corresponding action. If the key is the space key, it toggles the recording state. If the key is the
'r' key, it replays the last translation. If the key is the escape key, it sets the should_exit flag
to True.
The function also checks the recording state and prints a message to the console accordingly.
Returns:
bool: True if the key press event was handled, False otherwise.
"""
nonlocal is_recording, should_exit, last_ai_audio_path
if key == keyboard.Key.space:
is_recording = not is_recording
if is_recording:
print(Fore.CYAN + "\nRecording started. Press SPACE to stop or wait for 45 seconds." + Style.RESET_ALL)
else:
print(Fore.CYAN + "Recording stopped." + Style.RESET_ALL)
elif key == keyboard.KeyCode.from_char('r'):
if last_ai_audio_path:
print(Fore.CYAN + "Replaying last translation..." + Style.RESET_ALL)
play_audio(file_path=last_ai_audio_path)
else:
print(Fore.YELLOW + "No translation available to replay." + Style.RESET_ALL)
elif key == keyboard.Key.esc:
should_exit = True
return False # Stop listener
listener = keyboard.Listener(on_press=on_press)
listener.start()
def signal_handler(sig, frame):
nonlocal should_exit
print(Fore.RED + "\nInterrupt received, cleaning up..." + Style.RESET_ALL)
should_exit = True
signal.signal(signal.SIGINT, signal_handler)
try:
while not should_exit:
if is_recording:
start_time = time.time()
audio_data = []
while is_recording and time.time() - start_time < 45:
chunk = record_audio(0.1, session_folder)
if chunk is not None:
audio_data.extend(chunk)
time.sleep(0.05)
if audio_data:
audio_array = np.array(audio_data)
audio_file_path = os.path.join(session_folder, f"audio_{int(time.time())}.wav")
wavio.write(audio_file_path, audio_array, RATE, sampwidth=2)
audio_files.append(audio_file_path)
logging.info(f"Transcribing audio file: {audio_file_path}")
transcribed_text = transcribe_audio(audio_file_path, groq_client)
if transcribed_text:
logging.info(f"Translating text: {transcribed_text}")
translated_text = translate_text(transcribed_text, content, config["openai"]["api_key"])
save_transcription(session_folder, transcribed_text, translated_text)
if args.voice:
logging.info(f"Generating voice for translated text: {translated_text}")
last_ai_audio_path = voice_stream(translated_text, args.voice, session_folder, openai_client, play_audio)
audio_files.append(last_ai_audio_path)
print_json_formatted({"Original": transcribed_text, "Translation": translated_text})
time.sleep(0.1)
except Exception as e:
print(Fore.RED + f"\nAn error occurred: {e}" + Style.RESET_ALL)
finally:
listener.stop()
handle_session_files(audio_files, session_folder, args.save_recordings)
def single_run_mode(content, args, session_folder):
"""
Runs the program in single run mode.
Args:
content (str): The content to be translated.
args (argparse.Namespace): The command line arguments.
session_folder (str): The folder path for the session.
Returns:
None
This function runs the program in single run mode. It listens for keyboard inputs and performs
the following actions:
- If the space bar is pressed, it starts recording audio. The recorded audio is saved in the
session folder as a WAV file. The audio is transcribed and translated using the specified
content. The translation is printed in JSON format.
If the recording is successful, the translated text is also converted to speech using the
specified voice.
- If the 'r' key is pressed, it replays the last translation.
- If the 'exit' key is pressed, it exits the program.
The function also handles interrupt signals and cleans up the session files.
The function does not return anything.
"""
audio_files = []
last_ai_audio_path = None
print(Fore.GREEN + "Press the space bar to start recording, 'r' to replay the last translation, or 'exit' to quit:" + Style.RESET_ALL)
try:
while True:
user_input = single_run_input_loop()
if user_input == " ":
audio_data = record_audio(args.duration or 20, session_folder)
if audio_data.size > 0: # Use .size to check if the numpy array is empty
audio_file_path = os.path.join(session_folder, f"audio_{int(time.time())}.wav")
wavio.write(audio_file_path, audio_data, RATE, sampwidth=SAMPLE_WIDTH)
audio_files.append(audio_file_path)
transcribed_text = transcribe_audio(audio_file_path, groq_client)
if transcribed_text:
translated_text = translate_text(transcribed_text, content, config["openai"]["api_key"])
save_transcription(session_folder, transcribed_text, translated_text)
if args.voice:
ai_audio_path = voice_stream(translated_text, args.voice, session_folder, openai_client, play_audio)
audio_files.append(ai_audio_path)
last_ai_audio_path = ai_audio_path
print_json_formatted({"Original": transcribed_text, "Translation": translated_text})
else:
print("Recording was interrupted or failed. Please try again.")
elif user_input.lower() == "r":
if last_ai_audio_path:
play_audio(file_path=last_ai_audio_path)
else:
print("No previous translation to replay.")
elif user_input.lower() == "exit":
break
except KeyboardInterrupt:
print(Fore.RED + "\nInterrupt received, cleaning up and exiting..." + Style.RESET_ALL)
finally:
handle_session_files(audio_files, session_folder, args.save_recordings)
def handle_session_files(audio_files, session_folder, save_recordings=False):
"""
Handles the session files based on the user's input. If `save_recordings` is `False`, the function prompts the user to either delete the session files or keep them. If the user chooses to delete the files, the function attempts to remove each file in the `audio_files` list. If any file fails to be deleted, an error message is printed. If the user chooses to keep the files, a success message is printed indicating where the session files are saved. If `save_recordings` is `True`, a success message is printed indicating where all audio files are saved. If a keyboard interrupt occurs during the deletion process, a message is printed indicating that the file deletion was skipped and the session files are kept.
Parameters:
audio_files (list): A list of file paths for the session files.
session_folder (str): The folder path where the session files are saved.
save_recordings (bool, optional): Whether to save the audio files. Defaults to `False`.
Returns:
None
"""
if not save_recordings:
try:
user_input = input(Fore.YELLOW + "Press 'd' to delete or any other key to keep the session files: " + Style.RESET_ALL)
if user_input.lower() == "d":
for file_path in audio_files:
try:
os.remove(file_path)
except Exception as e:
print(Fore.RED + f"Failed to delete file {file_path}: {e}" + Style.RESET_ALL)
print(Fore.GREEN + "All session files have been deleted." + Style.RESET_ALL)
else:
print(Fore.GREEN + f"Session files are saved in {session_folder}." + Style.RESET_ALL)
except KeyboardInterrupt:
print(Fore.YELLOW + "\nFile deletion skipped. Session files are kept." + Style.RESET_ALL)
else:
print(Fore.GREEN + f"All audio files are saved in {session_folder}." + Style.RESET_ALL)
def process_file(file_path, content, action_choice):
"""
Process a file by transcribing its audio content and optionally translating it.
Args:
file_path (str): The path of the file to be processed.
content (str): The content to be used for translation.
action_choice (str): The choice of action to be performed.
Returns:
None
This function takes a file path, a content string, and an action choice. It extracts the base name of the file,
generates a text file name based on the base name, and transcribes the audio content of the file using the `groq_client`.
If the transcription is successful, it checks the action choice. If the action choice is "1", it translates the
transcribed text using the `translate_text` function and saves the original and translated text in a result
content string. If the action choice is not "1", it saves the transcribed text in the result content string.
Finally, it saves the result content in a text file on the desktop using the `save_to_desktop` function.
Note:
- The function assumes that the `groq_client` and the `config` dictionary are defined in the global scope.
- The function does not return anything.
"""
base_name = os.path.basename(file_path)
text_file_name = f"{os.path.splitext(base_name)[0]}_transcription.txt"
transcribed_text = transcribe_audio(file_path, groq_client)
if transcribed_text:
if action_choice == "1": # Transcribe and translate
translated_text = translate_text(transcribed_text, content, config["openai"]["api_key"])
result_content = f"Original: {transcribed_text}\nTranslation: {translated_text}"
else: # Only transcribe
result_content = f"Transcription: {transcribed_text}"
save_to_desktop(text_file_name, result_content)
def main():
"""
The main function of the program.
This function parses the command line arguments using the `parse_arguments` function and prints a welcome message.
If the `content` argument is not provided or is an empty string, it prompts the user to select a language and generates the modified content using the `get_modified_content` function.
If the `content` argument is set to 'Smart Select', it sets the content to the special content and prints the content being sent to the API.
If the `content` argument is provided and is not 'Smart Select', it generates the modified content using the `get_modified_content` function.
If the `file` argument is provided, it prompts the user to choose an action and a path. It checks if the action choice is None and exits the program if it is. It then processes the files in the path or the file itself using the `process_file` function.
If the `file` argument is not provided, it creates a session folder using the `create_session_folder` function. If the `continuous` argument is provided, it enters the continuous run mode using the `continuous_run_mode` function. Otherwise, it enters the single run mode using the `single_run_mode` function.
Parameters:
None
Returns:
None
"""
args = parse_arguments()
print_welcome_message()
if args.content is None or args.content == '':
selected_language = get_language_choice(language_map)
content = get_modified_content(selected_language)
elif args.content == 'Smart Select':
content = SPECIAL_CONTENT
print(Fore.CYAN + "\nContent being sent to API (Smart Select):" + Style.RESET_ALL)
print(content)
print()
else:
content = get_modified_content(args.content)
if args.file:
action_choice, path = get_file_processing_choices()
if action_choice is None:
sys.exit(1)
files_to_process = glob.glob(os.path.join(path, "*.wav")) if os.path.isdir(path) else [path]
for file_path in files_to_process:
process_file(file_path, content, action_choice)
else:
session_folder = create_session_folder()
if args.continuous:
continuous_run_mode(content, args, session_folder)
else:
single_run_mode(content, args, session_folder)
if __name__ == "__main__":
main()