Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tts/stt and styling #25

Merged
merged 2 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion neon_iris/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def _send_audio(self, audio_file: str, lang: str,
audio_data = encode_file_to_base64_string(audio_file)
message = self._build_message("neon.audio_input",
{"lang": lang,
"audio_data": audio_data},
"audio_data": audio_data,
"utterances": []},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should have a TODO note that its patching a connector bug

username, user_profiles)
serialized = {"msg_type": message.msg_type,
"data": message.data,
Expand Down
80 changes: 69 additions & 11 deletions neon_iris/web_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,22 @@
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from os.path import isfile
from typing import List
from os.path import isfile, join
from time import time
from typing import List, Optional

import gradio

from threading import Event
from ovos_bus_client import Message
from ovos_config import Configuration
from ovos_utils import LOG

from neon_utils.file_utils import decode_base64_string_to_file
from ovos_utils.xdg_utils import xdg_data_home

from neon_iris.client import NeonAIClient

import librosa
import soundfile as sf

class GradIOClient(NeonAIClient):
def __init__(self, lang: str = None):
Expand All @@ -45,6 +47,7 @@ def __init__(self, lang: str = None):
NeonAIClient.__init__(self, config.get("MQ"))
self._await_response = Event()
self._response = None
self._current_tts = None
self.lang = lang or self.config.get('default_lang') or \
self.config.get('languages', ['en-us'])[0]
self.chat_ui = gradio.Blocks()
Expand Down Expand Up @@ -72,6 +75,42 @@ def update_profile(self, stt_lang: str, tts_lang: str, tts_lang_2: str):
from neon_utils.user_utils import apply_local_user_profile_updates
apply_local_user_profile_updates(profile_update, self._user_config)

def send_audio(self, audio_file: str, lang: str = "en-us",
username: Optional[str] = None,
user_profiles: Optional[list] = None):
"""
@param audio_file: path to audio file to send to speech module
@param lang: language code associated with request
@param username: username associated with request
@param user_profiles: user profiles expecting a response
"""
audio_file = self.convert_audio(audio_file)
self._send_audio(audio_file, lang, username, user_profiles)

def convert_audio(self, audio_file: str, target_sr=16000, target_channels=1, dtype='int16') -> str:
"""
@param audio_file: path to audio file to convert for speech model
@returns: path to converted audio file
"""
# Load the audio file
y, sr = librosa.load(audio_file, sr=None, mono=False) # Load without changing sample rate or channels

# If the file has more than one channel, mix it down to one channel
if y.ndim > 1 and target_channels == 1:
y = librosa.to_mono(y)

# Resample the audio to the target sample rate
y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

# Ensure the audio array is in the correct format (int16 for 2-byte samples)
y_resampled = (y_resampled * (2**(8*2 - 1))).astype(dtype)

output_path = join(join(xdg_data_home(), "iris", "stt"), f"{time()}.wav")
# Save the audio file with the new sample rate and sample width
sf.write(output_path, y_resampled, target_sr, format='WAV', subtype='PCM_16')
LOG.info(f"Converted audio file to {output_path}")
return output_path

def on_user_input(self, utterance: str, *args, **kwargs) -> str:
"""
Callback to handle textual user input
Expand All @@ -82,12 +121,21 @@ def on_user_input(self, utterance: str, *args, **kwargs) -> str:
LOG.info(kwargs)
self._await_response.clear()
self._response = None
self.send_utterance(utterance, self.lang)
if utterance:
LOG.info(f"Sending utterance: {utterance}")
self.send_utterance(utterance, self.lang)
else:
LOG.info(f"Sending audio: {args[1]} with lang: {self.lang}")
self.send_audio(args[1], self.lang)
self._await_response.wait(30)
self._response = self._response or "ERROR"
LOG.info(f"Response={self._response}")
LOG.info(f"Got response={self._response}")
return self._response

def play_tts(self):
LOG.info(f"Playing most recent TTS file {self._current_tts}")
return self._current_tts

def run(self):
"""
Blocking method to start the web server
Expand All @@ -99,22 +147,28 @@ def run(self):
address = self.config.get("server_address") or "0.0.0.0"
port = self.config.get("server_port") or 7860

audio_input = gradio.Audio(source="microphone", type="filepath")
chatbot = gradio.Chatbot(label=description)
textbox = gradio.Textbox(placeholder=placeholder)

with self.chat_ui as blocks:
# Define primary UI
audio_input = gradio.Audio(source="microphone",
type="filepath",
label="Talk to NEON",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should read from config

auto_submit=True)
gradio.ChatInterface(self.on_user_input,
chatbot=chatbot,
textbox=textbox,
additional_inputs=audio_input,
additional_inputs=[audio_input],
title=title,
retry_btn=None,
undo_btn=None)
undo_btn=None,)
tts_audio = gradio.Audio(autoplay=True, visible=True,
label="Neon's Response")
tts_button = gradio.Button("Play TTS")
tts_button.click(self.play_tts,
outputs=[tts_audio])
# Define settings UI
with gradio.Row():
submit = gradio.Button("Update User Settings")
with gradio.Row():
with gradio.Column():
stt_lang = gradio.Radio(label="Input Language",
Expand All @@ -127,6 +181,7 @@ def run(self):
choices=[None] +
self.supported_languages,
value=None)
submit = gradio.Button("Update User Settings")
with gradio.Column():
# TODO: Unit settings
pass
Expand Down Expand Up @@ -156,6 +211,9 @@ def handle_klat_response(self, message: Message):
for gender, data in response["audio"].items():
filepath = "/".join([self.audio_cache_dir] +
response[gender].split('/')[-4:])
# TODO: This only plays the most recent, so it doesn't support
# multiple languages
self._current_tts = filepath
files.append(filepath)
if not isfile(filepath):
decode_base64_string_to_file(data, filepath)
Expand Down
3 changes: 2 additions & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ click-default-group~=1.2
neon-utils~=1.0
pyyaml>=5.4,<7.0.0
neon-mq-connector~=0.7,>=0.7.1a4
ovos-bus-client~=0.0.3
ovos-bus-client~=0.0.3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be replaced with neon-utils[audio]

librosa~=0.10.1
Loading