NeonGeckoCom · NeonDaniel · Nov 6, 2023 · Nov 5, 2023 · Nov 6, 2023 · NeonDaniel
diff --git a/neon_iris/client.py b/neon_iris/client.py
@@ -283,7 +283,8 @@ def _send_audio(self, audio_file: str, lang: str,
         audio_data = encode_file_to_base64_string(audio_file)
         message = self._build_message("neon.audio_input",
                                       {"lang": lang,
-                                       "audio_data": audio_data},
+                                       "audio_data": audio_data,
+                                       "utterances": []},
                                       username, user_profiles)
         serialized = {"msg_type": message.msg_type,
                       "data": message.data,

diff --git a/neon_iris/web_client.py b/neon_iris/web_client.py
@@ -23,20 +23,22 @@
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from os.path import isfile
-from typing import List
+from os.path import isfile, join
+from time import time
+from typing import List, Optional
 
 import gradio
 
 from threading import Event
 from ovos_bus_client import Message
 from ovos_config import Configuration
 from ovos_utils import LOG
-
 from neon_utils.file_utils import decode_base64_string_to_file
+from ovos_utils.xdg_utils import xdg_data_home
 
 from neon_iris.client import NeonAIClient
-
+import librosa
+import soundfile as sf
 
 class GradIOClient(NeonAIClient):
     def __init__(self, lang: str = None):
@@ -45,6 +47,7 @@ def __init__(self, lang: str = None):
         NeonAIClient.__init__(self, config.get("MQ"))
         self._await_response = Event()
         self._response = None
+        self._current_tts = None
         self.lang = lang or self.config.get('default_lang') or \
                     self.config.get('languages', ['en-us'])[0]
         self.chat_ui = gradio.Blocks()
@@ -72,6 +75,42 @@ def update_profile(self, stt_lang: str, tts_lang: str, tts_lang_2: str):
         from neon_utils.user_utils import apply_local_user_profile_updates
         apply_local_user_profile_updates(profile_update, self._user_config)
 
+    def send_audio(self, audio_file: str, lang: str = "en-us",
+                   username: Optional[str] = None,
+                   user_profiles: Optional[list] = None):
+        """
+        @param audio_file: path to audio file to send to speech module
+        @param lang: language code associated with request
+        @param username: username associated with request
+        @param user_profiles: user profiles expecting a response
+        """
+        audio_file = self.convert_audio(audio_file)
+        self._send_audio(audio_file, lang, username, user_profiles)
+
+    def convert_audio(self, audio_file: str, target_sr=16000, target_channels=1, dtype='int16') -> str:
+        """
+        @param audio_file: path to audio file to convert for speech model
+        @returns: path to converted audio file
+        """
+        # Load the audio file
+        y, sr = librosa.load(audio_file, sr=None, mono=False)  # Load without changing sample rate or channels
+
+        # If the file has more than one channel, mix it down to one channel
+        if y.ndim > 1 and target_channels == 1:
+            y = librosa.to_mono(y)
+
+        # Resample the audio to the target sample rate
+        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
+
+        # Ensure the audio array is in the correct format (int16 for 2-byte samples)
+        y_resampled = (y_resampled * (2**(8*2 - 1))).astype(dtype)
+
+        output_path = join(join(xdg_data_home(), "iris", "stt"), f"{time()}.wav")
+        # Save the audio file with the new sample rate and sample width
+        sf.write(output_path, y_resampled, target_sr, format='WAV', subtype='PCM_16')
+        LOG.info(f"Converted audio file to {output_path}")
+        return output_path
+
     def on_user_input(self, utterance: str, *args, **kwargs) -> str:
         """
         Callback to handle textual user input
@@ -82,12 +121,21 @@ def on_user_input(self, utterance: str, *args, **kwargs) -> str:
         LOG.info(kwargs)
         self._await_response.clear()
         self._response = None
-        self.send_utterance(utterance, self.lang)
+        if utterance:
+            LOG.info(f"Sending utterance: {utterance}")
+            self.send_utterance(utterance, self.lang)
+        else:
+            LOG.info(f"Sending audio: {args[1]} with lang: {self.lang}")
+            self.send_audio(args[1], self.lang)
         self._await_response.wait(30)
         self._response = self._response or "ERROR"
-        LOG.info(f"Response={self._response}")
+        LOG.info(f"Got response={self._response}")
         return self._response
 
+    def play_tts(self):
+        LOG.info(f"Playing most recent TTS file {self._current_tts}")
+        return self._current_tts
+
     def run(self):
         """
         Blocking method to start the web server
@@ -99,22 +147,28 @@ def run(self):
         address = self.config.get("server_address") or "0.0.0.0"
         port = self.config.get("server_port") or 7860
 
-        audio_input = gradio.Audio(source="microphone", type="filepath")
         chatbot = gradio.Chatbot(label=description)
         textbox = gradio.Textbox(placeholder=placeholder)
 
         with self.chat_ui as blocks:
             # Define primary UI
+            audio_input = gradio.Audio(source="microphone",
+                            type="filepath",
+                            label="Talk to NEON",
+                            auto_submit=True)
             gradio.ChatInterface(self.on_user_input,
                                  chatbot=chatbot,
                                  textbox=textbox,
-                                 additional_inputs=audio_input,
+                                 additional_inputs=[audio_input],
                                  title=title,
                                  retry_btn=None,
-                                 undo_btn=None)
+                                 undo_btn=None,)
+            tts_audio = gradio.Audio(autoplay=True, visible=True,
+                                     label="Neon's Response")
+            tts_button = gradio.Button("Play TTS")
+            tts_button.click(self.play_tts,
+                             outputs=[tts_audio])
             # Define settings UI
-            with gradio.Row():
-                submit = gradio.Button("Update User Settings")
             with gradio.Row():
                 with gradio.Column():
                     stt_lang = gradio.Radio(label="Input Language",
@@ -127,6 +181,7 @@ def run(self):
                                               choices=[None] +
                                               self.supported_languages,
                                               value=None)
+                    submit = gradio.Button("Update User Settings")
                 with gradio.Column():
                     # TODO: Unit settings
                     pass
@@ -156,6 +211,9 @@ def handle_klat_response(self, message: Message):
                 for gender, data in response["audio"].items():
                     filepath = "/".join([self.audio_cache_dir] +
                                         response[gender].split('/')[-4:])
+                    # TODO: This only plays the most recent, so it doesn't support
+                    # multiple languages
+                    self._current_tts = filepath
                     files.append(filepath)
                     if not isfile(filepath):
                         decode_base64_string_to_file(data, filepath)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -3,4 +3,5 @@ click-default-group~=1.2
 neon-utils~=1.0
 pyyaml>=5.4,<7.0.0
 neon-mq-connector~=0.7,>=0.7.1a4
-ovos-bus-client~=0.0.3
+ovos-bus-client~=0.0.3
+librosa~=0.10.1