diff --git a/CHANGELOG.md b/CHANGELOG.md index f4e306b..73b4876 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,60 +1,60 @@ # Changelog -## [4.2.0](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.0) (2023-10-27) +## [4.2.1a7](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a7) (2023-12-13) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.1a6...4.2.0) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.1a6...4.2.1a7) -**Fixed bugs:** +**Merged pull requests:** -- \[BUG\] Docker `start_listening` resource missing [\#170](https://github.com/NeonGeckoCom/neon_speech/issues/170) +- Update neon-utils dependency to stable release [\#186](https://github.com/NeonGeckoCom/neon_speech/pull/186) ([NeonDaniel](https://github.com/NeonDaniel)) -## [4.1.1a6](https://github.com/NeonGeckoCom/neon_speech/tree/4.1.1a6) (2023-10-26) +## [4.2.1a6](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a6) (2023-11-29) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.1a5...4.1.1a6) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.1a5...4.2.1a6) **Merged pull requests:** -- OVOS Dinkum Listener Backwards Compat [\#178](https://github.com/NeonGeckoCom/neon_speech/pull/178) ([NeonDaniel](https://github.com/NeonDaniel)) +- Override ovos.language.stt handler for server/API usage [\#185](https://github.com/NeonGeckoCom/neon_speech/pull/185) ([NeonDaniel](https://github.com/NeonDaniel)) -## [4.1.1a5](https://github.com/NeonGeckoCom/neon_speech/tree/4.1.1a5) (2023-10-26) +## [4.2.1a5](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a5) (2023-11-22) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.1a4...4.1.1a5) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.1a4...4.2.1a5) **Merged pull requests:** -- Stable dependencies for release [\#177](https://github.com/NeonGeckoCom/neon_speech/pull/177) ([NeonDaniel](https://github.com/NeonDaniel)) +- Update global config on local user STT language change [\#184](https://github.com/NeonGeckoCom/neon_speech/pull/184) ([NeonDaniel](https://github.com/NeonDaniel)) -## [4.1.1a4](https://github.com/NeonGeckoCom/neon_speech/tree/4.1.1a4) (2023-10-13) +## [4.2.1a4](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a4) (2023-11-22) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.1a3...4.1.1a4) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.1a3...4.2.1a4) **Merged pull requests:** -- Update Dinkum Listener dependency [\#176](https://github.com/NeonGeckoCom/neon_speech/pull/176) ([NeonDaniel](https://github.com/NeonDaniel)) +- Add timing metrics [\#183](https://github.com/NeonGeckoCom/neon_speech/pull/183) ([NeonDaniel](https://github.com/NeonDaniel)) -## [4.1.1a3](https://github.com/NeonGeckoCom/neon_speech/tree/4.1.1a3) (2023-10-03) +## [4.2.1a3](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a3) (2023-11-14) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.1a2...4.1.1a3) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.1a2...4.2.1a3) **Merged pull requests:** -- Add timing metrics for minerva testing [\#175](https://github.com/NeonGeckoCom/neon_speech/pull/175) ([NeonDaniel](https://github.com/NeonDaniel)) +- Improved timing context handling with unit tests [\#182](https://github.com/NeonGeckoCom/neon_speech/pull/182) ([NeonDaniel](https://github.com/NeonDaniel)) -## [4.1.1a2](https://github.com/NeonGeckoCom/neon_speech/tree/4.1.1a2) (2023-07-28) +## [4.2.1a2](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a2) (2023-11-10) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.1a1...4.1.1a2) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.1a1...4.2.1a2) **Merged pull requests:** -- Kubernetes/No-audio server compat. [\#174](https://github.com/NeonGeckoCom/neon_speech/pull/174) ([NeonDaniel](https://github.com/NeonDaniel)) +- Add timing metrics for audio input to handler in speech service [\#181](https://github.com/NeonGeckoCom/neon_speech/pull/181) ([NeonDaniel](https://github.com/NeonDaniel)) -## [4.1.1a1](https://github.com/NeonGeckoCom/neon_speech/tree/4.1.1a1) (2023-07-27) +## [4.2.1a1](https://github.com/NeonGeckoCom/neon_speech/tree/4.2.1a1) (2023-11-09) -[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.1.0...4.1.1a1) +[Full Changelog](https://github.com/NeonGeckoCom/neon_speech/compare/4.2.0...4.2.1a1) **Merged pull requests:** -- Update container config handling and resolve logged warnings [\#173](https://github.com/NeonGeckoCom/neon_speech/pull/173) ([NeonDaniel](https://github.com/NeonDaniel)) +- Resample API input wav audio to ensure format matches listener config [\#180](https://github.com/NeonGeckoCom/neon_speech/pull/180) ([NeonDaniel](https://github.com/NeonDaniel)) diff --git a/neon_speech/__init__.py b/neon_speech/__init__.py index 718d1b0..4af0cae 100644 --- a/neon_speech/__init__.py +++ b/neon_speech/__init__.py @@ -25,3 +25,6 @@ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Import to ensure patched class is applied +from neon_speech.transformers import NeonAudioTransformerService diff --git a/neon_speech/service.py b/neon_speech/service.py index 769842b..92199a1 100644 --- a/neon_speech/service.py +++ b/neon_speech/service.py @@ -27,6 +27,8 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +from typing import Dict + import ovos_dinkum_listener.plugins from tempfile import mkstemp @@ -80,8 +82,6 @@ def on_started(): class NeonSpeechClient(OVOSDinkumVoiceService): - _stopwatch = Stopwatch("get_stt") - def __init__(self, ready_hook=on_ready, error_hook=on_error, stopping_hook=on_stopping, alive_hook=on_alive, started_hook=on_started, watchdog=lambda: None, @@ -112,6 +112,8 @@ def __init__(self, ready_hook=on_ready, error_hook=on_error, watchdog=watchdog) self.daemon = daemonic self.config.bus = self.bus + self._stt_stopwatch = Stopwatch("get_stt", allow_reporting=True, + bus=self.bus) from neon_utils.signal_utils import init_signal_handlers, \ init_signal_bus init_signal_bus(self.bus) @@ -133,6 +135,37 @@ def __init__(self, ready_hook=on_ready, error_hook=on_error, LOG.info("Skipping api_stt init") self.api_stt = None + def _record_begin(self): + self._stt_stopwatch.start() + OVOSDinkumVoiceService._record_begin(self) + + def _stt_text(self, text: str, stt_context: dict): + self._stt_stopwatch.stop() + stt_context.setdefault("timing", dict()) + stt_context["timing"]["get_stt"] = self._stt_stopwatch.time + + # This is where the first Message of the interaction is created + OVOSDinkumVoiceService._stt_text(self, text, stt_context) + self._stt_stopwatch.report() + + def _save_stt(self, audio_bytes, stt_meta, save_path=None): + stopwatch = Stopwatch("save_audio", True, self.bus) + with stopwatch: + path = OVOSDinkumVoiceService._save_stt(self, audio_bytes, stt_meta, + save_path) + stt_meta.setdefault('timing', dict()) + stt_meta['timing']['save_audio'] = stopwatch.time + return path + + def _save_ww(self, audio_bytes, ww_meta, save_path=None): + stopwatch = Stopwatch("save_ww", True, self.bus) + with stopwatch: + path = OVOSDinkumVoiceService._save_ww(self, audio_bytes, ww_meta, + save_path) + ww_meta.setdefault('timing', dict()) + ww_meta['timing']['save_ww'] = stopwatch.time + return path + def _validate_message_context(self, message: Message, native_sources=None): if message.context.get('destination') and \ "audio" not in message.context['destination']: @@ -188,6 +221,16 @@ def register_event_handlers(self): self.bus.on("neon.enable_wake_word", self.handle_enable_wake_word) self.bus.on("neon.disable_wake_word", self.handle_disable_wake_word) + def _handle_get_languages_stt(self, message): + if self.config.get('listener', {}).get('enable_voice_loop', True): + return OVOSDinkumVoiceService._handle_get_languages_stt(self, + message) + # For server use, get the API STT langs + stt_langs = self.api_stt.available_languages or \ + [self.config.get('lang') or 'en-us'] + LOG.debug(f"Got stt_langs: {stt_langs}") + self.bus.emit(message.response({'langs': list(stt_langs)})) + def handle_disable_wake_word(self, message: Message): """ Disable a wake word. If the requested wake word is the only one enabled, @@ -295,10 +338,18 @@ def handle_profile_update(self, message): :param message: Message associated with profile update """ updated_profile = message.data.get("profile") - if updated_profile["user"]["username"] == \ + if updated_profile["user"]["username"] != \ self._default_user["user"]["username"]: - apply_local_user_profile_updates(updated_profile, - self._default_user) + LOG.info(f"Ignoring profile update for " + f"{updated_profile['user']['username']}") + return + apply_local_user_profile_updates(updated_profile, + self._default_user) + if updated_profile.get("speech", {}).get("stt_language"): + new_stt_lang = updated_profile["speech"]["stt_language"] + if new_stt_lang != self.config['lang']: + from neon_speech.utils import patch_config + patch_config({"lang": new_stt_lang}) def handle_wake_words_state(self, message): """ @@ -327,6 +378,7 @@ def handle_get_stt(self, message: Message): Emits a response to the sender with stt data or error data :param message: Message associated with request """ + received_time = time() if message.data.get("audio_data"): wav_file_path = self._write_encoded_file( message.data.pop("audio_data")) @@ -334,24 +386,38 @@ def handle_get_stt(self, message: Message): wav_file_path = message.data.get("audio_file") lang = message.data.get("lang") ident = message.context.get("ident") or "neon.get_stt.response" + + message.context.setdefault("timing", dict()) LOG.info(f"Handling STT request: {ident}") if not wav_file_path: + message.context['timing']['response_sent'] = time() self.bus.emit(message.reply( ident, data={"error": f"audio_file not specified!"})) return if not os.path.isfile(wav_file_path): + message.context['timing']['response_sent'] = time() self.bus.emit(message.reply( ident, data={"error": f"{wav_file_path} Not found!"})) try: + _, parser_data, transcriptions = \ self._get_stt_from_file(wav_file_path, lang) + timing = parser_data.pop('timing') + message.context["timing"] = {**message.context["timing"], **timing} + sent_time = message.context["timing"].get("client_sent", + received_time) + if received_time != sent_time: + message.context['timing']['client_to_core'] = \ + received_time - sent_time + message.context['timing']['response_sent'] = time() self.bus.emit(message.reply(ident, data={"parser_data": parser_data, "transcripts": transcriptions})) except Exception as e: LOG.error(e) + message.context['timing']['response_sent'] = time() self.bus.emit(message.reply(ident, data={"error": repr(e)})) def handle_audio_input(self, message): @@ -370,11 +436,18 @@ def build_context(msg: Message): 'username': self._default_user["user"]["username"] or "local", 'user_profiles': [self._default_user.content]} - ctx = {**defaults, **ctx, 'destination': ['skills'], - 'timing': {'start': msg.data.get('time'), - 'transcribed': time()}} + ctx = {**defaults, **ctx, 'destination': ['skills']} + ctx['timing'] = {**ctx.get('timing', {}), + **{'start': msg.data.get('time'), + 'transcribed': time()}} return ctx + received_time = time() + sent_time = message.context.get("timing", {}).get("client_sent", + received_time) + if received_time != sent_time: + message.context['timing']['client_to_core'] = \ + received_time - sent_time ident = message.context.get("ident") or "neon.audio_input.response" LOG.info(f"Handling audio input: {ident}") if message.data.get("audio_data"): @@ -384,18 +457,23 @@ def build_context(msg: Message): wav_file_path = message.data.get("audio_file") lang = message.data.get("lang") try: - with self._stopwatch: - _, parser_data, transcriptions = \ - self._get_stt_from_file(wav_file_path, lang) + # _=transformed audio_data + _, parser_data, transcriptions = \ + self._get_stt_from_file(wav_file_path, lang) + timing = parser_data.pop('timing') message.context["audio_parser_data"] = parser_data + message.context.setdefault('timing', dict()) + message.context['timing'] = {**timing, **message.context['timing']} context = build_context(message) - context['timing']['get_stt'] = self._stopwatch.time data = { "utterances": transcriptions, "lang": message.data.get("lang", "en-us") } + # Send a new message to the skills module with proper routing ctx handled = self._emit_utterance_to_skills(Message( 'recognizer_loop:utterance', data, context)) + + # Reply to original message with transcription/audio parser data self.bus.emit(message.reply(ident, data={"parser_data": parser_data, "transcripts": transcriptions, @@ -423,7 +501,7 @@ def handle_offline(self, _): Handle notification to operate in offline mode """ LOG.info("Offline mode selected, Reloading STT Plugin") - config = dict(self.config) + config: Dict[str, dict] = dict(self.config) if config['stt'].get('offline_module'): config['stt']['module'] = config['stt'].get('offline_module') self.voice_loop.stt = STTFactory.create(config) @@ -456,35 +534,48 @@ def _get_stt_from_file(self, wav_file: str, :return: (AudioData of object, extracted context, transcriptions) """ from neon_utils.file_utils import get_audio_file_stream - lang = lang or 'en-us' # TODO: read default from config - segment = AudioSegment.from_file(wav_file) + _stopwatch = Stopwatch() + lang = lang or self.config.get('lang') + desired_sample_rate = self.config['listener'].get('sample_rate', 16000) + desired_sample_width = self.config['listener'].get('sample_width', 2) + segment = (AudioSegment.from_file(wav_file).set_channels(1) + .set_frame_rate(desired_sample_rate) + .set_sample_width(desired_sample_width)) + LOG.debug(f"Audio fr={segment.frame_rate},sw={segment.sample_width}," + f"fw={segment.frame_width},ch={segment.channels}") audio_data = AudioData(segment.raw_data, segment.frame_rate, segment.sample_width) - audio_stream = get_audio_file_stream(wav_file) if not self.api_stt: raise RuntimeError("api_stt not initialized." " is `listener['enable_stt_api'] set to False?") - if hasattr(self.api_stt, 'stream_start'): - if self.lock.acquire(True, 30): - LOG.info(f"Starting STT processing (lang={lang}): {wav_file}") - self.api_stt.stream_start(lang) - while True: - try: - data = audio_stream.read(1024) - self.api_stt.stream_data(data) - except EOFError: - break - transcriptions = self.api_stt.stream_stop() - self.lock.release() + with _stopwatch: + if hasattr(self.api_stt, 'stream_start'): + audio_stream = get_audio_file_stream(wav_file, desired_sample_rate) + if self.lock.acquire(True, 30): + LOG.info(f"Starting STT processing (lang={lang}): {wav_file}") + self.api_stt.stream_start(lang) + while True: + try: + data = audio_stream.read(1024) + self.api_stt.stream_data(data) + except EOFError: + break + transcriptions = self.api_stt.stream_stop() + self.lock.release() + else: + LOG.error(f"Timed out acquiring lock, not processing: {wav_file}") + transcriptions = [] else: - LOG.error(f"Timed out acquiring lock, not processing: {wav_file}") - transcriptions = [] - else: - transcriptions = self.api_stt.execute(audio_data, lang) - if isinstance(transcriptions, str): - LOG.warning("Transcriptions is a str, no alternatives provided") - transcriptions = [transcriptions] - audio, audio_context = self.transformers.transform(audio_data) + transcriptions = self.api_stt.execute(audio_data, lang) + if isinstance(transcriptions, str): + LOG.warning("Transcriptions is a str, no alternatives provided") + transcriptions = [transcriptions] + + get_stt = float(_stopwatch.time) + with _stopwatch: + audio, audio_context = self.transformers.transform(audio_data) + audio_context["timing"] = {"get_stt": get_stt, + "transform_audio": _stopwatch.time} LOG.info(f"Transcribed: {transcriptions}") return audio, audio_context, transcriptions diff --git a/neon_speech/transformers.py b/neon_speech/transformers.py new file mode 100644 index 0000000..dde3179 --- /dev/null +++ b/neon_speech/transformers.py @@ -0,0 +1,47 @@ +# NEON AI (TM) SOFTWARE, Software Development Kit & Application Framework +# All trademark and other rights reserved by their respective owners +# Copyright 2008-2022 Neongecko.com Inc. +# Contributors: Daniel McKnight, Guy Daniels, Elon Gasper, Richard Leeds, +# Regina Bloomstine, Casimiro Ferreira, Andrii Pernatii, Kirill Hrymailo +# BSD-3 License +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import ovos_dinkum_listener.transformers +from neon_utils.metrics_utils import Stopwatch +from ovos_dinkum_listener.transformers import AudioTransformersService + + +class NeonAudioTransformerService(AudioTransformersService): + """ + Overrides the default AudioTransformersService to add timing metrics + """ + + def transform(self, chunk: bytes) -> (bytes, dict): + stopwatch = Stopwatch("transform_audio", True, self.bus) + with stopwatch: + chunk, context = AudioTransformersService.transform(self, chunk) + context.setdefault("timing", dict()) + context['timing']['transform_audio'] = stopwatch.time + return chunk, context + + +ovos_dinkum_listener.transformers.AudioTransformersService = NeonAudioTransformerService diff --git a/requirements/requirements.txt b/requirements/requirements.txt index c9f6b97..b10137e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -4,7 +4,7 @@ ovos-utils~=0.0.30 ovos-plugin-manager~=0.0.23 click~=8.0 click-default-group~=1.2 -neon-utils[network,audio]~=1.6 +neon-utils[network,audio]~=1.7 ovos-config~=0.0.7 ovos-vad-plugin-webrtcvad~=0.0.1 diff --git a/tests/api_method_tests.py b/tests/api_method_tests.py index c1c4d07..2354b52 100644 --- a/tests/api_method_tests.py +++ b/tests/api_method_tests.py @@ -69,18 +69,18 @@ def setUpClass(cls) -> None: test_config["listener"]["VAD"]["module"] = "dummy" assert test_config["stt"]["module"] == "deepspeech_stream_local" + ready_event = Event() + + def _ready(): + ready_event.set() + cls.speech_service = NeonSpeechClient(speech_config=test_config, - daemonic=False, bus=cls.bus) + daemonic=False, bus=cls.bus, + ready_hook=_ready) assert cls.speech_service.config["stt"]["module"] == "deepspeech_stream_local" cls.speech_service.start() - ready = False - timeout = time() + 120 - while not ready and time() < timeout: - message = cls.bus.wait_for_response( - Message("mycroft.voice.is_ready")) - if message: - ready = message.data.get("status") - if not ready: + + if not ready_event.wait(120): raise TimeoutError("Speech module not ready after 120 seconds") from ovos_plugin_manager.templates import STT assert isinstance(cls.speech_service.voice_loop.stt, STT) @@ -105,6 +105,8 @@ def test_get_stt_no_file(self): {}, context), context["ident"]) self.assertEqual(stt_resp.context, context) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("error"), str) self.assertEqual(stt_resp.data["error"], "audio_file not specified!") @@ -114,8 +116,11 @@ def test_get_stt_invalid_file_path(self): "user": "TestRunner"} stt_resp = self.bus.wait_for_response( Message("neon.get_stt", {"audio_file": "~/invalid_file.wav"}, - context), context["ident"]) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"]) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("error"), str) def test_get_stt_invalid_file_type(self): @@ -125,24 +130,36 @@ def test_get_stt_invalid_file_type(self): stt_resp = self.bus.wait_for_response( Message("neon.get_stt", {"audio_file": os.path.join(AUDIO_FILE_PATH, "test.txt")}, - context), context["ident"]) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"]) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("error"), str) def test_get_stt_valid_file(self): context = {"client": "tester", "ident": "12345", - "user": "TestRunner"} + "user": "TestRunner", + "timing": {"client_sent": time()}} stt_resp = self.bus.wait_for_response(Message( "neon.get_stt", {"audio_file": os.path.join(AUDIO_FILE_PATH, "stop.wav")}, - context), context["ident"], 60.0) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"], 60.0) + for key in context: + if key != 'timing': + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("parser_data"), dict, stt_resp.serialize()) self.assertIsInstance(stt_resp.data.get("transcripts"), list, stt_resp.serialize()) self.assertIn("stop", stt_resp.data.get("transcripts")) + self.assertEqual(stt_resp.context['timing']['client_sent'], + context['timing']['client_sent'], stt_resp.context) + self.assertIsInstance(stt_resp.context['timing']['client_to_core'], + float, stt_resp.context) def test_get_stt_valid_contents(self): context = {"client": "tester", @@ -152,8 +169,11 @@ def test_get_stt_valid_contents(self): "stop.wav")) stt_resp = self.bus.wait_for_response(Message( "neon.get_stt", {"audio_data": audio_data}, - context), context["ident"], 60.0) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"], 60.0) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("parser_data"), dict, stt_resp.serialize()) self.assertIsInstance(stt_resp.data.get("transcripts"), list, @@ -166,29 +186,38 @@ def test_audio_input_valid(self): context = {"client": "tester", "ident": "11111", "user": "TestRunner", - "extra_data": "something"} + "extra_data": "something", + "timing": {"client_sent": time()}} audio_data = encode_file_to_base64_string(os.path.join(AUDIO_FILE_PATH, "stop.wav")) stt_resp = self.bus.wait_for_response(Message( "neon.audio_input", {"audio_data": audio_data}, - context), context["ident"], 60.0) + dict(context)), context["ident"], 60.0) self.assertIsInstance(stt_resp, Message) for key in context: self.assertIn(key, stt_resp.context) - self.assertEqual(context[key], stt_resp.context[key]) + if key != "timing": + self.assertEqual(context[key], stt_resp.context[key]) self.assertIsInstance(stt_resp.data.get("skills_recv"), bool, stt_resp.serialize()) + self.assertIsInstance(stt_resp.context['timing']['client_to_core'], + float, stt_resp.context) handle_utterance.assert_called_once() message = handle_utterance.call_args[0][0] self.assertIsInstance(message, Message) for key in context: self.assertIn(key, message.context) - self.assertEqual(context[key], message.context[key]) + if key != "timing": + self.assertEqual(context[key], message.context[key]) self.assertIsInstance(message.data["utterances"], list, message.data) self.assertIn("stop", message.data["utterances"], message.data.get("utterances")) self.assertIsInstance(message.context["timing"], dict) + self.assertIsInstance(message.context['timing']['client_to_core'], + float, message.context) + self.assertIsInstance(message.context['timing']['transcribed'], float, + message.context) self.assertEqual(message.context["destination"], ["skills"]) def test_wake_words_state(self): @@ -256,19 +285,19 @@ def setUpClass(cls) -> None: test_config["listener"]["VAD"]["module"] = "dummy" assert test_config["stt"]["module"] == "neon-stt-plugin-nemo" + ready_event = Event() + + def _ready(): + ready_event.set() + cls.speech_service = NeonSpeechClient(speech_config=test_config, - daemonic=False, bus=cls.bus) + daemonic=False, bus=cls.bus, + ready_hook=_ready) assert cls.speech_service.config["stt"]["module"] == \ "neon-stt-plugin-nemo" cls.speech_service.start() - ready = False - timeout = time() + 120 - while not ready and time() < timeout: - message = cls.bus.wait_for_response( - Message("mycroft.voice.is_ready")) - if message: - ready = message.data.get("status") - if not ready: + + if not ready_event.wait(120): raise TimeoutError("Speech module not ready after 120 seconds") from ovos_plugin_manager.templates import STT assert isinstance(cls.speech_service.voice_loop.stt, STT) @@ -290,9 +319,12 @@ def test_get_stt_no_file(self): "ident": "123", "user": "TestRunner"} stt_resp = self.bus.wait_for_response(Message("neon.get_stt", - {}, context), + {}, dict(context)), context["ident"]) - self.assertEqual(stt_resp.context, context) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("error"), str) self.assertEqual(stt_resp.data["error"], "audio_file not specified!") @@ -302,8 +334,11 @@ def test_get_stt_invalid_file_path(self): "user": "TestRunner"} stt_resp = self.bus.wait_for_response( Message("neon.get_stt", {"audio_file": "~/invalid_file.wav"}, - context), context["ident"]) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"]) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("error"), str) def test_get_stt_invalid_file_type(self): @@ -313,8 +348,11 @@ def test_get_stt_invalid_file_type(self): stt_resp = self.bus.wait_for_response( Message("neon.get_stt", {"audio_file": os.path.join(AUDIO_FILE_PATH, "test.txt")}, - context), context["ident"]) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"]) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("error"), str) def test_get_stt_valid_file(self): @@ -324,8 +362,11 @@ def test_get_stt_valid_file(self): stt_resp = self.bus.wait_for_response(Message( "neon.get_stt", {"audio_file": os.path.join(AUDIO_FILE_PATH, "stop.wav")}, - context), context["ident"], 60.0) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"], 60.0) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("parser_data"), dict, stt_resp.serialize()) self.assertIsInstance(stt_resp.data.get("transcripts"), list, @@ -340,8 +381,11 @@ def test_get_stt_valid_contents(self): "stop.wav")) stt_resp = self.bus.wait_for_response(Message( "neon.get_stt", {"audio_data": audio_data}, - context), context["ident"], 60.0) - self.assertEqual(stt_resp.context, context) + dict(context)), context["ident"], 60.0) + for key in context: + self.assertEqual(stt_resp.context[key], context[key]) + self.assertIsInstance(stt_resp.context['timing']['response_sent'], + float) self.assertIsInstance(stt_resp.data.get("parser_data"), dict, stt_resp.serialize()) self.assertIsInstance(stt_resp.data.get("transcripts"), list, @@ -359,11 +403,12 @@ def test_audio_input_valid(self): "stop.wav")) stt_resp = self.bus.wait_for_response(Message( "neon.audio_input", {"audio_data": audio_data}, - context), context["ident"], 60.0) + dict(context)), context["ident"], 60.0) self.assertIsInstance(stt_resp, Message) for key in context: self.assertIn(key, stt_resp.context) - self.assertEqual(context[key], stt_resp.context[key]) + if key != "timing": + self.assertEqual(context[key], stt_resp.context[key]) self.assertIsInstance(stt_resp.data.get("skills_recv"), bool, stt_resp.serialize()) @@ -372,7 +417,8 @@ def test_audio_input_valid(self): self.assertIsInstance(message, Message) for key in context: self.assertIn(key, message.context) - self.assertEqual(context[key], message.context[key]) + if key != "timing": + self.assertEqual(context[key], message.context[key]) self.assertIsInstance(message.data["utterances"], list, message.data) self.assertIn("stop", message.data["utterances"], message.data.get("utterances")) diff --git a/version.py b/version.py index de7eb74..1102bcb 100644 --- a/version.py +++ b/version.py @@ -26,4 +26,4 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -__version__ = "4.2.0" +__version__ = "4.3.0"