From fa904f6fb78b74e67464104d391541a02238a818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20=C5=81ukawski?= Date: Tue, 12 Dec 2023 20:17:02 +0100 Subject: [PATCH] Re-implement speechSynthesis.py --- programs/speechSynthesis/speechSynthesis.py | 306 +++++++++++--------- programs/speechSynthesis/test.py | 158 ---------- 2 files changed, 175 insertions(+), 289 deletions(-) delete mode 100644 programs/speechSynthesis/test.py diff --git a/programs/speechSynthesis/speechSynthesis.py b/programs/speechSynthesis/speechSynthesis.py index 0b81a82..b809fd9 100644 --- a/programs/speechSynthesis/speechSynthesis.py +++ b/programs/speechSynthesis/speechSynthesis.py @@ -1,181 +1,225 @@ #!/usr/bin/env python3 -# adapted from https://github.com/MycroftAI/mimic3/blob/be72c18/mimic3_tts/__main__.py - import argparse -import queue -import signal -import shlex -import shutil -import subprocess -import tempfile -import threading -import time - -import mimic3_tts -import yarp import roboticslab_speech +import sounddevice as sd +import yarp -PLAY_PROGRAMS = ['paplay', 'play -q', 'aplay -q'] - -class TextToSpeechResponder(roboticslab_speech.SpeechSynthesis): - def __init__(self, engine): - super().__init__() - self.engine = engine - self.is_playing = False - self.p = None - self.result_queue = queue.Queue(maxsize=5) - self.result_thread = threading.Thread(target=self._process_result, daemon=True) - self.result_thread.start() - - def setLanguage(self, language): - if language.startswith('#'): - # same voice, different speaker - self.engine.speaker = language[1:] - else: - # different voice - self.engine.voice = language - - if self.engine.voice not in list(self.getSupportedLangs()): - print('Voice not available: %s' % self.engine.voice) - return False - else: - self.engine.preload_voice(self.engine.voice) - print('Loaded voice: %s (speaker: %s)' % (self.engine.voice, self.engine.speaker or 'default')) - return True - - def setSpeed(self, speed): - self.engine.rate = float(speed) / 100 - return True +from abc import ABC, abstractmethod - def setPitch(self, pitch): - return super().setPitch(pitch) +class SynthesizerFactory(ABC): + @abstractmethod + def create(self, player): + pass - def getSpeed(self): - return int(self.engine.rate * 100) +class PiperSynthesizerFactory(SynthesizerFactory): + def __init__(self, model, rf): + self.model = model + self.rf = rf - def getPitch(self): - return super().getPitch() + def create(self, player): + return PiperSynthesizer(player, self.model, self.rf) - def getSupportedLangs(self): - all_voices = sorted(list(self.engine.get_voices()), key=lambda v: v.key) - local_voices = filter(lambda v: not v.location.startswith('http'), all_voices) - available_voices = [v.key for v in local_voices] - return yarp.SVector(available_voices) +class SpeechSynthesizer(roboticslab_speech.SpeechSynthesis): + def __init__(self, player): + super().__init__() + self._player = player def say(self, text): - self.engine.begin_utterance() - self.engine.speak_text(text) - - for result in self.engine.end_utterance(): - self.result_queue.put(result) - + self._player.set_generator(self._get_generator(text)) return True def play(self): - return super().play() + self._player.resume() + return True def pause(self): - return super().pause() + self._player.pause() + return True def stop(self): - if self.p: - self.p.terminate() - + self._player.clear_generator() return True def checkSayDone(self): - return not self.is_playing + return not self._player.is_playing() - def _process_result(self): - while True: - result = self.result_queue.get() + def setSpeed(self, speed): + return super().setSpeed(speed) - if result is None: - break + def setPitch(self, pitch): + return super().setPitch(pitch) - wav_bytes = result.to_wav_bytes() + def getSpeed(self): + return super().getSpeed() - if not wav_bytes: - continue + def getPitch(self): + return super().getPitch() - with tempfile.NamedTemporaryFile(mode='wb+', suffix='.wav') as wav_file: - wav_file.write(wav_bytes) - wav_file.seek(0) + @abstractmethod + def get_sample_rate(self): + pass - for play_program in reversed(PLAY_PROGRAMS): - play_cmd = shlex.split(play_program) + @abstractmethod + def _get_generator(self, text): + pass - if not shutil.which(play_cmd[0]): - continue +class PiperSynthesizer(SpeechSynthesizer): + def __init__(self, callback, model, rf): + super().__init__(callback) + self.rf = rf + self.voice = self._load_model(model) - play_cmd.append(wav_file.name) - self.is_playing = True + def _load_model(self, model): + from piper import PiperVoice - with subprocess.Popen(play_cmd) as self.p: - try: - self.p.wait() - except: # e.g. on keyboard interrupt - self.p.kill() + if not model.endswith('.onnx'): + model += '.onnx' - self.is_playing = False - break + return PiperVoice.load(model, use_cuda=False) # TODO: cuda -parser = argparse.ArgumentParser(prog='speechSynthesis', description='TTS service running a Mimic 3 engine') -parser.add_argument('--voice', '-v', help='Name of voice (expected in /)', required=True) -parser.add_argument('--speaker', '-s', help='Name or number of speaker (default: first speaker)') -parser.add_argument('--noise-scale', type=float, help='Noise scale [0-1], default is 0.667') -parser.add_argument('--length-scale', type=float, help='Length scale (1.0 is default speed, 0.5 is 2x faster)') -parser.add_argument('--noise-w', type=float, help='Variation in cadence [0-1], default is 0.8') -parser.add_argument('--cuda', action='store_true', help='Use Onnx CUDA execution provider (requires onnxruntime-gpu)') -parser.add_argument('--port', '-p', default='/speechSynthesis', help='YARP port prefix') + def get_sample_rate(self): + return self.voice.config.sample_rate -args = parser.parse_args() + def _get_generator(self, text): + return self.voice.synthesize_stream_raw(text) -tts = mimic3_tts.Mimic3TextToSpeechSystem( - mimic3_tts.Mimic3Settings( - length_scale=args.length_scale, - noise_scale=args.noise_scale, - noise_w=args.noise_w, - use_cuda=args.cuda, - ) -) + def setLanguage(self, language): + try: + self.voice = self._load_model(language) + return True + except IOError as e: + print(e) + return False + + def getSupportedLangs(self): + return super().getSupportedLangs() + +def int_or_str(text): + """Helper function for argument parsing.""" + try: + return int(text) + except ValueError: + return text -tts.voice = args.voice -tts.speaker = args.speaker +BACKENDS = ['piper'] -print('Preloading voice: %s' % args.voice) -tts.preload_voice(args.voice) +parser = argparse.ArgumentParser(description='YARP service that transforms text into live audio output', add_help=False) +parser.add_argument('--list-devices', action='store_true', help='list available audio devices and exit') +parser.add_argument('--list-backends', action='store_true', help='list available TTS backends and exit') +args, remaining = parser.parse_known_args() + +if args.list_devices: + print(sd.query_devices()) + raise SystemExit +elif args.list_backends: + print('\n'.join(BACKENDS)) + raise SystemExit + +parser = argparse.ArgumentParser(description=parser.description, formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[parser]) +parser.add_argument('--backend', '-b', type=str, required=True, help='ASR backend engine') +parser.add_argument('--device', '-d', type=int_or_str, help='input device (numeric ID or substring)') +parser.add_argument('--model', type=str, help='model, e.g. follow-me') +parser.add_argument('--cuda', action='store_true', help='Use Onnx CUDA execution provider (requires onnxruntime-gpu)') +parser.add_argument('--prefix', '-p', type=str, default='/speechSynthesis', help='YARP port prefix') +parser.add_argument('--context', type=str, default='speechSynthesis', help='YARP context directory') +parser.add_argument('--from', type=str, dest='ini', default='speechSynthesis.ini', help='YARP configuration (.ini) file') +args = parser.parse_args(remaining) yarp.Network.init() +rf = yarp.ResourceFinder() +rf.setDefaultContext(args.context) +rf.setDefaultConfigFile(args.ini) + +if args.backend == 'piper': + if args.model is None: + print('Model must be specified for Piper') + raise SystemExit + + synthesizer_factory = PiperSynthesizerFactory(args.model, rf) +else: + print('Backend not available, must be one of: %s' % ', '.join(BACKENDS)) + raise SystemExit + if not yarp.Network.checkNetwork(): - print('YARP network not found') + print('No YARP network available') raise SystemExit rpc = yarp.RpcServer() -processor = TextToSpeechResponder(tts) -if not rpc.open(args.port + '/rpc:s'): - print('Cannot open port %s' % rpc.getName()) +if not rpc.open(args.prefix + '/rpc:s'): + print('Unable to open RPC port') raise SystemExit -processor.yarp().attachAsServer(rpc) - -quitRequested = False +class CallbackPlayer: + def __init__(self): + self._generator = None + self._queued_generator = None + self._is_paused = False -def askToStop(): - global quitRequested - quitRequested = True + def set_generator(self, generator): + self._generator = generator + self._queued_generator = None + self._is_paused = False -signal.signal(signal.SIGINT, lambda signal, frame: askToStop()) -signal.signal(signal.SIGTERM, lambda signal, frame: askToStop()) + def clear_generator(self): + self._generator = None + self._queued_generator = None + self._is_paused = False -while not quitRequested: - time.sleep(0.1) - -rpc.interrupt() -rpc.close() + def pause(self): + self._is_paused = True + + def resume(self): + self._is_paused = False + + def is_playing(self): + return self._generator is not None and not self._is_paused + + def callback(self, outdata, frames, time, status): + # https://stackoverflow.com/a/62609827 + + if self.is_playing(): + try: + raw = next(self._generator) + + if len(outdata) > len(raw): + outdata[:len(raw)] = raw + outdata[len(raw):] = b'\x00' * (len(outdata) - len(raw)) + elif len(outdata) < len(raw): + outdata[:] = raw[:len(outdata)] + self._queued_generator = self._generator + self._generator = iter([raw[len(outdata):]]) + else: + outdata[:] = raw + + return + except StopIteration: + if self._queued_generator is not None: + self._generator = self._queued_generator + self._queued_generator = None + else: + self.clear_generator() + + outdata[:] = b'\x00' * len(outdata) + +try: + player = CallbackPlayer() + synthesizer = synthesizer_factory.create(player) + + with sd.RawOutputStream(samplerate=synthesizer.get_sample_rate(), + blocksize=1024, + device=args.device, + dtype='int16', + channels=1, + callback=player.callback) as stream: + synthesizer.yarp().attachAsServer(rpc) -processor.result_queue.put(None) -processor.result_thread.join() + while True: + import time + time.sleep(0.1) +except KeyboardInterrupt: + rpc.interrupt() + rpc.close() + parser.exit(0) diff --git a/programs/speechSynthesis/test.py b/programs/speechSynthesis/test.py deleted file mode 100644 index 91d09be..0000000 --- a/programs/speechSynthesis/test.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import queue -import roboticslab_speech -import sounddevice as sd -import threading -import yarp - -from abc import ABC, abstractmethod -from piper import PiperVoice # TODO: hide import -from piper.download import find_voice # TODO: hide import - -class SpeechSynthesizer(roboticslab_speech.SpeechSynthesis): - def __init__(self): - super().__init__() - self.event = threading.Event() - self._generator = None - - @abstractmethod - def get_sample_rate(self): - pass - - def say(self, text): - self._generator = self._make_generator(text) - self.event.set() - return True - - @abstractmethod - def _make_generator(self, text): - pass - - def get_generator(self): - return self._generator - -class PiperSynthesizer(SpeechSynthesizer): - def __init__(self, model, rf): - super().__init__() - self.model = model - self.rf = rf - self.voice = PiperVoice.load(self.model, use_cuda=False) # TODO: cuda - - def get_sample_rate(self): - return self.voice.config.sample_rate - - def _make_generator(self, text): - return self.voice.synthesize_stream_raw(text) - - def play(self): - pass - - def pause(self): - pass - - def stop(self): - pass - - def checkSayDone(self): - pass - - def setLanguage(self, language): - pass - - def setSpeed(self, speed): - pass - - def setPitch(self, pitch): - pass - - def getSpeed(self): - pass - - def getPitch(self): - pass - - def getSupportedLangs(self): - pass - -def int_or_str(text): - """Helper function for argument parsing.""" - try: - return int(text) - except ValueError: - return text - -BACKENDS = ['piper'] - -parser = argparse.ArgumentParser(description='YARP service that transforms text into live audio output', add_help=False) -parser.add_argument('--list-devices', action='store_true', help='list available audio devices and exit') -parser.add_argument('--list-backends', action='store_true', help='list available TTS backends and exit') -args, remaining = parser.parse_known_args() - -if args.list_devices: - print(sd.query_devices()) - raise SystemExit -elif args.list_backends: - print('\n'.join(BACKENDS)) - raise SystemExit - -parser = argparse.ArgumentParser(description=parser.description, formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[parser]) -parser.add_argument('--backend', '-b', type=str, required=True, help='ASR backend engine') -parser.add_argument('--device', '-d', type=int_or_str, help='input device (numeric ID or substring)') -parser.add_argument('--model', type=str, help='model, e.g. follow-me') -parser.add_argument('--cuda', action='store_true', help='Use Onnx CUDA execution provider (requires onnxruntime-gpu)') -parser.add_argument('--prefix', '-p', type=str, default='/speechSynthesis', help='YARP port prefix') -parser.add_argument('--context', type=str, default='speechSynthesis', help='YARP context directory') -parser.add_argument('--from', type=str, dest='ini', default='speechSynthesis.ini', help='YARP configuration (.ini) file') -args = parser.parse_args(remaining) - -yarp.Network.init() - -rf = yarp.ResourceFinder() -rf.setDefaultContext(args.context) -rf.setDefaultConfigFile(args.ini) - -if args.backend == 'piper': - if args.model is None: - print('Model must be specified for Piper') - raise SystemExit - - synthesizer = PiperSynthesizer(args.model, rf) -else: - print('Backend not available, must be one of: %s' % ', '.join(BACKENDS)) - raise SystemExit - -if not yarp.Network.checkNetwork(): - print('No YARP network available') - raise SystemExit - -rpc = yarp.RpcServer() - -if not rpc.open(args.prefix + '/rpc:s'): - print('Unable to open RPC port') - raise SystemExit - -try: - with sd.RawOutputStream(samplerate=synthesizer.get_sample_rate(), - blocksize=1024, - device=args.device, - dtype='int16', - channels=1) as stream: - synthesizer.yarp().attachAsServer(rpc) - - while True: - print('Waiting for text to synthesize...') - synthesizer.event.wait() - print('Synthesizing...') - - for frames in synthesizer.get_generator(): - print(stream.write_available) - stream.write(frames) - - print('Done synthesizing') - synthesizer.event.clear() -except KeyboardInterrupt: - rpc.interrupt() - rpc.close() - parser.exit(0)