diff --git a/.gitignore b/.gitignore index d63a3eee8..e87236843 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,4 @@ jslint.mjs vits-piper-en_US-amy-low vits-piper-*-*-* log +*.exe diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp index 895070dbe..4f775a527 100644 --- a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp @@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { param.sampleFormat = paFloat32; param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; - float sample_rate = config_.feat_config.sample_rate; + float sample_rate = static_cast(config_.feat_config.sample_rate); pa_stream_ = nullptr; PaError err = Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */ @@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_); AcceptWaveformOffline(stream, config_.feat_config.sample_rate, - samples_.data(), samples_.size()); + samples_.data(), static_cast(samples_.size())); DecodeOfflineStream(recognizer_, stream); auto r = GetOfflineStreamResult(stream); results_.emplace_back(r->text); diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc index 04d9b49df..3bdff8519 100644 Binary files a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc and b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc differ diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp index eaf082d75..5ad7fda4f 100644 --- a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp @@ -9,14 +9,184 @@ #include "afxdialogex.h" #include +#include // NOLINT +#include #include #include +#include // NOLINT #include #ifdef _DEBUG #define new DEBUG_NEW #endif +Microphone::Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-2); + } +} + +Microphone::~Microphone() { + PaError err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-2); + } +} + +// NOTE(fangjun): Code is copied from +// https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc#L22 +static std::condition_variable g_cv; +static std::mutex g_cv_m; + +struct Samples { + std::vector data; + int32_t consumed = 0; +}; + +struct Buffer { + std::queue samples; + std::mutex mutex; +}; + +static Buffer g_buffer; + +static bool g_started = false; +static bool g_stopped = false; +static bool g_killed = false; + +static void AudioGeneratedCallback(const float *s, int32_t n) { + if (n > 0) { + Samples samples; + samples.data = std::vector{s, s + n}; + + std::lock_guard lock(g_buffer.mutex); + g_buffer.samples.push(std::move(samples)); + g_started = true; + } +} + +static int PlayCallback(const void * /*in*/, void *out, + unsigned long _n, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void * /*user_data*/) { + int32_t n = static_cast(_n); + if (g_killed) { + return paComplete; + } + + float *pout = reinterpret_cast(out); + std::lock_guard lock(g_buffer.mutex); + + if (g_buffer.samples.empty()) { + if (g_stopped) { + // no more data is available and we have processed all of the samples + return paComplete; + } + + // The current sentence is so long, though very unlikely, that + // the model has not finished processing it yet. + std::fill_n(pout, n, 0); + + return paContinue; + } + + int32_t k = 0; + for (; k < n && !g_buffer.samples.empty();) { + int32_t this_block = n - k; + + auto &p = g_buffer.samples.front(); + + int32_t remaining = static_cast(p.data.size()) - p.consumed; + + if (this_block <= remaining) { + std::copy(p.data.begin() + p.consumed, + p.data.begin() + p.consumed + this_block, pout + k); + p.consumed += this_block; + + k = n; + + if (p.consumed == p.data.size()) { + g_buffer.samples.pop(); + } + break; + } + + std::copy(p.data.begin() + p.consumed, p.data.end(), pout + k); + k += static_cast(p.data.size()) - p.consumed; + g_buffer.samples.pop(); + } + + if (k < n) { + std::fill_n(pout + k, n - k, 0); + } + + if (g_stopped && g_buffer.samples.empty()) { + return paComplete; + } + + return paContinue; +} + +static void PlayCallbackFinished(void *userData) { g_cv.notify_all(); } + +static void StartPlayback(int32_t sample_rate) { + int32_t frames_per_buffer = 1024; + PaStreamParameters outputParameters; + PaStream *stream; + PaError err; + + outputParameters.device = + Pa_GetDefaultOutputDevice(); /* default output device */ + + outputParameters.channelCount = 1; /* stereo output */ + outputParameters.sampleFormat = paFloat32; /* 32 bit floating point output */ + outputParameters.suggestedLatency = + Pa_GetDeviceInfo(outputParameters.device)->defaultLowOutputLatency; + outputParameters.hostApiSpecificStreamInfo = nullptr; + + err = Pa_OpenStream(&stream, nullptr, /* no input */ + &outputParameters, sample_rate, frames_per_buffer, + paClipOff, // we won't output out of range samples so + // don't bother clipping them + PlayCallback, nullptr); + if (err != paNoError) { + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err)); + return; + } + + err = Pa_SetStreamFinishedCallback(stream, &PlayCallbackFinished); + if (err != paNoError) { + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err)); + return; + } + + err = Pa_StartStream(stream); + if (err != paNoError) { + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err)); + return; + } + + std::unique_lock lock(g_cv_m); + while (!g_killed && !g_stopped && + (!g_started || (g_started && !g_buffer.samples.empty()))) { + g_cv.wait(lock); + } + + err = Pa_StopStream(stream); + if (err != paNoError) { + return; + } + + err = Pa_CloseStream(stream); + if (err != paNoError) { + return; + } +} + // CAboutDlg dialog used for App About @@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() { ok = false; } - if (!Exists("./lexicon.txt")) { - error_message += "Cannot find ./lexicon.txt\r\n"; + if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) { + error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n"; ok = false; } @@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() { generate_btn_.EnableWindow(FALSE); error_message += "\r\nPlease refer to\r\n" - "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html"; + "https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models"; error_message += "\r\nto download models.\r\n"; - error_message += "\r\nWe given an example below\r\n\r\n"; - error_message += - "wget -O model.onnx " - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" - "vits-aishell3.onnx\r\n"; + error_message += "\r\nWe give an example below\r\n\r\n"; error_message += - "wget " - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" - "lexicon.txt\r\n"; - error_message += - "wget " - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" - "tokens.txt\r\n"; + "1. Download vits-piper-en_US-amy-low.tar.bz2 from the following URL\r\n\r\n" + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2\r\n\r\n" + "2. Uncompress it and you will get a directory vits-piper-en_US-amy-low \r\n\r\n" + "3. Switch to the directory vits-piper-en_US-amy-low \r\n\r\n" + "4. Rename en_US-amy-low.onnx to model.onnx \r\n\r\n" + "5. Copy the current exe to the directory vits-piper-en_US-amy-low\r\n\r\n" + "6. Done! You can now run the exe in the directory vits-piper-en_US-amy-low\r\n\r\n"; AppendLineToMultilineEditCtrl(my_hint_, error_message); return; @@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() { SherpaOnnxOfflineTtsConfig config; memset(&config, 0, sizeof(config)); config.model.debug = 0; - config.model.num_threads = 1; + config.model.num_threads = 2; config.model.provider = "cpu"; config.model.vits.model = "./model.onnx"; - config.model.vits.lexicon = "./lexicon.txt"; + if (Exists("./espeak-ng-data/phontab")) { + config.model.vits.data_dir = "./espeak-ng-data"; + } else { + config.model.vits.lexicon = "./lexicon.txt"; + } config.model.vits.tokens = "./tokens.txt"; tts_ = SherpaOnnxCreateOfflineTts(&config); @@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() { } void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { - // TODO: Add your control notification handler code here CString s; speaker_id_.GetWindowText(s); int speaker_id = _ttoi(s); @@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { } my_text_.GetWindowText(s); + std::string ss = ToString(s); if (ss.empty()) { AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK); return; } + if (play_thread_) { + g_killed = true; + g_stopped = true; + if (play_thread_->joinable()) { + play_thread_->join(); + } + } + + g_killed = false; + g_stopped = false; + g_started = false; + g_buffer.samples = {}; + + // Caution(fangjun): It is not efficient to re-create the thread. We use this approach + // for simplicity + play_thread_ = std::make_unique(StartPlayback, SherpaOnnxOfflineTtsSampleRate(tts_)); + + generate_btn_.EnableWindow(FALSE); + const SherpaOnnxGeneratedAudio *audio = - SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed); + SherpaOnnxOfflineTtsGenerateWithCallback(tts_, ss.c_str(), speaker_id, speed, &AudioGeneratedCallback); + + generate_btn_.EnableWindow(TRUE); + output_filename_.GetWindowText(s); std::string filename = ToString(s); + int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename.c_str()); SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); if (ok) { - AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK); + // AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK); + AppendLineToMultilineEditCtrl(my_hint_, std::string("Saved to ") + filename + " successfully"); } else { - AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK); + // AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK); + AppendLineToMultilineEditCtrl(my_hint_, std::string("Failed to saved to ") + filename); } //CDialogEx::OnOK(); diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h index 952ec53b5..2bf57493d 100644 --- a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h @@ -6,6 +6,16 @@ #include "sherpa-onnx/c-api/c-api.h" +#include +#include + +#include "portaudio.h" + +class Microphone { + public: + Microphone(); + ~Microphone(); +}; // CNonStreamingTextToSpeechDlg dialog class CNonStreamingTextToSpeechDlg : public CDialogEx @@ -34,16 +44,21 @@ class CNonStreamingTextToSpeechDlg : public CDialogEx afx_msg void OnPaint(); afx_msg HCURSOR OnQueryDragIcon(); DECLARE_MESSAGE_MAP() - public: - CEdit my_hint_; - CEdit speaker_id_; - CEdit speed_; - void Init(); - void InitHint(); - CButton generate_btn_; - afx_msg void OnBnClickedOk(); - - SherpaOnnxOfflineTts *tts_; - CEdit my_text_; - CEdit output_filename_; +public: + CEdit my_hint_; + CEdit speaker_id_; + CEdit speed_; + void Init(); + void InitHint(); + CButton generate_btn_; + afx_msg void OnBnClickedOk(); + + SherpaOnnxOfflineTts *tts_ = nullptr; + CEdit my_text_; + CEdit output_filename_; + +private: + Microphone mic_; + std::unique_ptr play_thread_; + };