Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support playing generated audio as it is generating for MFC. #462

Merged
merged 2 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,4 @@ jslint.mjs
vits-piper-en_US-amy-low
vits-piper-*-*-*
log
*.exe
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
param.sampleFormat = paFloat32;
param.suggestedLatency = info->defaultLowInputLatency;
param.hostApiSpecificStreamInfo = nullptr;
float sample_rate = config_.feat_config.sample_rate;
float sample_rate = static_cast<float>(config_.feat_config.sample_rate);
pa_stream_ = nullptr;
PaError err =
Pa_OpenStream(&pa_stream_, &param, nullptr, /* &outputParameters, */
Expand Down Expand Up @@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_);

AcceptWaveformOffline(stream, config_.feat_config.sample_rate,
samples_.data(), samples_.size());
samples_.data(), static_cast<int32_t>(samples_.size()));
DecodeOfflineStream(recognizer_, stream);
auto r = GetOfflineStreamResult(stream);
results_.emplace_back(r->text);
Expand Down
Binary file modified mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc
Binary file not shown.
237 changes: 216 additions & 21 deletions mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,184 @@
#include "afxdialogex.h"

#include <fstream>
#include <mutex> // NOLINT
#include <queue>
#include <stdexcept>
#include <string>
#include <thread> // NOLINT
#include <vector>

#ifdef _DEBUG
#define new DEBUG_NEW
#endif

Microphone::Microphone() {
PaError err = Pa_Initialize();
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(-2);
}
}

Microphone::~Microphone() {
PaError err = Pa_Terminate();
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(-2);
}
}

// NOTE(fangjun): Code is copied from
// https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc#L22
static std::condition_variable g_cv;
static std::mutex g_cv_m;

struct Samples {
std::vector<float> data;
int32_t consumed = 0;
};

struct Buffer {
std::queue<Samples> samples;
std::mutex mutex;
};

static Buffer g_buffer;

static bool g_started = false;
static bool g_stopped = false;
static bool g_killed = false;

static void AudioGeneratedCallback(const float *s, int32_t n) {
if (n > 0) {
Samples samples;
samples.data = std::vector<float>{s, s + n};

std::lock_guard<std::mutex> lock(g_buffer.mutex);
g_buffer.samples.push(std::move(samples));
g_started = true;
}
}

static int PlayCallback(const void * /*in*/, void *out,
unsigned long _n, // NOLINT
const PaStreamCallbackTimeInfo * /*time_info*/,
PaStreamCallbackFlags /*status_flags*/,
void * /*user_data*/) {
int32_t n = static_cast<int32_t>(_n);
if (g_killed) {
return paComplete;
}

float *pout = reinterpret_cast<float *>(out);
std::lock_guard<std::mutex> lock(g_buffer.mutex);

if (g_buffer.samples.empty()) {
if (g_stopped) {
// no more data is available and we have processed all of the samples
return paComplete;
}

// The current sentence is so long, though very unlikely, that
// the model has not finished processing it yet.
std::fill_n(pout, n, 0);

return paContinue;
}

int32_t k = 0;
for (; k < n && !g_buffer.samples.empty();) {
int32_t this_block = n - k;

auto &p = g_buffer.samples.front();

int32_t remaining = static_cast<int32_t>(p.data.size()) - p.consumed;

if (this_block <= remaining) {
std::copy(p.data.begin() + p.consumed,
p.data.begin() + p.consumed + this_block, pout + k);
p.consumed += this_block;

k = n;

if (p.consumed == p.data.size()) {
g_buffer.samples.pop();
}
break;
}

std::copy(p.data.begin() + p.consumed, p.data.end(), pout + k);
k += static_cast<int32_t>(p.data.size()) - p.consumed;
g_buffer.samples.pop();
}

if (k < n) {
std::fill_n(pout + k, n - k, 0);
}

if (g_stopped && g_buffer.samples.empty()) {
return paComplete;
}

return paContinue;
}

static void PlayCallbackFinished(void *userData) { g_cv.notify_all(); }

static void StartPlayback(int32_t sample_rate) {
int32_t frames_per_buffer = 1024;
PaStreamParameters outputParameters;
PaStream *stream;
PaError err;

outputParameters.device =
Pa_GetDefaultOutputDevice(); /* default output device */

outputParameters.channelCount = 1; /* stereo output */
outputParameters.sampleFormat = paFloat32; /* 32 bit floating point output */
outputParameters.suggestedLatency =
Pa_GetDeviceInfo(outputParameters.device)->defaultLowOutputLatency;
outputParameters.hostApiSpecificStreamInfo = nullptr;

err = Pa_OpenStream(&stream, nullptr, /* no input */
&outputParameters, sample_rate, frames_per_buffer,
paClipOff, // we won't output out of range samples so
// don't bother clipping them
PlayCallback, nullptr);
if (err != paNoError) {
fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
return;
}

err = Pa_SetStreamFinishedCallback(stream, &PlayCallbackFinished);
if (err != paNoError) {
fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
return;
}

err = Pa_StartStream(stream);
if (err != paNoError) {
fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
return;
}

std::unique_lock<std::mutex> lock(g_cv_m);
while (!g_killed && !g_stopped &&
(!g_started || (g_started && !g_buffer.samples.empty()))) {
g_cv.wait(lock);
}

err = Pa_StopStream(stream);
if (err != paNoError) {
return;
}

err = Pa_CloseStream(stream);
if (err != paNoError) {
return;
}
}


// CAboutDlg dialog used for App About

Expand Down Expand Up @@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() {
ok = false;
}

if (!Exists("./lexicon.txt")) {
error_message += "Cannot find ./lexicon.txt\r\n";
if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) {
error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n";
ok = false;
}

Expand All @@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() {
generate_btn_.EnableWindow(FALSE);
error_message +=
"\r\nPlease refer to\r\n"
"https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html";
"https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models";
error_message += "\r\nto download models.\r\n";
error_message += "\r\nWe given an example below\r\n\r\n";
error_message +=
"wget -O model.onnx "
"https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
"vits-aishell3.onnx\r\n";
error_message += "\r\nWe give an example below\r\n\r\n";
error_message +=
"wget "
"https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
"lexicon.txt\r\n";
error_message +=
"wget "
"https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
"tokens.txt\r\n";
"1. Download vits-piper-en_US-amy-low.tar.bz2 from the following URL\r\n\r\n"
"https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2\r\n\r\n"
"2. Uncompress it and you will get a directory vits-piper-en_US-amy-low \r\n\r\n"
"3. Switch to the directory vits-piper-en_US-amy-low \r\n\r\n"
"4. Rename en_US-amy-low.onnx to model.onnx \r\n\r\n"
"5. Copy the current exe to the directory vits-piper-en_US-amy-low\r\n\r\n"
"6. Done! You can now run the exe in the directory vits-piper-en_US-amy-low\r\n\r\n";

AppendLineToMultilineEditCtrl(my_hint_, error_message);
return;
Expand All @@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() {
SherpaOnnxOfflineTtsConfig config;
memset(&config, 0, sizeof(config));
config.model.debug = 0;
config.model.num_threads = 1;
config.model.num_threads = 2;
config.model.provider = "cpu";
config.model.vits.model = "./model.onnx";
config.model.vits.lexicon = "./lexicon.txt";
if (Exists("./espeak-ng-data/phontab")) {
config.model.vits.data_dir = "./espeak-ng-data";
} else {
config.model.vits.lexicon = "./lexicon.txt";
}
config.model.vits.tokens = "./tokens.txt";

tts_ = SherpaOnnxCreateOfflineTts(&config);
Expand All @@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() {
}

void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
// TODO: Add your control notification handler code here
CString s;
speaker_id_.GetWindowText(s);
int speaker_id = _ttoi(s);
Expand All @@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
}

my_text_.GetWindowText(s);

std::string ss = ToString(s);
if (ss.empty()) {
AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK);
return;
}

if (play_thread_) {
g_killed = true;
g_stopped = true;
if (play_thread_->joinable()) {
play_thread_->join();
}
}

g_killed = false;
g_stopped = false;
g_started = false;
g_buffer.samples = {};

// Caution(fangjun): It is not efficient to re-create the thread. We use this approach
// for simplicity
play_thread_ = std::make_unique<std::thread>(StartPlayback, SherpaOnnxOfflineTtsSampleRate(tts_));

generate_btn_.EnableWindow(FALSE);

const SherpaOnnxGeneratedAudio *audio =
SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed);
SherpaOnnxOfflineTtsGenerateWithCallback(tts_, ss.c_str(), speaker_id, speed, &AudioGeneratedCallback);

generate_btn_.EnableWindow(TRUE);

output_filename_.GetWindowText(s);
std::string filename = ToString(s);

int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate,
filename.c_str());

SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);

if (ok) {
AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
// AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
AppendLineToMultilineEditCtrl(my_hint_, std::string("Saved to ") + filename + " successfully");
} else {
AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
// AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
AppendLineToMultilineEditCtrl(my_hint_, std::string("Failed to saved to ") + filename);
}

//CDialogEx::OnOK();
Expand Down
39 changes: 27 additions & 12 deletions mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@

#include "sherpa-onnx/c-api/c-api.h"

#include <memory>
#include <thread>

#include "portaudio.h"

class Microphone {
public:
Microphone();
~Microphone();
};

// CNonStreamingTextToSpeechDlg dialog
class CNonStreamingTextToSpeechDlg : public CDialogEx
Expand Down Expand Up @@ -34,16 +44,21 @@ class CNonStreamingTextToSpeechDlg : public CDialogEx
afx_msg void OnPaint();
afx_msg HCURSOR OnQueryDragIcon();
DECLARE_MESSAGE_MAP()
public:
CEdit my_hint_;
CEdit speaker_id_;
CEdit speed_;
void Init();
void InitHint();
CButton generate_btn_;
afx_msg void OnBnClickedOk();

SherpaOnnxOfflineTts *tts_;
CEdit my_text_;
CEdit output_filename_;
public:
CEdit my_hint_;
CEdit speaker_id_;
CEdit speed_;
void Init();
void InitHint();
CButton generate_btn_;
afx_msg void OnBnClickedOk();

SherpaOnnxOfflineTts *tts_ = nullptr;
CEdit my_text_;
CEdit output_filename_;

private:
Microphone mic_;
std::unique_ptr<std::thread> play_thread_;

};
Loading