-
Notifications
You must be signed in to change notification settings - Fork 477
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Pascal API for MatchaTTS models. (#1686)
- Loading branch information
1 parent
46330b2
commit c6fcd32
Showing
12 changed files
with
875 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,7 @@ | |
piper | ||
piper-playback | ||
link*.res | ||
matcha-zh | ||
matcha-en | ||
matcha-zh-playback | ||
matcha-en-playback |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
{ Copyright (c) 2025 Xiaomi Corporation } | ||
program matcha_en_playback; | ||
{ | ||
This file shows how to use the text to speech API of sherpa-onnx | ||
with Piper models. | ||
It generates speech from text and saves it to a wave file. | ||
Note that it plays the audio back as it is still generating. | ||
} | ||
|
||
{$mode objfpc} | ||
|
||
uses | ||
{$ifdef unix} | ||
cthreads, | ||
{$endif} | ||
SysUtils, | ||
dos, | ||
ctypes, | ||
portaudio, | ||
sherpa_onnx; | ||
|
||
var | ||
CriticalSection: TRTLCriticalSection; | ||
|
||
Tts: TSherpaOnnxOfflineTts; | ||
Audio: TSherpaOnnxGeneratedAudio; | ||
Resampler: TSherpaOnnxLinearResampler; | ||
|
||
Text: AnsiString; | ||
Speed: Single = 1.0; {Use a larger value to speak faster} | ||
SpeakerId: Integer = 0; | ||
Buffer: TSherpaOnnxCircularBuffer; | ||
FinishedGeneration: Boolean = False; | ||
FinishedPlaying: Boolean = False; | ||
|
||
Version: String; | ||
EnvStr: String; | ||
Status: Integer; | ||
NumDevices: Integer; | ||
DeviceIndex: Integer; | ||
DeviceInfo: PPaDeviceInfo; | ||
|
||
{ If you get EDivByZero: Division by zero error, please change the sample rate | ||
to the one supported by your microphone. | ||
} | ||
DeviceSampleRate: Integer = 48000; | ||
I: Integer; | ||
Param: TPaStreamParameters; | ||
Stream: PPaStream; | ||
Wave: TSherpaOnnxWave; | ||
|
||
function GenerateCallback( | ||
Samples: pcfloat; N: cint32; | ||
Arg: Pointer): cint; cdecl; | ||
begin | ||
EnterCriticalSection(CriticalSection); | ||
try | ||
if Resampler <> nil then | ||
Buffer.Push(Resampler.Resample(Samples, N, False)) | ||
else | ||
Buffer.Push(Samples, N); | ||
finally | ||
LeaveCriticalSection(CriticalSection); | ||
end; | ||
|
||
{ 1 means to continue generating; 0 means to stop generating. } | ||
Result := 1; | ||
end; | ||
|
||
function PlayCallback( | ||
input: Pointer; output: Pointer; | ||
frameCount: culong; | ||
timeInfo: PPaStreamCallbackTimeInfo; | ||
statusFlags: TPaStreamCallbackFlags; | ||
userData: Pointer ): cint; cdecl; | ||
var | ||
Samples: TSherpaOnnxSamplesArray; | ||
I: Integer; | ||
begin | ||
EnterCriticalSection(CriticalSection); | ||
try | ||
if Buffer.Size >= frameCount then | ||
begin | ||
Samples := Buffer.Get(Buffer.Head, FrameCount); | ||
Buffer.Pop(FrameCount); | ||
end | ||
else if Buffer.Size > 0 then | ||
begin | ||
Samples := Buffer.Get(Buffer.Head, Buffer.Size); | ||
Buffer.Pop(Buffer.Size); | ||
SetLength(Samples, frameCount); | ||
end | ||
else | ||
SetLength(Samples, frameCount); | ||
|
||
for I := 0 to frameCount - 1 do | ||
pcfloat(output)[I] := Samples[I]; | ||
|
||
if (Buffer.Size > 0) or (not FinishedGeneration) then | ||
Result := paContinue | ||
else | ||
begin | ||
Result := paComplete; | ||
FinishedPlaying := True; | ||
end; | ||
finally | ||
LeaveCriticalSection(CriticalSection); | ||
end; | ||
end; | ||
|
||
function GetOfflineTts: TSherpaOnnxOfflineTts; | ||
var | ||
Config: TSherpaOnnxOfflineTtsConfig; | ||
begin | ||
Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx'; | ||
Config.Model.Matcha.Vocoder := './hifigan_v2.onnx'; | ||
Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt'; | ||
Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data'; | ||
Config.Model.NumThreads := 1; | ||
Config.Model.Debug := False; | ||
Config.MaxNumSentences := 1; | ||
|
||
Result := TSherpaOnnxOfflineTts.Create(Config); | ||
end; | ||
|
||
begin | ||
Tts := GetOfflineTts; | ||
if Tts.GetSampleRate <> DeviceSampleRate then | ||
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); | ||
|
||
Version := String(Pa_GetVersionText); | ||
WriteLn('Version is ', Version); | ||
Status := Pa_Initialize; | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); | ||
Exit; | ||
end; | ||
|
||
NumDevices := Pa_GetDeviceCount; | ||
WriteLn('Num devices: ', NumDevices); | ||
|
||
DeviceIndex := Pa_GetDefaultOutputDevice; | ||
|
||
if DeviceIndex = paNoDevice then | ||
begin | ||
WriteLn('No default output device found'); | ||
Pa_Terminate; | ||
Exit; | ||
end; | ||
|
||
EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); | ||
if EnvStr <> '' then | ||
begin | ||
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); | ||
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); | ||
end; | ||
|
||
for I := 0 to (NumDevices - 1) do | ||
begin | ||
DeviceInfo := Pa_GetDeviceInfo(I); | ||
if I = DeviceIndex then | ||
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } | ||
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) | ||
else | ||
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); | ||
end; | ||
|
||
WriteLn('Use device ', DeviceIndex); | ||
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); | ||
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); | ||
|
||
Initialize(Param); | ||
Param.Device := DeviceIndex; | ||
Param.ChannelCount := 1; | ||
Param.SampleFormat := paFloat32; | ||
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; | ||
param.HostApiSpecificStreamInfo := nil; | ||
|
||
Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); | ||
|
||
|
||
{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } | ||
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, | ||
PPaStreamCallback(@PlayCallback), nil); | ||
|
||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); | ||
Pa_Terminate; | ||
Exit; | ||
end; | ||
|
||
InitCriticalSection(CriticalSection); | ||
|
||
Status := Pa_StartStream(stream); | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); | ||
Pa_Terminate; | ||
Exit; | ||
end; | ||
|
||
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); | ||
|
||
Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.'; | ||
|
||
Audio := Tts.Generate(Text, SpeakerId, Speed, | ||
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); | ||
FinishedGeneration := True; | ||
SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate); | ||
WriteLn('Saved to ./matcha-zh-playback.wav'); | ||
|
||
while not FinishedPlaying do | ||
Pa_Sleep(100); {sleep for 0.1 second } | ||
{TODO(fangjun): Use an event to indicate the play is finished} | ||
|
||
DoneCriticalSection(CriticalSection); | ||
|
||
FreeAndNil(Tts); | ||
FreeAndNil(Resampler); | ||
|
||
Status := Pa_CloseStream(stream); | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); | ||
Exit; | ||
end; | ||
|
||
Status := Pa_Terminate; | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); | ||
Exit; | ||
end; | ||
end. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
{ Copyright (c) 2025 Xiaomi Corporation } | ||
program matcha_en; | ||
{ | ||
This file shows how to use the text to speech API of sherpa-onnx | ||
with MatchaTTS models. | ||
It generates speech from text and saves it to a wave file. | ||
If you want to play it while it is generating, please see | ||
./matcha-zh-playback.pas | ||
} | ||
|
||
{$mode objfpc} | ||
|
||
uses | ||
SysUtils, | ||
sherpa_onnx; | ||
|
||
function GetOfflineTts: TSherpaOnnxOfflineTts; | ||
var | ||
Config: TSherpaOnnxOfflineTtsConfig; | ||
begin | ||
Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx'; | ||
Config.Model.Matcha.Vocoder := './hifigan_v2.onnx'; | ||
Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt'; | ||
Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data'; | ||
Config.Model.NumThreads := 1; | ||
Config.Model.Debug := False; | ||
Config.MaxNumSentences := 1; | ||
|
||
Result := TSherpaOnnxOfflineTts.Create(Config); | ||
end; | ||
|
||
var | ||
Tts: TSherpaOnnxOfflineTts; | ||
Audio: TSherpaOnnxGeneratedAudio; | ||
|
||
Text: AnsiString; | ||
Speed: Single = 1.0; {Use a larger value to speak faster} | ||
SpeakerId: Integer = 0; | ||
|
||
begin | ||
Tts := GetOfflineTts; | ||
|
||
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); | ||
|
||
Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.'; | ||
|
||
Audio := Tts.Generate(Text, SpeakerId, Speed); | ||
SherpaOnnxWriteWave('./matcha-en.wav', Audio.Samples, Audio.SampleRate); | ||
WriteLn('Saved to ./matcha-en.wav'); | ||
|
||
FreeAndNil(Tts); | ||
end. | ||
|
Oops, something went wrong.