Skip to content

Commit

Permalink
Add Dart API for MatchaTTS models (#1687)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 6, 2025
1 parent c6fcd32 commit d7c95d3
Show file tree
Hide file tree
Showing 10 changed files with 349 additions and 24 deletions.
41 changes: 25 additions & 16 deletions .github/scripts/test-dart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,31 @@ set -ex

cd dart-api-examples

pushd tts

echo '----------matcha tts----------'
./run-matcha-zh.sh
./run-matcha-en.sh
ls -lh *.wav
rm -rf matcha-icefall-*
rm *.onnx

echo '----------piper tts----------'
./run-piper.sh
rm -rf vits-piper-*

echo '----------coqui tts----------'
./run-coqui.sh
rm -rf vits-coqui-*

echo '----------zh tts----------'
./run-vits-zh.sh
rm -rf sherpa-onnx-*

ls -lh *.wav

popd # tts

pushd speaker-diarization
echo '----------speaker diarization----------'
./run.sh
Expand Down Expand Up @@ -106,22 +131,6 @@ rm -rf sherpa-onnx-*

popd # non-streaming-asr

pushd tts

echo '----------piper tts----------'
./run-piper.sh
rm -rf vits-piper-*

echo '----------coqui tts----------'
./run-coqui.sh
rm -rf vits-coqui-*

echo '----------zh tts----------'
./run-zh.sh
rm -rf sherpa-onnx-*

popd # tts

pushd streaming-asr

echo '----------streaming zipformer ctc HLG----------'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/checksum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:

jobs:
checksum:
if: github.repository_owner == 'k2-fsa'
runs-on: macos-latest
strategy:
matrix:
Expand Down
86 changes: 86 additions & 0 deletions dart-api-examples/tts/bin/matcha-en.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';

import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;

import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

final parser = ArgParser()
..addOption('acoustic-model', help: 'Path to the acoustic model')
..addOption('vocoder', help: 'Path to the vocoder model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption(
'data-dir',
help: 'Path to espeak-ng-data directory',
defaultsTo: '',
)
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
..addOption('text', help: 'Text to generate TTS for')
..addOption('output-wav', help: 'Filename to save the generated audio')
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
..addOption(
'sid',
help: 'Speaker ID to select. Used only for multi-speaker TTS',
defaultsTo: '0',
);
final res = parser.parse(arguments);
if (res['acoustic-model'] == null ||
res['vocoder'] == null ||
res['tokens'] == null ||
res['data-dir'] == null ||
res['output-wav'] == null ||
res['text'] == null) {
print(parser.usage);
exit(1);
}
final acousticModel = res['acoustic-model'] as String;
final vocoder = res['vocoder'] as String;
final tokens = res['tokens'] as String;
final dataDir = res['data-dir'] as String;
final ruleFsts = res['rule-fsts'] as String;
final ruleFars = res['rule-fars'] as String;
final text = res['text'] as String;
final outputWav = res['output-wav'] as String;
var speed = double.tryParse(res['speed'] as String) ?? 1.0;
final sid = int.tryParse(res['sid'] as String) ?? 0;

if (speed == 0) {
speed = 1.0;
}

final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
acousticModel: acousticModel,
vocoder: vocoder,
tokens: tokens,
dataDir: dataDir,
lengthScale: 1 / speed,
);

final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
matcha: matcha,
numThreads: 1,
debug: true,
);
final config = sherpa_onnx.OfflineTtsConfig(
model: modelConfig,
maxNumSenetences: 1,
ruleFsts: ruleFsts,
ruleFars: ruleFars,
);

final tts = sherpa_onnx.OfflineTts(config);
final audio = tts.generate(text: text, sid: sid, speed: speed);
tts.free();

sherpa_onnx.writeWave(
filename: outputWav,
samples: audio.samples,
sampleRate: audio.sampleRate,
);
print('Saved to $outputWav');
}
90 changes: 90 additions & 0 deletions dart-api-examples/tts/bin/matcha-zh.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';

import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;

import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

final parser = ArgParser()
..addOption('acoustic-model', help: 'Path to the acoustic model')
..addOption('vocoder', help: 'Path to the vocoder model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('lexicon', help: 'Path to lexicon.txt')
..addOption(
'dict-dir',
help: 'Path to jieba dict directory',
defaultsTo: '',
)
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
..addOption('text', help: 'Text to generate TTS for')
..addOption('output-wav', help: 'Filename to save the generated audio')
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
..addOption(
'sid',
help: 'Speaker ID to select. Used only for multi-speaker TTS',
defaultsTo: '0',
);
final res = parser.parse(arguments);
if (res['acoustic-model'] == null ||
res['vocoder'] == null ||
res['lexicon'] == null ||
res['tokens'] == null ||
res['dict-dir'] == null ||
res['output-wav'] == null ||
res['text'] == null) {
print(parser.usage);
exit(1);
}
final acousticModel = res['acoustic-model'] as String;
final vocoder = res['vocoder'] as String;
final lexicon = res['lexicon'] as String;
final tokens = res['tokens'] as String;
final dictDir = res['dict-dir'] as String;
final ruleFsts = res['rule-fsts'] as String;
final ruleFars = res['rule-fars'] as String;
final text = res['text'] as String;
final outputWav = res['output-wav'] as String;
var speed = double.tryParse(res['speed'] as String) ?? 1.0;
final sid = int.tryParse(res['sid'] as String) ?? 0;

if (speed == 0) {
speed = 1.0;
}

final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
acousticModel: acousticModel,
vocoder: vocoder,
lexicon: lexicon,
tokens: tokens,
dictDir: dictDir,
lengthScale: 1 / speed,
);

final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
matcha: matcha,
numThreads: 1,
debug: true,
);
final config = sherpa_onnx.OfflineTtsConfig(
model: modelConfig,
maxNumSenetences: 1,
ruleFsts: ruleFsts,
ruleFars: ruleFars,
);

final tts = sherpa_onnx.OfflineTts(config);
final audio = tts.generate(text: text, sid: sid, speed: speed);
tts.free();

sherpa_onnx.writeWave(
filename: outputWav,
samples: audio.samples,
sampleRate: audio.sampleRate,
);
print('Saved to $outputWav');
}
File renamed without changes.
32 changes: 32 additions & 0 deletions dart-api-examples/tts/run-matcha-en.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

set -ex

dart pub get

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

dart run \
./bin/matcha-en.dart \
--acoustic-model ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--vocoder ./hifigan_v2.onnx \
--tokens ./matcha-icefall-en_US-ljspeech/tokens.txt \
--data-dir ./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--sid 0 \
--speed 1.0 \
--output-wav matcha-en-1.wav \
--text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \

ls -lh *.wav
45 changes: 45 additions & 0 deletions dart-api-examples/tts/run-matcha-zh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

set -ex

dart pub get

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

dart run \
./bin/matcha-zh.dart \
--acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
--vocoder ./hifigan_v2.onnx \
--lexicon ./matcha-icefall-zh-baker/lexicon.txt \
--tokens ./matcha-icefall-zh-baker/tokens.txt \
--dict-dir ./matcha-icefall-zh-baker/dict \
--rule-fsts ./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--sid 0 \
--speed 1.0 \
--output-wav matcha-zh-1.wav \
--text "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" \

dart run \
./bin/matcha-zh.dart \
--acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
--vocoder ./hifigan_v2.onnx \
--lexicon ./matcha-icefall-zh-baker/lexicon.txt \
--tokens ./matcha-icefall-zh-baker/tokens.txt \
--dict-dir ./matcha-icefall-zh-baker/dict \
--sid 0 \
--speed 1.0 \
--output-wav matcha-zh-2.wav \
--text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." \

ls -lh *.wav
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ if [[ ! -f ./sherpa-onnx-vits-zh-ll/tokens.txt ]]; then
fi

dart run \
./bin/zh.dart \
./bin/vits-zh.dart \
--model ./sherpa-onnx-vits-zh-ll/model.onnx \
--lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
--tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
--dict-dir ./sherpa-onnx-vits-zh-ll/dict \
--sid 2 \
--speed 1.0 \
--text '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。' \
--output-wav zh-jieba-2.wav
--output-wav vits-zh-jieba-2.wav

dart run \
./bin/zh.dart \
./bin/vits-zh.dart \
--model ./sherpa-onnx-vits-zh-ll/model.onnx \
--lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
--tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
Expand All @@ -36,6 +36,6 @@ dart run \
--sid 3 \
--speed 1.0 \
--text '今天是2024年6月15号,13点23分。如果有困难,请拨打110或者18920240511。123456块钱。' \
--output-wav zh-jieba-3.wav
--output-wav vits-zh-jieba-3.wav

ls -lh *.wav
17 changes: 17 additions & 0 deletions flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,22 @@ final class SherpaOnnxOfflineTtsVitsModelConfig extends Struct {
external Pointer<Utf8> dictDir;
}

final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct {
external Pointer<Utf8> acousticModel;
external Pointer<Utf8> vocoder;
external Pointer<Utf8> lexicon;
external Pointer<Utf8> tokens;
external Pointer<Utf8> dataDir;

@Float()
external double noiseScale;

@Float()
external double lengthScale;

external Pointer<Utf8> dictDir;
}

final class SherpaOnnxOfflineTtsModelConfig extends Struct {
external SherpaOnnxOfflineTtsVitsModelConfig vits;
@Int32()
Expand All @@ -140,6 +156,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct {
external int debug;

external Pointer<Utf8> provider;
external SherpaOnnxOfflineTtsMatchaModelConfig matcha;
}

final class SherpaOnnxOfflineTtsConfig extends Struct {
Expand Down
Loading

0 comments on commit d7c95d3

Please sign in to comment.