-
Notifications
You must be signed in to change notification settings - Fork 477
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Dart API for MatchaTTS models (#1687)
- Loading branch information
1 parent
c6fcd32
commit d7c95d3
Showing
10 changed files
with
349 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ on: | |
|
||
jobs: | ||
checksum: | ||
if: github.repository_owner == 'k2-fsa' | ||
runs-on: macos-latest | ||
strategy: | ||
matrix: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
// Copyright (c) 2025 Xiaomi Corporation | ||
import 'dart:io'; | ||
|
||
import 'package:args/args.dart'; | ||
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
|
||
import './init.dart'; | ||
|
||
void main(List<String> arguments) async { | ||
await initSherpaOnnx(); | ||
|
||
final parser = ArgParser() | ||
..addOption('acoustic-model', help: 'Path to the acoustic model') | ||
..addOption('vocoder', help: 'Path to the vocoder model') | ||
..addOption('tokens', help: 'Path to tokens.txt') | ||
..addOption( | ||
'data-dir', | ||
help: 'Path to espeak-ng-data directory', | ||
defaultsTo: '', | ||
) | ||
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') | ||
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') | ||
..addOption('text', help: 'Text to generate TTS for') | ||
..addOption('output-wav', help: 'Filename to save the generated audio') | ||
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') | ||
..addOption( | ||
'sid', | ||
help: 'Speaker ID to select. Used only for multi-speaker TTS', | ||
defaultsTo: '0', | ||
); | ||
final res = parser.parse(arguments); | ||
if (res['acoustic-model'] == null || | ||
res['vocoder'] == null || | ||
res['tokens'] == null || | ||
res['data-dir'] == null || | ||
res['output-wav'] == null || | ||
res['text'] == null) { | ||
print(parser.usage); | ||
exit(1); | ||
} | ||
final acousticModel = res['acoustic-model'] as String; | ||
final vocoder = res['vocoder'] as String; | ||
final tokens = res['tokens'] as String; | ||
final dataDir = res['data-dir'] as String; | ||
final ruleFsts = res['rule-fsts'] as String; | ||
final ruleFars = res['rule-fars'] as String; | ||
final text = res['text'] as String; | ||
final outputWav = res['output-wav'] as String; | ||
var speed = double.tryParse(res['speed'] as String) ?? 1.0; | ||
final sid = int.tryParse(res['sid'] as String) ?? 0; | ||
|
||
if (speed == 0) { | ||
speed = 1.0; | ||
} | ||
|
||
final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig( | ||
acousticModel: acousticModel, | ||
vocoder: vocoder, | ||
tokens: tokens, | ||
dataDir: dataDir, | ||
lengthScale: 1 / speed, | ||
); | ||
|
||
final modelConfig = sherpa_onnx.OfflineTtsModelConfig( | ||
matcha: matcha, | ||
numThreads: 1, | ||
debug: true, | ||
); | ||
final config = sherpa_onnx.OfflineTtsConfig( | ||
model: modelConfig, | ||
maxNumSenetences: 1, | ||
ruleFsts: ruleFsts, | ||
ruleFars: ruleFars, | ||
); | ||
|
||
final tts = sherpa_onnx.OfflineTts(config); | ||
final audio = tts.generate(text: text, sid: sid, speed: speed); | ||
tts.free(); | ||
|
||
sherpa_onnx.writeWave( | ||
filename: outputWav, | ||
samples: audio.samples, | ||
sampleRate: audio.sampleRate, | ||
); | ||
print('Saved to $outputWav'); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
// Copyright (c) 2025 Xiaomi Corporation | ||
import 'dart:io'; | ||
|
||
import 'package:args/args.dart'; | ||
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
|
||
import './init.dart'; | ||
|
||
void main(List<String> arguments) async { | ||
await initSherpaOnnx(); | ||
|
||
final parser = ArgParser() | ||
..addOption('acoustic-model', help: 'Path to the acoustic model') | ||
..addOption('vocoder', help: 'Path to the vocoder model') | ||
..addOption('tokens', help: 'Path to tokens.txt') | ||
..addOption('lexicon', help: 'Path to lexicon.txt') | ||
..addOption( | ||
'dict-dir', | ||
help: 'Path to jieba dict directory', | ||
defaultsTo: '', | ||
) | ||
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') | ||
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') | ||
..addOption('text', help: 'Text to generate TTS for') | ||
..addOption('output-wav', help: 'Filename to save the generated audio') | ||
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') | ||
..addOption( | ||
'sid', | ||
help: 'Speaker ID to select. Used only for multi-speaker TTS', | ||
defaultsTo: '0', | ||
); | ||
final res = parser.parse(arguments); | ||
if (res['acoustic-model'] == null || | ||
res['vocoder'] == null || | ||
res['lexicon'] == null || | ||
res['tokens'] == null || | ||
res['dict-dir'] == null || | ||
res['output-wav'] == null || | ||
res['text'] == null) { | ||
print(parser.usage); | ||
exit(1); | ||
} | ||
final acousticModel = res['acoustic-model'] as String; | ||
final vocoder = res['vocoder'] as String; | ||
final lexicon = res['lexicon'] as String; | ||
final tokens = res['tokens'] as String; | ||
final dictDir = res['dict-dir'] as String; | ||
final ruleFsts = res['rule-fsts'] as String; | ||
final ruleFars = res['rule-fars'] as String; | ||
final text = res['text'] as String; | ||
final outputWav = res['output-wav'] as String; | ||
var speed = double.tryParse(res['speed'] as String) ?? 1.0; | ||
final sid = int.tryParse(res['sid'] as String) ?? 0; | ||
|
||
if (speed == 0) { | ||
speed = 1.0; | ||
} | ||
|
||
final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig( | ||
acousticModel: acousticModel, | ||
vocoder: vocoder, | ||
lexicon: lexicon, | ||
tokens: tokens, | ||
dictDir: dictDir, | ||
lengthScale: 1 / speed, | ||
); | ||
|
||
final modelConfig = sherpa_onnx.OfflineTtsModelConfig( | ||
matcha: matcha, | ||
numThreads: 1, | ||
debug: true, | ||
); | ||
final config = sherpa_onnx.OfflineTtsConfig( | ||
model: modelConfig, | ||
maxNumSenetences: 1, | ||
ruleFsts: ruleFsts, | ||
ruleFars: ruleFars, | ||
); | ||
|
||
final tts = sherpa_onnx.OfflineTts(config); | ||
final audio = tts.generate(text: text, sid: sid, speed: speed); | ||
tts.free(); | ||
|
||
sherpa_onnx.writeWave( | ||
filename: outputWav, | ||
samples: audio.samples, | ||
sampleRate: audio.sampleRate, | ||
); | ||
print('Saved to $outputWav'); | ||
} |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
dart pub get | ||
|
||
# please visit | ||
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
# to download more models | ||
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
tar xf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
fi | ||
|
||
if [ ! -f ./hifigan_v2.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
fi | ||
|
||
dart run \ | ||
./bin/matcha-en.dart \ | ||
--acoustic-model ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
--vocoder ./hifigan_v2.onnx \ | ||
--tokens ./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
--data-dir ./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
--sid 0 \ | ||
--speed 1.0 \ | ||
--output-wav matcha-en-1.wav \ | ||
--text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \ | ||
|
||
ls -lh *.wav |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
dart pub get | ||
|
||
# please visit | ||
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
# to download more models | ||
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
rm matcha-icefall-zh-baker.tar.bz2 | ||
fi | ||
|
||
if [ ! -f ./hifigan_v2.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
fi | ||
|
||
dart run \ | ||
./bin/matcha-zh.dart \ | ||
--acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
--vocoder ./hifigan_v2.onnx \ | ||
--lexicon ./matcha-icefall-zh-baker/lexicon.txt \ | ||
--tokens ./matcha-icefall-zh-baker/tokens.txt \ | ||
--dict-dir ./matcha-icefall-zh-baker/dict \ | ||
--rule-fsts ./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
--sid 0 \ | ||
--speed 1.0 \ | ||
--output-wav matcha-zh-1.wav \ | ||
--text "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" \ | ||
|
||
dart run \ | ||
./bin/matcha-zh.dart \ | ||
--acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
--vocoder ./hifigan_v2.onnx \ | ||
--lexicon ./matcha-icefall-zh-baker/lexicon.txt \ | ||
--tokens ./matcha-icefall-zh-baker/tokens.txt \ | ||
--dict-dir ./matcha-icefall-zh-baker/dict \ | ||
--sid 0 \ | ||
--speed 1.0 \ | ||
--output-wav matcha-zh-2.wav \ | ||
--text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." \ | ||
|
||
ls -lh *.wav |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.