Add Dart API for MatchaTTS models (#1687)

k2-fsa · Jan 6, 2025 · d7c95d3 · d7c95d3
1 parent c6fcd32
commit d7c95d3
Show file tree

Hide file tree

Showing 10 changed files with 349 additions and 24 deletions.
diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh
@@ -4,6 +4,31 @@ set -ex
 
 cd dart-api-examples
 
+pushd tts
+
+echo '----------matcha tts----------'
+./run-matcha-zh.sh
+./run-matcha-en.sh
+ls -lh *.wav
+rm -rf matcha-icefall-*
+rm *.onnx
+
+echo '----------piper tts----------'
+./run-piper.sh
+rm -rf vits-piper-*
+
+echo '----------coqui tts----------'
+./run-coqui.sh
+rm -rf vits-coqui-*
+
+echo '----------zh tts----------'
+./run-vits-zh.sh
+rm -rf sherpa-onnx-*
+
+ls -lh *.wav
+
+popd # tts
+
 pushd speaker-diarization
 echo '----------speaker diarization----------'
 ./run.sh
@@ -106,22 +131,6 @@ rm -rf sherpa-onnx-*
 
 popd # non-streaming-asr
 
-pushd tts
-
-echo '----------piper tts----------'
-./run-piper.sh
-rm -rf vits-piper-*
-
-echo '----------coqui tts----------'
-./run-coqui.sh
-rm -rf vits-coqui-*
-
-echo '----------zh tts----------'
-./run-zh.sh
-rm -rf sherpa-onnx-*
-
-popd # tts
-
 pushd streaming-asr
 
 echo '----------streaming zipformer ctc HLG----------'

diff --git a/.github/workflows/checksum.yaml b/.github/workflows/checksum.yaml
@@ -7,6 +7,7 @@ on:
 
 jobs:
   checksum:
+    if: github.repository_owner == 'k2-fsa'
     runs-on: macos-latest
     strategy:
       matrix:

diff --git a/dart-api-examples/tts/bin/matcha-en.dart b/dart-api-examples/tts/bin/matcha-en.dart
@@ -0,0 +1,86 @@
+// Copyright (c)  2025  Xiaomi Corporation
+import 'dart:io';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('acoustic-model', help: 'Path to the acoustic model')
+    ..addOption('vocoder', help: 'Path to the vocoder model')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption(
+      'data-dir',
+      help: 'Path to espeak-ng-data directory',
+      defaultsTo: '',
+    )
+    ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
+    ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
+    ..addOption('text', help: 'Text to generate TTS for')
+    ..addOption('output-wav', help: 'Filename to save the generated audio')
+    ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+    ..addOption(
+      'sid',
+      help: 'Speaker ID to select. Used only for multi-speaker TTS',
+      defaultsTo: '0',
+    );
+  final res = parser.parse(arguments);
+  if (res['acoustic-model'] == null ||
+      res['vocoder'] == null ||
+      res['tokens'] == null ||
+      res['data-dir'] == null ||
+      res['output-wav'] == null ||
+      res['text'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+  final acousticModel = res['acoustic-model'] as String;
+  final vocoder = res['vocoder'] as String;
+  final tokens = res['tokens'] as String;
+  final dataDir = res['data-dir'] as String;
+  final ruleFsts = res['rule-fsts'] as String;
+  final ruleFars = res['rule-fars'] as String;
+  final text = res['text'] as String;
+  final outputWav = res['output-wav'] as String;
+  var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+  final sid = int.tryParse(res['sid'] as String) ?? 0;
+
+  if (speed == 0) {
+    speed = 1.0;
+  }
+
+  final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
+    acousticModel: acousticModel,
+    vocoder: vocoder,
+    tokens: tokens,
+    dataDir: dataDir,
+    lengthScale: 1 / speed,
+  );
+
+  final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+    matcha: matcha,
+    numThreads: 1,
+    debug: true,
+  );
+  final config = sherpa_onnx.OfflineTtsConfig(
+    model: modelConfig,
+    maxNumSenetences: 1,
+    ruleFsts: ruleFsts,
+    ruleFars: ruleFars,
+  );
+
+  final tts = sherpa_onnx.OfflineTts(config);
+  final audio = tts.generate(text: text, sid: sid, speed: speed);
+  tts.free();
+
+  sherpa_onnx.writeWave(
+    filename: outputWav,
+    samples: audio.samples,
+    sampleRate: audio.sampleRate,
+  );
+  print('Saved to $outputWav');
+}
diff --git a/dart-api-examples/tts/bin/matcha-zh.dart b/dart-api-examples/tts/bin/matcha-zh.dart
@@ -0,0 +1,90 @@
+// Copyright (c)  2025  Xiaomi Corporation
+import 'dart:io';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('acoustic-model', help: 'Path to the acoustic model')
+    ..addOption('vocoder', help: 'Path to the vocoder model')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption('lexicon', help: 'Path to lexicon.txt')
+    ..addOption(
+      'dict-dir',
+      help: 'Path to jieba dict directory',
+      defaultsTo: '',
+    )
+    ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
+    ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
+    ..addOption('text', help: 'Text to generate TTS for')
+    ..addOption('output-wav', help: 'Filename to save the generated audio')
+    ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+    ..addOption(
+      'sid',
+      help: 'Speaker ID to select. Used only for multi-speaker TTS',
+      defaultsTo: '0',
+    );
+  final res = parser.parse(arguments);
+  if (res['acoustic-model'] == null ||
+      res['vocoder'] == null ||
+      res['lexicon'] == null ||
+      res['tokens'] == null ||
+      res['dict-dir'] == null ||
+      res['output-wav'] == null ||
+      res['text'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+  final acousticModel = res['acoustic-model'] as String;
+  final vocoder = res['vocoder'] as String;
+  final lexicon = res['lexicon'] as String;
+  final tokens = res['tokens'] as String;
+  final dictDir = res['dict-dir'] as String;
+  final ruleFsts = res['rule-fsts'] as String;
+  final ruleFars = res['rule-fars'] as String;
+  final text = res['text'] as String;
+  final outputWav = res['output-wav'] as String;
+  var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+  final sid = int.tryParse(res['sid'] as String) ?? 0;
+
+  if (speed == 0) {
+    speed = 1.0;
+  }
+
+  final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
+    acousticModel: acousticModel,
+    vocoder: vocoder,
+    lexicon: lexicon,
+    tokens: tokens,
+    dictDir: dictDir,
+    lengthScale: 1 / speed,
+  );
+
+  final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+    matcha: matcha,
+    numThreads: 1,
+    debug: true,
+  );
+  final config = sherpa_onnx.OfflineTtsConfig(
+    model: modelConfig,
+    maxNumSenetences: 1,
+    ruleFsts: ruleFsts,
+    ruleFars: ruleFars,
+  );
+
+  final tts = sherpa_onnx.OfflineTts(config);
+  final audio = tts.generate(text: text, sid: sid, speed: speed);
+  tts.free();
+
+  sherpa_onnx.writeWave(
+    filename: outputWav,
+    samples: audio.samples,
+    sampleRate: audio.sampleRate,
+  );
+  print('Saved to $outputWav');
+}
diff --git a/dart-api-examples/tts/bin/zh.dart → dart-api-examples/tts/bin/vits-zh.dart b/dart-api-examples/tts/bin/zh.dart → dart-api-examples/tts/bin/vits-zh.dart
diff --git a/dart-api-examples/tts/run-matcha-en.sh b/dart-api-examples/tts/run-matcha-en.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -ex
+
+dart pub get
+
+# please visit
+# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+# to download more models
+if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
+  tar xf matcha-icefall-en_US-ljspeech.tar.bz2
+  rm matcha-icefall-en_US-ljspeech.tar.bz2
+fi
+
+if [ ! -f ./hifigan_v2.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+fi
+
+dart run \
+  ./bin/matcha-en.dart \
+  --acoustic-model ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
+  --vocoder ./hifigan_v2.onnx \
+  --tokens ./matcha-icefall-en_US-ljspeech/tokens.txt \
+  --data-dir ./matcha-icefall-en_US-ljspeech/espeak-ng-data \
+  --sid 0 \
+  --speed 1.0 \
+  --output-wav matcha-en-1.wav \
+  --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \
+
+ls -lh *.wav
diff --git a/dart-api-examples/tts/run-matcha-zh.sh b/dart-api-examples/tts/run-matcha-zh.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+set -ex
+
+dart pub get
+
+# please visit
+# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
+# to download more models
+if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
+  tar xvf matcha-icefall-zh-baker.tar.bz2
+  rm matcha-icefall-zh-baker.tar.bz2
+fi
+
+if [ ! -f ./hifigan_v2.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+fi
+
+dart run \
+  ./bin/matcha-zh.dart \
+  --acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
+  --vocoder ./hifigan_v2.onnx \
+  --lexicon ./matcha-icefall-zh-baker/lexicon.txt \
+  --tokens ./matcha-icefall-zh-baker/tokens.txt \
+  --dict-dir ./matcha-icefall-zh-baker/dict \
+  --rule-fsts ./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
+  --sid 0 \
+  --speed 1.0 \
+  --output-wav matcha-zh-1.wav \
+  --text "某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。" \
+
+dart run \
+  ./bin/matcha-zh.dart \
+  --acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
+  --vocoder ./hifigan_v2.onnx \
+  --lexicon ./matcha-icefall-zh-baker/lexicon.txt \
+  --tokens ./matcha-icefall-zh-baker/tokens.txt \
+  --dict-dir ./matcha-icefall-zh-baker/dict \
+  --sid 0 \
+  --speed 1.0 \
+  --output-wav matcha-zh-2.wav \
+  --text "当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔." \
+
+ls -lh *.wav
diff --git a/dart-api-examples/tts/run-zh.sh → dart-api-examples/tts/run-vits-zh.sh b/dart-api-examples/tts/run-zh.sh → dart-api-examples/tts/run-vits-zh.sh
@@ -16,18 +16,18 @@ if [[ ! -f ./sherpa-onnx-vits-zh-ll/tokens.txt ]]; then
 fi
 
 dart run \
-  ./bin/zh.dart \
+  ./bin/vits-zh.dart \
   --model ./sherpa-onnx-vits-zh-ll/model.onnx \
   --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
   --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
   --dict-dir ./sherpa-onnx-vits-zh-ll/dict \
   --sid 2 \
   --speed 1.0 \
   --text '当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔。' \
-  --output-wav zh-jieba-2.wav
+  --output-wav vits-zh-jieba-2.wav
 
 dart run \
-  ./bin/zh.dart \
+  ./bin/vits-zh.dart \
   --model ./sherpa-onnx-vits-zh-ll/model.onnx \
   --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
   --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
@@ -36,6 +36,6 @@ dart run \
   --sid 3 \
   --speed 1.0 \
   --text '今天是2024年6月15号，13点23分。如果有困难，请拨打110或者18920240511。123456块钱。' \
-  --output-wav zh-jieba-3.wav
+  --output-wav vits-zh-jieba-3.wav
 
 ls -lh *.wav
diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
@@ -131,6 +131,22 @@ final class SherpaOnnxOfflineTtsVitsModelConfig extends Struct {
   external Pointer<Utf8> dictDir;
 }
 
+final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct {
+  external Pointer<Utf8> acousticModel;
+  external Pointer<Utf8> vocoder;
+  external Pointer<Utf8> lexicon;
+  external Pointer<Utf8> tokens;
+  external Pointer<Utf8> dataDir;
+
+  @Float()
+  external double noiseScale;
+
+  @Float()
+  external double lengthScale;
+
+  external Pointer<Utf8> dictDir;
+}
+
 final class SherpaOnnxOfflineTtsModelConfig extends Struct {
   external SherpaOnnxOfflineTtsVitsModelConfig vits;
   @Int32()
@@ -140,6 +156,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct {
   external int debug;
 
   external Pointer<Utf8> provider;
+  external SherpaOnnxOfflineTtsMatchaModelConfig matcha;
 }
 
 final class SherpaOnnxOfflineTtsConfig extends Struct {