Add streaming ASR examples for Dart API (#1009)

k2-fsa · Jun 15, 2024 · e307767 · e307767
1 parent d945066
commit e307767
Show file tree

Hide file tree

Showing 30 changed files with 1,021 additions and 2 deletions.
diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh
@@ -4,6 +4,30 @@ set -ex
 
 cd dart-api-examples
 
+pushd streaming-asr
+
+echo '----------streaming zipformer ctc HLG----------'
+./run-zipformer-ctc-hlg.sh
+rm -rf sherpa-onnx-*
+
+echo '----------streaming zipformer ctc----------'
+./run-zipformer-ctc.sh
+rm -rf sherpa-onnx-*
+
+echo '----------streaming zipformer transducer----------'
+./run-zipformer-transducer.sh
+rm -rf sherpa-onnx-*
+
+echo '----------streaming NeMo transducer----------'
+./run-nemo-transducer.sh
+rm -rf sherpa-onnx-*
+
+echo '----------streaming paraformer----------'
+./run-paraformer.sh
+rm -rf sherpa-onnx-*
+
+popd # streaming-asr
+
 pushd non-streaming-asr
 
 echo '----------VAD with paraformer----------'
@@ -34,7 +58,7 @@ echo '----------zipformer transducer----------'
 ./run-zipformer-transducer.sh
 rm -rf sherpa-onnx-*
 
-popd
+popd # non-streaming-asr
 
 pushd vad
 ./run.sh

diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml
@@ -6,12 +6,14 @@ on:
       - master
     paths:
       - '.github/workflows/test-dart.yaml'
+      - '.github/scripts/test-dart.sh'
       - 'dart-api-examples/**'
   pull_request:
     branches:
       - master
     paths:
       - '.github/workflows/test-dart.yaml'
+      - '.github/scripts/test-dart.sh'
       - 'dart-api-examples/**'
 
   workflow_dispatch:
@@ -89,5 +91,6 @@ jobs:
         run: |
           cp scripts/dart/vad-pubspec.yaml dart-api-examples/vad/pubspec.yaml
           cp scripts/dart/non-streaming-asr-pubspec.yaml dart-api-examples/non-streaming-asr/pubspec.yaml
+          cp scripts/dart/streaming-asr-pubspec.yaml dart-api-examples/streaming-asr/pubspec.yaml
 
           .github/scripts/test-dart.sh
diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart b/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart b/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/non-streaming-asr/bin/paraformer.dart b/dart-api-examples/non-streaming-asr/bin/paraformer.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart b/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/non-streaming-asr/bin/whisper.dart b/dart-api-examples/non-streaming-asr/bin/whisper.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart
@@ -1,3 +1,4 @@
+// Copyright (c)  2024  Xiaomi Corporation
 import 'dart:io';
 import 'dart:typed_data';
 

diff --git a/dart-api-examples/streaming-asr/.gitignore b/dart-api-examples/streaming-asr/.gitignore
@@ -0,0 +1,3 @@
+# https://dart.dev/guides/libraries/private-files
+# Created by `dart pub`
+.dart_tool/
diff --git a/dart-api-examples/streaming-asr/CHANGELOG.md b/dart-api-examples/streaming-asr/CHANGELOG.md
@@ -0,0 +1,3 @@
+## 1.0.0
+
+- Initial version.
diff --git a/dart-api-examples/streaming-asr/README.md b/dart-api-examples/streaming-asr/README.md
@@ -0,0 +1,11 @@
+# Introduction
+
+This folder contains examples for streaming ASR with Dart API.
+| File | Description|
+|------|------------|
+|[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)|
+|[./bin/paraformer.dart](./bin/paraformer.dart)| Use a Paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)|
+|[./bin/zipformer-ctc-hlg.dart](./bin/zipformer-ctc-hlg.dart)| Use a Zipformer CTC model with HLG graph for speech recognition. See [./run-zipformer-ctc-hlg.sh](./run-zipformer-ctc-hlg.sh)|
+|[./bin/zipformer-ctc.dart](./bin/zipformer-ctc.dart)| Use a Zipformer CTC model for speech recognition. See [./run-zipformer-ctc.sh](./run-zipformer-ctc.sh)|
+|[./bin/zipformer-transducer.dart](./bin/zipformer-transducer.dart)| Use a Zipformer transducer model for speech recognition. See [./run-zipformer-transducer.sh](./run-zipformer-transducer.sh)|
+
diff --git a/dart-api-examples/streaming-asr/analysis_options.yaml b/dart-api-examples/streaming-asr/analysis_options.yaml
@@ -0,0 +1,30 @@
+# This file configures the static analysis results for your project (errors,
+# warnings, and lints).
+#
+# This enables the 'recommended' set of lints from `package:lints`.
+# This set helps identify many issues that may lead to problems when running
+# or consuming Dart code, and enforces writing Dart using a single, idiomatic
+# style and format.
+#
+# If you want a smaller set of lints you can change this to specify
+# 'package:lints/core.yaml'. These are just the most critical lints
+# (the recommended set includes the core lints).
+# The core lints are also what is used by pub.dev for scoring packages.
+
+include: package:lints/recommended.yaml
+
+# Uncomment the following section to specify additional rules.
+
+# linter:
+#   rules:
+#     - camel_case_types
+
+# analyzer:
+#   exclude:
+#     - path/to/excluded/files/**
+
+# For more information about the core and recommended set of lints, see
+# https://dart.dev/go/core-lints
+
+# For additional information about configuring this file, see
+# https://dart.dev/guides/language/analysis-options
diff --git a/dart-api-examples/streaming-asr/bin/init.dart b/dart-api-examples/streaming-asr/bin/init.dart
@@ -0,0 +1 @@
+../../vad/bin/init.dart
diff --git a/dart-api-examples/streaming-asr/bin/nemo-transducer.dart b/dart-api-examples/streaming-asr/bin/nemo-transducer.dart
@@ -0,0 +1 @@
+zipformer-transducer.dart
diff --git a/dart-api-examples/streaming-asr/bin/paraformer.dart b/dart-api-examples/streaming-asr/bin/paraformer.dart
@@ -0,0 +1,92 @@
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+import 'dart:typed_data';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('encoder', help: 'Path to the encoder model')
+    ..addOption('decoder', help: 'Path to decoder model')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption('input-wav', help: 'Path to input.wav to transcribe');
+
+  final res = parser.parse(arguments);
+  if (res['encoder'] == null ||
+      res['decoder'] == null ||
+      res['tokens'] == null ||
+      res['input-wav'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+
+  final encoder = res['encoder'] as String;
+  final decoder = res['decoder'] as String;
+  final tokens = res['tokens'] as String;
+  final inputWav = res['input-wav'] as String;
+
+  final paraformer = sherpa_onnx.OnlineParaformerModelConfig(
+    encoder: encoder,
+    decoder: decoder,
+  );
+
+  final modelConfig = sherpa_onnx.OnlineModelConfig(
+    paraformer: paraformer,
+    tokens: tokens,
+    debug: true,
+    numThreads: 1,
+  );
+  final config = sherpa_onnx.OnlineRecognizerConfig(model: modelConfig);
+  final recognizer = sherpa_onnx.OnlineRecognizer(config);
+
+  final waveData = sherpa_onnx.readWave(inputWav);
+  final stream = recognizer.createStream();
+
+  // simulate streaming. You can choose an arbitrary chunk size.
+  // chunkSize of a single sample is also ok, i.e, chunkSize = 1
+  final chunkSize = 1600; // 0.1 second for 16kHz
+  final numChunks = waveData.samples.length ~/ chunkSize;
+
+  var last = '';
+  for (int i = 0; i != numChunks; ++i) {
+    int start = i * chunkSize;
+    stream.acceptWaveform(
+      samples:
+          Float32List.sublistView(waveData.samples, start, start + chunkSize),
+      sampleRate: waveData.sampleRate,
+    );
+    while (recognizer.isReady(stream)) {
+      recognizer.decode(stream);
+    }
+    final result = recognizer.getResult(stream);
+    if (result.text != last && result.text != '') {
+      last = result.text;
+      print(last);
+    }
+  }
+
+  // 0.5 seconds, assume sampleRate is 16kHz
+  final tailPaddings = Float32List(8000);
+  stream.acceptWaveform(
+    samples: tailPaddings,
+    sampleRate: waveData.sampleRate,
+  );
+
+  while (recognizer.isReady(stream)) {
+    recognizer.decode(stream);
+  }
+
+  final result = recognizer.getResult(stream);
+
+  if (result.text != '') {
+    print(result.text);
+  }
+
+  stream.free();
+  recognizer.free();
+}
diff --git a/dart-api-examples/streaming-asr/bin/zipformer-ctc-hlg.dart b/dart-api-examples/streaming-asr/bin/zipformer-ctc-hlg.dart
@@ -0,0 +1,94 @@
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+import 'dart:typed_data';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('model', help: 'Path to the model')
+    ..addOption('hlg', help: 'Path to HLG.fst')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption('input-wav', help: 'Path to input.wav to transcribe');
+
+  final res = parser.parse(arguments);
+  if (res['model'] == null ||
+      res['hlg'] == null ||
+      res['tokens'] == null ||
+      res['input-wav'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+
+  final model = res['model'] as String;
+  final hlg = res['hlg'] as String;
+  final tokens = res['tokens'] as String;
+  final inputWav = res['input-wav'] as String;
+
+  final ctc = sherpa_onnx.OnlineZipformer2CtcModelConfig(
+    model: model,
+  );
+
+  final modelConfig = sherpa_onnx.OnlineModelConfig(
+    zipformer2Ctc: ctc,
+    tokens: tokens,
+    debug: true,
+    numThreads: 1,
+  );
+  final config = sherpa_onnx.OnlineRecognizerConfig(
+    model: modelConfig,
+    ctcFstDecoderConfig: sherpa_onnx.OnlineCtcFstDecoderConfig(graph: hlg),
+  );
+  final recognizer = sherpa_onnx.OnlineRecognizer(config);
+
+  final waveData = sherpa_onnx.readWave(inputWav);
+  final stream = recognizer.createStream();
+
+  // simulate streaming. You can choose an arbitrary chunk size.
+  // chunkSize of a single sample is also ok, i.e, chunkSize = 1
+  final chunkSize = 1600; // 0.1 second for 16kHz
+  final numChunks = waveData.samples.length ~/ chunkSize;
+
+  var last = '';
+  for (int i = 0; i != numChunks; ++i) {
+    int start = i * chunkSize;
+    stream.acceptWaveform(
+      samples:
+          Float32List.sublistView(waveData.samples, start, start + chunkSize),
+      sampleRate: waveData.sampleRate,
+    );
+    while (recognizer.isReady(stream)) {
+      recognizer.decode(stream);
+    }
+    final result = recognizer.getResult(stream);
+    if (result.text != last && result.text != '') {
+      last = result.text;
+      print(last);
+    }
+  }
+
+  // 0.5 seconds, assume sampleRate is 16kHz
+  final tailPaddings = Float32List(8000);
+  stream.acceptWaveform(
+    samples: tailPaddings,
+    sampleRate: waveData.sampleRate,
+  );
+
+  while (recognizer.isReady(stream)) {
+    recognizer.decode(stream);
+  }
+
+  final result = recognizer.getResult(stream);
+
+  if (result.text != '') {
+    print(result.text);
+  }
+
+  stream.free();
+  recognizer.free();
+}