From ec98110e11881406e4cf677b6ab65a7215b8e70c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 31 Jul 2024 13:53:52 +0800 Subject: [PATCH] Add speaker identification and verification exmaple for Dart API (#1194) --- .github/scripts/test-dart.sh | 5 + .github/workflows/test-dart.yaml | 1 + dart-api-examples/README.md | 7 +- .../speaker-identification/.gitignore | 3 + .../speaker-identification/README.md | 7 + .../analysis_options.yaml | 30 ++++ .../speaker-identification/bin/init.dart | 1 + .../bin/speaker_id.dart | 160 ++++++++++++++++++ .../speaker-identification/pubspec.yaml | 17 ++ .../speaker-identification/run-3d-speaker.sh | 19 +++ flutter/sherpa_onnx/example/example.md | 4 + java-api-examples/SpeakerIdentification.java | 2 +- scripts/dart/speaker-id-pubspec.yaml | 18 ++ 13 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 dart-api-examples/speaker-identification/.gitignore create mode 100644 dart-api-examples/speaker-identification/README.md create mode 100644 dart-api-examples/speaker-identification/analysis_options.yaml create mode 120000 dart-api-examples/speaker-identification/bin/init.dart create mode 100644 dart-api-examples/speaker-identification/bin/speaker_id.dart create mode 100644 dart-api-examples/speaker-identification/pubspec.yaml create mode 100755 dart-api-examples/speaker-identification/run-3d-speaker.sh create mode 100644 scripts/dart/speaker-id-pubspec.yaml diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index bcdb86b67..2194467b3 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,11 @@ set -ex cd dart-api-examples +pushd speaker-identification +echo '----------3d speaker----------' +./run-3d-speaker.sh +popd + pushd add-punctuations echo '----------CT Transformer----------' ./run-ct-transformer.sh diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index f280bfc84..1516f6325 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -112,6 +112,7 @@ jobs: cp scripts/dart/vad-non-streaming-asr-pubspec.yaml dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml cp scripts/dart/audio-tagging-pubspec.yaml dart-api-examples/audio-tagging/pubspec.yaml cp scripts/dart/add-punctuations-pubspec.yaml dart-api-examples/add-punctuations/pubspec.yaml + cp scripts/dart/speaker-id-pubspec.yaml dart-api-examples/speaker-identification/pubspec.yaml cp scripts/dart/sherpa-onnx-pubspec.yaml flutter/sherpa_onnx/pubspec.yaml diff --git a/dart-api-examples/README.md b/dart-api-examples/README.md index da49310e1..9370372e7 100644 --- a/dart-api-examples/README.md +++ b/dart-api-examples/README.md @@ -9,14 +9,15 @@ https://pub.dev/packages/sherpa_onnx | Directory | Description | |-----------|-------------| +| [./add-punctuations](./add-punctuations)| Example for adding punctuations to text.| +| [./audio-tagging](./audio-tagging)| Example for audio tagging.| | [./keyword-spotter](./keyword-spotter)| Example for keyword spotting| | [./non-streaming-asr](./non-streaming-asr)| Example for non-streaming speech recognition| +| [./speaker-identification](./speaker-identification)| Example for speaker identification and verification.| | [./streaming-asr](./streaming-asr)| Example for streaming speech recognition| | [./tts](./tts)| Example for text to speech| -| [./vad](./vad)| Example for voice activity detection| | [./vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| Example for voice activity detection with non-streaming speech recognition. You can use it to generate subtitles.| -| [./audio-tagging](./audio-tagging)| Example for audio tagging.| -| [./add-punctuations](./add-punctuations)| Example for adding punctuations to text.| +| [./vad](./vad)| Example for voice activity detection| ## How to create an example in this folder diff --git a/dart-api-examples/speaker-identification/.gitignore b/dart-api-examples/speaker-identification/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/speaker-identification/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/speaker-identification/README.md b/dart-api-examples/speaker-identification/README.md new file mode 100644 index 000000000..b25dd5099 --- /dev/null +++ b/dart-api-examples/speaker-identification/README.md @@ -0,0 +1,7 @@ +# Introduction + +This example shows how to use the Dart API from sherpa-onnx for speaker identification. + +| File | Description| +|------|------------| +|[./bin/speaker_id.dart](./bin/speaker_id.dart)| Use a speaker embedding extractor model for speaker identification and verification. See also [./run-3d-speaker.sh](./run-3d-speaker.sh)| diff --git a/dart-api-examples/speaker-identification/analysis_options.yaml b/dart-api-examples/speaker-identification/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/speaker-identification/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/speaker-identification/bin/init.dart b/dart-api-examples/speaker-identification/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/speaker-identification/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/speaker-identification/bin/speaker_id.dart b/dart-api-examples/speaker-identification/bin/speaker_id.dart new file mode 100644 index 000000000..3f1e54dda --- /dev/null +++ b/dart-api-examples/speaker-identification/bin/speaker_id.dart @@ -0,0 +1,160 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; +import './init.dart'; + +Float32List computeEmbedding( + {required sherpa_onnx.SpeakerEmbeddingExtractor extractor, + required String filename}) { + final waveData = sherpa_onnx.readWave(filename); + final stream = extractor.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, + sampleRate: waveData.sampleRate, + ); + + stream.inputFinished(); + + final embedding = extractor.compute(stream); + + stream.free(); + + return embedding; +} + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser()..addOption('model', help: 'Path to model.onnx'); + + final res = parser.parse(arguments); + if (res['model'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + /* + Please download test data by yourself + + curl -SL -o sr-data.tar.gz https://github.com/csukuangfj/sr-data/archive/refs/tags/v1.0.0.tar.gz + tar xvf sr-data.tar.gz + mv sr-data-1.0.0 sr-data + */ + + final config = sherpa_onnx.SpeakerEmbeddingExtractorConfig( + model: model, + numThreads: 1, + debug: true, + provider: 'cpu', + ); + final extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config: config); + + final manager = sherpa_onnx.SpeakerEmbeddingManager(extractor.dim); + + final spk1Files = [ + "./sr-data/enroll/fangjun-sr-1.wav", + "./sr-data/enroll/fangjun-sr-2.wav", + "./sr-data/enroll/fangjun-sr-3.wav", + ]; + + final spk1Vec = []; + for (final f in spk1Files) { + final embedding = computeEmbedding(extractor: extractor, filename: f); + spk1Vec.add(embedding); + } + + final spk2Files = [ + "./sr-data/enroll/leijun-sr-1.wav", + "./sr-data/enroll/leijun-sr-2.wav", + ]; + + final spk2Vec = []; + for (final f in spk2Files) { + final embedding = computeEmbedding(extractor: extractor, filename: f); + spk2Vec.add(embedding); + } + + if (!manager.addMulti(name: "fangjun", embeddingList: spk1Vec)) { + // Note you should free extractor and manager in your app to avoid memory leak + print("Failed to register fangjun"); + return; + } + + if (!manager.addMulti(name: "leijun", embeddingList: spk2Vec)) { + print("Failed to register leijun"); + return; + } + + if (manager.numSpeakers != 2) { + print("There should be two speakers"); + return; + } + + if (!manager.contains("fangjun")) { + print("It should contain the speaker fangjun"); + return; + } + + if (!manager.contains("leijun")) { + print("It should contain the speaker leijun"); + return; + } + + print("---All speakers---"); + final allSpeakers = manager.allSpeakerNames; + for (final s in allSpeakers) { + print(s); + } + print("------------"); + + final testFiles = [ + "./sr-data/test/fangjun-test-sr-1.wav", + "./sr-data/test/leijun-test-sr-1.wav", + "./sr-data/test/liudehua-test-sr-1.wav", + ]; + + final threshold = 0.6; + for (final file in testFiles) { + final embedding = computeEmbedding(extractor: extractor, filename: file); + + var name = manager.search(embedding: embedding, threshold: threshold); + if (name == '') { + name = ""; + } + print("$file: $name"); + } + + if (!manager.verify( + name: "fangjun", + embedding: computeEmbedding(extractor: extractor, filename: testFiles[0]), + threshold: threshold)) { + print("{$testFiles[0]} should match fangjun!"); + return; + } + + if (!manager.remove("fangjun")) { + print("Failed to remove fangjun"); + return; + } + + if (manager.verify( + name: "fangjun", + embedding: computeEmbedding(extractor: extractor, filename: testFiles[0]), + threshold: threshold)) { + print("${testFiles[0]} should match no one!"); + return; + } + + if (manager.numSpeakers != 1) { + print("There should only 1 speaker left."); + return; + } + + extractor.free(); + manager.free(); +} diff --git a/dart-api-examples/speaker-identification/pubspec.yaml b/dart-api-examples/speaker-identification/pubspec.yaml new file mode 100644 index 000000000..fb1f3a489 --- /dev/null +++ b/dart-api-examples/speaker-identification/pubspec.yaml @@ -0,0 +1,17 @@ +name: speaker_identification + +description: > + This example demonstrates how to use the Dart API for speaker identification. + +version: 1.0.0 + +environment: + sdk: ^3.4.0 + +dependencies: + sherpa_onnx: ^1.10.20 + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/dart-api-examples/speaker-identification/run-3d-speaker.sh b/dart-api-examples/speaker-identification/run-3d-speaker.sh new file mode 100755 index 000000000..325d0998f --- /dev/null +++ b/dart-api-examples/speaker-identification/run-3d-speaker.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./sr-data/enroll/leijun-sr-1.wav ]; then + curl -SL -o sr-data.tar.gz https://github.com/csukuangfj/sr-data/archive/refs/tags/v1.0.0.tar.gz + tar xvf sr-data.tar.gz + mv sr-data-1.0.0 sr-data +fi + +dart run \ + ./bin/speaker_id.dart \ + --model ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx diff --git a/flutter/sherpa_onnx/example/example.md b/flutter/sherpa_onnx/example/example.md index 02b0e22fb..7e7e8031d 100644 --- a/flutter/sherpa_onnx/example/example.md +++ b/flutter/sherpa_onnx/example/example.md @@ -15,3 +15,7 @@ |Non-Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/non-streaming-asr)| macOS, Windows, Linux| |Text to speech| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/tts)| macOS, Windows, Linux| |Voice activity detection (VAD)| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/vad)| macOS, Windows, Linux| +|Voice activity detection (VAD) with non-streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/vad-with-non-streaming-asr)| macOS, Windows, Linux| +|Speaker identification and verification| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/speaker-identification)| macOS, Windows, Linux| +|Audio tagging| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/audio-tagging)| macOS, Windows, Linux| +|Keyword spotter| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/keyword-spotter)| macOS, Windows, Linux| diff --git a/java-api-examples/SpeakerIdentification.java b/java-api-examples/SpeakerIdentification.java index 971dc2967..0bad7ed42 100644 --- a/java-api-examples/SpeakerIdentification.java +++ b/java-api-examples/SpeakerIdentification.java @@ -107,7 +107,7 @@ public static void main(String[] args) { // test verify if (!manager.verify("fangjun", computeEmbedding(extractor, testFiles[0]), threshold)) { - System.out.printf("testFiles[0] should match fangjun!"); + System.out.printf("%s should match fangjun!\n", testFiles[0]); return; } diff --git a/scripts/dart/speaker-id-pubspec.yaml b/scripts/dart/speaker-id-pubspec.yaml new file mode 100644 index 000000000..00338fb3d --- /dev/null +++ b/scripts/dart/speaker-id-pubspec.yaml @@ -0,0 +1,18 @@ +name: speaker_identification + +description: > + This example demonstrates how to use the Dart API for speaker identification. + +version: 1.0.0 + +environment: + sdk: ^3.4.0 + +dependencies: + sherpa_onnx: + path: ../../flutter/sherpa_onnx + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0