Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pascal API for speaker diarization #1420

Merged
merged 4 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,21 @@ jobs:
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
fi

- name: Run Pascal test (Speaker diarization)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH

cd ./pascal-api-examples
pushd speaker-diarization

./run.sh
rm -rfv *.onnx *.wav sherpa-onnx-*
ls -lh
echo "---"

popd

- name: Run Pascal test (TTS)
shell: bash
run: |
Expand Down
1 change: 1 addition & 0 deletions pascal-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
|Directory| Description|
|---------|------------|
|[read-wav](./read-wav)|It shows how to read a wave file.|
|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|[vad](./vad)| It shows how to use the voice activity detection API.|
Expand Down
104 changes: 104 additions & 0 deletions pascal-api-examples/speaker-diarization/main.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use the Pascal API from sherpa-onnx
for speaker diarization.

Usage:

Step 1: Download a speaker segmentation model

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

Step 2: Download a speaker embedding extractor model

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

Step 3. Download test wave files

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

Step 4. Run it
}

program main;

{$mode delphi}

uses
sherpa_onnx,
ctypes,
SysUtils;

function ProgressCallback(
NumProcessedChunks: cint32;
NumTotalChunks: cint32): cint32; cdecl;
var
Progress: Single;
begin
Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
WriteLn(Format('Progress: %.3f%%', [Progress]));

Result := 0;
end;

var
Wave: TSherpaOnnxWave;
Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
Sd: TSherpaOnnxOfflineSpeakerDiarization;
Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
I: Integer;
begin
Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');

Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';

{
Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
set NumClusters to 4 here.
If you don't have such information, please set NumClusters to -1.
In that case, you have to set Config.Clustering.Threshold.
A larger threshold leads to fewer clusters, i.e., fewer speakers.
}
Config.Clustering.NumClusters := 4;
Config.Segmentation.Debug := True;
Config.Embedding.Debug := True;

Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
if Sd.GetHandle = nil then
begin
WriteLn('Please check you config');
Exit;
end;

if Sd.GetSampleRate <> Wave.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
Exit;
end;

{
// If you don't want to use a callback
Segments := Sd.Process(Wave.Samples);
}
Segments := Sd.Process(Wave.Samples, @ProgressCallback);

for I := Low(Segments) to High(Segments) do
begin
WriteLn(Format('%.3f -- %.3f speaker_%d',
[Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
end;

FreeAndNil(Sd);
end.
49 changes: 49 additions & 0 deletions pascal-api-examples/speaker-diarization/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./main.pas

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi

if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi

if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi

./main
Loading
Loading