From ae0b2e071ed808b1d13c49258aa7566fa103335b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 12 Aug 2024 08:10:38 +0000 Subject: [PATCH] add more examples --- .github/workflows/pascal.yaml | 9 ++ pascal-api-examples/streaming-asr/.gitignore | 2 + pascal-api-examples/streaming-asr/README.md | 11 +++ .../streaming-asr/zipformer_ctc.pas | 89 ++++++++++++++++++ .../streaming-asr/zipformer_ctc_hlg.pas | 90 +++++++++++++++++++ 5 files changed, 201 insertions(+) create mode 100644 pascal-api-examples/streaming-asr/README.md create mode 100644 pascal-api-examples/streaming-asr/zipformer_ctc.pas create mode 100644 pascal-api-examples/streaming-asr/zipformer_ctc_hlg.pas diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index c799daa56..9ddab6936 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -121,8 +121,17 @@ jobs: ./run-zipformer-transducer.sh rm -rf sherpa-onnx-* echo "---" + ./run-paraformer.sh rm -rf sherpa-onnx-* echo "---" + + ./run-zipformer-ctc.sh + echo "---" + + ./run-zipformer-ctc-hlg.sh + rm -rf sherpa-onnx-* + echo "---" + ls -lh popd diff --git a/pascal-api-examples/streaming-asr/.gitignore b/pascal-api-examples/streaming-asr/.gitignore index 69c8eaa37..9bfca7ffb 100644 --- a/pascal-api-examples/streaming-asr/.gitignore +++ b/pascal-api-examples/streaming-asr/.gitignore @@ -1,2 +1,4 @@ zipformer_transducer paraformer +zipformer_ctc +zipformer_ctc_hlg diff --git a/pascal-api-examples/streaming-asr/README.md b/pascal-api-examples/streaming-asr/README.md new file mode 100644 index 000000000..eb4e6ba20 --- /dev/null +++ b/pascal-api-examples/streaming-asr/README.md @@ -0,0 +1,11 @@ +# Introduction + +This folder contains examples about using sherpa-onnx's object pascal +APIs with streaming models for speech recognition. + +|File|Description| +|----|-----------| +|./run-paraformer.sh|Use a streaming Paraformer model for speech recognition| +|./run-zipformer-ctc-hlg.sh|Use a streaming Zipformer CTC model for speech recognition| +|./run-zipformer-ctc.sh|Use a streaming Zipformer CTC model with HLG for speech recognition| +|./run-zipformer-transducer.sh|Use a Zipformer transducer model for speech recognition| diff --git a/pascal-api-examples/streaming-asr/zipformer_ctc.pas b/pascal-api-examples/streaming-asr/zipformer_ctc.pas new file mode 100644 index 000000000..ff71853a3 --- /dev/null +++ b/pascal-api-examples/streaming-asr/zipformer_ctc.pas @@ -0,0 +1,89 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a streaming Zipformer CTC model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program zipformer_ctc; + +{$mode delphi} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + + +var + Config: TSherpaOnnxOnlineRecognizerConfig; + Recognizer: TSherpaOnnxOnlineRecognizer; + Stream: TSherpaOnnxOnlineStream; + RecognitionResult: TSherpaOnnxOnlineRecognizerResult; + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + TailPaddings: array of Single; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + {Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + to download model files used in this file.} + Config.ModelConfig.Zipformer2Ctc.Model := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOnlineRecognizer.Create(Config); + + Start := Now; + + Stream := Recognizer.CreateStream(); + + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + + SetLength(TailPaddings, Round(Wave.SampleRate * 0.5)); {0.5 seconds of padding} + Stream.AcceptWaveform(TailPaddings, Wave.SampleRate); + + Stream.InputFinished(); + + while Recognizer.IsReady(Stream) do + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/streaming-asr/zipformer_ctc_hlg.pas b/pascal-api-examples/streaming-asr/zipformer_ctc_hlg.pas new file mode 100644 index 000000000..f6c0215e1 --- /dev/null +++ b/pascal-api-examples/streaming-asr/zipformer_ctc_hlg.pas @@ -0,0 +1,90 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a streaming Zipformer CTC model +with HLG to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program zipformer_ctc_hlg; + +{$mode delphi} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + + +var + Config: TSherpaOnnxOnlineRecognizerConfig; + Recognizer: TSherpaOnnxOnlineRecognizer; + Stream: TSherpaOnnxOnlineStream; + RecognitionResult: TSherpaOnnxOnlineRecognizerResult; + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + TailPaddings: array of Single; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + {Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + to download model files used in this file.} + Config.ModelConfig.Zipformer2Ctc.Model := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := True; + Config.CtcFstDecoderConfig.Graph := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst'; + + WaveFilename := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOnlineRecognizer.Create(Config); + + Start := Now; + + Stream := Recognizer.CreateStream(); + + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + + SetLength(TailPaddings, Round(Wave.SampleRate * 0.5)); {0.5 seconds of padding} + Stream.AcceptWaveform(TailPaddings, Wave.SampleRate); + + Stream.InputFinished(); + + while Recognizer.IsReady(Stream) do + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end.