From 1e728a712718dcc7d4eca1fab33079cbb529acf4 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Aug 2024 12:06:32 +0800 Subject: [PATCH 1/6] support reading wave files with 8-bit encoded samples --- sherpa-onnx/csrc/wave-reader.cc | 68 ++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/sherpa-onnx/csrc/wave-reader.cc b/sherpa-onnx/csrc/wave-reader.cc index 10bc2223f..6a4eaaa32 100644 --- a/sherpa-onnx/csrc/wave-reader.cc +++ b/sherpa-onnx/csrc/wave-reader.cc @@ -50,6 +50,10 @@ struct WaveHeader { }; static_assert(sizeof(WaveHeader) == 44); +/* +sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav + */ + // Read a wave file of mono-channel. // Return its samples normalized to the range [-1, 1). std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, @@ -161,8 +165,8 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, return {}; } - if (header.bits_per_sample != 16) { // we support only 16 bits per sample - SHERPA_ONNX_LOGE("Expected bits_per_sample 16. Given: %d\n", + if (header.bits_per_sample != 8 && header.bits_per_sample != 16) { + SHERPA_ONNX_LOGE("Expected bits_per_sample 8 or 16. Given: %d\n", header.bits_per_sample); *is_ok = false; return {}; @@ -199,20 +203,64 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, *sampling_rate = header.sample_rate; - // header.subchunk2_size contains the number of bytes in the data. - // As we assume each sample contains two bytes, so it is divided by 2 here - std::vector samples(header.subchunk2_size / 2); + std::vector ans; - is.read(reinterpret_cast(samples.data()), header.subchunk2_size); - if (!is) { + if (header.bits_per_sample == 16) { + // header.subchunk2_size contains the number of bytes in the data. + // As we assume each sample contains two bytes, so it is divided by 2 here + std::vector samples(header.subchunk2_size / 2); + + is.read(reinterpret_cast(samples.data()), header.subchunk2_size); + if (!is) { + SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size); + *is_ok = false; + return {}; + } + + ans.resize(samples.size()); + for (int32_t i = 0; i != static_cast(ans.size()); ++i) { + ans[i] = samples[i] / 32768.; + } + } else if (header.bits_per_sample == 8) { + // number of samples == number of bytes for 8-bit encoded samples + // + // For 8-bit encoded samples, they are unsigned! + std::vector samples(header.subchunk2_size); + + is.read(reinterpret_cast(samples.data()), header.subchunk2_size); + if (!is) { + SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size); + *is_ok = false; + return {}; + } + + ans.resize(samples.size()); + for (int32_t i = 0; i != static_cast(ans.size()); ++i) { + // Note(fangjun): We want to normalize each sample into the range [-1, 1] + // Since each original sample is in the range [0, 256], dividing + // them by 128 converts them to the range [0, 2]; + // so after subtracting 1, we get the range [-1, 1] + // + ans[i] = samples[i] / 128. - 1; + } + } else { + SHERPA_ONNX_LOGE( + "Unsupported %d bits per sample. Supported values are: 8, 16", + header.bits_per_sample); *is_ok = false; return {}; } - std::vector ans(samples.size()); - for (int32_t i = 0; i != static_cast(ans.size()); ++i) { - ans[i] = samples[i] / 32768.; + SHERPA_ONNX_LOGE("number of samples: %d", (int)ans.size()); + float mean = 0, sum = 0, max = -1000, min = 1000; + for (auto f : ans) { + sum += f; + max = (f > max) ? f : max; + min = (f < min) ? f : min; } + mean = sum / ans.size(); + SHERPA_ONNX_LOGE("sum: %.3f, mean: %.3f, n: %d, max: %.3f, min: %.3f\n", sum, + mean, (int)ans.size(), max, min); *is_ok = true; return ans; From cd319c46f96a108cbb64e2e342725df80086a3e4 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Aug 2024 13:12:25 +0800 Subject: [PATCH 2/6] Support reading 32-bit wav files --- sherpa-onnx/csrc/wave-reader.cc | 57 ++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/sherpa-onnx/csrc/wave-reader.cc b/sherpa-onnx/csrc/wave-reader.cc index 6a4eaaa32..eb044504e 100644 --- a/sherpa-onnx/csrc/wave-reader.cc +++ b/sherpa-onnx/csrc/wave-reader.cc @@ -39,12 +39,18 @@ struct WaveHeader { int32_t format; int32_t subchunk1_id; int32_t subchunk1_size; + int16_t audio_format; int16_t num_channels; + int32_t sample_rate; + int32_t byte_rate; + int16_t block_align; + int16_t bits_per_sample; + int32_t subchunk2_id; // a tag of this chunk int32_t subchunk2_size; // size of subchunk2 }; @@ -118,9 +124,18 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, is.read(reinterpret_cast(&header.audio_format), sizeof(header.audio_format)); - if (header.audio_format != 1) { // 1 for PCM + if (header.audio_format != 1 && header.audio_format != 3) { + // 1 for integer PCM + // 3 for floating point PCM + // see https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + // and https://github.com/microsoft/DirectXTK/wiki/Wave-Formats SHERPA_ONNX_LOGE("Expected audio_format 1. Given: %d\n", header.audio_format); + + if (header.audio_format == static_cast(0xfffe)) { + SHERPA_ONNX_LOGE("We don't support WAVE_FORMAT_EXTENSIBLE files."); + } + *is_ok = false; return {}; } @@ -165,8 +180,9 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, return {}; } - if (header.bits_per_sample != 8 && header.bits_per_sample != 16) { - SHERPA_ONNX_LOGE("Expected bits_per_sample 8 or 16. Given: %d\n", + if (header.bits_per_sample != 8 && header.bits_per_sample != 16 && + header.bits_per_sample != 32) { + SHERPA_ONNX_LOGE("Expected bits_per_sample 8, 16 or 32. Given: %d\n", header.bits_per_sample); *is_ok = false; return {}; @@ -205,7 +221,7 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, std::vector ans; - if (header.bits_per_sample == 16) { + if (header.bits_per_sample == 16 && header.audio_format == 1) { // header.subchunk2_size contains the number of bytes in the data. // As we assume each sample contains two bytes, so it is divided by 2 here std::vector samples(header.subchunk2_size / 2); @@ -221,7 +237,7 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, for (int32_t i = 0; i != static_cast(ans.size()); ++i) { ans[i] = samples[i] / 32768.; } - } else if (header.bits_per_sample == 8) { + } else if (header.bits_per_sample == 8 && header.audio_format == 1) { // number of samples == number of bytes for 8-bit encoded samples // // For 8-bit encoded samples, they are unsigned! @@ -243,6 +259,37 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, // ans[i] = samples[i] / 128. - 1; } + } else if (header.bits_per_sample == 32 && header.audio_format == 1) { + // 32 here is for int32 + // + // header.subchunk2_size contains the number of bytes in the data. + // As we assume each sample contains 4 bytes, so it is divided by 4 here + std::vector samples(header.subchunk2_size / 4); + + is.read(reinterpret_cast(samples.data()), header.subchunk2_size); + if (!is) { + SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size); + *is_ok = false; + return {}; + } + + ans.resize(samples.size()); + for (int32_t i = 0; i != static_cast(ans.size()); ++i) { + ans[i] = static_cast(samples[i]) / (1 << 31); + } + } else if (header.bits_per_sample == 32 && header.audio_format == 3) { + // 32 here is for float32 + // + // header.subchunk2_size contains the number of bytes in the data. + // As we assume each sample contains 4 bytes, so it is divided by 4 here + ans.resize(header.subchunk2_size / 4); + + is.read(reinterpret_cast(ans.data()), header.subchunk2_size); + if (!is) { + SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size); + *is_ok = false; + return {}; + } } else { SHERPA_ONNX_LOGE( "Unsupported %d bits per sample. Supported values are: 8, 16", From 4121dbf38d4d97cc16c50d7ee9583ffe2ce6e002 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Aug 2024 13:12:56 +0800 Subject: [PATCH 3/6] remove unused code --- sherpa-onnx/csrc/wave-reader.cc | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/sherpa-onnx/csrc/wave-reader.cc b/sherpa-onnx/csrc/wave-reader.cc index eb044504e..48399fb16 100644 --- a/sherpa-onnx/csrc/wave-reader.cc +++ b/sherpa-onnx/csrc/wave-reader.cc @@ -58,6 +58,9 @@ static_assert(sizeof(WaveHeader) == 44); /* sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav +we use audacity to generate int32-1-channel-zh.wav and float32-1-channel-zh.wav +because sox use WAVE_FORMAT_EXTENSIBLE, which is not easy to support +in sherpa-onnx. */ // Read a wave file of mono-channel. @@ -298,17 +301,6 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, return {}; } - SHERPA_ONNX_LOGE("number of samples: %d", (int)ans.size()); - float mean = 0, sum = 0, max = -1000, min = 1000; - for (auto f : ans) { - sum += f; - max = (f > max) ? f : max; - min = (f < min) ? f : min; - } - mean = sum / ans.size(); - SHERPA_ONNX_LOGE("sum: %.3f, mean: %.3f, n: %d, max: %.3f, min: %.3f\n", sum, - mean, (int)ans.size(), max, min); - *is_ok = true; return ans; } From 8612a2f2a18fc6c6dde3b211de4ba602850a24dc Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Aug 2024 13:32:39 +0800 Subject: [PATCH 4/6] support reading multi-channel wave files --- sherpa-onnx/csrc/wave-reader.cc | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/sherpa-onnx/csrc/wave-reader.cc b/sherpa-onnx/csrc/wave-reader.cc index 48399fb16..87900e773 100644 --- a/sherpa-onnx/csrc/wave-reader.cc +++ b/sherpa-onnx/csrc/wave-reader.cc @@ -58,6 +58,9 @@ static_assert(sizeof(WaveHeader) == 44); /* sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav + +sox int16-1-channel-zh.wav -c 2 int16-2-channel-zh.wav + we use audacity to generate int32-1-channel-zh.wav and float32-1-channel-zh.wav because sox use WAVE_FORMAT_EXTENSIBLE, which is not easy to support in sherpa-onnx. @@ -147,10 +150,9 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, sizeof(header.num_channels)); if (header.num_channels != 1) { // we support only single channel for now - SHERPA_ONNX_LOGE("Expected single channel. Given: %d\n", - header.num_channels); - *is_ok = false; - return {}; + SHERPA_ONNX_LOGE( + "Warning: %d channels are found. We only use the first channel.\n", + header.num_channels); } is.read(reinterpret_cast(&header.sample_rate), @@ -228,6 +230,8 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, // header.subchunk2_size contains the number of bytes in the data. // As we assume each sample contains two bytes, so it is divided by 2 here std::vector samples(header.subchunk2_size / 2); + SHERPA_ONNX_LOGE("%d samples, bytes: %d", (int)samples.size(), + header.subchunk2_size); is.read(reinterpret_cast(samples.data()), header.subchunk2_size); if (!is) { @@ -236,9 +240,11 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, return {}; } - ans.resize(samples.size()); + ans.resize(samples.size() / header.num_channels); + + // samples are interleaved for (int32_t i = 0; i != static_cast(ans.size()); ++i) { - ans[i] = samples[i] / 32768.; + ans[i] = samples[i * header.num_channels] / 32768.; } } else if (header.bits_per_sample == 8 && header.audio_format == 1) { // number of samples == number of bytes for 8-bit encoded samples @@ -253,14 +259,14 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, return {}; } - ans.resize(samples.size()); + ans.resize(samples.size() / header.num_channels); for (int32_t i = 0; i != static_cast(ans.size()); ++i) { // Note(fangjun): We want to normalize each sample into the range [-1, 1] // Since each original sample is in the range [0, 256], dividing // them by 128 converts them to the range [0, 2]; // so after subtracting 1, we get the range [-1, 1] // - ans[i] = samples[i] / 128. - 1; + ans[i] = samples[i * header.num_channels] / 128. - 1; } } else if (header.bits_per_sample == 32 && header.audio_format == 1) { // 32 here is for int32 @@ -276,23 +282,28 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, return {}; } - ans.resize(samples.size()); + ans.resize(samples.size() / header.num_channels); for (int32_t i = 0; i != static_cast(ans.size()); ++i) { - ans[i] = static_cast(samples[i]) / (1 << 31); + ans[i] = static_cast(samples[i * header.num_channels]) / (1 << 31); } } else if (header.bits_per_sample == 32 && header.audio_format == 3) { // 32 here is for float32 // // header.subchunk2_size contains the number of bytes in the data. // As we assume each sample contains 4 bytes, so it is divided by 4 here - ans.resize(header.subchunk2_size / 4); + std::vector samples(header.subchunk2_size / 4); - is.read(reinterpret_cast(ans.data()), header.subchunk2_size); + is.read(reinterpret_cast(samples.data()), header.subchunk2_size); if (!is) { SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size); *is_ok = false; return {}; } + + ans.resize(samples.size() / header.num_channels); + for (int32_t i = 0; i != static_cast(ans.size()); ++i) { + ans[i] = samples[i * header.num_channels]; + } } else { SHERPA_ONNX_LOGE( "Unsupported %d bits per sample. Supported values are: 8, 16", From 506fe4cacdf8e9a14ba8180c604d505d1c84743a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Aug 2024 13:37:16 +0800 Subject: [PATCH 5/6] test reading multi-channel waves --- .github/scripts/test-offline-ctc.sh | 28 +++++++++++++++++++++------- .github/workflows/linux.yaml | 17 ++++++++--------- sherpa-onnx/csrc/wave-reader.cc | 13 ++++--------- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/.github/scripts/test-offline-ctc.sh b/.github/scripts/test-offline-ctc.sh index 05db35a20..57208e9da 100755 --- a/.github/scripts/test-offline-ctc.sh +++ b/.github/scripts/test-offline-ctc.sh @@ -38,14 +38,28 @@ done # test wav reader for non-standard wav files -curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/naudio.wav -curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/junk-padding.wav +waves=( + naudio.wav + junk-padding.wav + int8-1-channel-zh.wav + int8-2-channel-zh.wav + int8-4-channel-zh.wav + int16-1-channel-zh.wav + int16-2-channel-zh.wav + int32-1-channel-zh.wav + int32-2-channel-zh.wav + float32-1-channel-zh.wav + float32-2-channel-zh.wav +) +for w in ${waves[@]}; do + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/$w -time $EXE \ - --tokens=$repo/tokens.txt \ - --sense-voice-model=$repo/model.int8.onnx \ - ./naudio.wav \ - ./junk-padding.wav + time $EXE \ + --tokens=$repo/tokens.txt \ + --sense-voice-model=$repo/model.int8.onnx \ + $w + rm -v $w +done rm -rf $repo diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 1f9832244..24e967d04 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -143,35 +143,34 @@ jobs: name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} path: install/* - - name: Test online punctuation + - name: Test offline CTC shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-online-punctuation + export EXE=sherpa-onnx-offline - .github/scripts/test-online-punctuation.sh + .github/scripts/test-offline-ctc.sh du -h -d1 . - - name: Test offline transducer + - name: Test online punctuation shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline + export EXE=sherpa-onnx-online-punctuation - .github/scripts/test-offline-transducer.sh + .github/scripts/test-online-punctuation.sh du -h -d1 . - - - name: Test offline CTC + - name: Test offline transducer shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH export EXE=sherpa-onnx-offline - .github/scripts/test-offline-ctc.sh + .github/scripts/test-offline-transducer.sh du -h -d1 . - name: Test online transducer diff --git a/sherpa-onnx/csrc/wave-reader.cc b/sherpa-onnx/csrc/wave-reader.cc index 87900e773..b1933bbf6 100644 --- a/sherpa-onnx/csrc/wave-reader.cc +++ b/sherpa-onnx/csrc/wave-reader.cc @@ -39,18 +39,12 @@ struct WaveHeader { int32_t format; int32_t subchunk1_id; int32_t subchunk1_size; - int16_t audio_format; int16_t num_channels; - int32_t sample_rate; - int32_t byte_rate; - int16_t block_align; - int16_t bits_per_sample; - int32_t subchunk2_id; // a tag of this chunk int32_t subchunk2_size; // size of subchunk2 }; @@ -62,7 +56,7 @@ sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav sox int16-1-channel-zh.wav -c 2 int16-2-channel-zh.wav we use audacity to generate int32-1-channel-zh.wav and float32-1-channel-zh.wav -because sox use WAVE_FORMAT_EXTENSIBLE, which is not easy to support +because sox uses WAVE_FORMAT_EXTENSIBLE, which is not easy to support in sherpa-onnx. */ @@ -306,8 +300,9 @@ std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, } } else { SHERPA_ONNX_LOGE( - "Unsupported %d bits per sample. Supported values are: 8, 16", - header.bits_per_sample); + "Unsupported %d bits per sample and audio format: %d. Supported values " + "are: 8, 16, 32.", + header.bits_per_sample, header.audio_format); *is_ok = false; return {}; } From ee2dd4750c88dc1d1af2b07e5edaeb5af0ff2949 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Aug 2024 14:49:16 +0800 Subject: [PATCH 6/6] fix style issues --- sherpa-onnx/csrc/offline-tts-frontend.h | 1 + sherpa-onnx/jni/offline-recognizer.cc | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sherpa-onnx/csrc/offline-tts-frontend.h b/sherpa-onnx/csrc/offline-tts-frontend.h index 0216284f1..cba50e36f 100644 --- a/sherpa-onnx/csrc/offline-tts-frontend.h +++ b/sherpa-onnx/csrc/offline-tts-frontend.h @@ -6,6 +6,7 @@ #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ #include #include +#include #include #include "sherpa-onnx/csrc/macros.h" diff --git a/sherpa-onnx/jni/offline-recognizer.cc b/sherpa-onnx/jni/offline-recognizer.cc index 3a7602dbe..8c1265bba 100644 --- a/sherpa-onnx/jni/offline-recognizer.cc +++ b/sherpa-onnx/jni/offline-recognizer.cc @@ -264,13 +264,9 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_newFromFile(JNIEnv *env, return (jlong)model; } - SHERPA_ONNX_EXTERN_C -JNIEXPORT void JNICALL -Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_setConfig(JNIEnv *env, - jobject /*obj*/, - jlong ptr, - jobject _config) { +JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_setConfig( + JNIEnv *env, jobject /*obj*/, jlong ptr, jobject _config) { auto config = sherpa_onnx::GetOfflineConfig(env, _config); SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); @@ -350,9 +346,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, // [3]: lang, jstring // [4]: emotion, jstring // [5]: event, jstring - env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str())); - env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str())); - env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str())); + env->SetObjectArrayElement(obj_arr, 3, + env->NewStringUTF(result.lang.c_str())); + env->SetObjectArrayElement(obj_arr, 4, + env->NewStringUTF(result.emotion.c_str())); + env->SetObjectArrayElement(obj_arr, 5, + env->NewStringUTF(result.event.c_str())); return obj_arr; }