diff --git a/.github/scripts/test-speaker-recognition-python.sh b/.github/scripts/test-speaker-recognition-python.sh index 7d6eff9ff..22b1367de 100755 --- a/.github/scripts/test-speaker-recognition-python.sh +++ b/.github/scripts/test-speaker-recognition-python.sh @@ -57,5 +57,19 @@ done ls -lh popd +log "Download NeMo models" +model_dir=$d/nemo +mkdir -p $model_dir +pushd $model_dir +models=( +nemo_en_titanet_large.onnx +nemo_en_titanet_small.onnx +nemo_en_speakerverification_speakernet.onnx +) +for m in ${models[@]}; do + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/$m +done +ls -lh +popd python3 sherpa-onnx/python/tests/test_speaker_recognition.py --verbose diff --git a/cmake/kaldi-native-fbank.cmake b/cmake/kaldi-native-fbank.cmake index 38751b67c..ea1c27d46 100644 --- a/cmake/kaldi-native-fbank.cmake +++ b/cmake/kaldi-native-fbank.cmake @@ -1,9 +1,9 @@ function(download_kaldi_native_fbank) include(FetchContent) - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.18.5.tar.gz") - set(kaldi_native_fbank_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.18.5.tar.gz") - set(kaldi_native_fbank_HASH "SHA256=dce0cb3bc6fece5d8053d8780cb4ce22da57cb57ebec332641661521a0425283") + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.18.6.tar.gz") + set(kaldi_native_fbank_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.18.6.tar.gz") + set(kaldi_native_fbank_HASH "SHA256=6202a00cd06ba8ff89beb7b6f85cda34e073e94f25fc29e37c519bff0706bf19") set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) # If you don't have access to the Internet, # please pre-download kaldi-native-fbank set(possible_file_locations - $ENV{HOME}/Downloads/kaldi-native-fbank-1.18.5.tar.gz - ${PROJECT_SOURCE_DIR}/kaldi-native-fbank-1.18.5.tar.gz - ${PROJECT_BINARY_DIR}/kaldi-native-fbank-1.18.5.tar.gz - /tmp/kaldi-native-fbank-1.18.5.tar.gz - /star-fj/fangjun/download/github/kaldi-native-fbank-1.18.5.tar.gz + $ENV{HOME}/Downloads/kaldi-native-fbank-1.18.6.tar.gz + ${PROJECT_SOURCE_DIR}/kaldi-native-fbank-1.18.6.tar.gz + ${PROJECT_BINARY_DIR}/kaldi-native-fbank-1.18.6.tar.gz + /tmp/kaldi-native-fbank-1.18.6.tar.gz + /star-fj/fangjun/download/github/kaldi-native-fbank-1.18.6.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h index eb87d9043..e819bd067 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_GENERAL_IMPL_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_GENERAL_IMPL_H_ diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc index ea6a510cf..a9babec92 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #include "sherpa-onnx/csrc/speaker-embedding-extractor-impl.h" #include "sherpa-onnx/csrc/macros.h" diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h index fa84b43e2..02362f89b 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-impl.h // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_IMPL_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_IMPL_H_ diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc index fedfcab54..2c9930f8b 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-model.cc // -// Copyright (c) 2023-2024 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #include "sherpa-onnx/csrc/speaker-embedding-extractor-model.h" diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-model.h b/sherpa-onnx/csrc/speaker-embedding-extractor-model.h index 3fa94ef3f..d5f179678 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-model.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-model.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-model.h // -// Copyright (c) 2023-2024 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_H_ diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h index d07a3fb71..6678758c2 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_NEMO_IMPL_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_NEMO_IMPL_H_ diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h index 5b09dbf72..f0ff1f7ba 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_NEMO_MODEL_META_DATA_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_NEMO_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor.cc b/sherpa-onnx/csrc/speaker-embedding-extractor.cc index 7826e4fb6..f7d6c9b12 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor.cc @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor.cc // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #include "sherpa-onnx/csrc/speaker-embedding-extractor.h" diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor.h b/sherpa-onnx/csrc/speaker-embedding-extractor.h index cb23d40c0..2d536aa54 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-extractor.h // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ diff --git a/sherpa-onnx/csrc/speaker-embedding-manager-test.cc b/sherpa-onnx/csrc/speaker-embedding-manager-test.cc index 0e1603c2b..6f115ca55 100644 --- a/sherpa-onnx/csrc/speaker-embedding-manager-test.cc +++ b/sherpa-onnx/csrc/speaker-embedding-manager-test.cc @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-manager-test.cc // -// Copyright (c) 2023 Jingzhao Ou (jingzhao.ou@gmail.com) +// Copyright (c) 2024 Jingzhao Ou (jingzhao.ou@gmail.com) #include "sherpa-onnx/csrc/speaker-embedding-manager.h" diff --git a/sherpa-onnx/csrc/speaker-embedding-manager.cc b/sherpa-onnx/csrc/speaker-embedding-manager.cc index 02894436d..dead72289 100644 --- a/sherpa-onnx/csrc/speaker-embedding-manager.cc +++ b/sherpa-onnx/csrc/speaker-embedding-manager.cc @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-manager.cc // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #include "sherpa-onnx/csrc/speaker-embedding-manager.h" diff --git a/sherpa-onnx/csrc/speaker-embedding-manager.h b/sherpa-onnx/csrc/speaker-embedding-manager.h index 25f85a930..66df665df 100644 --- a/sherpa-onnx/csrc/speaker-embedding-manager.h +++ b/sherpa-onnx/csrc/speaker-embedding-manager.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/speaker-embedding-manager.h // -// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_MANAGER_H_ #define SHERPA_ONNX_CSRC_SPEAKER_EMBEDDING_MANAGER_H_ diff --git a/sherpa-onnx/python/tests/test_speaker_recognition.py b/sherpa-onnx/python/tests/test_speaker_recognition.py index e05ae2a01..bd7c8edb6 100755 --- a/sherpa-onnx/python/tests/test_speaker_recognition.py +++ b/sherpa-onnx/python/tests/test_speaker_recognition.py @@ -56,7 +56,7 @@ def load_speaker_embedding_model(model_filename): return extractor -def test_wespeaker_model(model_filename: str): +def test_zh_models(model_filename: str): model_filename = str(model_filename) if "en" in model_filename: print(f"skip {model_filename}") @@ -114,8 +114,9 @@ def test_wespeaker_model(model_filename: str): assert ans == name, (name, ans) -def test_3dspeaker_model(model_filename: str): - extractor = load_speaker_embedding_model(str(model_filename)) +def test_en_and_zh_models(model_filename: str): + model_filename = str(model_filename) + extractor = load_speaker_embedding_model(model_filename) manager = sherpa_onnx.SpeakerEmbeddingManager(extractor.dim) filenames = [ @@ -124,7 +125,14 @@ def test_3dspeaker_model(model_filename: str): "speaker1_a_en_16k", "speaker2_a_en_16k", ] + is_en = "en" in model_filename for filename in filenames: + if is_en and "cn" in filename: + continue + + if not is_en and "en" in filename: + continue + name = filename.rsplit("_", maxsplit=1)[0] data, sample_rate = read_wave( f"/tmp/sr-models/sr-data/test/3d-speaker/{filename}.wav" @@ -145,6 +153,11 @@ def test_3dspeaker_model(model_filename: str): "speaker1_b_en_16k", ] for filename in filenames: + if is_en and "cn" in filename: + continue + + if not is_en and "en" in filename: + continue print(filename) name = filename.rsplit("_", maxsplit=1)[0] name = name.replace("b_cn", "a_cn") @@ -178,7 +191,8 @@ def test_wespeaker_models(self): return for filename in model_dir.glob("*.onnx"): print(filename) - test_wespeaker_model(filename) + test_zh_models(filename) + test_en_and_zh_models(filename) def test_3dpeaker_models(self): model_dir = Path(d) / "3dspeaker" @@ -187,7 +201,16 @@ def test_3dpeaker_models(self): return for filename in model_dir.glob("*.onnx"): print(filename) - test_3dspeaker_model(filename) + test_en_and_zh_models(filename) + + def test_nemo_models(self): + model_dir = Path(d) / "nemo" + if not model_dir.is_dir(): + print(f"{model_dir} does not exist - skip it") + return + for filename in model_dir.glob("*.onnx"): + print(filename) + test_en_and_zh_models(filename) if __name__ == "__main__":