Skip to content

Commit

Permalink
Encode hotwords in C++ side (#828)
Browse files Browse the repository at this point in the history
* Encode hotwords in C++ side
  • Loading branch information
pkufool authored May 20, 2024
1 parent 8af2af8 commit b012b78
Show file tree
Hide file tree
Showing 43 changed files with 713 additions and 101 deletions.
2 changes: 2 additions & 0 deletions .github/scripts/test-offline-ctc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-offline-transducer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-offline-tts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-offline-whisper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-online-ctc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-online-paraformer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-online-transducer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

log "test online NeMo CTC"

url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2
Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/test-spoken-language-identification.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

export GIT_CLONE_PROTECTION_ACTIVE=false

echo "EXE is $EXE"
echo "PATH: $PATH"

Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ endif()
include(kaldi-native-fbank)
include(kaldi-decoder)
include(onnxruntime)
include(simple-sentencepiece)
set(ONNXRUNTIME_DIR ${onnxruntime_SOURCE_DIR})
message(STATUS "ONNXRUNTIME_DIR: ${ONNXRUNTIME_DIR}")

Expand Down
8 changes: 5 additions & 3 deletions build-ios-no-tts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ echo "Generate xcframework"

mkdir -p "build/simulator/lib"
for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a; do
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a libssentencepiece_core.a; do
lipo -create build/simulator_arm64/lib/${f} \
build/simulator_x86_64/lib/${f} \
-output build/simulator/lib/${f}
Expand All @@ -140,15 +140,17 @@ libtool -static -o build/simulator/sherpa-onnx.a \
build/simulator/lib/libsherpa-onnx-core.a \
build/simulator/lib/libsherpa-onnx-fst.a \
build/simulator/lib/libsherpa-onnx-kaldifst-core.a \
build/simulator/lib/libkaldi-decoder-core.a
build/simulator/lib/libkaldi-decoder-core.a \
build/simulator/lib/libssentencepiece_core.a

libtool -static -o build/os64/sherpa-onnx.a \
build/os64/lib/libkaldi-native-fbank-core.a \
build/os64/lib/libsherpa-onnx-c-api.a \
build/os64/lib/libsherpa-onnx-core.a \
build/os64/lib/libsherpa-onnx-fst.a \
build/os64/lib/libsherpa-onnx-kaldifst-core.a \
build/os64/lib/libkaldi-decoder-core.a
build/os64/lib/libkaldi-decoder-core.a \
build/os64/lib/libssentencepiece_core.a

rm -rf sherpa-onnx.xcframework

Expand Down
4 changes: 3 additions & 1 deletion build-ios.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ echo "Generate xcframework"

mkdir -p "build/simulator/lib"
for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
libsherpa-onnx-fstfar.a \
libsherpa-onnx-fstfar.a libssentencepiece_core.a \
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \
libucd.a libpiper_phonemize.a libespeak-ng.a; do
lipo -create build/simulator_arm64/lib/${f} \
Expand All @@ -150,6 +150,7 @@ libtool -static -o build/simulator/sherpa-onnx.a \
build/simulator/lib/libucd.a \
build/simulator/lib/libpiper_phonemize.a \
build/simulator/lib/libespeak-ng.a \
build/simulator/lib/libssentencepiece_core.a

libtool -static -o build/os64/sherpa-onnx.a \
build/os64/lib/libkaldi-native-fbank-core.a \
Expand All @@ -162,6 +163,7 @@ libtool -static -o build/os64/sherpa-onnx.a \
build/os64/lib/libucd.a \
build/os64/lib/libpiper_phonemize.a \
build/os64/lib/libespeak-ng.a \
build/os64/lib/libssentencepiece_core.a


rm -rf sherpa-onnx.xcframework
Expand Down
3 changes: 2 additions & 1 deletion build-swift-macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \
./install/lib/libkaldi-decoder-core.a \
./install/lib/libucd.a \
./install/lib/libpiper_phonemize.a \
./install/lib/libespeak-ng.a
./install/lib/libespeak-ng.a \
./install/lib/libssentencepiece_core.a
63 changes: 63 additions & 0 deletions cmake/simple-sentencepiece.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
function(download_simple_sentencepiece)
include(FetchContent)

set(simple-sentencepiece_URL "https://github.com/pkufool/simple-sentencepiece/archive/refs/tags/v0.7.tar.gz")
set(simple-sentencepiece_URL2 "https://hub.nauu.cf/pkufool/simple-sentencepiece/archive/refs/tags/v0.7.tar.gz")
set(simple-sentencepiece_HASH "SHA256=1748a822060a35baa9f6609f84efc8eb54dc0e74b9ece3d82367b7119fdc75af")

# If you don't have access to the Internet,
# please pre-download simple-sentencepiece
set(possible_file_locations
$ENV{HOME}/Downloads/simple-sentencepiece-0.7.tar.gz
${CMAKE_SOURCE_DIR}/simple-sentencepiece-0.7.tar.gz
${CMAKE_BINARY_DIR}/simple-sentencepiece-0.7.tar.gz
/tmp/simple-sentencepiece-0.7.tar.gz
/star-fj/fangjun/download/github/simple-sentencepiece-0.7.tar.gz
)

foreach(f IN LISTS possible_file_locations)
if(EXISTS ${f})
set(simple-sentencepiece_URL "${f}")
file(TO_CMAKE_PATH "${simple-sentencepiece_URL}" simple-sentencepiece_URL)
message(STATUS "Found local downloaded simple-sentencepiece: ${simple-sentencepiece_URL}")
set(simple-sentencepiece_URL2)
break()
endif()
endforeach()

set(SBPE_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
set(SBPE_BUILD_PYTHON OFF CACHE BOOL "" FORCE)

FetchContent_Declare(simple-sentencepiece
URL
${simple-sentencepiece_URL}
${simple-sentencepiece_URL2}
URL_HASH
${simple-sentencepiece_HASH}
)

FetchContent_GetProperties(simple-sentencepiece)
if(NOT simple-sentencepiece_POPULATED)
message(STATUS "Downloading simple-sentencepiece ${simple-sentencepiece_URL}")
FetchContent_Populate(simple-sentencepiece)
endif()
message(STATUS "simple-sentencepiece is downloaded to ${simple-sentencepiece_SOURCE_DIR}")
add_subdirectory(${simple-sentencepiece_SOURCE_DIR} ${simple-sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)

target_include_directories(ssentencepiece_core
PUBLIC
${simple-sentencepiece_SOURCE_DIR}/
)

if(SHERPA_ONNX_ENABLE_PYTHON AND WIN32)
install(TARGETS ssentencepiece_core DESTINATION ..)
else()
install(TARGETS ssentencepiece_core DESTINATION lib)
endif()

if(WIN32 AND BUILD_SHARED_LIBS)
install(TARGETS ssentencepiece_core DESTINATION bin)
endif()
endfunction()

download_simple_sentencepiece()
2 changes: 1 addition & 1 deletion kotlin-api-examples/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ function testSpeakerEmbeddingExtractor() {
function testOnlineAsr() {
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
git lfs install
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
GIT_CLONE_PROTECTION_ACTIVE=false git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
fi

if [ ! -f ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms/tokens.txt ]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
piper_phonemize.lib;
espeak-ng.lib;
ucd.lib;
ssentencepiece_core.lib;
</SherpaOnnxLibraries>
</PropertyGroup>
<ItemDefinitionGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
piper_phonemize.lib;
espeak-ng.lib;
ucd.lib;
ssentencepiece_core.lib;
</SherpaOnnxLibraries>
</PropertyGroup>
<ItemDefinitionGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
piper_phonemize.lib;
espeak-ng.lib;
ucd.lib;
ssentencepiece_core.lib;
</SherpaOnnxLibraries>
</PropertyGroup>
<ItemDefinitionGroup>
Expand Down
32 changes: 27 additions & 5 deletions python-api-examples/offline-decode-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,9 @@ def get_args():
type=str,
default="",
help="""
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
The file containing hotwords, one words/phrases per line, like
HELLO WORLD
你好世界
""",
)

Expand All @@ -128,6 +126,28 @@ def get_args():
""",
)

parser.add_argument(
"--modeling-unit",
type=str,
default="",
help="""
The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
Used only when hotwords-file is given.
""",
)

parser.add_argument(
"--bpe-vocab",
type=str,
default="",
help="""
The path to the bpe vocabulary, the bpe vocabulary is generated by
sentencepiece, you can also export the bpe vocabulary through a bpe model
by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
and modeling-unit is bpe or cjkchar+bpe.
""",
)

parser.add_argument(
"--encoder",
default="",
Expand Down Expand Up @@ -347,6 +367,8 @@ def main():
decoding_method=args.decoding_method,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
modeling_unit=args.modeling_unit,
bpe_vocab=args.bpe_vocab,
blank_penalty=args.blank_penalty,
debug=args.debug,
)
Expand Down
32 changes: 27 additions & 5 deletions python-api-examples/online-decode-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,9 @@ def get_args():
type=str,
default="",
help="""
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
The file containing hotwords, one words/phrases per line, like
HELLO WORLD
你好世界
""",
)

Expand All @@ -216,6 +214,28 @@ def get_args():
""",
)

parser.add_argument(
"--modeling-unit",
type=str,
default="",
help="""
The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
Used only when hotwords-file is given.
""",
)

parser.add_argument(
"--bpe-vocab",
type=str,
default="",
help="""
The path to the bpe vocabulary, the bpe vocabulary is generated by
sentencepiece, you can also export the bpe vocabulary through a bpe model
by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
and modeling-unit is bpe or cjkchar+bpe.
""",
)

parser.add_argument(
"--blank-penalty",
type=float,
Expand Down Expand Up @@ -302,6 +322,8 @@ def main():
lm_scale=args.lm_scale,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
modeling_unit=args.modeling_unit,
bpe_vocab=args.bpe_vocab,
blank_penalty=args.blank_penalty,
)
elif args.zipformer2_ctc:
Expand Down
Loading

0 comments on commit b012b78

Please sign in to comment.