Skip to content

Commit

Permalink
skip tests that have no test data
Browse files Browse the repository at this point in the history
  • Loading branch information
pkufool committed Sep 14, 2023
1 parent 4ac7de7 commit 195fa8f
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 10 deletions.
1 change: 1 addition & 0 deletions python-api-examples/non_streaming_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ def check_args(args):

if args.hotwords_file != "":
assert args.decoding_method == "modified_beam_search", args.decoding_method
assert Path(args.hotwords_file).is_file(), args.hotwords_file


def get_args():
Expand Down
53 changes: 43 additions & 10 deletions sherpa-onnx/python/tests/test_text2token.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# ctest --verbose -R test_text2token_py

import unittest
from pathlib import Path

import sherpa_onnx

Expand All @@ -18,12 +19,23 @@

class TestText2Token(unittest.TestCase):
def test_bpe(self):
tokens = f"{d}/text2token/tokens_en.txt"
bpe_model = f"{d}/text2token/bpe_en.model"

if not Path(tokens).is_file() or not Path(bpe_model).is_file():
print(
f"No test data found, skipping test_bpe().\n"
f"You can download the test data by: \n"
f"git clone [email protected]:pkufool/sherpa-test-data.git /tmp/sherpa-test-data"
)
return

texts = ["HELLO WORLD", "I LOVE YOU"]
encoded_texts = sherpa_onnx.text2token(
texts,
tokens=f"{d}/text2token/tokens_en.txt",
tokens=tokens,
tokens_type="bpe",
bpe_model=f"{d}/text2token/bpe_en.model",
bpe_model=bpe_model,
)
assert encoded_texts == [
["▁HE", "LL", "O", "▁WORLD"],
Expand All @@ -32,25 +44,35 @@ def test_bpe(self):

encoded_ids = sherpa_onnx.text2token(
texts,
tokens=f"{d}/text2token/tokens_en.txt",
tokens=tokens,
tokens_type="bpe",
bpe_model=f"{d}/text2token/bpe_en.model",
bpe_model=bpe_model,
output_ids=True,
)
assert encoded_ids == [[22, 58, 24, 425], [19, 370, 47]], encoded_ids

def test_cjkchar(self):
tokens = f"{d}/text2token/tokens_cn.txt"

if not Path(tokens).is_file():
print(
f"No test data found, skipping test_cjkchar().\n"
f"You can download the test data by: \n"
f"git clone [email protected]:pkufool/sherpa-test-data.git /tmp/sherpa-test-data"
)
return

texts = ["世界人民大团结", "中国 VS 美国"]
encoded_texts = sherpa_onnx.text2token(
texts, tokens=f"{d}/text2token/tokens_cn.txt", tokens_type="cjkchar"
texts, tokens=tokens, tokens_type="cjkchar"
)
assert encoded_texts == [
["世", "界", "人", "民", "大", "团", "结"],
["中", "国", "V", "S", "美", "国"],
], encoded_texts
encoded_ids = sherpa_onnx.text2token(
texts,
tokens=f"{d}/text2token/tokens_cn.txt",
tokens=tokens,
tokens_type="cjkchar",
output_ids=True,
)
Expand All @@ -60,22 +82,33 @@ def test_cjkchar(self):
], encoded_ids

def test_cjkchar_bpe(self):
tokens = f"{d}/text2token/tokens_mix.txt"
bpe_model = f"{d}/text2token/bpe_mix.model"

if not Path(tokens).is_file() or not Path(bpe_model).is_file():
print(
f"No test data found, skipping test_cjkchar_bpe().\n"
f"You can download the test data by: \n"
f"git clone [email protected]:pkufool/sherpa-test-data.git /tmp/sherpa-test-data"
)
return

texts = ["世界人民 GOES TOGETHER", "中国 GOES WITH 美国"]
encoded_texts = sherpa_onnx.text2token(
texts,
tokens=f"{d}/text2token/tokens_mix.txt",
tokens=tokens,
tokens_type="cjkchar+bpe",
bpe_model=f"{d}/text2token/bpe_mix.model",
bpe_model=bpe_model,
)
assert encoded_texts == [
["世", "界", "人", "民", "▁GO", "ES", "▁TOGETHER"],
["中", "国", "▁GO", "ES", "▁WITH", "美", "国"],
], encoded_texts
encoded_ids = sherpa_onnx.text2token(
texts,
tokens=f"{d}/text2token/tokens_mix.txt",
tokens=tokens,
tokens_type="cjkchar+bpe",
bpe_model=f"{d}/text2token/bpe_mix.model",
bpe_model=bpe_model,
output_ids=True,
)
assert encoded_ids == [
Expand Down

0 comments on commit 195fa8f

Please sign in to comment.