diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index db4e502b3..9b050877c 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -73,7 +73,7 @@ def _get_spm_vocab( dim_str = str(dim) if isinstance(dim, str): # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. - dim = {"20k": 20_480, "10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] + dim = {"20k": 20_480, "10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024, "512": 512, "128": 128}[dim] assert isinstance(dim, int) and dim >= 10 # https://github.com/google/sentencepiece/blob/master/doc/options.md diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index af4e70f01..d7e24df71 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -397,7 +397,14 @@ def py(): ("spm4k", "spm", 0.7), ("spm4k", "bpe", 0.01), ("spm1k", None, None), # 6.07 + ("spm1k", "spm", 0.7), ("spm1k", "bpe", 0.01), # 6.13 (but dev-clean,test-* are better than no sampling) + ("spm_bpe1k", None, None), + ("spm_bpe1k", "bpe", 0.01), + ("spm512", None, None), + ("spm512", "bpe", 0.01), + ("spm128", None, None), + ("spm128", "bpe", 0.01), ]: train_exp( f"v6-relPosAttDef-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100"