From e65f6ed49842d240693703f8b84b563e0cf61f35 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 8 Jul 2024 23:33:41 +0200 Subject: [PATCH] more --- users/zeyer/datasets/librispeech.py | 2 +- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index db4e502b3..9b050877c 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -73,7 +73,7 @@ def _get_spm_vocab( dim_str = str(dim) if isinstance(dim, str): # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. - dim = {"20k": 20_480, "10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] + dim = {"20k": 20_480, "10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024, "512": 512, "128": 128}[dim] assert isinstance(dim, int) and dim >= 10 # https://github.com/google/sentencepiece/blob/master/doc/options.md diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index af4e70f01..d7e24df71 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -397,7 +397,14 @@ def py(): ("spm4k", "spm", 0.7), ("spm4k", "bpe", 0.01), ("spm1k", None, None), # 6.07 + ("spm1k", "spm", 0.7), ("spm1k", "bpe", 0.01), # 6.13 (but dev-clean,test-* are better than no sampling) + ("spm_bpe1k", None, None), + ("spm_bpe1k", "bpe", 0.01), + ("spm512", None, None), + ("spm512", "bpe", 0.01), + ("spm128", None, None), + ("spm128", "bpe", 0.01), ]: train_exp( f"v6-relPosAttDef-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100"