-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Using the code for tedlium2_v2 with merged apostroph #218
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from collections import defaultdict | ||
from typing import Dict | ||
|
||
from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT | ||
from i6_experiments.common.datasets.tedlium2_v2.corpus import get_corpus_object_dict | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this file pushed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am waiting for the referenced PR to be merged. |
||
from i6_experiments.common.datasets.tedlium2_v2.lexicon import ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this pushed? |
||
get_g2p_augmented_bliss_lexicon, | ||
) | ||
from i6_experiments.common.setups.rasr.util import RasrDataInput | ||
|
||
from i6_experiments.common.setups.rasr.config.lex_config import ( | ||
LexiconRasrConfig, | ||
) | ||
from i6_experiments.common.setups.rasr.config.lm_config import ArpaLmRasrConfig | ||
from i6_experiments.common.baselines.tedlium2_v2.lm.ngram_config import run_tedlium2_ngram_lm | ||
|
||
|
||
def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dict[str, Dict[str, RasrDataInput]]: | ||
corpus_object_dict = get_corpus_object_dict(audio_format="wav", output_prefix="corpora") | ||
|
||
train_lexicon = LexiconRasrConfig( | ||
get_g2p_augmented_bliss_lexicon( | ||
add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping, output_prefix="lexicon" | ||
), | ||
False, | ||
) | ||
|
||
lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping) | ||
lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] | ||
comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm) | ||
|
||
rasr_data_input_dict = defaultdict(dict) | ||
|
||
for name, crp_obj in corpus_object_dict.items(): | ||
rasr_data_input_dict[name][name] = RasrDataInput( | ||
corpus_object=crp_obj, | ||
lexicon=train_lexicon.get_dict(), | ||
concurrent=CONCURRENT[name], | ||
lm=comb_lm.get_dict() if name == "dev" or name == "test" else None, | ||
) | ||
|
||
return rasr_data_input_dict |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,27 @@ | ||||||||||
""" | ||||||||||
List of default tools and software to be defined as default independent from hashing | ||||||||||
by setting one explicit hash. | ||||||||||
|
||||||||||
In order to use different software paths without hash changes, just use the same explicit hash string as given here. | ||||||||||
|
||||||||||
If you want a stronger guarantee that you get the intended results, please consider using the explicit software | ||||||||||
version listed here. Nevertheless, the most recent "head" should be safe to be used as well. | ||||||||||
""" | ||||||||||
from sisyphus import tk | ||||||||||
from i6_experiments.common.tools.audio import compile_ffmpeg_binary | ||||||||||
from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode | ||||||||||
from i6_experiments.common.tools.sctk import compile_sctk | ||||||||||
|
||||||||||
RASR_BINARY_PATH = compile_rasr_binaries_i6mode( | ||||||||||
branch="apptainer_tf_2_8", configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"] | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know this is not your change, but I dont like merging something that will not work, since the branch does not exist anymore. I think we can just remove this and replace this by None and then let the assert do the work. |
||||||||||
) # use most recent RASR | ||||||||||
Comment on lines
+15
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
# RASR_BINARY_PATH = tk.Path("/work/asr4/rossenbach/neon_test/rasr_versions/rasr_no_tf/arch/linux-x86_64-standard/") | ||||||||||
assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline" | ||||||||||
RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" | ||||||||||
|
||||||||||
|
||||||||||
SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version | ||||||||||
SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH" | ||||||||||
|
||||||||||
SRILM_PATH = tk.Path("/work/tools/users/luescher/srilm-1.7.3/bin/i686-m64/") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to RASR I think this needs to be either changed to some u22 path or just asserted as False |
||||||||||
SRILM_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SRILM_PATH" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While I know this does not change, if we do a "full copy" shouldn't we for completeness also copy this? @JackTemaki opinions?