-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
1,329 additions
and
301 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import os | ||
from functools import lru_cache | ||
from typing import Dict, Optional, Any | ||
|
||
from sisyphus import tk | ||
|
||
from i6_core.audio.encoding import BlissChangeEncodingJob | ||
|
||
from i6_core.meta import CorpusObject | ||
|
||
from ..tedlium2.constants import DURATIONS | ||
from .download import download_data_dict | ||
|
||
|
||
@lru_cache() | ||
def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datasets") -> Dict[str, tk.Path]: | ||
""" | ||
creates a dictionary of all corpora in the TedLiumV2 dataset in the bliss xml format | ||
:param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. | ||
:param output_prefix: | ||
:return: | ||
""" | ||
assert audio_format in ["flac", "ogg", "wav", "sph", "nist"] | ||
|
||
output_prefix = os.path.join(output_prefix, "Ted-Lium-2") | ||
|
||
bliss_corpus_dict = download_data_dict(output_prefix=output_prefix).bliss_nist | ||
|
||
audio_format_options = { | ||
"wav": { | ||
"output_format": "wav", | ||
"codec": "pcm_s16le", | ||
}, | ||
"ogg": {"output_format": "ogg", "codec": "libvorbis"}, | ||
"flac": {"output_format": "flac", "codec": "flac"}, | ||
} | ||
|
||
converted_bliss_corpus_dict = {} | ||
if audio_format not in ["sph", "nist"]: | ||
for corpus_name, sph_corpus in bliss_corpus_dict.items(): | ||
bliss_change_encoding_job = BlissChangeEncodingJob( | ||
corpus_file=sph_corpus, | ||
sample_rate=16000, | ||
recover_duration=False, | ||
**audio_format_options[audio_format], | ||
) | ||
bliss_change_encoding_job.add_alias( | ||
os.path.join( | ||
output_prefix, | ||
"%s_conversion" % audio_format, | ||
corpus_name, | ||
) | ||
) | ||
converted_bliss_corpus_dict[corpus_name] = bliss_change_encoding_job.out_corpus | ||
else: | ||
converted_bliss_corpus_dict = bliss_corpus_dict | ||
|
||
return converted_bliss_corpus_dict | ||
|
||
|
||
@lru_cache() | ||
def get_corpus_object_dict(audio_format: str = "flac", output_prefix: str = "datasets") -> Dict[str, CorpusObject]: | ||
""" | ||
creates a dict of all corpora in the TedLiumV2 dataset as a `meta.CorpusObject` | ||
:param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. | ||
:param output_prefix: | ||
:return: | ||
""" | ||
bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) | ||
|
||
corpus_object_dict = {} | ||
|
||
for corpus_name, bliss_corpus in bliss_corpus_dict.items(): | ||
corpus_object = CorpusObject() | ||
corpus_object.corpus_file = bliss_corpus | ||
corpus_object.audio_format = audio_format | ||
corpus_object.audio_dir = None | ||
corpus_object.duration = DURATIONS[corpus_name] | ||
|
||
corpus_object_dict[corpus_name] = corpus_object | ||
|
||
return corpus_object_dict | ||
|
||
|
||
@lru_cache() | ||
def get_stm_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: | ||
""" | ||
fetches the STM files for TedLiumV2 dataset | ||
:param output_prefix: | ||
:return: | ||
""" | ||
return download_data_dict(output_prefix=output_prefix).stm | ||
|
||
|
||
def get_ogg_zip_dict( | ||
subdir_prefix: str = "datasets", | ||
returnn_python_exe: Optional[tk.Path] = None, | ||
returnn_root: Optional[tk.Path] = None, | ||
bliss_to_ogg_job_rqmt: Optional[Dict[str, Any]] = None, | ||
extra_args: Optional[Dict[str, Dict[str, Any]]] = None, | ||
) -> Dict[str, tk.Path]: | ||
""" | ||
Get a dictionary containing the paths to the ogg_zip for each corpus part. | ||
No outputs will be registered. | ||
:param subdir_prefix: dir name prefix for aliases and outputs | ||
:param returnn_python_exe: path to returnn python executable | ||
:param returnn_root: python to returnn root | ||
:param bliss_to_ogg_job_rqmt: rqmt for bliss to ogg job | ||
:param extra_args: extra args for each dataset for bliss to ogg job | ||
:return: dictionary with ogg zip paths for each corpus (train, dev, test) | ||
""" | ||
from i6_core.returnn.oggzip import BlissToOggZipJob | ||
|
||
ogg_zip_dict = {} | ||
bliss_corpus_dict = get_bliss_corpus_dict(audio_format="wav", output_prefix=subdir_prefix) | ||
if extra_args is None: | ||
extra_args = {} | ||
for name, bliss_corpus in bliss_corpus_dict.items(): | ||
ogg_zip_job = BlissToOggZipJob( | ||
bliss_corpus, | ||
no_conversion=False, # cannot be used for corpus with multiple segments per recording | ||
returnn_python_exe=returnn_python_exe, | ||
returnn_root=returnn_root, | ||
**extra_args.get(name, {}), | ||
) | ||
if bliss_to_ogg_job_rqmt: | ||
ogg_zip_job.rqmt = bliss_to_ogg_job_rqmt | ||
ogg_zip_job.add_alias(os.path.join(subdir_prefix, "Ted-Lium-2", "%s_ogg_zip_job" % name)) | ||
ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip | ||
|
||
return ogg_zip_dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
from dataclasses import dataclass | ||
from functools import lru_cache | ||
from typing import Any, Dict | ||
|
||
from sisyphus import tk | ||
|
||
from i6_core.datasets.tedlium2 import ( | ||
DownloadTEDLIUM2CorpusJob, | ||
CreateTEDLIUM2BlissCorpusJobV2, | ||
) | ||
|
||
|
||
@dataclass(frozen=True) | ||
class TedLium2Data: | ||
"""Class for storing the TedLium2 data""" | ||
|
||
data_dir: Dict[str, tk.Path] | ||
lm_dir: tk.Path | ||
vocab: tk.Path | ||
bliss_nist: Dict[str, tk.Path] | ||
stm: Dict[str, tk.Path] | ||
|
||
|
||
@lru_cache() | ||
def download_data_dict(output_prefix: str = "datasets") -> TedLium2Data: | ||
""" | ||
downloads the TedLiumV2 dataset and performs the initial data processing steps | ||
Uses the fixed job CreateTEDLIUM2BlissCorpusJobV2 from: https://github.com/rwth-i6/i6_core/pull/490 | ||
:param output_prefix: | ||
:return: | ||
""" | ||
download_tedlium2_job = DownloadTEDLIUM2CorpusJob() | ||
download_tedlium2_job.add_alias(os.path.join(output_prefix, "download", "raw_corpus_job")) | ||
|
||
bliss_corpus_tedlium2_job = CreateTEDLIUM2BlissCorpusJobV2(download_tedlium2_job.out_corpus_folders) | ||
bliss_corpus_tedlium2_job.add_alias(os.path.join(output_prefix, "create_bliss", "bliss_corpus_job")) | ||
|
||
tl2_data = TedLium2Data( | ||
data_dir=download_tedlium2_job.out_corpus_folders, | ||
lm_dir=download_tedlium2_job.out_lm_folder, | ||
vocab=download_tedlium2_job.out_vocab_dict, | ||
bliss_nist=bliss_corpus_tedlium2_job.out_corpus_files, | ||
stm=bliss_corpus_tedlium2_job.out_stm_files, | ||
) | ||
|
||
return tl2_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import os | ||
|
||
from sisyphus import tk | ||
|
||
from .corpus import get_bliss_corpus_dict, get_stm_dict | ||
from .lexicon import get_bliss_lexicon, get_g2p_augmented_bliss_lexicon | ||
from .textual_data import get_text_data_dict | ||
|
||
TEDLIUM_PREFIX = "Ted-Lium-2" | ||
|
||
|
||
def _export_datasets(output_prefix: str = "datasets"): | ||
""" | ||
exports all datasets for TedLiumV2 with all available audio formats | ||
:param output_prefix: | ||
:return: | ||
""" | ||
for audio_format in ["flac", "ogg", "wav", "nist", "sph"]: | ||
bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) | ||
for name, bliss_corpus in bliss_corpus_dict.items(): | ||
tk.register_output( | ||
os.path.join( | ||
output_prefix, | ||
TEDLIUM_PREFIX, | ||
"corpus", | ||
f"{name}-{audio_format}.xml.gz", | ||
), | ||
bliss_corpus, | ||
) | ||
|
||
|
||
def _export_stms(output_prefix: str = "datasets"): | ||
""" | ||
exports all STMs for TedLiumV2 | ||
:param output_prefix: | ||
:return: | ||
""" | ||
stm_dict = get_stm_dict(output_prefix=output_prefix) | ||
for name, stm_file in stm_dict.items(): | ||
tk.register_output( | ||
os.path.join( | ||
output_prefix, | ||
TEDLIUM_PREFIX, | ||
"stm", | ||
f"{name}.txt", | ||
), | ||
stm_file, | ||
) | ||
|
||
|
||
def _export_text_data(output_prefix: str = "datasets"): | ||
""" | ||
exports all the textual data for TedLiumV2 dataset | ||
:param output_prefix: | ||
:return: | ||
""" | ||
txt_data_dict = get_text_data_dict(output_prefix=output_prefix) | ||
for k, v in txt_data_dict.items(): | ||
tk.register_output(os.path.join(output_prefix, TEDLIUM_PREFIX, "text_data", f"{k}.gz"), v) | ||
|
||
|
||
def _export_lexicon(output_prefix: str = "datasets"): | ||
""" | ||
exports the lexicon for TedLiumV2 | ||
:param output_prefix: | ||
:return: | ||
""" | ||
lexicon_output_prefix = os.path.join(output_prefix, TEDLIUM_PREFIX, "lexicon") | ||
|
||
bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix) | ||
tk.register_output(os.path.join(lexicon_output_prefix, "tedlium2.lexicon.xml.gz"), bliss_lexicon) | ||
|
||
g2p_bliss_lexicon = get_g2p_augmented_bliss_lexicon( | ||
add_unknown_phoneme_and_mapping=False, output_prefix=output_prefix | ||
) | ||
tk.register_output( | ||
os.path.join(lexicon_output_prefix, "tedlium2.lexicon_with_g2p.xml.gz"), | ||
g2p_bliss_lexicon, | ||
) | ||
|
||
|
||
def export_all(output_prefix: str = "datasets"): | ||
""" | ||
exports everything for TedLiumV2 | ||
:param output_prefix: | ||
:return: | ||
""" | ||
_export_datasets(output_prefix=output_prefix) | ||
_export_stms(output_prefix=output_prefix) | ||
_export_text_data(output_prefix=output_prefix) | ||
_export_lexicon(output_prefix=output_prefix) |
Oops, something went wrong.