Skip to content

Commit

Permalink
adding latest from main
Browse files Browse the repository at this point in the history
  • Loading branch information
Marvin84 committed Jun 4, 2024
1 parent 701e21d commit cf5884a
Show file tree
Hide file tree
Showing 20 changed files with 1,329 additions and 301 deletions.
136 changes: 136 additions & 0 deletions common/datasets/tedlium2_v2/corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os
from functools import lru_cache
from typing import Dict, Optional, Any

from sisyphus import tk

from i6_core.audio.encoding import BlissChangeEncodingJob

from i6_core.meta import CorpusObject

from ..tedlium2.constants import DURATIONS
from .download import download_data_dict


@lru_cache()
def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datasets") -> Dict[str, tk.Path]:
"""
creates a dictionary of all corpora in the TedLiumV2 dataset in the bliss xml format
:param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same.
:param output_prefix:
:return:
"""
assert audio_format in ["flac", "ogg", "wav", "sph", "nist"]

output_prefix = os.path.join(output_prefix, "Ted-Lium-2")

bliss_corpus_dict = download_data_dict(output_prefix=output_prefix).bliss_nist

audio_format_options = {
"wav": {
"output_format": "wav",
"codec": "pcm_s16le",
},
"ogg": {"output_format": "ogg", "codec": "libvorbis"},
"flac": {"output_format": "flac", "codec": "flac"},
}

converted_bliss_corpus_dict = {}
if audio_format not in ["sph", "nist"]:
for corpus_name, sph_corpus in bliss_corpus_dict.items():
bliss_change_encoding_job = BlissChangeEncodingJob(
corpus_file=sph_corpus,
sample_rate=16000,
recover_duration=False,
**audio_format_options[audio_format],
)
bliss_change_encoding_job.add_alias(
os.path.join(
output_prefix,
"%s_conversion" % audio_format,
corpus_name,
)
)
converted_bliss_corpus_dict[corpus_name] = bliss_change_encoding_job.out_corpus
else:
converted_bliss_corpus_dict = bliss_corpus_dict

return converted_bliss_corpus_dict


@lru_cache()
def get_corpus_object_dict(audio_format: str = "flac", output_prefix: str = "datasets") -> Dict[str, CorpusObject]:
"""
creates a dict of all corpora in the TedLiumV2 dataset as a `meta.CorpusObject`
:param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same.
:param output_prefix:
:return:
"""
bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)

corpus_object_dict = {}

for corpus_name, bliss_corpus in bliss_corpus_dict.items():
corpus_object = CorpusObject()
corpus_object.corpus_file = bliss_corpus
corpus_object.audio_format = audio_format
corpus_object.audio_dir = None
corpus_object.duration = DURATIONS[corpus_name]

corpus_object_dict[corpus_name] = corpus_object

return corpus_object_dict


@lru_cache()
def get_stm_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]:
"""
fetches the STM files for TedLiumV2 dataset
:param output_prefix:
:return:
"""
return download_data_dict(output_prefix=output_prefix).stm


def get_ogg_zip_dict(
subdir_prefix: str = "datasets",
returnn_python_exe: Optional[tk.Path] = None,
returnn_root: Optional[tk.Path] = None,
bliss_to_ogg_job_rqmt: Optional[Dict[str, Any]] = None,
extra_args: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Dict[str, tk.Path]:
"""
Get a dictionary containing the paths to the ogg_zip for each corpus part.
No outputs will be registered.
:param subdir_prefix: dir name prefix for aliases and outputs
:param returnn_python_exe: path to returnn python executable
:param returnn_root: python to returnn root
:param bliss_to_ogg_job_rqmt: rqmt for bliss to ogg job
:param extra_args: extra args for each dataset for bliss to ogg job
:return: dictionary with ogg zip paths for each corpus (train, dev, test)
"""
from i6_core.returnn.oggzip import BlissToOggZipJob

ogg_zip_dict = {}
bliss_corpus_dict = get_bliss_corpus_dict(audio_format="wav", output_prefix=subdir_prefix)
if extra_args is None:
extra_args = {}
for name, bliss_corpus in bliss_corpus_dict.items():
ogg_zip_job = BlissToOggZipJob(
bliss_corpus,
no_conversion=False, # cannot be used for corpus with multiple segments per recording
returnn_python_exe=returnn_python_exe,
returnn_root=returnn_root,
**extra_args.get(name, {}),
)
if bliss_to_ogg_job_rqmt:
ogg_zip_job.rqmt = bliss_to_ogg_job_rqmt
ogg_zip_job.add_alias(os.path.join(subdir_prefix, "Ted-Lium-2", "%s_ogg_zip_job" % name))
ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip

return ogg_zip_dict
48 changes: 48 additions & 0 deletions common/datasets/tedlium2_v2/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
from dataclasses import dataclass
from functools import lru_cache
from typing import Any, Dict

from sisyphus import tk

from i6_core.datasets.tedlium2 import (
DownloadTEDLIUM2CorpusJob,
CreateTEDLIUM2BlissCorpusJobV2,
)


@dataclass(frozen=True)
class TedLium2Data:
"""Class for storing the TedLium2 data"""

data_dir: Dict[str, tk.Path]
lm_dir: tk.Path
vocab: tk.Path
bliss_nist: Dict[str, tk.Path]
stm: Dict[str, tk.Path]


@lru_cache()
def download_data_dict(output_prefix: str = "datasets") -> TedLium2Data:
"""
downloads the TedLiumV2 dataset and performs the initial data processing steps
Uses the fixed job CreateTEDLIUM2BlissCorpusJobV2 from: https://github.com/rwth-i6/i6_core/pull/490
:param output_prefix:
:return:
"""
download_tedlium2_job = DownloadTEDLIUM2CorpusJob()
download_tedlium2_job.add_alias(os.path.join(output_prefix, "download", "raw_corpus_job"))

bliss_corpus_tedlium2_job = CreateTEDLIUM2BlissCorpusJobV2(download_tedlium2_job.out_corpus_folders)
bliss_corpus_tedlium2_job.add_alias(os.path.join(output_prefix, "create_bliss", "bliss_corpus_job"))

tl2_data = TedLium2Data(
data_dir=download_tedlium2_job.out_corpus_folders,
lm_dir=download_tedlium2_job.out_lm_folder,
vocab=download_tedlium2_job.out_vocab_dict,
bliss_nist=bliss_corpus_tedlium2_job.out_corpus_files,
stm=bliss_corpus_tedlium2_job.out_stm_files,
)

return tl2_data
96 changes: 96 additions & 0 deletions common/datasets/tedlium2_v2/export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os

from sisyphus import tk

from .corpus import get_bliss_corpus_dict, get_stm_dict
from .lexicon import get_bliss_lexicon, get_g2p_augmented_bliss_lexicon
from .textual_data import get_text_data_dict

TEDLIUM_PREFIX = "Ted-Lium-2"


def _export_datasets(output_prefix: str = "datasets"):
"""
exports all datasets for TedLiumV2 with all available audio formats
:param output_prefix:
:return:
"""
for audio_format in ["flac", "ogg", "wav", "nist", "sph"]:
bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)
for name, bliss_corpus in bliss_corpus_dict.items():
tk.register_output(
os.path.join(
output_prefix,
TEDLIUM_PREFIX,
"corpus",
f"{name}-{audio_format}.xml.gz",
),
bliss_corpus,
)


def _export_stms(output_prefix: str = "datasets"):
"""
exports all STMs for TedLiumV2
:param output_prefix:
:return:
"""
stm_dict = get_stm_dict(output_prefix=output_prefix)
for name, stm_file in stm_dict.items():
tk.register_output(
os.path.join(
output_prefix,
TEDLIUM_PREFIX,
"stm",
f"{name}.txt",
),
stm_file,
)


def _export_text_data(output_prefix: str = "datasets"):
"""
exports all the textual data for TedLiumV2 dataset
:param output_prefix:
:return:
"""
txt_data_dict = get_text_data_dict(output_prefix=output_prefix)
for k, v in txt_data_dict.items():
tk.register_output(os.path.join(output_prefix, TEDLIUM_PREFIX, "text_data", f"{k}.gz"), v)


def _export_lexicon(output_prefix: str = "datasets"):
"""
exports the lexicon for TedLiumV2
:param output_prefix:
:return:
"""
lexicon_output_prefix = os.path.join(output_prefix, TEDLIUM_PREFIX, "lexicon")

bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix)
tk.register_output(os.path.join(lexicon_output_prefix, "tedlium2.lexicon.xml.gz"), bliss_lexicon)

g2p_bliss_lexicon = get_g2p_augmented_bliss_lexicon(
add_unknown_phoneme_and_mapping=False, output_prefix=output_prefix
)
tk.register_output(
os.path.join(lexicon_output_prefix, "tedlium2.lexicon_with_g2p.xml.gz"),
g2p_bliss_lexicon,
)


def export_all(output_prefix: str = "datasets"):
"""
exports everything for TedLiumV2
:param output_prefix:
:return:
"""
_export_datasets(output_prefix=output_prefix)
_export_stms(output_prefix=output_prefix)
_export_text_data(output_prefix=output_prefix)
_export_lexicon(output_prefix=output_prefix)
Loading

0 comments on commit cf5884a

Please sign in to comment.