add the voxpopuli recipe (#1374)

* add the `voxpopuli` recipe - this is the data preparation - there is no ASR training and no results * update the PR#1374 (feedback from @csukuangfj) - fixing .py headers and docstrings - removing BUT specific parts of `prepare.sh` - adding assert `num_jobs >= num_workers` to `compute_fbank.py` - narrowing list of languages (let's limit to ASR sets with transcripts for now) - added links to `README.md` - extending `text_from_manifest.py`
k2-fsa · Nov 16, 2023 · 59c9438 · 59c9438
1 parent 6d275dd
commit 59c9438
Show file tree

Hide file tree

Showing 16 changed files with 1,296 additions and 0 deletions.
diff --git a/egs/voxpopuli/ASR/README.md b/egs/voxpopuli/ASR/README.md
@@ -0,0 +1,38 @@
+# Readme
+
+This recipe contains data preparation for the
+[VoxPopuli](https://github.com/facebookresearch/voxpopuli) dataset
+[(pdf)](https://aclanthology.org/2021.acl-long.80.pdf).
+At the moment, without model training.
+
+
+## audio per language
+
+| language |  Size  | Hrs. untranscribed | Hrs. transcribed |
+|----------|--------|--------------------|------------------|
+|  bg      |  295G  |  17.6K             |    -             |
+|  cs      |  308G  |  18.7K             |   62             |
+|  da      |  233G  |  13.6K             |    -             |
+|  de      |  379G  |  23.2K             |  282             |
+|  el      |  305G  |  17.7K             |    -             |
+|  en      |  382G  |  24.1K             |  543             |
+|  es      |  362G  |  21.4K             |  166             |
+|  et      |  179G  |  10.6K             |    3             |
+|  fi      |  236G  |  14.2K             |   27             |
+|  fr      |  376G  |  22.8K             |  211             |
+|  hr      |  132G  |   8.1K             |   43             |
+|  hu      |  297G  |  17.7K             |   63             |
+|  it      |  361G  |  21.9K             |   91             |
+|  lt      |  243G  |  14.4K             |    2             |
+|  lv      |  217G  |  13.1K             |    -             |
+|  mt      |  147G  |   9.1K             |    -             |
+|  nl      |  322G  |  19.0K             |   53             |
+|  pl      |  348G  |  21.2K             |  111             |
+|  pt      |  300G  |  17.5K             |    -             |
+|  ro      |  296G  |  17.9K             |   89             |
+|  sk      |  201G  |  12.1K             |   35             |
+|  sl      |  190G  |  11.3K             |   10             |
+|  sv      |  272G  |  16.3K             |    -             |
+|          |        |                    |                  |
+|  total   |  6.3T  |   384K             | 1791             |
+
diff --git a/egs/voxpopuli/ASR/local/compute_fbank.py b/egs/voxpopuli/ASR/local/compute_fbank.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#              2023  Brno University of Technology  (authors: Karel Veselý)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of VoxPopuli dataset.
+
+Usage example:
+
+  python3 ./local/compute_fbank.py \
+      --src-dir data/fbank --output-dir data/fbank \
+      --num-jobs 100 --num-workers 25 \
+      --prefix "voxpopuli-${task}-${lang}" \
+      --dataset train \
+      --trim-to-supervisions True \
+      --speed-perturb True
+
+It looks for raw CutSet in the directory data/fbank
+located at: `{src_dir}/{prefix}_cuts_{dataset}_raw.jsonl.gz`.
+
+The generated fbank features are saved in `data/fbank/{prefix}-{dataset}_feats`
+and CutSet manifest stored in `data/fbank/{prefix}_cuts_{dataset}.jsonl.gz`.
+
+Typically, the number of workers is smaller than number of jobs
+(see --num-jobs 100 --num-workers 25 in the example).
+And, the number of jobs should be at least the number of workers (it's checked).
+"""
+
+import argparse
+import logging
+import multiprocessing
+import os
+from concurrent.futures import ProcessPoolExecutor
+from pathlib import Path
+
+import sentencepiece as spm
+import torch
+from filter_cuts import filter_cuts
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    is_caching_enabled,
+    set_caching_enabled,
+)
+
+from icefall.utils import str2bool
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        help="""Path to the bpe.model. If not None, we will remove short and
+        long utterances before extracting features""",
+    )
+    parser.add_argument(
+        "--src-dir",
+        type=str,
+        help="""Folder with the input manifest files.""",
+        default="data/manifests",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="""Folder with the output manifests (cuts) and feature files.""",
+        default="data/fbank",
+    )
+
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        help="""Prefix of the manifest files.""",
+        default="",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="""Dataset parts to compute fbank (train,test,dev).""",
+        default=None,
+    )
+
+    parser.add_argument(
+        "--num-jobs",
+        type=int,
+        help="""Number of jobs (i.e. files with extracted features)""",
+        default=50,
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        help="""Number of parallel workers""",
+        default=10,
+    )
+    parser.add_argument(
+        "--speed-perturb",
+        type=str2bool,
+        default=False,
+        help="""Enable speed perturbation for the set.""",
+    )
+    parser.add_argument(
+        "--trim-to-supervisions",
+        type=str2bool,
+        default=False,
+        help="""Apply `trim-to-supervision` to cut set.""",
+    )
+
+    return parser.parse_args()
+
+
+def compute_fbank_features(args: argparse.Namespace):
+    set_caching_enabled(True)  # lhotse
+
+    src_dir = Path(args.src_dir)
+    output_dir = Path(args.output_dir)
+    num_jobs = args.num_jobs
+    num_workers = min(args.num_workers, os.cpu_count())
+    num_mel_bins = 80
+
+    bpe_model = args.bpe_model
+    if bpe_model:
+        logging.info(f"Loading {bpe_model}")
+        sp = spm.SentencePieceProcessor()
+        sp.load(bpe_model)
+
+    prefix = args.prefix  # "ELEF_TRAIN"
+    dataset = args.dataset
+    suffix = "jsonl.gz"
+
+    cuts_raw_filename = Path(f"{src_dir}/{prefix}_cuts_{dataset}_raw.{suffix}")
+    cuts_raw = CutSet.from_file(cuts_raw_filename)
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    cuts_filename = Path(f"{prefix}_cuts_{dataset}.{suffix}")
+    if (output_dir / cuts_filename).is_file():
+        logging.info(f"{output_dir/cuts_filename} already exists - skipping.")
+        return
+
+    logging.info(f"Processing {output_dir/cuts_filename}")
+    cut_set = cuts_raw
+
+    if bpe_model:
+        cut_set = filter_cuts(cut_set, sp)
+
+    if args.speed_perturb:
+        cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+
+    if args.trim_to_supervisions:
+        logging.info(f"About to `trim_to_supervisions()` {output_dir / cuts_filename}")
+        cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+    else:
+        logging.info(
+            "Not doing `trim_to_supervisions()`, "
+            "to enable use --trim-to-supervision=True"
+        )
+
+    cut_set = cut_set.to_eager()  # disallow lazy evaluation (sorting requires it)
+    cut_set = cut_set.sort_by_recording_id()  # enhances AudioCache hit rate
+
+    # We typically use `num_jobs=100, num_workers=20`
+    # - this is helpful for large databases
+    # - both values are configurable externally
+    assert num_jobs >= num_workers, (num_jobs, num_workers)
+    executor = ProcessPoolExecutor(
+        max_workers=num_workers,
+        mp_context=multiprocessing.get_context("spawn"),
+        initializer=set_caching_enabled,
+        initargs=(is_caching_enabled(),),
+    )
+
+    logging.info(
+        f"executor {executor} : num_workers {num_workers}, num_jobs {num_jobs}"
+    )
+
+    cut_set = cut_set.compute_and_store_features(
+        extractor=extractor,
+        storage_path=f"{output_dir / prefix}-{dataset}_feats",
+        num_jobs=num_jobs,
+        executor=executor,
+        storage_type=LilcomChunkyWriter,
+    )
+
+    # correct small deviations of duration, caused by speed-perturbation
+    for cut in cut_set:
+        assert len(cut.supervisions) == 1, (len(cut.supervisions), cut.id)
+        duration_difference = abs(cut.supervisions[0].duration - cut.duration)
+        tolerance = 0.02  # 20ms
+        if duration_difference == 0.0:
+            pass
+        elif duration_difference <= tolerance:
+            logging.info(
+                "small mismatch of the supervision duration "
+                f"(Δt = {duration_difference*1000}ms), "
+                f"correcting : cut.duration {cut.duration} -> "
+                f"supervision {cut.supervisions[0].duration}"
+            )
+            cut.supervisions[0].duration = cut.duration
+        else:
+            logging.error(
+                "mismatch of cut/supervision duration "
+                f"(Δt = {duration_difference*1000}ms) : "
+                f"cut.duration {cut.duration}, "
+                f"supervision {cut.supervisions[0].duration}"
+            )
+            raise ValueError(
+                "mismatch of cut/supervision duration "
+                f"(Δt = {duration_difference*1000}ms)"
+            )
+
+    # store the cutset
+    logging.info(f"storing CutSet to : `{output_dir / cuts_filename}`")
+    cut_set.to_file(output_dir / cuts_filename)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    args = get_args()
+    logging.info(vars(args))
+
+    compute_fbank_features(args)
diff --git a/egs/voxpopuli/ASR/local/compute_fbank_musan.py b/egs/voxpopuli/ASR/local/compute_fbank_musan.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
diff --git a/egs/voxpopuli/ASR/local/display_manifest_statistics.py b/egs/voxpopuli/ASR/local/display_manifest_statistics.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#              2023  Brno University of Technology  (authors: Karel Veselý)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file displays duration statistics of utterances in a manifest.
+You can use the displayed value to choose minimum/maximum duration
+to remove short and long utterances during the training.
+
+Usage example:
+    python3 ./local/display_manifest_statistics.py data/fbank/*_cuts*.jsonl.gz
+
+See the function `remove_short_and_long_utt()` in transducer/train.py
+for usage.
+
+"""
+
+import argparse
+
+from lhotse import load_manifest_lazy
+
+
+def get_args():
+    parser = argparse.ArgumentParser("Compute statistics for 'cuts' .jsonl.gz")
+
+    parser.add_argument(
+        "filename",
+        help="data/fbank/imported_cuts_bison-train_trim.jsonl.gz",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    cuts = load_manifest_lazy(args.filename)
+    cuts.describe()
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../librispeech/ASR/local/compute_fbank_musan.py