Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shrutakeerti patch 4 #3791

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
54 changes: 54 additions & 0 deletions banglaconversion
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Install necessary dependencies
pip install -r requirements_eval_tflite.txt

# Replace 'YOUR_MODEL.pb' with the path to your Bangla model file
MODEL_PATH='YOUR_MODEL.pb'
LANGUAGE_MODEL='path_to_your_language_model'

# Evaluate the model
python -u DeepSpeech.py \
--alphabet_config_path=alphabet.txt \
--lm_binary_path=$LANGUAGE_MODEL \
--lm_trie_path=trie \
--model $MODEL_PATH \
--test_files=test.csv \
--scorer_path=scorer
import os
import argparse
import subprocess
import shutil

def train_bangla_language_model(data_dir, output_dir):
# Define paths
alphabet_path = 'alphabet.txt'
lm_binary_path = 'lm.binary'
lm_trie_path = 'trie'

# Generate alphabet file
with open(alphabet_path, 'w', encoding='utf-8') as f:
f.write('ঀঁংঃঅআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহঽািীুূৃৄেৈোৌ্ৎৗড়ঢ়য়০১২৩৪৫৬৭৮৯\n')

# Train the language model
print('Training language model...')
subprocess.call(['./kenlm/build/bin/lmplz', '--order', '5', '--arpa', lm_binary_path, '--text', os.path.join(data_dir, 'text.txt'), '--discount_fallback'])

# Build the language model trie
print('Building language model trie...')
subprocess.call(['./DeepSpeech.py', '--alphabet_config_path', alphabet_path, '--lm_binary_path', lm_binary_path, '--lm_trie_path', lm_trie_path])

# Move generated files to output directory
shutil.move(alphabet_path, os.path.join(output_dir, alphabet_path))
shutil.move(lm_binary_path, os.path.join(output_dir, lm_binary_path))
shutil.move(lm_trie_path, os.path.join(output_dir, lm_trie_path))

print('Training completed. Language model files saved to', output_dir)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train a Bangla language model for DeepSpeech.')
parser.add_argument('data_dir', type=str, help='Path to the directory containing audio and text data.')
parser.add_argument('output_dir', type=str, help='Path to the output directory to save the trained model files.')
args = parser.parse_args()

train_bangla_language_model(args.data_dir, args.output_dir)
./train_bangla_language_model.py /path/to/data_dir /path/to/output_dir

51 changes: 14 additions & 37 deletions bin/import_aishell.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
#!/usr/bin/env python
import glob
import os
import tarfile

import pandas
import pandas as pd

from deepspeech_training.util.importers import get_importers_parser

COLUMNNAMES = ["wav_filename", "wav_filesize", "transcript"]

COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]

def extract(archive_path, target_dir):
print("Extracting {} into {}...".format(archive_path, target_dir))
print(f"Extracting {archive_path} into {target_dir}...")
with tarfile.open(archive_path) as tar:
tar.extractall(target_dir)


def preprocess_data(tgz_file, target_dir):
# First extract main archive and sub-archives
extract(tgz_file, target_dir)
Expand All @@ -25,23 +21,7 @@ def preprocess_data(tgz_file, target_dir):
for targz in glob.glob(os.path.join(wav_archives_folder, "*.tar.gz")):
extract(targz, main_folder)

# Folder structure is now:
# - data_aishell/
# - train/S****/*.wav
# - dev/S****/*.wav
# - test/S****/*.wav
# - wav/S****.tar.gz
# - transcript/aishell_transcript_v0.8.txt

# Transcripts file has one line per WAV file, where each line consists of
# the WAV file name without extension followed by a single space followed
# by the transcript.

# Since the transcripts themselves can contain spaces, we split on space but
# only once, then build a mapping from file name to transcript
transcripts_path = os.path.join(
main_folder, "transcript", "aishell_transcript_v0.8.txt"
)
transcripts_path = os.path.join(main_folder, "transcript", "aishell_transcript_v0.8.txt")
with open(transcripts_path) as fin:
transcripts = dict((line.split(" ", maxsplit=1) for line in fin))

Expand All @@ -52,36 +32,33 @@ def load_set(glob_path):
wav_filename = wav
wav_filesize = os.path.getsize(wav)
transcript_key = os.path.splitext(os.path.basename(wav))[0]
transcript = transcripts[transcript_key].strip("\n")
transcript = transcripts.get(transcript_key, "").strip("\n")
set_files.append((wav_filename, wav_filesize, transcript))
except KeyError:
print("Warning: Missing transcript for WAV file {}.".format(wav))
print(f"Warning: Missing transcript for WAV file {wav}.")
return set_files

for subset in ("train", "dev", "test"):
print("Loading {} set samples...".format(subset))
for subset in ["train", "dev", "test"]:
print(f"Loading {subset} set samples...")
subset_files = load_set(os.path.join(main_folder, subset, "S*", "*.wav"))
df = pandas.DataFrame(data=subset_files, columns=COLUMNNAMES)
df = pd.DataFrame(data=subset_files, columns=COLUMN_NAMES)

# Trim train set to under 10s by removing the last couple hundred samples
if subset == "train":
durations = (df["wav_filesize"] - 44) / 16000 / 2
df = df[durations <= 10.0]
print("Trimming {} samples > 10 seconds".format((durations > 10.0).sum()))
print(f"Trimming {subset} samples > 10 seconds: {(durations > 10.0).sum()}")

dest_csv = os.path.join(target_dir, "aishell_{}.csv".format(subset))
print("Saving {} set into {}...".format(subset, dest_csv))
dest_csv = os.path.join(target_dir, f"aishell_{subset}.csv")
print(f"Saving {subset} set into {dest_csv}...")
df.to_csv(dest_csv, index=False)


def main():
# http://www.openslr.org/33/
parser = get_importers_parser(description="Import AISHELL corpus")
parser.add_argument("aishell_tgz_file", help="Path to data_aishell.tgz")
parser.add_argument(
"--target_dir",
default="",
help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
help="Target folder to extract files into and put the resulting CSVs. Defaults to the same folder as the main archive.",
)
params = parser.parse_args()

Expand All @@ -90,6 +67,6 @@ def main():

preprocess_data(params.aishell_tgz_file, params.target_dir)


if __name__ == "__main__":
main()

9 changes: 5 additions & 4 deletions ci_scripts/cppwin-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@

set -xe

source $(dirname "$0")/all-vars.sh
source $(dirname "$0")/all-utils.sh
source $(dirname "$0")/asserts.sh
source "$(dirname "$0")/all-vars.sh"
source "$(dirname "$0")/all-utils.sh"
source "$(dirname "$0")/asserts.sh"

bitrate=$1
set_ldc_sample_filename "${bitrate}"

download_material "${CI_TMP_DIR}/ds"

export PATH=${CI_TMP_DIR}/ds/:$PATH
export PATH="${CI_TMP_DIR}/ds/:$PATH"

check_versions

ensure_cuda_usage "$2"

run_basic_inference_tests

13 changes: 6 additions & 7 deletions ci_scripts/cppwin_tflite-tests.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
#!/bin/bash

set -xe

source $(dirname "$0")/all-vars.sh
source $(dirname "$0")/all-utils.sh
source $(dirname "$0")/asserts.sh
source "$(dirname "$0")/all-vars.sh"
source "$(dirname "$0")/all-utils.sh"
source "$(dirname "$0")/asserts.sh"

bitrate=$1
set_ldc_sample_filename "${bitrate}"

model_source=${DEEPSPEECH_TEST_MODEL//.pb/.tflite}
model_source="${DEEPSPEECH_TEST_MODEL//.pb/.tflite}"
model_name=$(basename "${model_source}")
model_name_mmap=$(basename "${model_source}")
model_name_mmap="${model_name}.mmap"
export DATA_TMP_DIR=${CI_TMP_DIR}

download_material "${CI_TMP_DIR}/ds"

export PATH=${CI_TMP_DIR}/ds/:$PATH
export PATH="${CI_TMP_DIR}/ds/:$PATH"

check_versions

Expand Down
49 changes: 19 additions & 30 deletions ci_scripts/tf-package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,53 +2,41 @@

set -xe

source $(dirname $0)/tf-vars.sh
source "$(dirname "$0")/tf-vars.sh"

mkdir -p ${CI_ARTIFACTS_DIR} || true
mkdir -p "${CI_ARTIFACTS_DIR}" || true

cp ${DS_ROOT_TASK}/tensorflow/bazel_*.log ${CI_ARTIFACTS_DIR} || true
cp "${DS_ROOT_TASK}/tensorflow/bazel_*.log" "${CI_ARTIFACTS_DIR}" || true

OUTPUT_ROOT="${DS_ROOT_TASK}/tensorflow/bazel-bin"

for output_bin in \
tensorflow/lite/experimental/c/libtensorflowlite_c.so \
tensorflow/tools/graph_transforms/transform_graph \
tensorflow/tools/graph_transforms/summarize_graph \
tensorflow/tools/benchmark/benchmark_model \
tensorflow/contrib/util/convert_graphdef_memmapped_format \
for output_bin in \
tensorflow/lite/experimental/c/libtensorflowlite_c.so \
tensorflow/tools/graph_transforms/transform_graph \
tensorflow/tools/graph_transforms/summarize_graph \
tensorflow/tools/benchmark/benchmark_model \
tensorflow/contrib/util/convert_graphdef_memmapped_format \
tensorflow/lite/toco/toco;
do
if [ -f "${OUTPUT_ROOT}/${output_bin}" ]; then
cp ${OUTPUT_ROOT}/${output_bin} ${CI_ARTIFACTS_DIR}/
fi;
done;
cp "${OUTPUT_ROOT}/${output_bin}" "${CI_ARTIFACTS_DIR}/"
fi
done

if [ -f "${OUTPUT_ROOT}/tensorflow/lite/tools/benchmark/benchmark_model" ]; then
cp ${OUTPUT_ROOT}/tensorflow/lite/tools/benchmark/benchmark_model ${CI_ARTIFACTS_DIR}/lite_benchmark_model
cp "${OUTPUT_ROOT}/tensorflow/lite/tools/benchmark/benchmark_model" "${CI_ARTIFACTS_DIR}/lite_benchmark_model"
fi

# It seems that bsdtar and gnutar are behaving a bit differently on the way
# they deal with --exclude="./public/*" ; this caused ./DeepSpeech/tensorflow/core/public/
# to be ditched when we just wanted to get rid of ./public/ on OSX.
# Switching to gnutar (already needed for the --transform on DeepSpeech tasks)
# does the trick.
TAR_EXCLUDE="--exclude=./dls/*"
if [ "${OS}" = "Darwin" ]; then
TAR_EXCLUDE="--exclude=./dls/* --exclude=./public/* --exclude=./generic-worker/* --exclude=./homebrew/* --exclude=./homebrew.cache/* --exclude=./homebrew.logs/*"
fi;

# Make a tar of
# - /home/build-user/ (linux
# - /Users/build-user/TaskCluster/HeavyTasks/X/ (OSX)
# - C:\builds\tc-workdir\ (windows)
fi

if [ "${OS}" = "${CI_MSYS_VERSION}" ]; then
export PATH=$PATH:'/c/Program Files/7-Zip/'
pushd ${DS_ROOT_TASK}
7z a '-xr!.\dls\' '-xr!.\tmp\' '-xr!.\msys64\' -snl -snh -so home.tar . | 7z a -si ${CI_ARTIFACTS_DIR}/home.tar.xz
popd
7z a '-xr!.\dls\' '-xr!.\tmp\' '-xr!.\msys64\' -snl -snh -so home.tar . | 7z a -si "${CI_ARTIFACTS_DIR}/home.tar.xz"
else
${TAR} -C ${DS_ROOT_TASK} ${TAR_EXCLUDE} -cf - . | ${XZ} > ${CI_ARTIFACTS_DIR}/home.tar.xz
tar -C "${DS_ROOT_TASK}" ${TAR_EXCLUDE} -cf - . | xz > "${CI_ARTIFACTS_DIR}/home.tar.xz"
fi

if [ "${OS}" = "Linux" ]; then
Expand All @@ -57,6 +45,7 @@ elif [ "${OS}" = "${CI_MSYS_VERSION}" ]; then
SHA_SUM_GEN="sha256sum"
elif [ "${OS}" = "Darwin" ]; then
SHA_SUM_GEN="shasum -a 256"
fi;
fi

${SHA_SUM_GEN} "${CI_ARTIFACTS_DIR}"/* > "${CI_ARTIFACTS_DIR}/checksums.txt"

${SHA_SUM_GEN} ${CI_ARTIFACTS_DIR}/* > ${CI_ARTIFACTS_DIR}/checksums.txt
4 changes: 0 additions & 4 deletions lm_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function

import absl.app
import optuna
import sys
Expand Down
15 changes: 8 additions & 7 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,21 @@
class TestAlphabetParsing(unittest.TestCase):

def _ending_tester(self, file, expected):
alphabet = Alphabet(os.path.join(os.path.dirname(__file__), 'test_data', file))
label = ''
label_id = -1
alphabet_file_path = os.path.join(os.path.dirname(__file__), 'test_data', file)
with open(alphabet_file_path, 'r') as f:
alphabet_data = f.read().splitlines()
alphabet = Alphabet(alphabet_data)
for expected_label, expected_label_id in expected:
try:
label_id = alphabet.Encode(expected_label)
self.assertEqual(label_id, [expected_label_id])
except KeyError:
pass
self.assertEqual(label_id, [expected_label_id])
self.fail(f"Failed to encode label '{expected_label}'")
try:
label = alphabet.Decode([expected_label_id])
self.assertEqual(label, expected_label)
except KeyError:
pass
self.assertEqual(label, expected_label)
self.fail(f"Failed to decode label '{expected_label_id}'")

def test_macos_ending(self):
self._ending_tester('alphabet_macos.txt', [('a', 0), ('b', 1), ('c', 2)])
Expand Down