From ec783cfbbb836b59c795e52ef45ea41876ea2760 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 26 Jul 2021 10:00:49 -0700 Subject: [PATCH] Bicleaner support + fixes (#13) SacreBLEU is a regular importer now and evaluation is not limited to sacrebleu datasets. fixes Added bicleaner-ai and bicleaner filtering (one or another based on available pretrained language packs). fixes Added script to find all datasets based on language pair and importer type, ready to use in config fixes Fixed conda environment activation to be reproducible on GCP Other minor reproducibility fixes --- .gitmodules | 3 + 3rd_party/kenlm | 1 + README.md | 53 ++++++++-------- config.sh | 10 ++-- .../generate-alignment-and-shortlist.sh | 34 ++++++----- pipeline/clean/bicleaner.sh | 60 +++++++++++++++++++ pipeline/clean/ce-filter.sh | 10 +++- pipeline/clean/clean-corpus.sh | 12 +++- pipeline/clean/clean-mono.sh | 4 +- .../clean/tools/download-bicleaner-pack.sh | 60 +++++++++++++++++++ pipeline/data/download-eval.sh | 29 +++++++++ pipeline/data/download-mono.sh | 2 +- pipeline/data/importers/corpus/mtdata.sh | 23 ++++--- pipeline/data/importers/corpus/opus.sh | 20 ++++--- pipeline/data/importers/corpus/sacrebleu.sh | 30 ++++++++++ pipeline/quantize/eval.sh | 13 ++-- pipeline/quantize/quantize.sh | 10 +++- pipeline/setup/activate-python.sh | 13 ++++ pipeline/setup/install-all.sh | 4 +- pipeline/setup/install-kenlm.sh | 31 ++++++++++ pipeline/setup/install-python-packages.sh | 18 ++++++ pipeline/setup/install-python.sh | 12 ++-- pipeline/train/eval.sh | 17 ++++-- pipeline/train/finetune-student.sh | 29 ++++----- pipeline/train/train-s2s.sh | 1 + pipeline/train/train-student.sh | 1 + pipeline/train/train-teacher-ensemble.sh | 1 + pipeline/train/train-teacher.sh | 1 + pipeline/train/train.sh | 6 +- pipeline/translate/translate-corpus.sh | 10 +++- pipeline/translate/translate-mono.sh | 6 ++ pipeline/utils/find-corpus.py | 54 +++++++++++++++++ .../tensorboard/tb_log_parser.py | 0 .../tensorboard/tensorboard.sh | 9 ++- run.sh | 36 +++++++---- 35 files changed, 500 insertions(+), 123 deletions(-) create mode 160000 3rd_party/kenlm create mode 100644 pipeline/clean/bicleaner.sh create mode 100644 pipeline/clean/tools/download-bicleaner-pack.sh create mode 100644 pipeline/data/download-eval.sh create mode 100644 pipeline/data/importers/corpus/sacrebleu.sh create mode 100644 pipeline/setup/activate-python.sh create mode 100644 pipeline/setup/install-kenlm.sh create mode 100644 pipeline/setup/install-python-packages.sh create mode 100644 pipeline/utils/find-corpus.py rename pipeline/{train => utils}/tensorboard/tb_log_parser.py (100%) rename pipeline/{train => utils}/tensorboard/tensorboard.sh (55%) diff --git a/.gitmodules b/.gitmodules index 4d0e0e985..cc978064e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "marian-dev"] path = 3rd_party/marian-dev url = https://github.com/browsermt/marian-dev +[submodule "3rd_party/kenlm"] + path = 3rd_party/kenlm + url = https://github.com/kpu/kenlm diff --git a/3rd_party/kenlm b/3rd_party/kenlm new file mode 160000 index 000000000..bbf4fc511 --- /dev/null +++ b/3rd_party/kenlm @@ -0,0 +1 @@ +Subproject commit bbf4fc511266c5d4515047055d7bdec659a6e158 diff --git a/README.md b/README.md index 937e710dd..e9e81f005 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ It was tested on relatively high resource language pair `ru-en`. Low resource pa - Ubuntu 18.04 (it can work on other Linux distributions, but might require `setup` scripts fixes; see more details in [marian installation instructions](https://marian-nmt.github.io/quickstart/)). - One or several Nvidia GPUs with CUDA drivers installed and at least 8 GB of memory. - At least 16 CPU cores ( some steps of the pipeline utilize multiple cores pretty well, so the more the better). -- 64GB RAM +- 64 GB RAM (128 GB might be required for bigger datasets) - 200+ GB of disk space ( mostly for datasets and transformations ). It depends on chosen datasets and can be significantly higher. @@ -87,9 +87,9 @@ bash ./pipeline/.../