From ec783cfbbb836b59c795e52ef45ea41876ea2760 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Mon, 26 Jul 2021 10:00:49 -0700
Subject: [PATCH] Bicleaner support + fixes (#13)

SacreBLEU is a regular importer now and evaluation is not limited to sacrebleu datasets.
fixes

Added bicleaner-ai and bicleaner filtering (one or another based on available pretrained language packs).
fixes


Added script to find all datasets based on language pair and importer type, ready to use in config
fixes


Fixed conda environment activation to be reproducible on GCP

Other minor reproducibility fixes
---
 .gitmodules                                   |  3 +
 3rd_party/kenlm                               |  1 +
 README.md                                     | 53 ++++++++--------
 config.sh                                     | 10 ++--
 .../generate-alignment-and-shortlist.sh       | 34 ++++++-----
 pipeline/clean/bicleaner.sh                   | 60 +++++++++++++++++++
 pipeline/clean/ce-filter.sh                   | 10 +++-
 pipeline/clean/clean-corpus.sh                | 12 +++-
 pipeline/clean/clean-mono.sh                  |  4 +-
 .../clean/tools/download-bicleaner-pack.sh    | 60 +++++++++++++++++++
 pipeline/data/download-eval.sh                | 29 +++++++++
 pipeline/data/download-mono.sh                |  2 +-
 pipeline/data/importers/corpus/mtdata.sh      | 23 ++++---
 pipeline/data/importers/corpus/opus.sh        | 20 ++++---
 pipeline/data/importers/corpus/sacrebleu.sh   | 30 ++++++++++
 pipeline/quantize/eval.sh                     | 13 ++--
 pipeline/quantize/quantize.sh                 | 10 +++-
 pipeline/setup/activate-python.sh             | 13 ++++
 pipeline/setup/install-all.sh                 |  4 +-
 pipeline/setup/install-kenlm.sh               | 31 ++++++++++
 pipeline/setup/install-python-packages.sh     | 18 ++++++
 pipeline/setup/install-python.sh              | 12 ++--
 pipeline/train/eval.sh                        | 17 ++++--
 pipeline/train/finetune-student.sh            | 29 ++++-----
 pipeline/train/train-s2s.sh                   |  1 +
 pipeline/train/train-student.sh               |  1 +
 pipeline/train/train-teacher-ensemble.sh      |  1 +
 pipeline/train/train-teacher.sh               |  1 +
 pipeline/train/train.sh                       |  6 +-
 pipeline/translate/translate-corpus.sh        | 10 +++-
 pipeline/translate/translate-mono.sh          |  6 ++
 pipeline/utils/find-corpus.py                 | 54 +++++++++++++++++
 .../tensorboard/tb_log_parser.py              |  0
 .../tensorboard/tensorboard.sh                |  9 ++-
 run.sh                                        | 36 +++++++----
 35 files changed, 500 insertions(+), 123 deletions(-)
 create mode 160000 3rd_party/kenlm
 create mode 100644 pipeline/clean/bicleaner.sh
 create mode 100644 pipeline/clean/tools/download-bicleaner-pack.sh
 create mode 100644 pipeline/data/download-eval.sh
 create mode 100644 pipeline/data/importers/corpus/sacrebleu.sh
 create mode 100644 pipeline/setup/activate-python.sh
 create mode 100644 pipeline/setup/install-kenlm.sh
 create mode 100644 pipeline/setup/install-python-packages.sh
 create mode 100644 pipeline/utils/find-corpus.py
 rename pipeline/{train => utils}/tensorboard/tb_log_parser.py (100%)
 rename pipeline/{train => utils}/tensorboard/tensorboard.sh (55%)

diff --git a/.gitmodules b/.gitmodules
index 4d0e0e985..cc978064e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "marian-dev"]
 	path = 3rd_party/marian-dev
 	url = https://github.com/browsermt/marian-dev
+[submodule "3rd_party/kenlm"]
+	path = 3rd_party/kenlm
+	url = https://github.com/kpu/kenlm
diff --git a/3rd_party/kenlm b/3rd_party/kenlm
new file mode 160000
index 000000000..bbf4fc511
--- /dev/null
+++ b/3rd_party/kenlm
@@ -0,0 +1 @@
+Subproject commit bbf4fc511266c5d4515047055d7bdec659a6e158
diff --git a/README.md b/README.md
index 937e710dd..e9e81f005 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ It was tested on relatively high resource language pair `ru-en`. Low resource pa
 - Ubuntu 18.04 (it can work on other Linux distributions, but might require `setup` scripts fixes; see more details in [marian installation instructions](https://marian-nmt.github.io/quickstart/)).
 - One or several Nvidia GPUs with CUDA drivers installed and at least 8 GB of memory.
 - At least 16 CPU cores ( some steps of the pipeline utilize multiple cores pretty well, so the more the better).
-- 64GB RAM
+- 64 GB RAM (128 GB might be required for bigger datasets)
 - 200+ GB of disk space ( mostly for datasets and transformations ). 
   It depends on chosen datasets and can be significantly higher.
   
@@ -87,9 +87,9 @@ bash ./pipeline/.../<script>.sh <args>
 #### To download exported models:
 
 ```
-pit pull home firefox-translations-training/models/ru-en/exported/model.ruen.intgemm.alphas.bin.gz .
-pit pull home firefox-translations-training/models/ru-en/exported/lex.50.50.ruen.s2t.bin.gz .
-pit pull home firefox-translations-training/models/ru-en/exported/vocab.ruen.spm.gz .
+pit pull home firefox-translations-training/models/ru-en/test/exported/model.ruen.intgemm.alphas.bin.gz .
+pit pull home firefox-translations-training/models/ru-en/test/exported/lex.50.50.ruen.s2t.bin.gz .
+pit pull home firefox-translations-training/models/ru-en/test/exported/vocab.ruen.spm.gz .
 ```
 
 ### Tensorboard
@@ -110,14 +110,15 @@ Step | Description | Bottleneck | Comments
 --- | --- | --- | ---
 Installation | Installing dependencies and compiling | CPU | Takes ~1 hour
 Data downloading | Downloads datasets, samples sentences | Network, Disk | Time depends on dataset size, sampling of huge mono datasets (100M+ sentences) is the most intensive operation.
-Data cleaning | Basic preprocessing, language specific, rule based, deduplication and other attempts to clean noisy data | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py).
+Data cleaning | Basic preprocessing, language specific, rule based, deduplication,  and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py).
+Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning threshold is controlled by `BICLEANER_THRESHOLD` config setting.
 Training s2s | Trains a backward shallow s2s model, which is useful for back-translations and ce-filtering | GPU | Inspired by a [marian example](https://github.com/marian-nmt/marian-examples/tree/master/training-basics-sentencepiece).
 Augmentation with back-translations | Translates mono corpus combined from `MONO_DATASETS_TRG` using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
 Training teacher | Trains one or multiple big transformer models | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Inspired by [transformer](https://github.com/marian-nmt/marian-examples/tree/master/transformer) and [wmt2017-uedin](https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin) marian examples and extended with [SentencePiece](https://github.com/google/sentencepiece).
 Translation by teacher | Translates a corpus and monolingual data combined from `MONO_DATASETS_SRC` using the teacher model (ensemble is not supported yet) | GPU | The slowest part of the pipeline. Can take days. It is possible to speed it up launching the same scripts ([corpus](pipeline/translate/translate-corpus.sh), [mono](pipeline/translate/translate-mono.sh)) in parallel from another machine with access to the same network directory.
 Cross-entropy filtering | Scores translated corpus with backward s2s model and removes a part of the corpus with the lowest scores to reduce noise | GPU, CPU, Disk | At this point we work with huge datasets, so it utilizes copying to a local disk to make things faster.
 Training alignments and shortlist | Trains alignments using [fast_align](https://github.com/clab/fast_align) and extracts lexical shortlist using [extract_lex](https://github.com/marian-nmt/extract-lex) tool | CPU, Disk | Some tools requires uncompressed datasets on disk and they are huge at this point. Data is copied to a local disk to make things faster. Might take 100+GB of local disk depending on a dataset size. Good CPU parallelization.
-Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](pipeline/train/tensorboard/tensorboard.sh) manually to see training visualization.
+Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](utils/tensorboard/tensorboard.sh) manually to see training visualization.
 Fine-tuning student | Finetunes the student model by emulating 8bit GEMM during training | GPU | Converges very quickly and then degrades. It's quick but you might want to reduce early stopping threshold.
 Quantizaiton |  Applies 8 bit quantization to the fined-tuned student model and evaluates on CPU | CPU | CPU threads must be set to 1 for this step.
 Export | Exports trained model and shortlist to (bergamot-translator)(https://github.com/mozilla/bergamot-translator) format | |
@@ -129,35 +130,28 @@ Dataset importers can be used in `TRAIN_DATASETS, DEVTEST_DATASETS, MONO_DATASET
 Example:
 ```
 TRAIN_DATASETS="opus_OPUS-ParaCrawl/v7.1 mtdata_newstest2019_ruen"
+TEST_DATASETS="sacrebleu_wmt20 sacrebleu_wmt18"
 ```
 
 Data source | Prefix | Name example | Type | Comments
 --- | --- | --- | ---| ---
 [MTData](https://github.com/thammegowda/mtdata) | mtdata | newstest2017_ruen | corpus | Supports many datasets. Run `mtdata list -l ru-en` to see datasets for a specific language pair.
-[OPUS](opus.nlpl.eu/) | opus | OPUS-ParaCrawl/v7.1 | corpus | Many open source datasets. Go to the website, choose a language pair, check links under Moses column to see what names and version is used in a link.
+[OPUS](opus.nlpl.eu/) | opus | ParaCrawl/v7.1 | corpus | Many open source datasets. Go to the website, choose a language pair, check links under Moses column to see what names and version is used in a link.
+[SacreBLEU](https://github.com/mjpost/sacrebleu) | sacrebleu | wmt20 | corpus | Official evaluation datasets available in SacreBLEU tool. Recommended to use in `TEST_DATASETS`. Look up supported datasets and language pairs in `sacrebleu.dataset` python module.
 [Paracrawl](https://paracrawl.eu/) | paracrawl-mono | paracrawl8 | mono | Datasets that are crawled from the web. Only [mono datasets](https://paracrawl.eu/index.php/moredata) are used in this importer. Parallel corpus is available using opus importer.
 [News crawl](http://data.statmt.org/news-crawl) | news-crawl | news.2019 | mono | Some news monolingual datasets from [WMT21](https://www.statmt.org/wmt21/translation-task.html)
 [Common crawl](https://commoncrawl.org/) | commoncrawl | wmt16 | mono | Huge web crawl datasets. The links are posted on [WMT21](https://www.statmt.org/wmt21/translation-task.html)
 
-### Adding a new importer
-
-Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `<prefix>.sh` 
-and accepts the same parameters as the other scripts from the same folder.
+You can also use [find-corpus](pipeline/utils/find-corpus.py) tool to find all datasets for an importer and get them formatted to use in config.
 
+Example:
 
-## Evaluation datasets
-
-Only [SacreBLEU](https://github.com/mjpost/sacrebleu) datasets are supported at the moment.
+`python ./pipeline/utils/find-corpus en ru opus`
 
-Example:
-```
-TEST_DATASETS="wmt20 wmt18"
-```
+### Adding a new importer
 
-To see what datasets are available for a language pair (for example, `ru-en`) run:
-```
-sacrebleu --list -l ru-en
-```
+Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `<prefix>.sh` 
+and accepts the same parameters as the other scripts from the same folder.
 
 ## Development
 
@@ -217,8 +211,15 @@ At the same time it is possible to run it all locally end to end or to do intera
 - Scripts should automatically inspect resources available for computation and utilize them to make things faster
   (number of cores, memory).
   
-## TODO
 
-1. Add [bicleaner](https://github.com/bitextor/bicleaner/)
-2. Add translation with an ensemble of teacher models
-3. Add more importers
+## References
+
+1. V. M. Sánchez-Cartagena, M. Bañón, S. Ortiz-Rojas and G. Ramírez-Sánchez, 
+"[Prompsit's submission to WMT 2018 Parallel Corpus Filtering shared task](http://www.statmt.org/wmt18/pdf/WMT116.pdf)",
+in *Proceedings of the Third Conference on Machine Translation, Volume 2: Shared Task Papers*.
+Brussels, Belgium: Association for Computational Linguistics, October 2018
+
+2. Gema Ramírez-Sánchez, Jaume Zaragoza-Bernabeu, Marta Bañón and Sergio Ortiz Rojas 
+"[Bifixer and Bicleaner: two open-source tools to clean your parallel data.](https://eamt2020.inesc-id.pt/proceedings-eamt2020.pdf#page=311)",
+in *Proceedings of the 22nd Annual Conference of the European Association for Machine Translation*.
+Lisboa, Portugal: European Association for Machine Translation, November 2020
diff --git a/config.sh b/config.sh
index f6f18e4c8..410f03939 100644
--- a/config.sh
+++ b/config.sh
@@ -16,6 +16,7 @@ MODELS_DIR=${MODELS_DIR:-${WORKDIR}/models}
 MARIAN=${MARIAN:-${WORKDIR}/3rd_party/marian-dev/build}
 CLEAN_TOOLS=${WORKDIR}/pipeline/clean/tools
 BIN=${WORKDIR}/bin
+CONDA_DIR=${HOME}/miniconda3
 TMP=/tmp
 
 EXPERIMENT=test
@@ -23,10 +24,10 @@ SRC=ru
 TRG=en
 
 # parallel corpus
-TRAIN_DATASETS="opus_OPUS-ParaCrawl/v7.1"
+TRAIN_DATASETS="opus_ada83/v1 opus_UN/v20090831 opus_GNOME/v1 opus_wikimedia/v20210402 opus_CCMatrix/v1 opus_Wikipedia/v1.0 opus_tico-19/v2020-10-28 opus_KDE4/v2 opus_OpenSubtitles/v2018 opus_MultiUN/v1 opus_GlobalVoices/v2018q4 opus_ELRC_2922/v1 opus_PHP/v1 opus_Tatoeba/v2021-03-10 opus_Tanzil/v1 opus_XLEnt/v1.1 opus_TildeMODEL/v2018 opus_Ubuntu/v14.10 opus_TED2013/v1.1 opus_infopankki/v1 opus_EUbookshop/v2 opus_ParaCrawl/v8 opus_Books/v1 opus_WMT-News/v2019 opus_bible-uedin/v1 opus_WikiMatrix/v1 opus_QED/v2.0a opus_CCAligned/v1 opus_TED2020/v1 opus_News-Commentary/v16 opus_UNPC/v1.0"\
+" mtdata_cc_aligned mtdata_airbaltic mtdata_GlobalVoices_2018Q4 mtdata_UNv1_test mtdata_neulab_tedtalksv1_train mtdata_neulab_tedtalksv1_dev mtdata_wmt13_commoncrawl mtdata_czechtourism mtdata_paracrawl_bonus mtdata_worldbank mtdata_wiki_titles_v1 mtdata_WikiMatrix_v1 mtdata_wmt18_news_commentary_v13 mtdata_wiki_titles_v2 mtdata_news_commentary_v14 mtdata_UNv1_dev mtdata_neulab_tedtalksv1_test mtdata_JW300"
 DEVTEST_DATASETS="mtdata_newstest2019_ruen mtdata_newstest2017_ruen mtdata_newstest2015_ruen mtdata_newstest2014_ruen"
-# sacrebleu
-TEST_DATASETS="wmt20 wmt18 wmt16 wmt13"
+TEST_DATASETS="sacrebleu_wmt20 sacrebleu_wmt18 sacrebleu_wmt16 sacrebleu_wmt13"
 # monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020)
 # to be translated by the teacher model
 MONO_DATASETS_SRC="news-crawl_news.2020 news-crawl_news.2019 news-crawl_news.2018 news-crawl_news.2017 "\
@@ -34,10 +35,11 @@ MONO_DATASETS_SRC="news-crawl_news.2020 news-crawl_news.2019 news-crawl_news.201
 "news-crawl_news.2011"
 # to be translated by the shallow s2s model to augment teacher corpus with back-translations
 # leave empty to skip augmentation step (high resource languages)
-MONO_DATASETS_TRG="news-crawl_news.2020"
+MONO_DATASETS_TRG=""
 # limits per downloaded dataset
 MONO_MAX_SENTENCES_SRC=100000000
 MONO_MAX_SENTENCES_TRG=20000000
+BICLEANER_THRESHOLD=0.5
 
 
 # marian --devices parameter for GPUs to use, for example 0 1 2 3
diff --git a/pipeline/alignment/generate-alignment-and-shortlist.sh b/pipeline/alignment/generate-alignment-and-shortlist.sh
index 3ec10ade3..dcb894959 100644
--- a/pipeline/alignment/generate-alignment-and-shortlist.sh
+++ b/pipeline/alignment/generate-alignment-and-shortlist.sh
@@ -19,17 +19,26 @@ corpus_prefix=$1
 vocab_path=$2
 output_dir=$3
 
+if [ -e "${output_dir}/corpus.aln.gz" ] && [ -e "${output_dir}/lex.s2t.pruned.gz" ]; then
+  echo "### Alignments and shortlist already exist, skipping"
+  echo "###### Done: Generating alignments and shortlist"
+  exit 0
+fi
+
+
 test -e "${BIN}/atools" || exit 1
 test -e "${BIN}/extract_lex" || exit 1
 test -e "${BIN}/fast_align" || exit 1
 
 mkdir -p "${output_dir}"
-dir="${TMP}/alignment"
+dir="${output_dir}/tmp"
 mkdir -p "${dir}"
 
 corpus_src="${corpus_prefix}.${SRC}.gz"
 corpus_trg="${corpus_prefix}.${TRG}.gz"
 
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
 echo "### Subword segmentation with SentencePiece"
 test -s "${dir}/corpus.spm.${SRC}.gz" ||
   pigz -dc "${corpus_src}" |
@@ -41,42 +50,35 @@ test -s "${dir}/corpus.spm.${TRG}.gz" ||
   pigz >"${dir}/corpus.spm.${TRG}.gz"
 
 echo "### Creating merged corpus"
-test -s "${dir}/corpus.aln.gz" || test -s "${dir}/corpus" ||
+test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/corpus" ||
   paste <(pigz -dc "${dir}/corpus.spm.${SRC}.gz") <(pigz -dc "${dir}/corpus.spm.${TRG}.gz") |
   sed 's/\t/ ||| /' >"${dir}/corpus"
 
 echo "### Training alignments"
-test -s "${dir}/corpus.aln.gz" ||
-  test -s "${dir}/align.s2t.gz" ||
+test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.s2t.gz" ||
   "${BIN}/fast_align" -vod -i "${dir}/corpus" |
   pigz >"${dir}/align.s2t.gz"
-test -s "${dir}/corpus.aln.gz" ||
-  test -s "${dir}/align.t2s.gz" ||
+test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.t2s.gz" ||
   "${BIN}/fast_align" -vodr -i "${dir}/corpus" |
   pigz >"${dir}/align.t2s.gz"
-test -s "${dir}/corpus" && rm "${dir}/corpus"
 
 echo "### Symmetrizing alignments"
-test -s "${dir}/corpus.aln.gz" || pigz -d "${dir}/align.s2t.gz" "${dir}/align.t2s.gz"
-test -s "${dir}/corpus.aln.gz" ||
+test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.t2s" ||
+  pigz -d "${dir}/align.s2t.gz" "${dir}/align.t2s.gz"
+test -s "${output_dir}/corpus.aln.gz" ||
   "${BIN}/atools" -i "${dir}/align.s2t" -j "${dir}/align.t2s" -c grow-diag-final-and |
-  pigz >"${dir}/corpus.aln.gz"
-test -s "${dir}/align.s2t" && rm "${dir}"/align.???
+  pigz >"${output_dir}/corpus.aln.gz"
 
 echo "### Creating shortlist"
 test -s "${dir}/lex.s2t.gz" ||
   "${BIN}/extract_lex" \
     "${dir}/corpus.spm.${TRG}.gz" \
     "${dir}/corpus.spm.${SRC}.gz" \
-    "${dir}/corpus.aln.gz" \
+    "${output_dir}/corpus.aln.gz" \
     "${dir}/lex.s2t" \
     "${dir}/lex.t2s"
 test -s "${dir}/lex.s2t" && pigz "${dir}/lex.s2t"
 
-echo "### Cleaning"
-test -s "${output_dir}/corpus.aln.gz" || rsync "${dir}/corpus.aln.gz" "${output_dir}/corpus.aln.gz"
-test -e "${dir}/lex.t2s" && rm "${dir}/lex.t2s"
-
 echo "### Shortlist pruning"
 test -s "${dir}/vocab.txt" ||
   "${MARIAN}/spm_export_vocab" --model="${vocab_path}" --output="${dir}/vocab.txt"
diff --git a/pipeline/clean/bicleaner.sh b/pipeline/clean/bicleaner.sh
new file mode 100644
index 000000000..1c6a4dafd
--- /dev/null
+++ b/pipeline/clean/bicleaner.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+##
+# Cleans corpus using bicleaner-ai or bicleaner
+#
+# Usage:
+#   bash bicleaner.sh corpus_prefix output_prefix
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Bicleaner filtering"
+
+test -v SRC
+test -v TRG
+test -v CLEAN_TOOLS
+test -v BICLEANER_THRESHOLD
+
+corpus_prefix=$1
+output_prefix=$2
+
+output_dir=$(dirname "${output_prefix}")
+tmp_dir="${output_dir}/tmp"
+mkdir -p "${tmp_dir}"
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
+# bicleaner and bicleaner-ai have conflicting dependencies. installing on demand
+if [ ! -e "${output_prefix}.${SRC}.gz" ]; then
+  if bash "${CLEAN_TOOLS}/download-bicleaner-pack.sh" "${tmp_dir}" "bicleaner-ai"; then
+    echo "### Using bicleaner-ai"
+    pip install bicleaner-ai==1.0.1
+    cmd=bicleaner-ai-classify
+  elif bash "${CLEAN_TOOLS}/download-bicleaner-pack.sh" "${tmp_dir}" "bicleaner"; then
+    echo "### Using bicleaner"
+    pip install bicleaner==0.14
+    cmd=bicleaner-classify
+  else
+    echo "### Bicleaner language pack is not supported, skipping."
+    cp "${corpus_prefix}.${SRC}.gz" "${output_prefix}.${SRC}.gz"
+    cp "${corpus_prefix}.${TRG}.gz" "${output_prefix}.${TRG}.gz"
+    exit 0
+  fi
+fi
+
+echo "### Classifying and filtering"
+test -s "${output_prefix}.${SRC}.gz" || test -s "${tmp_dir}/best.gz" ||
+  paste <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") |
+  ${cmd} --scol 1 --tcol 1 - - "${tmp_dir}"/*.yaml |
+  awk -v threshold=${BICLEANER_THRESHOLD} '{if ($3>threshold) {print $0}}' |
+  pigz >"${tmp_dir}/best.gz"
+
+echo "### Writing output corpus"
+test -s "${output_prefix}.${SRC}.gz" || pigz -dc "${tmp_dir}/best.gz" | cut -f1 | pigz >"${output_prefix}.${SRC}.gz"
+test -s "${output_prefix}.${TRG}.gz" || pigz -dc "${tmp_dir}/best.gz" | cut -f2 | pigz >"${output_prefix}.${TRG}.gz"
+
+echo "### Cleaning files"
+rm -rf "${tmp_dir}"
+
+echo "###### Done: Bicleaner filtering"
diff --git a/pipeline/clean/ce-filter.sh b/pipeline/clean/ce-filter.sh
index ad43ac2db..27a3e8f62 100644
--- a/pipeline/clean/ce-filter.sh
+++ b/pipeline/clean/ce-filter.sh
@@ -21,15 +21,23 @@ model_dir=$1
 corpus_prefix=$2
 output_prefix=$3
 
+if [ -e "${output_prefix}.${TRG}.gz" ]; then
+  echo "### Dataset already exists, skipping"
+  echo "###### Done: Cross entropy filtering"
+  exit 0
+fi
+
 # Part of the data to be removed (0.05 is 5%)
 remove=0.05
 model="${model_dir}/model.npz.best-ce-mean-words.npz"
 vocab="${model_dir}/vocab.spm"
-dir="${TMP}/scored"
 output_dir=$(dirname "${output_prefix}")
+dir="${output_dir}/scored"
 mkdir -p "${output_dir}"
 mkdir -p "${dir}"
 
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
 echo "### Decompressing corpus"
 test -s "${dir}/corpus.${TRG}" || pigz -dc "${corpus_prefix}.${TRG}.gz" >"${dir}/corpus.${TRG}"
 test -s "${dir}/corpus.${SRC}" || pigz -dc "${corpus_prefix}.${SRC}.gz" >"${dir}/corpus.${SRC}"
diff --git a/pipeline/clean/clean-corpus.sh b/pipeline/clean/clean-corpus.sh
index 0c01e7298..901447ec5 100755
--- a/pipeline/clean/clean-corpus.sh
+++ b/pipeline/clean/clean-corpus.sh
@@ -19,12 +19,16 @@ test -v CLEAN_TOOLS
 data=$1
 output=$2
 
-mkdir -p "$(dirname "${output}")"
+dir="$(dirname "${output}")"
+tmp="${dir}/tmp"
+mkdir -p "${tmp}"
 
 # Check if files exist
 test -s "${data}.${SRC}.gz" || exit 1
 test -s "${data}.${TRG}.gz" || exit 1
 
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
 echo "### CLeaning ${data}"
 
 ######################################################################
@@ -41,7 +45,7 @@ done
 echo "### Deduplication"
 test -s "${output}.${SRC}.gz" || test -s "${output}.${SRC}${TRG}.nrm.uniq.gz" ||
   paste <(pigz -dc "${output}.${SRC}.nrm.gz") <(pigz -dc "${output}.${TRG}.nrm.gz") |
-  LC_ALL=C sort -S 10G |
+  LC_ALL=C sort -S 10G -T "${tmp}" |
   uniq |
   pigz >"${output}.${SRC}${TRG}.nrm.uniq.gz"
 
@@ -58,7 +62,8 @@ test -s "${output}.${SRC}.gz" || test -s "${output}.${SRC}${TRG}.rule-based.gz"
 echo "### Language identification"
 test -s "${output}.${SRC}.gz" || test -s "${output}.${SRC}${TRG}.langid.gz" ||
   pigz -dc "${output}.${SRC}${TRG}.rule-based.gz" |
-  parallel --no-notice --pipe -k -j "$(nproc)" --block 50M \
+  # memory intensive
+  parallel --no-notice --pipe -k -j "$(echo "$(nproc)"/4 | bc)" --block 50M \
     "python3 -Wi ${CLEAN_TOOLS}/langid_fasttext.py -f 1 | python3 -Wi ${CLEAN_TOOLS}/langid_fasttext.py -f 1" |
   grep -P "^${SRC}\t${TRG}\t" |
   cut -f3,4 |
@@ -84,6 +89,7 @@ test -s "${output}.${TRG}.gz" || exit 1
 
 echo "### Remove ${data} from intermediate steps"
 rm -f "${output}".*.nrm.gz "${output}".*.nrm.uniq.gz "${output}".*.langid.gz "${output}".*.rule-based.gz
+rm -rf "${tmp}"
 
 echo "### Clean data is written to  ${output}"
 
diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh
index dbb116515..630930a24 100755
--- a/pipeline/clean/clean-mono.sh
+++ b/pipeline/clean/clean-mono.sh
@@ -16,6 +16,7 @@ input=$2
 output=$3
 
 test -v CLEAN_TOOLS
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
 
 echo "### CLeaning ${input}"
 
@@ -42,7 +43,8 @@ test -s "${output}.${lang}.gz" || test -s "${output}.${lang}.nrm.uniq.gz" ||
 echo "### Language identification"
 test -s "${output}.${lang}.gz" || test -s "${output}.${lang}.langid.gz" ||
   pigz -dc "${output}.${lang}.nrm.uniq.gz" |
-  parallel --no-notice --pipe -k -j "$(nproc)" --block 50M "python ${CLEAN_TOOLS}/langid_fasttext.py" |
+  # memory intensive
+  parallel --no-notice --pipe -k -j "$(echo "$(nproc)"/4 | bc)" --block 50M "python ${CLEAN_TOOLS}/langid_fasttext.py" |
   grep -P "^${lang}\t" | cut -f2 |
   pigz >"${output}.${lang}.langid.gz"
 
diff --git a/pipeline/clean/tools/download-bicleaner-pack.sh b/pipeline/clean/tools/download-bicleaner-pack.sh
new file mode 100644
index 000000000..17fb7190a
--- /dev/null
+++ b/pipeline/clean/tools/download-bicleaner-pack.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+##
+# Downloads bicleaner-ai or bicleaner language pack
+#
+# Usage:
+#   bash download-bicleaner-pack.sh download_path type
+#
+
+set -x
+# don't use pipefail here because of wget check
+set -eu
+
+test -v SRC
+test -v TRG
+
+download_path=$1
+type=$2
+
+
+invalid_url() {
+  wget -S --spider -o - $1 | grep -q '404 Not Found'
+}
+
+if [ "${type}" == 'bicleaner-ai' ]; then
+    url="https://github.com/bitextor/bicleaner-ai-data/releases/latest/download"
+    prefix="full-"
+    extension="tgz"
+elif [ "${type}" == 'bicleaner' ]; then
+    url="https://github.com/bitextor/bicleaner-data/releases/latest/download"
+    prefix=""
+    extension="tar.gz"
+else
+  echo "Unsupported type: ${type}"
+  exit 1
+fi
+
+echo "### Downloading ${type} language pack ${url}"
+
+if invalid_url "${url}/${prefix}${SRC}-${TRG}.${extension}"; then
+  echo "### ${SRC}-${TRG} language pack does not exist, trying ${TRG}-${SRC}..."
+  if invalid_url "${url}/${prefix}${TRG}-${SRC}.${extension}"; then
+    echo "### ${TRG}-${SRC} language pack does not exist"
+    exit 1
+  else
+    lang1=$TRG
+    lang2=$SRC
+  fi
+else
+  lang1=$SRC
+  lang2=$TRG
+fi
+
+if ! test -s "${download_path}"/*.yaml; then
+  wget -P "${download_path}" "${url}/${prefix}${lang1}-${lang2}.${extension}"
+  tar xvf "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -C "${download_path}" --no-same-owner
+  mv "${download_path}/${lang1}-${lang2}"/* "${download_path}/"
+  rm "${download_path}/${prefix}${lang1}-${lang2}.${extension}"
+fi
+
+echo "### ${type} language pack ${url} is downloaded"
diff --git a/pipeline/data/download-eval.sh b/pipeline/data/download-eval.sh
new file mode 100644
index 000000000..73db2f26b
--- /dev/null
+++ b/pipeline/data/download-eval.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+##
+# Downloads evaluation datasets
+#
+# Usage:
+#   bash download-eval.sh dir [datasets...]
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Downloading evaluation datasets"
+
+test -v WORKDIR
+test -v TEST_DATASETS
+
+dir=$1
+
+
+for dataset in "${@:2}"; do
+  name="${dataset//[^A-Za-z0-9_- ]/_}"
+  bash "${WORKDIR}/pipeline/data/download-corpus.sh" "${dir}/${name}" "${dataset}"
+
+  test -e "${dir}/${name}.${SRC}" || pigz -dk "${dir}/${name}.${SRC}.gz"
+  test -e "${dir}/${name}.${TRG}" || pigz -dk "${dir}/${name}.${TRG}.gz"
+done
+
+
+echo "###### Done: Downloading evaluation datasets"
diff --git a/pipeline/data/download-mono.sh b/pipeline/data/download-mono.sh
index 61353bd1d..18c4fb5a6 100644
--- a/pipeline/data/download-mono.sh
+++ b/pipeline/data/download-mono.sh
@@ -47,7 +47,7 @@ if [ ! -e "${file_name}" ]; then
     rm "${source_prefix}"*
   done
 
-  pigz -dc "${dir}"/*."${lang}".gz | pigz >"${file_name}"
+  pigz -dc "${dir}"/*."${lang}".gz | shuf -n "${max_sent}" | pigz >"${file_name}"
 
 fi
 
diff --git a/pipeline/data/importers/corpus/mtdata.sh b/pipeline/data/importers/corpus/mtdata.sh
index 8ac9b2379..bb1980e7b 100644
--- a/pipeline/data/importers/corpus/mtdata.sh
+++ b/pipeline/data/importers/corpus/mtdata.sh
@@ -16,18 +16,25 @@ trg=$2
 dir=$3
 dataset=$4
 
+test -v WORKDIR
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
 src_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${src}', fail_error=True))")
 trg_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${trg}', fail_error=True))")
 
-mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${dir}"
+if [ ! -e "${dir}/${dataset}.${TRG}" ]; then
+  mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${dir}"
+
+  for f in "${dir}"/train-parts/*."${src_iso}"; do
+    mv "${f}" "${dir}/${dataset}.${SRC}"
+  done
+  for f in "${dir}"/train-parts/*."${trg_iso}"; do
+    mv "${f}" "${dir}/${dataset}.${TRG}"
+  done
 
-for f in "${dir}"/train-parts/*."${src_iso}"; do
-  mv "${f}" "${dir}/${dataset}.${SRC}"
-done
-for f in "${dir}"/train-parts/*."${trg_iso}"; do
-  mv "${f}" "${dir}/${dataset}.${TRG}"
-done
+  rm -rf "${dir}/train-parts"
+fi
 
-rm -rf "${dir}/train-parts"
 
 echo "###### Done: Downloading mtdata corpus"
diff --git a/pipeline/data/importers/corpus/opus.sh b/pipeline/data/importers/corpus/opus.sh
index 649e128be..02f4cd407 100644
--- a/pipeline/data/importers/corpus/opus.sh
+++ b/pipeline/data/importers/corpus/opus.sh
@@ -16,14 +16,20 @@ trg=$2
 dir=$3
 dataset=$4
 
-mkdir -p "${dir}/tmp"
+name=${dataset%%/*}
 
-dataset_path=${dir}/tmp/${dataset%\/*}.txt.zip
-test -s "${dataset_path}" ||
-  wget -O "${dataset_path}" "https://object.pouta.csc.fi/${dataset}/moses/${src}-${trg}.txt.zip" ||
-  wget -O "${dataset_path}" "https://object.pouta.csc.fi/${dataset}/moses/${trg}-${src}.txt.zip"
-unzip "${dataset_path}" -d "${dir}"
+if [ ! -s "${dir}/${name}.${src}-${trg}.${trg}" ] && [ ! -s "${dir}/${name}.${trg}-${src}.${trg}" ]; then
+  mkdir -p "${dir}/opus"
 
-rm -rf "${dir}/tmp"
+  name_and_version="${dataset//[^A-Za-z0-9_- ]/_}"
+  archive_path="${dir}/opus/${name_and_version}.txt.zip"
+
+  test -s "${archive_path}" ||
+    wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" ||
+    wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip"
+  unzip -o "${archive_path}" -d "${dir}"
+
+  rm -rf "${dir}/opus"
+fi
 
 echo "###### Done: Downloading opus corpus"
diff --git a/pipeline/data/importers/corpus/sacrebleu.sh b/pipeline/data/importers/corpus/sacrebleu.sh
new file mode 100644
index 000000000..d270475fd
--- /dev/null
+++ b/pipeline/data/importers/corpus/sacrebleu.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+##
+# Downloads corpus using sacrebleu
+#
+# Usage:
+#   bash sacrebleu.sh source target dir dataset
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Downloading sacrebleu corpus"
+
+src=$1
+trg=$2
+dir=$3
+dataset=$4
+
+test -v WORKDIR
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+name="${dataset//[^A-Za-z0-9_- ]/_}"
+
+test -s "${dir}/${name}.${src}" ||
+sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src > "${dir}/${name}.${src}"
+
+test -s "${dir}/${name}.${trg}" ||
+sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref > "${dir}/${name}.${trg}"
+
+echo "###### Done: Downloading sacrebleu corpus"
diff --git a/pipeline/quantize/eval.sh b/pipeline/quantize/eval.sh
index 5eb327242..ecf6f4f48 100644
--- a/pipeline/quantize/eval.sh
+++ b/pipeline/quantize/eval.sh
@@ -15,20 +15,25 @@ test -v MARIAN
 test -v TEST_DATASETS
 test -v SRC
 test -v TRG
+test -v WORKDIR
 
 model_dir=$1
 shortlist=$2
+datasets_dir=$3
 
 eval_dir="${model_dir}/eval"
 vocab="${model_dir}/vocab.spm"
 
 mkdir -p "${eval_dir}"
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
 
 echo "### Evaluating a model ${model_dir} on CPU"
-for prefix in ${TEST_DATASETS}; do
+for src_path in "${datasets_dir}"/*."${SRC}"; do
+  prefix=$(basename "${src_path}" ".${SRC}")
   echo "### Evaluating ${prefix} ${SRC}-${TRG}"
-  sacrebleu -t "${prefix}" -l "${SRC}-${TRG}" --echo src |
-    tee "${eval_dir}/${prefix}.${SRC}" |
+
+  test -s "${eval_dir}/${prefix}.${TRG}.bleu" ||
+    tee "${eval_dir}/${prefix}.${SRC}" < "${src_path}" |
     "${MARIAN}"/marian-decoder \
       -m "${model_dir}/model.intgemm.alphas.bin" \
       -v "${vocab}" "${vocab}" \
@@ -39,7 +44,7 @@ for prefix in ${TEST_DATASETS}; do
       --shortlist "${shortlist}" false \
       --int8shiftAlphaAll |
     tee "${eval_dir}/${prefix}.${TRG}" |
-    sacrebleu -d -t "${prefix}" -l "${SRC}-${TRG}" |
+    sacrebleu -d -l "${SRC}-${TRG}" "${datasets_dir}/${prefix}.${TRG}" |
     tee "${eval_dir}/${prefix}.${TRG}.bleu"
 
   test -e "${eval_dir}/${prefix}.${TRG}.bleu" || exit 1
diff --git a/pipeline/quantize/quantize.sh b/pipeline/quantize/quantize.sh
index 81ba0f9cc..f7e57d03c 100644
--- a/pipeline/quantize/quantize.sh
+++ b/pipeline/quantize/quantize.sh
@@ -22,6 +22,15 @@ shortlist=$2
 devtest_src=$3
 output_dir=$4
 
+res_model="${output_dir}/model.intgemm.alphas.bin"
+
+if [ -e "${res_model}" ]; then
+  echo "### Converted model already exists, skipping"
+  echo "###### Done: Quantizing a model"
+  exit 0
+fi
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
 mkdir -p "${output_dir}"
 
 model="${model_dir}/model.npz.best-bleu-detok.npz"
@@ -52,7 +61,6 @@ test -s "${output_dir}/model.alphas.npz" ||
     "${output_dir}/model.alphas.npz"
 
 echo "### Converting"
-res_model="${output_dir}/model.intgemm.alphas.bin"
 test -s "${res_model}" ||
   "${MARIAN}"/marian-conv \
     -f "${output_dir}/model.alphas.npz" \
diff --git a/pipeline/setup/activate-python.sh b/pipeline/setup/activate-python.sh
new file mode 100644
index 000000000..f250eeea1
--- /dev/null
+++ b/pipeline/setup/activate-python.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+##
+# Activates python conda environment
+#
+# Usage:
+#   source ./activate-python.sh
+#
+
+set +eu
+PATH="${CONDA_DIR}/bin:${PATH}"
+source "${CONDA_DIR}/etc/profile.d/conda.sh"
+conda activate bergamot-training-env
+set -eu
\ No newline at end of file
diff --git a/pipeline/setup/install-all.sh b/pipeline/setup/install-all.sh
index 1c341a16c..9575e2543 100644
--- a/pipeline/setup/install-all.sh
+++ b/pipeline/setup/install-all.sh
@@ -20,7 +20,9 @@ echo "### Installing extra dependencies"
 sudo apt-get install -y pigz htop wget unzip parallel
 
 bash "${WORKDIR}/pipeline/setup/compile-marian.sh"
-bash "${WORKDIR}/pipeline/setup/install-python.sh"
 bash "${WORKDIR}/pipeline/setup/compile-alignment.sh"
+bash "${WORKDIR}/pipeline/setup/install-python.sh"
+bash "${WORKDIR}/pipeline/setup/install-kenlm.sh"
+bash "${WORKDIR}/pipeline/setup/install-python-packages.sh"
 
 echo "######### Done: Installing all dependencies"
diff --git a/pipeline/setup/install-kenlm.sh b/pipeline/setup/install-kenlm.sh
new file mode 100644
index 000000000..98d80567e
--- /dev/null
+++ b/pipeline/setup/install-kenlm.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+##
+# Installs and compiles kenlm
+#
+# Usage:
+#   bash install-kenlm.sh
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Installing kenlm"
+test -v WORKDIR
+test -v BIN
+
+cd "${WORKDIR}/3rd_party/kenlm"
+
+if [ ! -e "${BIN}/kenlm" ]; then
+  mkdir -p build
+  cd build
+  mkdir "${BIN}/kenlm"
+  cmake .. -DKENLM_MAX_ORDER=7 -DCMAKE_INSTALL_PREFIX:PATH="${BIN}/kenlm"
+  make -j all install
+  cd ..
+fi
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+python -m pip install . --install-option="--max_order 7"
+cd "${WORKDIR}"
+
+echo "###### Done: Installing kenlm"
diff --git a/pipeline/setup/install-python-packages.sh b/pipeline/setup/install-python-packages.sh
new file mode 100644
index 000000000..6792b847f
--- /dev/null
+++ b/pipeline/setup/install-python-packages.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+##
+# Install python packages
+#
+# Usage:
+#   bash install-python-packages.sh
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Installing Python packages"
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+pip install -r "${WORKDIR}/pipeline/setup/requirements.txt"
+
+
+echo "###### Done: Installing Python packages"
diff --git a/pipeline/setup/install-python.sh b/pipeline/setup/install-python.sh
index 03f4dd654..26351f412 100644
--- a/pipeline/setup/install-python.sh
+++ b/pipeline/setup/install-python.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 ##
-# Install python and packages
+# Create python conda environment
 #
 # Usage:
 #   bash install-python.sh
@@ -11,17 +11,13 @@ set -euo pipefail
 
 echo "###### Installing Python"
 
-echo "### Installing Python libraries ---"
-if [ ! -e /root/miniconda3/bin/conda ]; then
+if [ ! -e "${CONDA_DIR}/bin/conda" ]; then
   wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-  bash ./Miniconda3-latest-Linux-x86_64.sh -b -u
+  bash ./Miniconda3-latest-Linux-x86_64.sh -b -u -p "${CONDA_DIR}"
   rm -f Miniconda3-latest-Linux-x86_64.sh
 fi
-export PATH="/root/miniconda3/bin:${PATH}"
+export PATH="${CONDA_DIR}/bin:${PATH}"
 conda create -y --name bergamot-training-env python=3.8
-source /root/miniconda3/etc/profile.d/conda.sh
-conda activate bergamot-training-env
-pip install -r "${WORKDIR}/pipeline/setup/requirements.txt"
 
 
 echo "###### Done: Installing Python"
diff --git a/pipeline/train/eval.sh b/pipeline/train/eval.sh
index 5bcf3c76e..10edfcd19 100644
--- a/pipeline/train/eval.sh
+++ b/pipeline/train/eval.sh
@@ -15,10 +15,13 @@ test -v GPUS
 test -v MARIAN
 test -v WORKSPACE
 test -v TEST_DATASETS
+test -v WORKDIR
 
 model_dir=$1
-src="${2:-${SRC}}"
-trg="${3:-${TRG}}"
+datasets_dir=$2
+src="${3:-${SRC}}"
+trg="${4:-${TRG}}"
+
 
 config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml"
 eval_dir="${model_dir}/eval"
@@ -26,13 +29,15 @@ eval_dir="${model_dir}/eval"
 echo "### Checking model files"
 test -e "${config}" || exit 1
 mkdir -p "${eval_dir}"
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
 
 echo "### Evaluating a model ${model_dir}"
-for prefix in ${TEST_DATASETS}; do
+for src_path in "${datasets_dir}"/*."${src}"; do
+  prefix=$(basename "${src_path}" ".${src}")
   echo "### Evaluating ${prefix} ${src}-${trg}"
+
   test -s "${eval_dir}/${prefix}.${trg}.bleu" ||
-  sacrebleu -t "${prefix}" -l "${src}-${trg}" --echo src |
-    tee "${eval_dir}/${prefix}.${src}" |
+    tee "${eval_dir}/${prefix}.${src}" < "${src_path}" |
     "${MARIAN}"/marian-decoder \
       -c "${config}" \
       -w "${WORKSPACE}" \
@@ -41,7 +46,7 @@ for prefix in ${TEST_DATASETS}; do
       --log "${eval_dir}/${prefix}.log" \
       -d ${GPUS} |
     tee "${eval_dir}/${prefix}.${trg}" |
-    sacrebleu -d -t "${prefix}" -l "${src}-${trg}" |
+    sacrebleu -d -l "${src}-${trg}" "${datasets_dir}/${prefix}.${trg}"  |
     tee "${eval_dir}/${prefix}.${trg}.bleu"
 
   test -e "${eval_dir}/${prefix}.${trg}.bleu" || exit 1
diff --git a/pipeline/train/finetune-student.sh b/pipeline/train/finetune-student.sh
index af062838d..531528218 100644
--- a/pipeline/train/finetune-student.sh
+++ b/pipeline/train/finetune-student.sh
@@ -21,20 +21,21 @@ test -v SRC
 test -v TRG
 test -v WORKDIR
 
-mkdir -p "${dir}"
-cp "${student}/model.npz.best-bleu-detok.npz" "${dir}/model.npz"
-cp "${student}/vocab.spm" "${dir}/"
-
-bash "${WORKDIR}/pipeline/train/train.sh" \
-  "${WORKDIR}/pipeline/train/configs/model/student.tiny11.yml" \
-  "${WORKDIR}/pipeline/train/configs/training/student.finetune.yml" \
-  "${SRC}" \
-  "${TRG}" \
-  "${corpus}" \
-  "${devset}" \
-  "${dir}" \
-  --guided-alignment "${alignment}/corpus.aln.gz"
-
+if [ ! -s "${dir}/model.npz.best-bleu-detok.npz" ]; then
+  mkdir -p "${dir}"
+  cp "${student}/model.npz.best-bleu-detok.npz" "${dir}/model.npz"
+  cp "${student}/vocab.spm" "${dir}/"
+
+  bash "${WORKDIR}/pipeline/train/train.sh" \
+    "${WORKDIR}/pipeline/train/configs/model/student.tiny11.yml" \
+    "${WORKDIR}/pipeline/train/configs/training/student.finetune.yml" \
+    "${SRC}" \
+    "${TRG}" \
+    "${corpus}" \
+    "${devset}" \
+    "${dir}" \
+    --guided-alignment "${alignment}/corpus.aln.gz"
+fi
 
 echo "###### Done: Finetuning the student model"
 
diff --git a/pipeline/train/train-s2s.sh b/pipeline/train/train-s2s.sh
index d76bc67d8..bec58c953 100644
--- a/pipeline/train/train-s2s.sh
+++ b/pipeline/train/train-s2s.sh
@@ -19,6 +19,7 @@ trg=${5:-${TRG}}
 
 test -v WORKDIR
 
+test -s "${dir}/model.npz.best-bleu-detok.npz" ||
 bash "${WORKDIR}/pipeline/train/train.sh" \
   "${WORKDIR}/pipeline/train/configs/model/s2s.yml" \
   "${WORKDIR}/pipeline/train/configs/training/s2s.train.yml" \
diff --git a/pipeline/train/train-student.sh b/pipeline/train/train-student.sh
index dc9f0fa36..0a20502eb 100644
--- a/pipeline/train/train-student.sh
+++ b/pipeline/train/train-student.sh
@@ -25,6 +25,7 @@ mkdir -p "${dir}"
 # use teacher's vocab, otherwise alignments won't work
 cp   "${teacher}/vocab.spm" "${dir}/"
 
+test -s "${dir}/model.npz.best-bleu-detok.npz" ||
 bash "${WORKDIR}/pipeline/train/train.sh" \
   "${WORKDIR}/pipeline/train/configs/model/student.tiny11.yml" \
   "${WORKDIR}/pipeline/train/configs/training/student.train.yml" \
diff --git a/pipeline/train/train-teacher-ensemble.sh b/pipeline/train/train-teacher-ensemble.sh
index f1abf5fab..8b7404f12 100644
--- a/pipeline/train/train-teacher-ensemble.sh
+++ b/pipeline/train/train-teacher-ensemble.sh
@@ -18,6 +18,7 @@ n=${4}
 
 # This can be parallelized across multiple machines
 for i in $(seq 1 ${n}); do
+  test -s "${dir}${i}/model.npz.best-bleu-detok.npz" ||
   bash "${WORKDIR}/pipeline/train/train.sh" \
     "${WORKDIR}/pipeline/train/configs/model/teacher.transformer.yml" \
     "${WORKDIR}/pipeline/train/configs/training/teacher.transformer-ens.train.yml" \
diff --git a/pipeline/train/train-teacher.sh b/pipeline/train/train-teacher.sh
index f51540f55..32123935e 100644
--- a/pipeline/train/train-teacher.sh
+++ b/pipeline/train/train-teacher.sh
@@ -19,6 +19,7 @@ test -v SRC
 test -v TRG
 test -v WORKDIR
 
+test -s "${dir}/model.npz.best-bleu-detok.npz" ||
 bash "${WORKDIR}/pipeline/train/train.sh" \
   "${WORKDIR}/pipeline/train/configs/model/teacher.transformer.yml" \
   "${WORKDIR}/pipeline/train/configs/training/teacher.transformer.train.yml" \
diff --git a/pipeline/train/train.sh b/pipeline/train/train.sh
index 76a7ac666..9a44a3c3f 100644
--- a/pipeline/train/train.sh
+++ b/pipeline/train/train.sh
@@ -25,15 +25,13 @@ model_dir=$7
 test -v GPUS
 test -v MARIAN
 test -v WORKSPACE
-test -v TMP
 
 test -e "${train_set_prefix}.${src}.gz" || exit 1
 test -e "${train_set_prefix}.${trg}.gz" || exit 1
 test -e "${valid_set_prefix}.${src}.gz" || exit 1
 test -e "${valid_set_prefix}.${trg}.gz" || exit 1
 
-mkdir -p tmp
-mkdir -p "${model_dir}"
+mkdir -p "${model_dir}/tmp"
 
 echo "### Training ${model_dir}"
 
@@ -41,7 +39,7 @@ echo "### Training ${model_dir}"
   --model "${model_dir}/model.npz" \
   -c "${model_config}" "${training_config}" \
   --train-sets "${train_set_prefix}".{"${src}","${trg}"}.gz \
-  -T "${TMP}/train" \
+  -T "${model_dir}/tmp" \
   --shuffle-in-ram \
   --vocabs "${model_dir}/vocab.spm" "${model_dir}/vocab.spm" \
   -w "${WORKSPACE}" \
diff --git a/pipeline/translate/translate-corpus.sh b/pipeline/translate/translate-corpus.sh
index 151056616..1ac612bd6 100755
--- a/pipeline/translate/translate-corpus.sh
+++ b/pipeline/translate/translate-corpus.sh
@@ -21,11 +21,19 @@ corpus_trg=$2
 model_dir=$3
 output_path=$4
 
+if [ -e "${output_path}" ]; then
+  echo "### Corpus already exists, skipping"
+  echo "###### Done: Translating a corpus"
+  exit 0
+fi
+
 config="${model_dir}/model.npz.best-ce-mean-words.npz.decoder.yml"
 decoder_config="${WORKDIR}/pipeline/translate/decoder.yml"
 tmp_dir=$(dirname "${output_path}")/tmp
 mkdir -p "${tmp_dir}"
 
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
 echo "### Splitting a parallel corpus into smaller chunks"
 test -s "${tmp_dir}/file.00" ||
   pigz -dc "${corpus_src}" |
@@ -59,7 +67,7 @@ test -s "${tmp_dir}/file.00.nbest.out" ||
     2>"${tmp_dir}/debug.txt"
 
 echo "### Collecting translations"
-test -s "${output_path}" || cat "${tmp_dir}"/file.??.nbest.out | pigz >"${output_path}"
+test -s "${output_path}" || cat "${tmp_dir}"/file.*.nbest.out | pigz >"${output_path}"
 
 echo "### Comparing number of sentences ${corpus_src} vs ${output_path}"
 src_len=$(pigz -dc "${corpus_src}" | wc -l)
diff --git a/pipeline/translate/translate-mono.sh b/pipeline/translate/translate-mono.sh
index 3d882faa1..7a836e83e 100755
--- a/pipeline/translate/translate-mono.sh
+++ b/pipeline/translate/translate-mono.sh
@@ -15,6 +15,12 @@ mono_path=$1
 model_dir=$2
 output_path=$3
 
+if [ -e "${output_path}" ]; then
+  echo "### Dataset already exists, skipping"
+  echo "###### Done: Translating monolingual data"
+  exit 0
+fi
+
 config="${model_dir}/model.npz.best-ce-mean-words.npz.decoder.yml"
 decoder_config="${WORKDIR}/pipeline/translate/decoder.yml"
 tmp_dir=$(dirname "${output_path}")/tmp
diff --git a/pipeline/utils/find-corpus.py b/pipeline/utils/find-corpus.py
new file mode 100644
index 000000000..49663012e
--- /dev/null
+++ b/pipeline/utils/find-corpus.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""
+Finds all opus datasets for a language pair
+and prints them to set in TRAIN_DATASETS, DEVTEST_DATASETS or TEST_DATASETS config settings.
+
+Usage:
+    python find-corpus.py <src> <trg> <importer>
+
+Params:
+    src - source language code
+    trg - target language code
+    importer - importer type (mtdata, opus, sacrebleu)
+
+"""
+
+import requests
+import sys
+
+source=sys.argv[1]
+target=sys.argv[2]
+type=sys.argv[3]
+
+# exclude = ['bible', 'Ubuntu', 'Gnome', 'KDE', 'Multi', 'OPUS100v']
+exclude = []
+names = []
+
+if type == 'opus':
+    exclude += ['OPUS100v']
+    datasets = requests.get(f'https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest').json()
+    names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']]
+elif type == 'sacrebleu':
+    import sacrebleu
+    names = [f'sacrebleu_{name}' for name, meta in sacrebleu.DATASETS.items()
+             if f'{source}-{target}' in meta or f'{target}-{source}' in meta]
+elif type == 'mtdata':
+    from mtdata.main import LangPair
+    from mtdata.data import get_entries
+    exclude += ['opus', 'newstest']
+    entries = get_entries(LangPair(f'{source}-{target}'), None, None)
+    names = [f'mtdata_{entry.name}' for entry in entries]
+else:
+    print(f'Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu')
+
+cleaned = set()
+for name in names:
+    filter=False
+    for ex in exclude:
+        if ex.lower() in name.lower():
+            filter=True
+            break
+    if not filter:
+        cleaned.add(name)
+
+print(' '.join(cleaned))
\ No newline at end of file
diff --git a/pipeline/train/tensorboard/tb_log_parser.py b/pipeline/utils/tensorboard/tb_log_parser.py
similarity index 100%
rename from pipeline/train/tensorboard/tb_log_parser.py
rename to pipeline/utils/tensorboard/tb_log_parser.py
diff --git a/pipeline/train/tensorboard/tensorboard.sh b/pipeline/utils/tensorboard/tensorboard.sh
similarity index 55%
rename from pipeline/train/tensorboard/tensorboard.sh
rename to pipeline/utils/tensorboard/tensorboard.sh
index 8dfc358f6..91c1c8556 100644
--- a/pipeline/train/tensorboard/tensorboard.sh
+++ b/pipeline/utils/tensorboard/tensorboard.sh
@@ -4,7 +4,7 @@
 #
 # Usage:
 #   Run from current directory
-#   MODELS=<absolute_path_to_models_directory> bash tensorboard.sh
+#   WORKDIR=<repo-root-dir> MODELS=<absolute_path_to_models_directory> bash tensorboard.sh
 #
 
 set -x
@@ -13,12 +13,11 @@ set -euo pipefail
 echo "###### Running tensorboard"
 
 test -v MODELS
+test -v WORKDIR
 
-PATH="/root/miniconda3/bin:${PATH}"
-source /root/miniconda3/etc/profile.d/conda.sh
-conda activate bergamot-training-env
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
 
-ls -d "${MODELS}"/*/* > tb-monitored-jobs
+ls -d "${MODELS}"/*/*/* > tb-monitored-jobs
 tensorboard --logdir="${MODELS}" --host=0.0.0.0 &
 python ./tb_log_parser.py --prefix=
 
diff --git a/run.sh b/run.sh
index e4c811fc7..978a6fc4e 100644
--- a/run.sh
+++ b/run.sh
@@ -13,6 +13,9 @@ set -euo pipefail
 # Directories structure
 #
 #├ data
+#│   ├ cache TODO
+#│   │  └ opus_wmt20.ru.gz
+#│   │  └ sacrebleu_wmt20.en.gz
 #│   └ ru-en
 #│      └ test
 #│        ├ original
@@ -22,11 +25,19 @@ set -euo pipefail
 #│        │   ├ mono.en.gz
 #│        │   ├ devset.ru.gz
 #│        │   └ devset.en.gz
+#│        ├ evaluation
+#│        │   ├ wmt12.ru
+#│        │   ├ wmt12.en
+#│        │   ├ wmt20.ru
+#│        │   ├ wmt20.en
 #│        ├ clean
 #│        │   ├ corpus.ru.gz
 #│        │   ├ corpus.en.gz
 #│        │   ├ mono.ru.gz
 #│        │   └ mono.en.gz
+#│        ├ biclean
+#│        │   ├ corpus.ru.gz
+#│        │   ├ corpus.en.gz
 #│        ├ translated
 #│        │   ├ mono.ru.gz
 #│        │   └ mono.en.gz
@@ -59,15 +70,14 @@ source ./config.sh
 
 echo "######  setup"
 bash ./pipeline/setup/install-all.sh
-PATH="/root/miniconda3/bin:${PATH}"
-source /root/miniconda3/etc/profile.d/conda.sh
-conda activate bergamot-training-env
 
 echo "######  set common variables"
 # data
 data_dir="${DATA_DIR}/${SRC}-${TRG}/${EXPERIMENT}"
 original="${data_dir}/original"
+evaluation="${data_dir}/evaluation"
 clean="${data_dir}/clean"
+biclean="${data_dir}/biclean"
 augmented="${data_dir}/augmented"
 translated="${data_dir}/translated"
 merged="${data_dir}/merged"
@@ -85,6 +95,7 @@ exported="${models_dir}/exported"
 echo "######  download data"
 bash ./pipeline/data/download-corpus.sh "${original}/corpus" ${TRAIN_DATASETS}
 bash ./pipeline/data/download-corpus.sh "${original}/devset" ${DEVTEST_DATASETS}
+bash ./pipeline/data/download-eval.sh "${evaluation}" ${TEST_DATASETS}
 test -n "${MONO_DATASETS_SRC}" &&
   bash ./pipeline/data/download-mono.sh "${SRC}" "${MONO_MAX_SENTENCES_SRC}" "${original}/mono" ${MONO_DATASETS_SRC}
 test -n "${MONO_DATASETS_TRG}" &&
@@ -92,36 +103,37 @@ test -n "${MONO_DATASETS_TRG}" &&
 
 echo "######  clean data"
 bash ./pipeline/clean/clean-corpus.sh "${original}/corpus" "${clean}/corpus"
+bash ./pipeline/clean/bicleaner.sh "${clean}/corpus" "${biclean}/corpus"
 test -e "${original}/mono.${SRC}.gz" &&
   bash ./pipeline/clean/clean-mono.sh "${SRC}" "${original}/mono" "${clean}/mono"
 test -e "${original}/mono.${TRG}.gz" &&
   bash ./pipeline/clean/clean-mono.sh "${TRG}" "${original}/mono" "${clean}/mono"
 
 echo "######  train backward model"
-bash ./pipeline/train/train-s2s.sh "${s2s}" "${clean}/corpus" "${original}/devset" "${TRG}" "${SRC}"
-bash ./pipeline/train/eval.sh "${s2s}" "${TRG}" "${SRC}"
+bash ./pipeline/train/train-s2s.sh "${s2s}" "${biclean}/corpus" "${original}/devset" "${TRG}" "${SRC}"
+bash ./pipeline/train/eval.sh "${s2s}" "${evaluation}" "${TRG}" "${SRC}"
 
 if [ -e "${clean}/mono.${TRG}.gz" ]; then
   echo "######  augment corpus with back translations"
   bash ./pipeline/translate/translate-mono.sh "${clean}/mono.${TRG}.gz" "${s2s}" "${translated}/mono.${SRC}.gz"
   bash ./pipeline/utils/merge-corpus.sh \
     "${translated}/mono.${SRC}.gz" \
-    "${clean}/corpus.${SRC}.gz" \
+    "${biclean}/corpus.${SRC}.gz" \
     "${clean}/mono.${TRG}.gz" \
-    "${clean}/corpus.${TRG}.gz" \
+    "${biclean}/corpus.${TRG}.gz" \
     "${augmented}/corpus.${SRC}.gz" \
     "${augmented}/corpus.${TRG}.gz"
   teacher_corpus="${augmented}/corpus"
 else
   echo "###### skipping augmentation"
-  teacher_corpus="${clean}/corpus"
+  teacher_corpus="${biclean}/corpus"
 fi
 
 echo "######  train teacher"
 bash ./pipeline/train/train-teacher.sh "${teacher_dir}" "${teacher_corpus}" "${original}/devset"
 
 echo "######  evaluate teacher"
-bash ./pipeline/train/eval.sh "${teacher_dir}"
+bash ./pipeline/train/eval.sh "${teacher_dir}" "${evaluation}"
 
 echo "######  translate with teacher"
 bash ./pipeline/translate/translate-corpus.sh "${clean}/corpus.${SRC}.gz" \
@@ -153,7 +165,7 @@ bash ./pipeline/train/train-student.sh \
   "${original}/devset" \
   "${teacher_dir}" \
   "${align_dir}"
-bash ./pipeline/train/eval.sh "${student_dir}"
+bash ./pipeline/train/eval.sh "${student_dir}" "${evaluation}"
 
 echo "######  finetune student"
 bash ./pipeline/train/finetune-student.sh \
@@ -162,7 +174,7 @@ bash ./pipeline/train/finetune-student.sh \
   "${original}/devset" \
   "${student_dir}" \
   "${align_dir}"
-bash ./pipeline/train/eval.sh "${student_finetuned_dir}"
+bash ./pipeline/train/eval.sh "${student_finetuned_dir}" "${evaluation}"
 
 echo "######   quantize"
 bash ./pipeline/quantize/quantize.sh \
@@ -170,7 +182,7 @@ bash ./pipeline/quantize/quantize.sh \
   "${align_dir}/lex.s2t.pruned.gz" \
   "${original}/devset.${SRC}.gz" \
   "${speed}"
-bash ./pipeline/quantize/eval.sh "${speed}" "${align_dir}/lex.s2t.pruned.gz"
+bash ./pipeline/quantize/eval.sh "${speed}" "${align_dir}/lex.s2t.pruned.gz" "${evaluation}"
 
 echo "######  export"
 bash ./pipeline/quantize/export.sh "${speed}" "${align_dir}/lex.s2t.pruned.gz" "${exported}"