From 58511a44d6286e9f787c72bb5b39f547af06f3e3 Mon Sep 17 00:00:00 2001
From: Ivan Belonogov <xbelonogov@gmail.com>
Date: Wed, 7 Aug 2019 10:15:27 +0300
Subject: [PATCH 1/5] added source code for speed test

---
 benchmark.md        |   3 +-
 tests/README.md     |  12 +++
 tests/speed_test.py | 200 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 tests/README.md
 create mode 100644 tests/speed_test.py

diff --git a/benchmark.md b/benchmark.md
index 5128fae..7929a01 100644
--- a/benchmark.md
+++ b/benchmark.md
@@ -13,7 +13,8 @@ In this benchmark, `YouTokenToMe` used 4 threads for training and tokenization.
  doesn't support multithreading for **BPE** at all. `fastBPE` doesn't support multithreading for training. 
  For tokenization, it also used 4 threads. 
  
- The results of the experiments are below. The time is measured in seconds.
+Source code for benchmark can be found [here](tests/speed_test.py).
+The results of the experiments are below. The time is measured in seconds.
 
 
 
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..e7331f8
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,12 @@
+# Running benchmark
+
+* Install [YouTokenToMe](https://github.com/vkcom/youtokentome)
+* Install [SentencePiece](https://github.com/google/sentencepiece)
+* Compile [fastBPE](https://github.com/glample/fastBPE) and specify path to binary file in variable
+ `PATH_TO_FASTBPE` in `speed_test.py`  
+* `python speed_test.py`
+
+    **Warning**: this script downloads several GB of data.
+    It use Wikipedia monolingual corpora for training and tokenization.
+[Here](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/) 
+ you can find more details about the data. 
diff --git a/tests/speed_test.py b/tests/speed_test.py
new file mode 100644
index 0000000..db88138
--- /dev/null
+++ b/tests/speed_test.py
@@ -0,0 +1,200 @@
+import argparse
+import os
+from pathlib import Path
+from time import time
+
+from tabulate import tabulate
+
+MODEL_FILE_NAME = "bpe.model"
+MODEL_SUFFIX = ".model"
+
+YOU_TOKEN_TO_ME = "YouTokenToMe"
+SENTENCE_PIECE = "SentencePiece"
+FAST_BPE = "fastBPE"
+
+PATH_TO_FASTBPE = './fastBPE'
+
+
+class SentencePieceInterface:
+    def train_from_file(self, train_file, vocab_size, model_file, _):
+        tmp = model_file.split(".")
+        assert len(tmp) == 2
+        assert tmp[1] == "model"
+        train_command = f"spm_train "
+        train_command += f" --input={str(train_file)} "
+        train_command += f" --model_prefix={tmp[0]} "
+        train_command += f" --vocab_size={vocab_size} "
+        train_command += f" --character_coverage=1.0 "
+        train_command += f" --model_type=bpe "
+        assert os.system(train_command) == 0
+
+    def encode_file(self, model_path, path_in, path_out, _):
+        encode_command = f"spm_encode "
+        encode_command += f" --model={model_path} "
+        encode_command += f" --output_format=piece "
+        encode_command += f" < {path_in} > {path_out} "
+        assert os.system(encode_command) == 0
+
+
+class FastBPEInterface:
+    def train_from_file(self, file_path, vocab_size, model_file, _):
+        train_command = f"{PATH_TO_FASTBPE} learnbpe"
+        train_command += f" {vocab_size} {str(file_path)} > {model_file}"
+        assert os.system(train_command) == 0
+
+    def encode_file(self, model_path, path_in, path_out, _):
+        encode_command = f"{PATH_TO_FASTBPE} applybpe {path_out} {path_in} {model_path}"
+        assert os.system(encode_command) == 0
+
+
+class YouTokenToMeInterface:
+    def train_from_file(self, file_path, vocab_size, model_path, n_threads):
+        train_command = f"yttm bpe "
+        train_command += f" --data={file_path} --model={model_path} "
+        train_command += f" --vocab_size={vocab_size}  --n_threads={n_threads} "
+        assert os.system(train_command) == 0
+
+    def encode_file(self, model_path, path_in, path_out, n_threads):
+        encode_command = "yttm encode "
+        encode_command += f" --model={model_path} --output_type=id "
+        encode_command += f" --n_threads={n_threads} "
+        encode_command += f" < {str(path_in)} > {str(path_out)}"
+        assert os.system(encode_command) == 0
+
+
+def get_bpe(impl_name):
+    if impl_name == YOU_TOKEN_TO_ME:
+        return YouTokenToMeInterface()
+    if impl_name == SENTENCE_PIECE:
+        return SentencePieceInterface()
+    if impl_name == FAST_BPE:
+        return FastBPEInterface()
+    assert False
+
+
+def check_train(algorithm, vocab_size, corpus_path, use_multithreading):
+    bpe = get_bpe(algorithm)
+    start_time = time()
+    bpe.train_from_file(corpus_path, vocab_size, MODEL_FILE_NAME, use_multithreading)
+    return time() - start_time
+
+
+def check_inference_file(algorithm, corpus_path, n_threads):
+    bpe = get_bpe(algorithm)
+    start_time = time()
+    bpe.encode_file(MODEL_FILE_NAME, corpus_path, "rm_it.txt", n_threads)
+    return time() - start_time
+
+
+def download_xml2txt():
+    if not Path("xml2txt.pl").exists():
+        print("downloading xml2txt.pl ...")
+        os.system("wget https://www.dropbox.com/s/p3ta9spzfviovk0/xml2txt.pl")
+
+
+def prepare_data(zip_path, size_mb):
+    expected_extension = ".xml.bz2"
+    assert zip_path.endswith(expected_extension)
+    base_path = Path(zip_path).parent
+
+    unzip_path = base_path / "wiki.xml"
+    full_text_path = base_path / "wiki.txt"
+    cutted_text_path = base_path / f"wiki_{size_mb}MB.txt"
+
+    if not Path(unzip_path).exists():
+        print(f"unziping file {zip_path} ...")
+        assert os.system(f"bzip2 -kdc {zip_path} > {unzip_path}") == 0
+
+    if not Path(full_text_path).exists():
+        print(f"converting xml to text {unzip_path} ...")
+        download_xml2txt()
+        preprocess_command = f"perl xml2txt.pl "
+        preprocess_command += f" -nomath -notables "
+        preprocess_command += f" {unzip_path} {full_text_path}"
+        assert os.system(preprocess_command) == 0
+
+    if not Path(cutted_text_path).exists():
+        byte_processed = 0
+        with open(cutted_text_path, "w") as fout:
+            with open(full_text_path, "r") as fin:
+                while byte_processed < size_mb * 1_000_000:
+                    s = fin.readline()
+                    byte_processed += len(s.encode())
+                    fout.write(s)
+
+    return cutted_text_path
+
+
+def speed_test(corpus_path, vocab_size, algorithms, n_threads):
+    train_res = {}
+    infer_res = {}
+    for algorithm in algorithms:
+        time_train = check_train(algorithm, vocab_size, corpus_path, n_threads)
+        time_infer = check_inference_file(algorithm, corpus_path, n_threads)
+
+        train_res[algorithm] = time_train
+        infer_res[algorithm] = time_infer
+
+    return train_res, infer_res
+
+
+def print_results(cfg, result_name, corpuses, algorithms):
+    result_table = [
+        ["#" for _ in range(len(corpuses) + 1)] for _ in range(len(algorithms))
+    ]
+    table_header = ["#"] + [lang for lang in corpuses]
+    rev_lang = {lang: i for i, lang in enumerate(table_header)}
+    rev_algo = {algo: i for i, algo in enumerate(algorithms)}
+    for i, algo_name in enumerate(algorithms):
+        result_table[i][0] = algo_name
+
+    for lang, res in cfg.items():
+        for algo in res:
+            j = rev_lang[lang]
+            i = rev_algo[algo]
+            result_table[i][j] = res[algo]
+
+    table_header[0] = result_name
+    print(tabulate(result_table, table_header, tablefmt="grid"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--vocab_size", type=int, default=30000)
+    parser.add_argument("--n_threads", type=int, default=4)
+    parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB" )
+
+    args = parser.parse_args()
+
+    links = {
+        "English": "https://www.dropbox.com/s/cnrhd11zdtc1pic/enwiki-20181001-corpus.xml.bz2?dl=1",
+        "Russian": "https://www.dropbox.com/s/lpfmyrl7nxn5ugg/ruwiki-20181001-corpus.xml.bz2?dl=1",
+        "Japanese": "https://www.dropbox.com/s/wf496hlu512z9kc/jawiki-20140807-corpus.xml.bz2?dl=1",
+        "Chinese": "https://www.dropbox.com/s/czhr6s5jwaljeue/zhwiki-20140804-corpus.xml.bz2?dl=1",
+    }
+    corpuses = {}
+
+    Path("data").mkdir(exist_ok=True)
+    for lang, link in links.items():
+        Path(f"data/{lang}").mkdir(exist_ok=True)
+        zip_file = f"data/{lang}/wiki.xml.bz2"
+        if not Path(zip_file).exists():
+            os.system(f"wget -O {zip_file} {link}")
+        corpuses[lang] = prepare_data(zip_file, args.corpus_size)
+
+    algorithms = [YOU_TOKEN_TO_ME, SENTENCE_PIECE, FAST_BPE]
+
+    global_train = {}
+    global_tokenization = {}
+
+    for lang, corpus_path in corpuses.items():
+        train_stat, tokenization_stat = speed_test(
+            corpus_path, args.vocab_size, algorithms, args.n_threads
+        )
+        global_train[lang] = train_stat
+        global_tokenization[lang] = tokenization_stat
+
+    print_results(global_train, "Train", corpuses, algorithms)
+    print_results(global_tokenization, "Tokenization", corpuses, algorithms)
+

From 66c350f124119519a95bdba767afbf7e1b4bcaa8 Mon Sep 17 00:00:00 2001
From: Ivan Belonogov <xbelonogov@gmail.com>
Date: Thu, 8 Aug 2019 13:47:06 +0300
Subject: [PATCH 2/5] added docker

---
 tests/Dockerfile    | 31 +++++++++++++++++++++++++++++++
 tests/README.md     | 16 +++++++++++++---
 tests/speed_test.py |  3 +--
 3 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100644 tests/Dockerfile

diff --git a/tests/Dockerfile b/tests/Dockerfile
new file mode 100644
index 0000000..15780fa
--- /dev/null
+++ b/tests/Dockerfile
@@ -0,0 +1,31 @@
+FROM ubuntu:18.04
+
+RUN apt update
+RUN apt install git -y
+RUN apt install python3-pip cmake make g++ vim wget -y
+RUN pip3 install tabulate youtokentome
+
+WORKDIR /repos
+
+RUN git clone https://github.com/google/sentencepiece.git
+RUN git clone https://github.com/glample/fastBPE
+
+WORKDIR /repos/sentencepiece/build
+
+RUN cmake .. &&  make -j $(nproc) && make install && ldconfig -v
+
+WORKDIR /repos/fastBPE
+
+RUN g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+
+WORKDIR /test_dir
+
+COPY ./speed_test.py ./speed_test.py
+RUN cp /repos/fastBPE/fast /test_dir/fastBPE
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+CMD python3 speed_test.py 
+
+
diff --git a/tests/README.md b/tests/README.md
index e7331f8..d9c1389 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -6,7 +6,17 @@
  `PATH_TO_FASTBPE` in `speed_test.py`  
 * `python speed_test.py`
 
-    **Warning**: this script downloads several GB of data.
-    It use Wikipedia monolingual corpora for training and tokenization.
+    **Warning**: this script downloads several GBs of data.
+    It uses Wikipedia monolingual corpora for training and tokenization.
 [Here](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/) 
- you can find more details about the data. 
+ you can find more details about the data.
+ 
+## Docker
+
+Alternatively benchmark can be run using docker.
+
+```
+cd tests
+sudo docker build -f Dockerfile -t yttm/speed_test .
+sudo docker run -it yttm/speed_test:latest
+```
diff --git a/tests/speed_test.py b/tests/speed_test.py
index db88138..13adaea 100644
--- a/tests/speed_test.py
+++ b/tests/speed_test.py
@@ -163,7 +163,7 @@ def print_results(cfg, result_name, corpuses, algorithms):
 
     parser.add_argument("--vocab_size", type=int, default=30000)
     parser.add_argument("--n_threads", type=int, default=4)
-    parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB" )
+    parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB")
 
     args = parser.parse_args()
 
@@ -197,4 +197,3 @@ def print_results(cfg, result_name, corpuses, algorithms):
 
     print_results(global_train, "Train", corpuses, algorithms)
     print_results(global_tokenization, "Tokenization", corpuses, algorithms)
-

From b0a49ebe23766f3786481fa5015241864d79ef13 Mon Sep 17 00:00:00 2001
From: Dmitry Yutkin <d.yutkin@corp.vk.com>
Date: Fri, 9 Aug 2019 10:07:17 +0300
Subject: [PATCH 3/5] Simplify Dockerfile

---
 tests/Dockerfile | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tests/Dockerfile b/tests/Dockerfile
index 15780fa..3c87fe4 100644
--- a/tests/Dockerfile
+++ b/tests/Dockerfile
@@ -1,14 +1,20 @@
-FROM ubuntu:18.04
+FROM python:3.7
 
-RUN apt update
-RUN apt install git -y
-RUN apt install python3-pip cmake make g++ vim wget -y
-RUN pip3 install tabulate youtokentome
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	git \
+	cmake \
+	make \
+	g++ \
+	wget && \
+    pip3 install tabulate youtokentome
 
 WORKDIR /repos
 
-RUN git clone https://github.com/google/sentencepiece.git
-RUN git clone https://github.com/glample/fastBPE
+RUN git clone https://github.com/google/sentencepiece.git && \
+    git clone https://github.com/glample/fastBPE
 
 WORKDIR /repos/sentencepiece/build
 
@@ -23,9 +29,4 @@ WORKDIR /test_dir
 COPY ./speed_test.py ./speed_test.py
 RUN cp /repos/fastBPE/fast /test_dir/fastBPE
 
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
-
-CMD python3 speed_test.py 
-
-
+CMD ["python", "speed_test.py"]

From 315db780df6d892ca84444a18f50eb3977639fc3 Mon Sep 17 00:00:00 2001
From: Dmitry Yutkin <d.yutkin@corp.vk.com>
Date: Fri, 9 Aug 2019 15:09:35 +0300
Subject: [PATCH 4/5] Add info about test to README

---
 tests/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index d9c1389..90fac53 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -6,17 +6,17 @@
  `PATH_TO_FASTBPE` in `speed_test.py`  
 * `python speed_test.py`
 
-    **Warning**: this script downloads several GBs of data.
+    **Warning!** This test requires about **80 GBs** of free space on your disk and can take **several hours** for running.
     It uses Wikipedia monolingual corpora for training and tokenization.
 [Here](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/) 
  you can find more details about the data.
  
 ## Docker
 
-Alternatively benchmark can be run using docker.
+Alternatively benchmark can be run using Docker.
 
 ```
 cd tests
-sudo docker build -f Dockerfile -t yttm/speed_test .
-sudo docker run -it yttm/speed_test:latest
+docker build -t yttm/speed_test .
+docker run --rm -it yttm/speed_test:latest
 ```

From baa2681b79381d43434c9e0f988fe4f265adb183 Mon Sep 17 00:00:00 2001
From: Dmitry Yutkin <d.yutkin@corp.vk.com>
Date: Fri, 9 Aug 2019 15:12:31 +0300
Subject: [PATCH 5/5] Format with Black

---
 tests/speed_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/speed_test.py b/tests/speed_test.py
index 13adaea..07b65c2 100644
--- a/tests/speed_test.py
+++ b/tests/speed_test.py
@@ -12,7 +12,7 @@
 SENTENCE_PIECE = "SentencePiece"
 FAST_BPE = "fastBPE"
 
-PATH_TO_FASTBPE = './fastBPE'
+PATH_TO_FASTBPE = "./fastBPE"
 
 
 class SentencePieceInterface:
@@ -163,7 +163,9 @@ def print_results(cfg, result_name, corpuses, algorithms):
 
     parser.add_argument("--vocab_size", type=int, default=30000)
     parser.add_argument("--n_threads", type=int, default=4)
-    parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB")
+    parser.add_argument(
+        "--corpus_size", type=int, default=100, help="Size of testing corpus in MB"
+    )
 
     args = parser.parse_args()