From 58511a44d6286e9f787c72bb5b39f547af06f3e3 Mon Sep 17 00:00:00 2001 From: Ivan Belonogov Date: Wed, 7 Aug 2019 10:15:27 +0300 Subject: [PATCH 1/5] added source code for speed test --- benchmark.md | 3 +- tests/README.md | 12 +++ tests/speed_test.py | 200 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 tests/README.md create mode 100644 tests/speed_test.py diff --git a/benchmark.md b/benchmark.md index 5128fae..7929a01 100644 --- a/benchmark.md +++ b/benchmark.md @@ -13,7 +13,8 @@ In this benchmark, `YouTokenToMe` used 4 threads for training and tokenization. doesn't support multithreading for **BPE** at all. `fastBPE` doesn't support multithreading for training. For tokenization, it also used 4 threads. - The results of the experiments are below. The time is measured in seconds. +Source code for benchmark can be found [here](tests/speed_test.py). +The results of the experiments are below. The time is measured in seconds. diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..e7331f8 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,12 @@ +# Running benchmark + +* Install [YouTokenToMe](https://github.com/vkcom/youtokentome) +* Install [SentencePiece](https://github.com/google/sentencepiece) +* Compile [fastBPE](https://github.com/glample/fastBPE) and specify path to binary file in variable + `PATH_TO_FASTBPE` in `speed_test.py` +* `python speed_test.py` + + **Warning**: this script downloads several GB of data. + It use Wikipedia monolingual corpora for training and tokenization. +[Here](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/) + you can find more details about the data. diff --git a/tests/speed_test.py b/tests/speed_test.py new file mode 100644 index 0000000..db88138 --- /dev/null +++ b/tests/speed_test.py @@ -0,0 +1,200 @@ +import argparse +import os +from pathlib import Path +from time import time + +from tabulate import tabulate + +MODEL_FILE_NAME = "bpe.model" +MODEL_SUFFIX = ".model" + +YOU_TOKEN_TO_ME = "YouTokenToMe" +SENTENCE_PIECE = "SentencePiece" +FAST_BPE = "fastBPE" + +PATH_TO_FASTBPE = './fastBPE' + + +class SentencePieceInterface: + def train_from_file(self, train_file, vocab_size, model_file, _): + tmp = model_file.split(".") + assert len(tmp) == 2 + assert tmp[1] == "model" + train_command = f"spm_train " + train_command += f" --input={str(train_file)} " + train_command += f" --model_prefix={tmp[0]} " + train_command += f" --vocab_size={vocab_size} " + train_command += f" --character_coverage=1.0 " + train_command += f" --model_type=bpe " + assert os.system(train_command) == 0 + + def encode_file(self, model_path, path_in, path_out, _): + encode_command = f"spm_encode " + encode_command += f" --model={model_path} " + encode_command += f" --output_format=piece " + encode_command += f" < {path_in} > {path_out} " + assert os.system(encode_command) == 0 + + +class FastBPEInterface: + def train_from_file(self, file_path, vocab_size, model_file, _): + train_command = f"{PATH_TO_FASTBPE} learnbpe" + train_command += f" {vocab_size} {str(file_path)} > {model_file}" + assert os.system(train_command) == 0 + + def encode_file(self, model_path, path_in, path_out, _): + encode_command = f"{PATH_TO_FASTBPE} applybpe {path_out} {path_in} {model_path}" + assert os.system(encode_command) == 0 + + +class YouTokenToMeInterface: + def train_from_file(self, file_path, vocab_size, model_path, n_threads): + train_command = f"yttm bpe " + train_command += f" --data={file_path} --model={model_path} " + train_command += f" --vocab_size={vocab_size} --n_threads={n_threads} " + assert os.system(train_command) == 0 + + def encode_file(self, model_path, path_in, path_out, n_threads): + encode_command = "yttm encode " + encode_command += f" --model={model_path} --output_type=id " + encode_command += f" --n_threads={n_threads} " + encode_command += f" < {str(path_in)} > {str(path_out)}" + assert os.system(encode_command) == 0 + + +def get_bpe(impl_name): + if impl_name == YOU_TOKEN_TO_ME: + return YouTokenToMeInterface() + if impl_name == SENTENCE_PIECE: + return SentencePieceInterface() + if impl_name == FAST_BPE: + return FastBPEInterface() + assert False + + +def check_train(algorithm, vocab_size, corpus_path, use_multithreading): + bpe = get_bpe(algorithm) + start_time = time() + bpe.train_from_file(corpus_path, vocab_size, MODEL_FILE_NAME, use_multithreading) + return time() - start_time + + +def check_inference_file(algorithm, corpus_path, n_threads): + bpe = get_bpe(algorithm) + start_time = time() + bpe.encode_file(MODEL_FILE_NAME, corpus_path, "rm_it.txt", n_threads) + return time() - start_time + + +def download_xml2txt(): + if not Path("xml2txt.pl").exists(): + print("downloading xml2txt.pl ...") + os.system("wget https://www.dropbox.com/s/p3ta9spzfviovk0/xml2txt.pl") + + +def prepare_data(zip_path, size_mb): + expected_extension = ".xml.bz2" + assert zip_path.endswith(expected_extension) + base_path = Path(zip_path).parent + + unzip_path = base_path / "wiki.xml" + full_text_path = base_path / "wiki.txt" + cutted_text_path = base_path / f"wiki_{size_mb}MB.txt" + + if not Path(unzip_path).exists(): + print(f"unziping file {zip_path} ...") + assert os.system(f"bzip2 -kdc {zip_path} > {unzip_path}") == 0 + + if not Path(full_text_path).exists(): + print(f"converting xml to text {unzip_path} ...") + download_xml2txt() + preprocess_command = f"perl xml2txt.pl " + preprocess_command += f" -nomath -notables " + preprocess_command += f" {unzip_path} {full_text_path}" + assert os.system(preprocess_command) == 0 + + if not Path(cutted_text_path).exists(): + byte_processed = 0 + with open(cutted_text_path, "w") as fout: + with open(full_text_path, "r") as fin: + while byte_processed < size_mb * 1_000_000: + s = fin.readline() + byte_processed += len(s.encode()) + fout.write(s) + + return cutted_text_path + + +def speed_test(corpus_path, vocab_size, algorithms, n_threads): + train_res = {} + infer_res = {} + for algorithm in algorithms: + time_train = check_train(algorithm, vocab_size, corpus_path, n_threads) + time_infer = check_inference_file(algorithm, corpus_path, n_threads) + + train_res[algorithm] = time_train + infer_res[algorithm] = time_infer + + return train_res, infer_res + + +def print_results(cfg, result_name, corpuses, algorithms): + result_table = [ + ["#" for _ in range(len(corpuses) + 1)] for _ in range(len(algorithms)) + ] + table_header = ["#"] + [lang for lang in corpuses] + rev_lang = {lang: i for i, lang in enumerate(table_header)} + rev_algo = {algo: i for i, algo in enumerate(algorithms)} + for i, algo_name in enumerate(algorithms): + result_table[i][0] = algo_name + + for lang, res in cfg.items(): + for algo in res: + j = rev_lang[lang] + i = rev_algo[algo] + result_table[i][j] = res[algo] + + table_header[0] = result_name + print(tabulate(result_table, table_header, tablefmt="grid")) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--vocab_size", type=int, default=30000) + parser.add_argument("--n_threads", type=int, default=4) + parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB" ) + + args = parser.parse_args() + + links = { + "English": "https://www.dropbox.com/s/cnrhd11zdtc1pic/enwiki-20181001-corpus.xml.bz2?dl=1", + "Russian": "https://www.dropbox.com/s/lpfmyrl7nxn5ugg/ruwiki-20181001-corpus.xml.bz2?dl=1", + "Japanese": "https://www.dropbox.com/s/wf496hlu512z9kc/jawiki-20140807-corpus.xml.bz2?dl=1", + "Chinese": "https://www.dropbox.com/s/czhr6s5jwaljeue/zhwiki-20140804-corpus.xml.bz2?dl=1", + } + corpuses = {} + + Path("data").mkdir(exist_ok=True) + for lang, link in links.items(): + Path(f"data/{lang}").mkdir(exist_ok=True) + zip_file = f"data/{lang}/wiki.xml.bz2" + if not Path(zip_file).exists(): + os.system(f"wget -O {zip_file} {link}") + corpuses[lang] = prepare_data(zip_file, args.corpus_size) + + algorithms = [YOU_TOKEN_TO_ME, SENTENCE_PIECE, FAST_BPE] + + global_train = {} + global_tokenization = {} + + for lang, corpus_path in corpuses.items(): + train_stat, tokenization_stat = speed_test( + corpus_path, args.vocab_size, algorithms, args.n_threads + ) + global_train[lang] = train_stat + global_tokenization[lang] = tokenization_stat + + print_results(global_train, "Train", corpuses, algorithms) + print_results(global_tokenization, "Tokenization", corpuses, algorithms) + From 66c350f124119519a95bdba767afbf7e1b4bcaa8 Mon Sep 17 00:00:00 2001 From: Ivan Belonogov Date: Thu, 8 Aug 2019 13:47:06 +0300 Subject: [PATCH 2/5] added docker --- tests/Dockerfile | 31 +++++++++++++++++++++++++++++++ tests/README.md | 16 +++++++++++++--- tests/speed_test.py | 3 +-- 3 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 tests/Dockerfile diff --git a/tests/Dockerfile b/tests/Dockerfile new file mode 100644 index 0000000..15780fa --- /dev/null +++ b/tests/Dockerfile @@ -0,0 +1,31 @@ +FROM ubuntu:18.04 + +RUN apt update +RUN apt install git -y +RUN apt install python3-pip cmake make g++ vim wget -y +RUN pip3 install tabulate youtokentome + +WORKDIR /repos + +RUN git clone https://github.com/google/sentencepiece.git +RUN git clone https://github.com/glample/fastBPE + +WORKDIR /repos/sentencepiece/build + +RUN cmake .. && make -j $(nproc) && make install && ldconfig -v + +WORKDIR /repos/fastBPE + +RUN g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast + +WORKDIR /test_dir + +COPY ./speed_test.py ./speed_test.py +RUN cp /repos/fastBPE/fast /test_dir/fastBPE + +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 + +CMD python3 speed_test.py + + diff --git a/tests/README.md b/tests/README.md index e7331f8..d9c1389 100644 --- a/tests/README.md +++ b/tests/README.md @@ -6,7 +6,17 @@ `PATH_TO_FASTBPE` in `speed_test.py` * `python speed_test.py` - **Warning**: this script downloads several GB of data. - It use Wikipedia monolingual corpora for training and tokenization. + **Warning**: this script downloads several GBs of data. + It uses Wikipedia monolingual corpora for training and tokenization. [Here](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/) - you can find more details about the data. + you can find more details about the data. + +## Docker + +Alternatively benchmark can be run using docker. + +``` +cd tests +sudo docker build -f Dockerfile -t yttm/speed_test . +sudo docker run -it yttm/speed_test:latest +``` diff --git a/tests/speed_test.py b/tests/speed_test.py index db88138..13adaea 100644 --- a/tests/speed_test.py +++ b/tests/speed_test.py @@ -163,7 +163,7 @@ def print_results(cfg, result_name, corpuses, algorithms): parser.add_argument("--vocab_size", type=int, default=30000) parser.add_argument("--n_threads", type=int, default=4) - parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB" ) + parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB") args = parser.parse_args() @@ -197,4 +197,3 @@ def print_results(cfg, result_name, corpuses, algorithms): print_results(global_train, "Train", corpuses, algorithms) print_results(global_tokenization, "Tokenization", corpuses, algorithms) - From b0a49ebe23766f3786481fa5015241864d79ef13 Mon Sep 17 00:00:00 2001 From: Dmitry Yutkin Date: Fri, 9 Aug 2019 10:07:17 +0300 Subject: [PATCH 3/5] Simplify Dockerfile --- tests/Dockerfile | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/Dockerfile b/tests/Dockerfile index 15780fa..3c87fe4 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,14 +1,20 @@ -FROM ubuntu:18.04 +FROM python:3.7 -RUN apt update -RUN apt install git -y -RUN apt install python3-pip cmake make g++ vim wget -y -RUN pip3 install tabulate youtokentome +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + cmake \ + make \ + g++ \ + wget && \ + pip3 install tabulate youtokentome WORKDIR /repos -RUN git clone https://github.com/google/sentencepiece.git -RUN git clone https://github.com/glample/fastBPE +RUN git clone https://github.com/google/sentencepiece.git && \ + git clone https://github.com/glample/fastBPE WORKDIR /repos/sentencepiece/build @@ -23,9 +29,4 @@ WORKDIR /test_dir COPY ./speed_test.py ./speed_test.py RUN cp /repos/fastBPE/fast /test_dir/fastBPE -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 - -CMD python3 speed_test.py - - +CMD ["python", "speed_test.py"] From 315db780df6d892ca84444a18f50eb3977639fc3 Mon Sep 17 00:00:00 2001 From: Dmitry Yutkin Date: Fri, 9 Aug 2019 15:09:35 +0300 Subject: [PATCH 4/5] Add info about test to README --- tests/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/README.md b/tests/README.md index d9c1389..90fac53 100644 --- a/tests/README.md +++ b/tests/README.md @@ -6,17 +6,17 @@ `PATH_TO_FASTBPE` in `speed_test.py` * `python speed_test.py` - **Warning**: this script downloads several GBs of data. + **Warning!** This test requires about **80 GBs** of free space on your disk and can take **several hours** for running. It uses Wikipedia monolingual corpora for training and tokenization. [Here](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/) you can find more details about the data. ## Docker -Alternatively benchmark can be run using docker. +Alternatively benchmark can be run using Docker. ``` cd tests -sudo docker build -f Dockerfile -t yttm/speed_test . -sudo docker run -it yttm/speed_test:latest +docker build -t yttm/speed_test . +docker run --rm -it yttm/speed_test:latest ``` From baa2681b79381d43434c9e0f988fe4f265adb183 Mon Sep 17 00:00:00 2001 From: Dmitry Yutkin Date: Fri, 9 Aug 2019 15:12:31 +0300 Subject: [PATCH 5/5] Format with Black --- tests/speed_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/speed_test.py b/tests/speed_test.py index 13adaea..07b65c2 100644 --- a/tests/speed_test.py +++ b/tests/speed_test.py @@ -12,7 +12,7 @@ SENTENCE_PIECE = "SentencePiece" FAST_BPE = "fastBPE" -PATH_TO_FASTBPE = './fastBPE' +PATH_TO_FASTBPE = "./fastBPE" class SentencePieceInterface: @@ -163,7 +163,9 @@ def print_results(cfg, result_name, corpuses, algorithms): parser.add_argument("--vocab_size", type=int, default=30000) parser.add_argument("--n_threads", type=int, default=4) - parser.add_argument("--corpus_size", type=int, default=100, help="Size of testing corpus in MB") + parser.add_argument( + "--corpus_size", type=int, default=100, help="Size of testing corpus in MB" + ) args = parser.parse_args()