Merge pull request #6 from HephaestusProject/feature/#3

Feature/#3
HephaestusProject · Sep 30, 2020 · 554644b · 554644b
2 parents fb765b0 + 368f918
commit 554644b
Show file tree

Hide file tree

Showing 5 changed files with 29,876 additions and 3 deletions.
diff --git a/build_tokenizer.py b/build_tokenizer.py
@@ -0,0 +1,53 @@
+import random
+from argparse import ArgumentParser
+from pathlib import Path
+
+from tokenizers import ByteLevelBPETokenizer
+
+
+def main(args):
+    # set the corpus
+    random.seed(42)
+    proj_dir = Path()
+    tokenizers_dir = proj_dir / "tokenizers"
+
+    if not tokenizers_dir.exists():
+        tokenizers_dir.mkdir(parents=True)
+
+    corpus_dir = proj_dir / "corpus"
+    comment_dir = corpus_dir / "comment"
+    source_path = comment_dir / "20190101_20200611_v2.txt"
+    sample_path = comment_dir / "sample.txt"
+
+    # sampling source
+    source_io = open(source_path, mode="r", encoding="utf-8")
+    sample_io = open(sample_path, mode="w", encoding="utf-8")
+
+    for line in source_io:
+        if random.random() > (1 - args.sample_rate):
+            sample_io.write(line)
+    else:
+        sample_io.close()
+        source_io.close()
+
+    # Initialize a tokenizer
+    tokenizer = ByteLevelBPETokenizer(add_prefix_space=False)
+
+    # Customize training
+    tokenizer.train(
+        files=str(sample_path),
+        vocab_size=args.vocab_size,
+        min_frequency=args.min_freq,
+        show_progress=True,
+        special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"],
+    )
+    tokenizer.save_model(directory=str(tokenizers_dir))
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--sample_rate", default=0.1, type=float)
+    parser.add_argument("--vocab_size", default=30000, type=int)
+    parser.add_argument("--min_freq", default=5, type=int)
+    args = parser.parse_args()
+    main(args)
diff --git a/dev.Dockerfile b/dev.Dockerfile
@@ -0,0 +1,43 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+MAINTAINER [email protected]
+
+RUN apt-get update && \
+      apt-get install -y sudo apt-utils make build-essential \
+      libssl-dev zlib1g-dev libbz2-dev libreadline-dev \
+      libsqlite3-dev wget curl git libffi-dev liblzma-dev locales \
+      g++ openjdk-8-jdk openssh-server
+
+RUN locale-gen en_US.UTF-8
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+
+ARG UID
+RUN adduser --disabled-password --gecos "" user --uid ${UID:-1000}
+RUN adduser user sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+USER user
+WORKDIR /home/user
+
+# pyenv 설치/ 설정
+ENV HOME /home/user
+ENV PYENV_ROOT $HOME/.pyenv
+ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
+RUN git clone https://github.com/pyenv/pyenv.git .pyenv
+
+# python 설치
+SHELL ["/bin/bash", "-c"]
+RUN pyenv install 3.7.8 && \
+	pyenv global 3.7.8 && \
+	pyenv rehash
+
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+
+# WORKDIR 설정
+WORKDIR /home/user/workspace
+
+# openssh-server 설정
+ARG PASSWD
+RUN echo user:${PASSWD:-hephaestus} | sudo chpasswd
+RUN sudo sed -i 's_/usr/lib/openssh/sftp-server_internal-sftp_g' /etc/ssh/sshd_config
+RUN echo "sudo service ssh start" > /home/user/.bashrc
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,39 @@
-torch==1.5.0
-torchvision==0.6.0 
-pytorch-lightning==0.8.5
+absl-py==0.10.0
+cachetools==4.1.1
+certifi==2020.6.20
+chardet==3.0.4
+click==7.1.2
+filelock==3.0.12
+future==0.18.2
+google-auth==1.21.3
+google-auth-oauthlib==0.4.1
+grpcio==1.32.0
+idna==2.10
+importlib-metadata==2.0.0
+joblib==0.16.0
+Markdown==3.2.2
+numpy==1.19.2
+oauthlib==3.1.0
+packaging==20.4
+protobuf==3.13.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pyparsing==2.4.7
+pytorch-lightning==0.9.0
+PyYAML==5.3.1
+regex==2020.7.14
+requests==2.24.0
+requests-oauthlib==1.3.0
+rsa==4.6
+sacremoses==0.0.43
+sentencepiece==0.1.91
+six==1.15.0
+tensorboard==2.2.0
+tensorboard-plugin-wit==1.7.0
+tokenizers==0.8.1rc2
+torch==1.6.0
+tqdm==4.49.0
+transformers==3.3.0
+urllib3==1.25.10
+Werkzeug==1.0.1
+zipp==3.2.0