Skip to content

Commit

Permalink
Merge pull request #6 from HephaestusProject/feature/#3
Browse files Browse the repository at this point in the history
Feature/#3
  • Loading branch information
seopbo authored Sep 30, 2020
2 parents fb765b0 + 368f918 commit 554644b
Show file tree
Hide file tree
Showing 5 changed files with 29,876 additions and 3 deletions.
53 changes: 53 additions & 0 deletions build_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import random
from argparse import ArgumentParser
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer


def main(args):
# set the corpus
random.seed(42)
proj_dir = Path()
tokenizers_dir = proj_dir / "tokenizers"

if not tokenizers_dir.exists():
tokenizers_dir.mkdir(parents=True)

corpus_dir = proj_dir / "corpus"
comment_dir = corpus_dir / "comment"
source_path = comment_dir / "20190101_20200611_v2.txt"
sample_path = comment_dir / "sample.txt"

# sampling source
source_io = open(source_path, mode="r", encoding="utf-8")
sample_io = open(sample_path, mode="w", encoding="utf-8")

for line in source_io:
if random.random() > (1 - args.sample_rate):
sample_io.write(line)
else:
sample_io.close()
source_io.close()

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(add_prefix_space=False)

# Customize training
tokenizer.train(
files=str(sample_path),
vocab_size=args.vocab_size,
min_frequency=args.min_freq,
show_progress=True,
special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"],
)
tokenizer.save_model(directory=str(tokenizers_dir))


if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--sample_rate", default=0.1, type=float)
parser.add_argument("--vocab_size", default=30000, type=int)
parser.add_argument("--min_freq", default=5, type=int)
args = parser.parse_args()
main(args)
43 changes: 43 additions & 0 deletions dev.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
MAINTAINER [email protected]

RUN apt-get update && \
apt-get install -y sudo apt-utils make build-essential \
libssl-dev zlib1g-dev libbz2-dev libreadline-dev \
libsqlite3-dev wget curl git libffi-dev liblzma-dev locales \
g++ openjdk-8-jdk openssh-server

RUN locale-gen en_US.UTF-8
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8

ARG UID
RUN adduser --disabled-password --gecos "" user --uid ${UID:-1000}
RUN adduser user sudo
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

USER user
WORKDIR /home/user

# pyenv 설치/ 설정
ENV HOME /home/user
ENV PYENV_ROOT $HOME/.pyenv
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN git clone https://github.com/pyenv/pyenv.git .pyenv

# python 설치
SHELL ["/bin/bash", "-c"]
RUN pyenv install 3.7.8 && \
pyenv global 3.7.8 && \
pyenv rehash

RUN pip install --no-cache-dir --upgrade pip setuptools wheel

# WORKDIR 설정
WORKDIR /home/user/workspace

# openssh-server 설정
ARG PASSWD
RUN echo user:${PASSWD:-hephaestus} | sudo chpasswd
RUN sudo sed -i 's_/usr/lib/openssh/sftp-server_internal-sftp_g' /etc/ssh/sshd_config
RUN echo "sudo service ssh start" > /home/user/.bashrc
42 changes: 39 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,39 @@
torch==1.5.0
torchvision==0.6.0
pytorch-lightning==0.8.5
absl-py==0.10.0
cachetools==4.1.1
certifi==2020.6.20
chardet==3.0.4
click==7.1.2
filelock==3.0.12
future==0.18.2
google-auth==1.21.3
google-auth-oauthlib==0.4.1
grpcio==1.32.0
idna==2.10
importlib-metadata==2.0.0
joblib==0.16.0
Markdown==3.2.2
numpy==1.19.2
oauthlib==3.1.0
packaging==20.4
protobuf==3.13.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
pyparsing==2.4.7
pytorch-lightning==0.9.0
PyYAML==5.3.1
regex==2020.7.14
requests==2.24.0
requests-oauthlib==1.3.0
rsa==4.6
sacremoses==0.0.43
sentencepiece==0.1.91
six==1.15.0
tensorboard==2.2.0
tensorboard-plugin-wit==1.7.0
tokenizers==0.8.1rc2
torch==1.6.0
tqdm==4.49.0
transformers==3.3.0
urllib3==1.25.10
Werkzeug==1.0.1
zipp==3.2.0
Loading

0 comments on commit 554644b

Please sign in to comment.