generated from HephaestusProject/template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Feature/#3
- Loading branch information
Showing
5 changed files
with
29,876 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import random | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
|
||
from tokenizers import ByteLevelBPETokenizer | ||
|
||
|
||
def main(args): | ||
# set the corpus | ||
random.seed(42) | ||
proj_dir = Path() | ||
tokenizers_dir = proj_dir / "tokenizers" | ||
|
||
if not tokenizers_dir.exists(): | ||
tokenizers_dir.mkdir(parents=True) | ||
|
||
corpus_dir = proj_dir / "corpus" | ||
comment_dir = corpus_dir / "comment" | ||
source_path = comment_dir / "20190101_20200611_v2.txt" | ||
sample_path = comment_dir / "sample.txt" | ||
|
||
# sampling source | ||
source_io = open(source_path, mode="r", encoding="utf-8") | ||
sample_io = open(sample_path, mode="w", encoding="utf-8") | ||
|
||
for line in source_io: | ||
if random.random() > (1 - args.sample_rate): | ||
sample_io.write(line) | ||
else: | ||
sample_io.close() | ||
source_io.close() | ||
|
||
# Initialize a tokenizer | ||
tokenizer = ByteLevelBPETokenizer(add_prefix_space=False) | ||
|
||
# Customize training | ||
tokenizer.train( | ||
files=str(sample_path), | ||
vocab_size=args.vocab_size, | ||
min_frequency=args.min_freq, | ||
show_progress=True, | ||
special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"], | ||
) | ||
tokenizer.save_model(directory=str(tokenizers_dir)) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser() | ||
parser.add_argument("--sample_rate", default=0.1, type=float) | ||
parser.add_argument("--vocab_size", default=30000, type=int) | ||
parser.add_argument("--min_freq", default=5, type=int) | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 | ||
MAINTAINER [email protected] | ||
|
||
RUN apt-get update && \ | ||
apt-get install -y sudo apt-utils make build-essential \ | ||
libssl-dev zlib1g-dev libbz2-dev libreadline-dev \ | ||
libsqlite3-dev wget curl git libffi-dev liblzma-dev locales \ | ||
g++ openjdk-8-jdk openssh-server | ||
|
||
RUN locale-gen en_US.UTF-8 | ||
ENV LC_ALL en_US.UTF-8 | ||
ENV LANG en_US.UTF-8 | ||
|
||
ARG UID | ||
RUN adduser --disabled-password --gecos "" user --uid ${UID:-1000} | ||
RUN adduser user sudo | ||
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers | ||
|
||
USER user | ||
WORKDIR /home/user | ||
|
||
# pyenv 설치/ 설정 | ||
ENV HOME /home/user | ||
ENV PYENV_ROOT $HOME/.pyenv | ||
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH | ||
RUN git clone https://github.com/pyenv/pyenv.git .pyenv | ||
|
||
# python 설치 | ||
SHELL ["/bin/bash", "-c"] | ||
RUN pyenv install 3.7.8 && \ | ||
pyenv global 3.7.8 && \ | ||
pyenv rehash | ||
|
||
RUN pip install --no-cache-dir --upgrade pip setuptools wheel | ||
|
||
# WORKDIR 설정 | ||
WORKDIR /home/user/workspace | ||
|
||
# openssh-server 설정 | ||
ARG PASSWD | ||
RUN echo user:${PASSWD:-hephaestus} | sudo chpasswd | ||
RUN sudo sed -i 's_/usr/lib/openssh/sftp-server_internal-sftp_g' /etc/ssh/sshd_config | ||
RUN echo "sudo service ssh start" > /home/user/.bashrc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,39 @@ | ||
torch==1.5.0 | ||
torchvision==0.6.0 | ||
pytorch-lightning==0.8.5 | ||
absl-py==0.10.0 | ||
cachetools==4.1.1 | ||
certifi==2020.6.20 | ||
chardet==3.0.4 | ||
click==7.1.2 | ||
filelock==3.0.12 | ||
future==0.18.2 | ||
google-auth==1.21.3 | ||
google-auth-oauthlib==0.4.1 | ||
grpcio==1.32.0 | ||
idna==2.10 | ||
importlib-metadata==2.0.0 | ||
joblib==0.16.0 | ||
Markdown==3.2.2 | ||
numpy==1.19.2 | ||
oauthlib==3.1.0 | ||
packaging==20.4 | ||
protobuf==3.13.0 | ||
pyasn1==0.4.8 | ||
pyasn1-modules==0.2.8 | ||
pyparsing==2.4.7 | ||
pytorch-lightning==0.9.0 | ||
PyYAML==5.3.1 | ||
regex==2020.7.14 | ||
requests==2.24.0 | ||
requests-oauthlib==1.3.0 | ||
rsa==4.6 | ||
sacremoses==0.0.43 | ||
sentencepiece==0.1.91 | ||
six==1.15.0 | ||
tensorboard==2.2.0 | ||
tensorboard-plugin-wit==1.7.0 | ||
tokenizers==0.8.1rc2 | ||
torch==1.6.0 | ||
tqdm==4.49.0 | ||
transformers==3.3.0 | ||
urllib3==1.25.10 | ||
Werkzeug==1.0.1 | ||
zipp==3.2.0 |
Oops, something went wrong.