Skip to content

Commit

Permalink
Merge pull request #75 from amjltc295/write_epochs_summary
Browse files Browse the repository at this point in the history
Write epochs summary into a csv file; Link resumed checkpoint
  • Loading branch information
amjltc295 authored Jan 14, 2020
2 parents d4f6598 + a0353e6 commit 49d3fcf
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 72 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ bash Miniconda3-latest-Linux-x86_64.sh
3. Build conda environment from file
```
cd pytorch-golden-template
conda env create -f environment.yml
conda create -n PytorchGoldenTemplate python">=3.6" pytorch">=1.0" torchvision tensorboard pillow">=6.1" pandas coloredlogs imageio scipy -c pytorch
source activate PytorchGoldenTemplate
pip install attrdict tensorboardX
# Or
conda env create -f environment.yaml
```

4. Update submodules
Expand Down
65 changes: 65 additions & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: PytorchGoldenTemplate
channels:
- pytorch
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- absl-py=0.8.1=py37_0
- blas=1.0=mkl
- c-ares=1.15.0=h7b6447c_1001
- ca-certificates=2019.11.27=0
- certifi=2019.11.28=py37_0
- cffi=1.13.2=py37h2e261b9_0
- coloredlogs=10.0=py37_0
- cudatoolkit=10.0.130=0
- freetype=2.9.1=h8a8886c_1
- grpcio=1.16.1=py37hf8bcb03_1
- humanfriendly=4.18=py37_0
- imageio=2.6.1=py37_0
- intel-openmp=2019.4=243
- jpeg=9b=h024ee3a_2
- ld_impl_linux-64=2.33.1=h53a641e_7
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=9.1.0=hdf63c60_0
- libgfortran-ng=7.3.0=hdf63c60_0
- libpng=1.6.37=hbc83047_0
- libprotobuf=3.11.2=hd408876_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- libtiff=4.1.0=h2733197_0
- markdown=3.1.1=py37_0
- mkl=2019.4=243
- mkl-service=2.3.0=py37he904b0f_0
- mkl_fft=1.0.15=py37ha843d7b_0
- mkl_random=1.1.0=py37hd6b4f25_0
- ncurses=6.1=he6710b0_1
- ninja=1.9.0=py37hfd86e86_0
- numpy=1.17.4=py37hc1035e2_0
- numpy-base=1.17.4=py37hde5b4d6_0
- olefile=0.46=py_0
- openssl=1.1.1d=h7b6447c_3
- pandas=0.25.3=py37he6710b0_0
- pillow=6.1.0=py37h34e0f95_0
- pip=19.3.1=py37_0
- protobuf=3.11.2=py37he6710b0_0
- pycparser=2.19=py_0
- python=3.7.6=h0371630_2
- python-dateutil=2.8.1=py_0
- pytorch=1.0.0=py3.7_cuda9.0.176_cudnn7.4.1_1
- pytz=2019.3=py_0
- readline=7.0=h7b6447c_5
- scipy=1.3.2=py37h7c811a0_0
- setuptools=44.0.0=py37_0
- six=1.13.0=py37_0
- sqlite=3.30.1=h7b6447c_0
- tensorboard=2.0.0=pyhb38c66f_1
- tk=8.6.8=hbc83047_0
- torchvision=0.2.2=py_3
- werkzeug=0.16.0=py_0
- wheel=0.33.6=py37_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
- zstd=1.3.7=h0b5b093_0
- pip:
- attrdict==2.0.1
- tensorboardx==2.0
70 changes: 0 additions & 70 deletions environment.yml

This file was deleted.

13 changes: 12 additions & 1 deletion src/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from abc import ABC, abstractmethod

import torch
import pandas as pd

from utils.util import get_instance
from utils.visualization import WriterTensorboard
Expand Down Expand Up @@ -235,10 +236,12 @@ def _resume_model_params(self, resumed_checkpoint):
model.load_state_dict(resumed_checkpoint['state_dict'])

def _print_and_write_log(self, epoch, worker_outputs, write=True):
# print common worker logged info
# This function is to print out epoch summary of workers
# and append these summary values on the summary csv file.
if write:
self.writer.set_step(epoch, 'epoch_average') # TODO: See if we can use tree-structured tensorboard logging
logger.info(f' epoch: {epoch:d}')
epoch_record = {'epoch': epoch}
# print the logged info for each loader (corresponding to each worker)
for loader_name, output in worker_outputs.items():
log = output['log']
Expand All @@ -248,5 +251,13 @@ def _print_and_write_log(self, epoch, worker_outputs, write=True):
if global_config.verbosity >= 1:
logger.info(f' {str(key):20s}: {value:.4f}')
if 'elapsed_time' not in key and write:
value = value.item() if isinstance(value, torch.Tensor) else value
epoch_record[f'{loader_name}_{key}'] = [value]
# TODO: See if we can use tree-structured tensorboard logging
self.writer.add_scalar(f'{loader_name}_{key}', value)

# concatenate summary of this epoch into 'epochs_summary.csv'
new_df = pd.DataFrame(epoch_record)
csv_file = os.path.join(self.saving_dir, 'epochs_summary.csv')
df = pd.concat([pd.read_csv(csv_file), new_df]) if os.path.exists(csv_file) else new_df
df.to_csv(csv_file, index=False)
6 changes: 6 additions & 0 deletions src/pipeline/training_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ def _create_saving_dir(self, args):
saving_dir = os.path.join(global_config['trainer']['save_dir'], args.ckpts_subdir,
global_config['name'], self.start_time)
ensure_dir(saving_dir)

# create a link to the resumed checkpoint as a reference
if args.resume is not None:
link = os.path.join(saving_dir, 'resumed_ckpt.pth')
os.symlink(os.path.abspath(args.resume), link)

return saving_dir

def _setup_loss_functions(self):
Expand Down

0 comments on commit 49d3fcf

Please sign in to comment.