From ec9be6c30e9cfce451dd9449c772dbdda820b53f Mon Sep 17 00:00:00 2001 From: Aayush Garg Date: Fri, 6 May 2022 12:59:19 +0530 Subject: [PATCH] add pl---hydra implementation --- pl-hydra/configs/callbacks/default.yaml | 28 +++ pl-hydra/configs/callbacks/none.yaml | 0 pl-hydra/configs/datamodule/cifar10.yaml | 11 + pl-hydra/configs/debug/default.yaml | 28 +++ pl-hydra/configs/debug/limit_batches.yaml | 12 ++ pl-hydra/configs/debug/overfit.yaml | 10 + pl-hydra/configs/debug/profiler.yaml | 12 ++ pl-hydra/configs/debug/step.yaml | 9 + pl-hydra/configs/debug/test_only.yaml | 9 + pl-hydra/configs/experiment/example.yaml | 38 ++++ .../configs/hparams_search/mnist_optuna.yaml | 60 ++++++ pl-hydra/configs/local/.gitkeep | 0 pl-hydra/configs/log_dir/debug.yaml | 8 + pl-hydra/configs/log_dir/default.yaml | 15 ++ pl-hydra/configs/log_dir/evaluation.yaml | 8 + pl-hydra/configs/logger/comet.yaml | 7 + pl-hydra/configs/logger/csv.yaml | 7 + pl-hydra/configs/logger/many_loggers.yaml | 9 + pl-hydra/configs/logger/mlflow.yaml | 9 + pl-hydra/configs/logger/neptune.yaml | 11 + pl-hydra/configs/logger/tensorboard.yaml | 11 + pl-hydra/configs/logger/wandb.yaml | 15 ++ pl-hydra/configs/model/cifar10_densenet.yaml | 10 + pl-hydra/configs/model/cifar10_googlenet.yaml | 6 + pl-hydra/configs/model/cifar10_resnet.yaml | 8 + pl-hydra/configs/model/cifar10_vgg11.yaml | 6 + pl-hydra/configs/model/cifar10_vit.yaml | 13 ++ pl-hydra/configs/optim/optim_adam.yaml | 11 + pl-hydra/configs/optim/optim_adam_vit.yaml | 10 + pl-hydra/configs/optim/optim_sgd.yaml | 12 ++ pl-hydra/configs/test.yaml | 32 +++ pl-hydra/configs/train.yaml | 71 +++++++ pl-hydra/configs/trainer/ddp.yaml | 6 + pl-hydra/configs/trainer/default.yaml | 12 ++ pl-hydra/notebooks/.gitkeep | 0 pl-hydra/notebooks/make_vgg11.ipynb | 196 +++++++++++++++++ pl-hydra/scripts/schedule.sh | 7 + pl-hydra/setup.cfg | 36 ++++ pl-hydra/src/__init__.py | 0 pl-hydra/src/datamodules/__init__.py | 0 .../src/datamodules/cifar10_datamodule.py | 128 +++++++++++ .../src/datamodules/components/__init__.py | 0 pl-hydra/src/datamodules/mnist_datamodule.py | 106 ++++++++++ pl-hydra/src/models/__init__.py | 0 pl-hydra/src/models/cifar10_module.py | 198 ++++++++++++++++++ pl-hydra/src/models/components/__init__.py | 0 pl-hydra/src/models/components/densenet.py | 153 ++++++++++++++ pl-hydra/src/models/components/googlenet.py | 132 ++++++++++++ pl-hydra/src/models/components/resnet.py | 135 ++++++++++++ pl-hydra/src/models/components/vgg.py | 111 ++++++++++ pl-hydra/src/models/components/vit.py | 128 +++++++++++ pl-hydra/src/testing_pipeline.py | 57 +++++ pl-hydra/src/training_pipeline.py | 126 +++++++++++ pl-hydra/src/utils/__init__.py | 164 +++++++++++++++ pl-hydra/src/utils/plotter.py | 37 ++++ pl-hydra/src/vendor/__init__.py | 1 + pl-hydra/test.py | 26 +++ pl-hydra/tests/__init__.py | 0 pl-hydra/tests/helpers/__init__.py | 0 pl-hydra/tests/helpers/module_available.py | 28 +++ pl-hydra/tests/helpers/run_command.py | 15 ++ pl-hydra/tests/helpers/runif.py | 104 +++++++++ pl-hydra/tests/shell/__init__.py | 0 pl-hydra/tests/shell/test_basic_commands.py | 58 +++++ pl-hydra/tests/shell/test_debug_configs.py | 35 ++++ pl-hydra/tests/shell/test_sweeps.py | 44 ++++ pl-hydra/tests/unit/__init__.py | 0 pl-hydra/tests/unit/test_mnist_datamodule.py | 36 ++++ pl-hydra/train.py | 33 +++ 69 files changed, 2608 insertions(+) create mode 100644 pl-hydra/configs/callbacks/default.yaml create mode 100644 pl-hydra/configs/callbacks/none.yaml create mode 100644 pl-hydra/configs/datamodule/cifar10.yaml create mode 100644 pl-hydra/configs/debug/default.yaml create mode 100644 pl-hydra/configs/debug/limit_batches.yaml create mode 100644 pl-hydra/configs/debug/overfit.yaml create mode 100644 pl-hydra/configs/debug/profiler.yaml create mode 100644 pl-hydra/configs/debug/step.yaml create mode 100644 pl-hydra/configs/debug/test_only.yaml create mode 100644 pl-hydra/configs/experiment/example.yaml create mode 100644 pl-hydra/configs/hparams_search/mnist_optuna.yaml create mode 100644 pl-hydra/configs/local/.gitkeep create mode 100644 pl-hydra/configs/log_dir/debug.yaml create mode 100644 pl-hydra/configs/log_dir/default.yaml create mode 100644 pl-hydra/configs/log_dir/evaluation.yaml create mode 100644 pl-hydra/configs/logger/comet.yaml create mode 100644 pl-hydra/configs/logger/csv.yaml create mode 100644 pl-hydra/configs/logger/many_loggers.yaml create mode 100644 pl-hydra/configs/logger/mlflow.yaml create mode 100644 pl-hydra/configs/logger/neptune.yaml create mode 100644 pl-hydra/configs/logger/tensorboard.yaml create mode 100644 pl-hydra/configs/logger/wandb.yaml create mode 100644 pl-hydra/configs/model/cifar10_densenet.yaml create mode 100644 pl-hydra/configs/model/cifar10_googlenet.yaml create mode 100644 pl-hydra/configs/model/cifar10_resnet.yaml create mode 100644 pl-hydra/configs/model/cifar10_vgg11.yaml create mode 100644 pl-hydra/configs/model/cifar10_vit.yaml create mode 100644 pl-hydra/configs/optim/optim_adam.yaml create mode 100644 pl-hydra/configs/optim/optim_adam_vit.yaml create mode 100644 pl-hydra/configs/optim/optim_sgd.yaml create mode 100644 pl-hydra/configs/test.yaml create mode 100644 pl-hydra/configs/train.yaml create mode 100644 pl-hydra/configs/trainer/ddp.yaml create mode 100644 pl-hydra/configs/trainer/default.yaml create mode 100644 pl-hydra/notebooks/.gitkeep create mode 100644 pl-hydra/notebooks/make_vgg11.ipynb create mode 100644 pl-hydra/scripts/schedule.sh create mode 100644 pl-hydra/setup.cfg create mode 100644 pl-hydra/src/__init__.py create mode 100644 pl-hydra/src/datamodules/__init__.py create mode 100644 pl-hydra/src/datamodules/cifar10_datamodule.py create mode 100644 pl-hydra/src/datamodules/components/__init__.py create mode 100644 pl-hydra/src/datamodules/mnist_datamodule.py create mode 100644 pl-hydra/src/models/__init__.py create mode 100644 pl-hydra/src/models/cifar10_module.py create mode 100644 pl-hydra/src/models/components/__init__.py create mode 100644 pl-hydra/src/models/components/densenet.py create mode 100644 pl-hydra/src/models/components/googlenet.py create mode 100644 pl-hydra/src/models/components/resnet.py create mode 100644 pl-hydra/src/models/components/vgg.py create mode 100644 pl-hydra/src/models/components/vit.py create mode 100644 pl-hydra/src/testing_pipeline.py create mode 100644 pl-hydra/src/training_pipeline.py create mode 100644 pl-hydra/src/utils/__init__.py create mode 100644 pl-hydra/src/utils/plotter.py create mode 100644 pl-hydra/src/vendor/__init__.py create mode 100644 pl-hydra/test.py create mode 100644 pl-hydra/tests/__init__.py create mode 100644 pl-hydra/tests/helpers/__init__.py create mode 100644 pl-hydra/tests/helpers/module_available.py create mode 100644 pl-hydra/tests/helpers/run_command.py create mode 100644 pl-hydra/tests/helpers/runif.py create mode 100644 pl-hydra/tests/shell/__init__.py create mode 100644 pl-hydra/tests/shell/test_basic_commands.py create mode 100644 pl-hydra/tests/shell/test_debug_configs.py create mode 100644 pl-hydra/tests/shell/test_sweeps.py create mode 100644 pl-hydra/tests/unit/__init__.py create mode 100644 pl-hydra/tests/unit/test_mnist_datamodule.py create mode 100644 pl-hydra/train.py diff --git a/pl-hydra/configs/callbacks/default.yaml b/pl-hydra/configs/callbacks/default.yaml new file mode 100644 index 0000000..29fa956 --- /dev/null +++ b/pl-hydra/configs/callbacks/default.yaml @@ -0,0 +1,28 @@ +model_checkpoint: + _target_: pytorch_lightning.callbacks.ModelCheckpoint + monitor: "val/acc" # name of the logged metric which determines when model is improving + mode: "max" # "max" means higher metric value is better, can be also "min" + save_top_k: 3 # save k best models (determined by above metric) + save_last: True # additionaly always save model from last epoch + verbose: False + dirpath: "checkpoints/" + filename: "epoch_{epoch:03d}" + auto_insert_metric_name: False + +early_stopping: + _target_: pytorch_lightning.callbacks.EarlyStopping + monitor: "val/acc" # name of the logged metric which determines when model is improving + mode: "max" # "max" means higher metric value is better, can be also "min" + patience: 100 # how many validation epochs of not improving until training stops + min_delta: 0 # minimum change in the monitored metric needed to qualify as an improvement + +model_summary: + _target_: pytorch_lightning.callbacks.RichModelSummary + max_depth: -1 + +rich_progress_bar: + _target_: pytorch_lightning.callbacks.RichProgressBar + +learning_rate_monitor: + _target_: pytorch_lightning.callbacks.LearningRateMonitor + logging_interval: epoch diff --git a/pl-hydra/configs/callbacks/none.yaml b/pl-hydra/configs/callbacks/none.yaml new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/configs/datamodule/cifar10.yaml b/pl-hydra/configs/datamodule/cifar10.yaml new file mode 100644 index 0000000..f5528b5 --- /dev/null +++ b/pl-hydra/configs/datamodule/cifar10.yaml @@ -0,0 +1,11 @@ +_target_: src.datamodules.cifar10_datamodule.CIFAR10DataModule + +data_dir: ${data_dir} # data_dir is specified in config.yaml +batch_size: 128 +num_workers: 4 +pin_memory: True +data_mean: [0.49421428, 0.48513139, 0.45040909] +data_std: [0.24665252, 0.24289226, 0.26159238] +image_size: [32, 32] +scale_bounds: [0.8, 1.0] +aspect_bounds: [0.9, 1.1] diff --git a/pl-hydra/configs/debug/default.yaml b/pl-hydra/configs/debug/default.yaml new file mode 100644 index 0000000..8dfb104 --- /dev/null +++ b/pl-hydra/configs/debug/default.yaml @@ -0,0 +1,28 @@ +# @package _global_ + +# default debugging setup, runs 1 full epoch +# other debugging configs can inherit from this one + +defaults: + - override /log_dir: debug.yaml + +trainer: + max_epochs: 1 + gpus: 0 # debuggers don't like gpus + detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor + track_grad_norm: 2 # track gradient norm with loggers + +datamodule: + num_workers: 0 # debuggers don't like multiprocessing + pin_memory: False # disable gpu memory pin + +# sets level of all command line loggers to 'DEBUG' +# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ +hydra: + verbose: True + + # use this to set level of only chosen command line loggers to 'DEBUG': + # verbose: [src.train, src.utils] + +# config is already printed by hydra when `hydra/verbose: True` +print_config: False diff --git a/pl-hydra/configs/debug/limit_batches.yaml b/pl-hydra/configs/debug/limit_batches.yaml new file mode 100644 index 0000000..cc28852 --- /dev/null +++ b/pl-hydra/configs/debug/limit_batches.yaml @@ -0,0 +1,12 @@ +# @package _global_ + +# uses only 1% of the training data and 5% of validation/test data + +defaults: + - default.yaml + +trainer: + max_epochs: 3 + limit_train_batches: 0.01 + limit_val_batches: 0.05 + limit_test_batches: 0.05 diff --git a/pl-hydra/configs/debug/overfit.yaml b/pl-hydra/configs/debug/overfit.yaml new file mode 100644 index 0000000..2ce654b --- /dev/null +++ b/pl-hydra/configs/debug/overfit.yaml @@ -0,0 +1,10 @@ +# @package _global_ + +# overfits to 3 batches + +defaults: + - default.yaml + +trainer: + max_epochs: 20 + overfit_batches: 3 diff --git a/pl-hydra/configs/debug/profiler.yaml b/pl-hydra/configs/debug/profiler.yaml new file mode 100644 index 0000000..e18df1c --- /dev/null +++ b/pl-hydra/configs/debug/profiler.yaml @@ -0,0 +1,12 @@ +# @package _global_ + +# runs with execution time profiling + +defaults: + - default.yaml + +trainer: + max_epochs: 1 + profiler: "simple" + # profiler: "advanced" + # profiler: "pytorch" diff --git a/pl-hydra/configs/debug/step.yaml b/pl-hydra/configs/debug/step.yaml new file mode 100644 index 0000000..98eba22 --- /dev/null +++ b/pl-hydra/configs/debug/step.yaml @@ -0,0 +1,9 @@ +# @package _global_ + +# runs 1 train, 1 validation and 1 test step + +defaults: + - default.yaml + +trainer: + fast_dev_run: true diff --git a/pl-hydra/configs/debug/test_only.yaml b/pl-hydra/configs/debug/test_only.yaml new file mode 100644 index 0000000..79dc34a --- /dev/null +++ b/pl-hydra/configs/debug/test_only.yaml @@ -0,0 +1,9 @@ +# @package _global_ + +# runs only test epoch + +defaults: + - default.yaml + +train: False +test: True diff --git a/pl-hydra/configs/experiment/example.yaml b/pl-hydra/configs/experiment/example.yaml new file mode 100644 index 0000000..305d96c --- /dev/null +++ b/pl-hydra/configs/experiment/example.yaml @@ -0,0 +1,38 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=example + +defaults: + - override /datamodule: mnist.yaml + - override /model: mnist.yaml + - override /callbacks: default.yaml + - override /logger: null + - override /trainer: default.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +# name of the run determines folder name in logs +name: "simple_dense_net" + +seed: 12345 + +trainer: + min_epochs: 10 + max_epochs: 10 + gradient_clip_val: 0.5 + +model: + lr: 0.002 + net: + lin1_size: 128 + lin2_size: 256 + lin3_size: 64 + +datamodule: + batch_size: 64 + +logger: + wandb: + tags: ["mnist", "${name}"] diff --git a/pl-hydra/configs/hparams_search/mnist_optuna.yaml b/pl-hydra/configs/hparams_search/mnist_optuna.yaml new file mode 100644 index 0000000..7de2b44 --- /dev/null +++ b/pl-hydra/configs/hparams_search/mnist_optuna.yaml @@ -0,0 +1,60 @@ +# @package _global_ + +# example hyperparameter optimization of some experiment with Optuna: +# python train.py -m hparams_search=mnist_optuna experiment=example + +defaults: + - override /hydra/sweeper: optuna + +# choose metric which will be optimized by Optuna +# make sure this is the correct name of some metric logged in lightning module! +optimized_metric: "val/acc_best" + +# here we define Optuna hyperparameter search +# it optimizes for value returned from function with @hydra.main decorator +# docs: https://hydra.cc/docs/next/plugins/optuna_sweeper +hydra: + sweeper: + _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper + + # storage URL to persist optimization results + # for example, you can use SQLite if you set 'sqlite:///example.db' + storage: null + + # name of the study to persist optimization results + study_name: null + + # number of parallel workers + n_jobs: 1 + + # 'minimize' or 'maximize' the objective + direction: maximize + + # total number of runs that will be executed + n_trials: 25 + + # choose Optuna hyperparameter sampler + # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html + sampler: + _target_: optuna.samplers.TPESampler + seed: 12345 + n_startup_trials: 10 # number of random sampling runs before optimization starts + + # define range of hyperparameters + search_space: + datamodule.batch_size: + type: categorical + choices: [32, 64, 128] + model.lr: + type: float + low: 0.0001 + high: 0.2 + model.net.lin1_size: + type: categorical + choices: [32, 64, 128, 256, 512] + model.net.lin2_size: + type: categorical + choices: [32, 64, 128, 256, 512] + model.net.lin3_size: + type: categorical + choices: [32, 64, 128, 256, 512] diff --git a/pl-hydra/configs/local/.gitkeep b/pl-hydra/configs/local/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/configs/log_dir/debug.yaml b/pl-hydra/configs/log_dir/debug.yaml new file mode 100644 index 0000000..83db732 --- /dev/null +++ b/pl-hydra/configs/log_dir/debug.yaml @@ -0,0 +1,8 @@ +# @package _global_ + +hydra: + run: + dir: logs/debugs/runs/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} + sweep: + dir: logs/debugs/multiruns/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} + subdir: ${hydra.job.num} diff --git a/pl-hydra/configs/log_dir/default.yaml b/pl-hydra/configs/log_dir/default.yaml new file mode 100644 index 0000000..3868729 --- /dev/null +++ b/pl-hydra/configs/log_dir/default.yaml @@ -0,0 +1,15 @@ +# @package _global_ + +hydra: + run: + dir: logs/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} + sweep: + dir: logs/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} + subdir: ${hydra.job.num} + +# hydra: +# run: +# dir: logs/experiments/runs/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +# sweep: +# dir: logs/experiments/multiruns/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +# subdir: ${hydra.job.num} diff --git a/pl-hydra/configs/log_dir/evaluation.yaml b/pl-hydra/configs/log_dir/evaluation.yaml new file mode 100644 index 0000000..a8de069 --- /dev/null +++ b/pl-hydra/configs/log_dir/evaluation.yaml @@ -0,0 +1,8 @@ +# @package _global_ + +hydra: + run: + dir: logs/evaluations/runs/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} + sweep: + dir: logs/evaluations/multiruns/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} + subdir: ${hydra.job.num} diff --git a/pl-hydra/configs/logger/comet.yaml b/pl-hydra/configs/logger/comet.yaml new file mode 100644 index 0000000..6ac99f4 --- /dev/null +++ b/pl-hydra/configs/logger/comet.yaml @@ -0,0 +1,7 @@ +# https://www.comet.ml + +comet: + _target_: pytorch_lightning.loggers.comet.CometLogger + api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable + project_name: "template-tests" + experiment_name: ${name} diff --git a/pl-hydra/configs/logger/csv.yaml b/pl-hydra/configs/logger/csv.yaml new file mode 100644 index 0000000..aaec6d7 --- /dev/null +++ b/pl-hydra/configs/logger/csv.yaml @@ -0,0 +1,7 @@ +# csv logger built in lightning + +csv: + _target_: pytorch_lightning.loggers.csv_logs.CSVLogger + save_dir: "." + name: "csv/" + prefix: "" diff --git a/pl-hydra/configs/logger/many_loggers.yaml b/pl-hydra/configs/logger/many_loggers.yaml new file mode 100644 index 0000000..801444d --- /dev/null +++ b/pl-hydra/configs/logger/many_loggers.yaml @@ -0,0 +1,9 @@ +# train with many loggers at once + +defaults: + # - comet.yaml + - csv.yaml + # - mlflow.yaml + # - neptune.yaml + - tensorboard.yaml + - wandb.yaml diff --git a/pl-hydra/configs/logger/mlflow.yaml b/pl-hydra/configs/logger/mlflow.yaml new file mode 100644 index 0000000..130d3de --- /dev/null +++ b/pl-hydra/configs/logger/mlflow.yaml @@ -0,0 +1,9 @@ +# https://mlflow.org + +mlflow: + _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger + experiment_name: ${name} + tracking_uri: ${original_work_dir}/logs/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI + tags: null + prefix: "" + artifact_location: null diff --git a/pl-hydra/configs/logger/neptune.yaml b/pl-hydra/configs/logger/neptune.yaml new file mode 100644 index 0000000..117af93 --- /dev/null +++ b/pl-hydra/configs/logger/neptune.yaml @@ -0,0 +1,11 @@ +# https://neptune.ai + +neptune: + _target_: pytorch_lightning.loggers.neptune.NeptuneLogger + api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable + project_name: your_name/template-tests + close_after_fit: True + offline_mode: False + experiment_name: ${name} + experiment_id: null + prefix: "" diff --git a/pl-hydra/configs/logger/tensorboard.yaml b/pl-hydra/configs/logger/tensorboard.yaml new file mode 100644 index 0000000..730c8e9 --- /dev/null +++ b/pl-hydra/configs/logger/tensorboard.yaml @@ -0,0 +1,11 @@ +# https://www.tensorflow.org/tensorboard/ + +tensorboard: + _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger + save_dir: "tensorboard/" + name: null + version: ${name} + log_graph: True + default_hp_metric: True + prefix: "" + \ No newline at end of file diff --git a/pl-hydra/configs/logger/wandb.yaml b/pl-hydra/configs/logger/wandb.yaml new file mode 100644 index 0000000..df6024b --- /dev/null +++ b/pl-hydra/configs/logger/wandb.yaml @@ -0,0 +1,15 @@ +# https://wandb.ai + +wandb: + _target_: pytorch_lightning.loggers.wandb.WandbLogger + project: "template-tests" + # name: ${name} + save_dir: "." + offline: False # set True to store all logs only locally + id: null # pass correct id to resume experiment! + # entity: "" # set to name of your wandb team + log_model: False + prefix: "" + job_type: "train" + group: "" + tags: [] diff --git a/pl-hydra/configs/model/cifar10_densenet.yaml b/pl-hydra/configs/model/cifar10_densenet.yaml new file mode 100644 index 0000000..be9f459 --- /dev/null +++ b/pl-hydra/configs/model/cifar10_densenet.yaml @@ -0,0 +1,10 @@ +_target_: src.models.cifar10_module.CIFAR10LitModule + +net: + _target_: src.models.components.densenet.DenseNet + num_classes: 10 + num_layers: [6,6,6,6] + bn_size: 2 + growth_rate: 16 + act_fn_by_name: relu + diff --git a/pl-hydra/configs/model/cifar10_googlenet.yaml b/pl-hydra/configs/model/cifar10_googlenet.yaml new file mode 100644 index 0000000..8efd5c0 --- /dev/null +++ b/pl-hydra/configs/model/cifar10_googlenet.yaml @@ -0,0 +1,6 @@ +_target_: src.models.cifar10_module.CIFAR10LitModule + +net: + _target_: src.models.components.googlenet.GoogleNet + num_classes: 10 + act_fn_by_name: relu diff --git a/pl-hydra/configs/model/cifar10_resnet.yaml b/pl-hydra/configs/model/cifar10_resnet.yaml new file mode 100644 index 0000000..9770ba5 --- /dev/null +++ b/pl-hydra/configs/model/cifar10_resnet.yaml @@ -0,0 +1,8 @@ +_target_: src.models.cifar10_module.CIFAR10LitModule + +net: + _target_: src.models.components.resnet.ResNet + num_classes: 10 + num_blocks: [3,3,3] + c_hidden: [16,32,64] + act_fn_by_name: relu diff --git a/pl-hydra/configs/model/cifar10_vgg11.yaml b/pl-hydra/configs/model/cifar10_vgg11.yaml new file mode 100644 index 0000000..6e1defc --- /dev/null +++ b/pl-hydra/configs/model/cifar10_vgg11.yaml @@ -0,0 +1,6 @@ +_target_: src.models.cifar10_module.CIFAR10LitModule + +net: + _target_: src.models.components.vgg.VGG11 + num_classes: 10 + act_fn_by_name: relu diff --git a/pl-hydra/configs/model/cifar10_vit.yaml b/pl-hydra/configs/model/cifar10_vit.yaml new file mode 100644 index 0000000..24e4bc9 --- /dev/null +++ b/pl-hydra/configs/model/cifar10_vit.yaml @@ -0,0 +1,13 @@ +_target_: src.models.cifar10_module.CIFAR10LitModule + +net: + _target_: src.models.components.vit.VisionTransformer + num_classes: 10 + num_heads: 8 + num_layers: 6 + num_channels: 3 + num_patches: 64 + patch_size: 4 + embed_dim: 256 + hidden_dim: 512 + dropout: 0.2 \ No newline at end of file diff --git a/pl-hydra/configs/optim/optim_adam.yaml b/pl-hydra/configs/optim/optim_adam.yaml new file mode 100644 index 0000000..212af78 --- /dev/null +++ b/pl-hydra/configs/optim/optim_adam.yaml @@ -0,0 +1,11 @@ +optimizer: + _target_: torch.optim.AdamW + lr: 1e-3 + weight_decay: 1e-4 + +use_lr_scheduler: True + +lr_scheduler: + _target_: torch.optim.lr_scheduler.MultiStepLR + milestones: [90,130] + gamma: 0.1 \ No newline at end of file diff --git a/pl-hydra/configs/optim/optim_adam_vit.yaml b/pl-hydra/configs/optim/optim_adam_vit.yaml new file mode 100644 index 0000000..2f3ab04 --- /dev/null +++ b/pl-hydra/configs/optim/optim_adam_vit.yaml @@ -0,0 +1,10 @@ +optimizer: + _target_: torch.optim.AdamW + lr: 3e-4 + +use_lr_scheduler: True + +lr_scheduler: + _target_: torch.optim.lr_scheduler.MultiStepLR + milestones: [90,130] + gamma: 0.1 \ No newline at end of file diff --git a/pl-hydra/configs/optim/optim_sgd.yaml b/pl-hydra/configs/optim/optim_sgd.yaml new file mode 100644 index 0000000..24a3ccf --- /dev/null +++ b/pl-hydra/configs/optim/optim_sgd.yaml @@ -0,0 +1,12 @@ +optimizer: + _target_: torch.optim.SGD + lr: 0.1 + weight_decay: 1e-4 + momentum: 0.9 + +use_lr_scheduler: True + +lr_scheduler: + _target_: torch.optim.lr_scheduler.MultiStepLR + milestones: [90,130] + gamma: 0.1 diff --git a/pl-hydra/configs/test.yaml b/pl-hydra/configs/test.yaml new file mode 100644 index 0000000..1e10d5c --- /dev/null +++ b/pl-hydra/configs/test.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# specify here default evaluation configuration +defaults: + - _self_ + - datamodule: mnist.yaml # choose the datamodule for evaluation + - model: mnist.yaml + - callbacks: null + - logger: null + - trainer: default.yaml + - log_dir: evaluation.yaml + + - experiment: null + + # enable color logging + - override hydra/hydra_logging: colorlog + - override hydra/job_logging: colorlog + +original_work_dir: ${hydra:runtime.cwd} + +data_dir: ${original_work_dir}/data/ + +print_config: True + +ignore_warnings: True + +seed: null + +name: "default" + +# passing checkpoint path is necessary +ckpt_path: ??? diff --git a/pl-hydra/configs/train.yaml b/pl-hydra/configs/train.yaml new file mode 100644 index 0000000..1632f5c --- /dev/null +++ b/pl-hydra/configs/train.yaml @@ -0,0 +1,71 @@ +# @package _global_ + +# specify here default training configuration +defaults: + - _self_ + - datamodule: cifar10.yaml + # for resnet + - model : cifar10_resnet.yaml + - optim: optim_sgd.yaml + # # for googlenet + # - model : cifar10_googlenet.yaml + # - optim: optim_adam.yaml + # # for densenet + # - model : cifar10_densenet.yaml + # - optim: optim_adam.yaml + # for vgg11 + # - model : cifar10_vgg11.yaml + # - optim: optim_adam.yaml + # # for Vit + # - model : cifar10_vit.yaml + # - optim: optim_adam_vit.yaml + # - callbacks: default.yaml + - logger: tensorboard.yaml # set logger here or use command line (e.g. `python train.py logger=tensorboard`) + # - trainer: ddp.yaml + - trainer: default.yaml + - log_dir: default.yaml + # experiment configs allow for version control of specific configurations + # e.g. best hyperparameters for each combination of model and datamodule + - experiment: null + + # debugging config (enable through command line, e.g. `python train.py debug=default) + - debug: null + + # config for hyperparameter optimization + - hparams_search: null + + # optional local config for machine/user specific settings + # it's optional since it doesn't need to exist and is excluded from version control + - optional local: default.yaml + + # enable color logging + - override hydra/hydra_logging: colorlog + - override hydra/job_logging: colorlog + +# default name for the experiment, determines logging folder path +# (you can overwrite this name in experiment configs) +name: "resnet" + +# path to original working directory +# hydra hijacks working directory by changing it to the new log directory +# https://hydra.cc/docs/next/tutorials/basic/running_your_app/working_directory +original_work_dir: ${hydra:runtime.cwd} + +# path to folder with data +data_dir: ${original_work_dir}/../../data/ + +# pretty print config at the start of the run using Rich library +print_config: True + +# disable python warnings if they annoy you +ignore_warnings: True + +# set False to skip model training +train: True + +# evaluate on test set, using best model weights achieved during training +# lightning chooses best weights based on the metric specified in checkpoint callback +test: True + +# seed for random number generators in pytorch, numpy and python.random +seed: 100 \ No newline at end of file diff --git a/pl-hydra/configs/trainer/ddp.yaml b/pl-hydra/configs/trainer/ddp.yaml new file mode 100644 index 0000000..8a11249 --- /dev/null +++ b/pl-hydra/configs/trainer/ddp.yaml @@ -0,0 +1,6 @@ +defaults: + - default.yaml + +gpus: 4 +strategy: ddp +sync_batchnorm: True diff --git a/pl-hydra/configs/trainer/default.yaml b/pl-hydra/configs/trainer/default.yaml new file mode 100644 index 0000000..2a1ee74 --- /dev/null +++ b/pl-hydra/configs/trainer/default.yaml @@ -0,0 +1,12 @@ +_target_: pytorch_lightning.Trainer + +gpus: 1 + +min_epochs: 1 +max_epochs: 150 + +# number of validation steps to execute at the beginning of the training +# num_sanity_val_steps: 0 + +# ckpt path +resume_from_checkpoint: null diff --git a/pl-hydra/notebooks/.gitkeep b/pl-hydra/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/notebooks/make_vgg11.ipynb b/pl-hydra/notebooks/make_vgg11.ipynb new file mode 100644 index 0000000..3d43ffa --- /dev/null +++ b/pl-hydra/notebooks/make_vgg11.ipynb @@ -0,0 +1,196 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python38264bitpytorchcondad338a0d9609a4ea7a86cafca05238e80", + "display_name": "Python 3.8.2 64-bit ('PYTORCH': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Standard libraries\n", + "import os\n", + "\n", + "## PyTorch\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.utils.data as data\n", + "import torch.optim as optim\n", + "from types import SimpleNamespace\n", + "\n", + "import math\n", + "import torch.nn.init as init" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Files already downloaded and verified\n" + } + ], + "source": [ + "## classes\n", + "name_classes = ['Airplane', 'Automobile', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck']\n", + "\n", + "# Act fns\n", + "act_fn_by_name = {\n", + " \"tanh\": nn.Tanh,\n", + " \"relu\": nn.ReLU,\n", + " \"leakyrelu\": nn.LeakyReLU,\n", + " \"gelu\": nn.GELU\n", + "}\n", + "# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)\n", + "DATASET_PATH = \"../data\"\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "## VGG network\n", + "class CnnBlock(nn.Module):\n", + "\n", + " def __init__(self, c_in, c_out, act_fn):\n", + " \"\"\"\n", + " Inputs:\n", + " c_in - Number of input feature maps from the previous layers\n", + " c_out - Number of output feature maps\n", + " act_fn - Activation class constructor (e.g. nn.ReLU)\n", + " \"\"\"\n", + " super().__init__()\n", + "\n", + " self.conv = nn.Sequential(\n", + " nn.Conv2d(c_in, c_out, kernel_size=3, padding=1),\n", + " nn.BatchNorm2d(c_out),\n", + " act_fn()\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.conv(x)\n", + "\n", + "class VGG11(nn.Module):\n", + " '''\n", + " VGG model \n", + " '''\n", + " def __init__(self,\n", + " num_classes: int=10,\n", + " act_fn_name = \"relu\",\n", + " **kwargs):\n", + " super().__init__()\n", + " self.hparams = SimpleNamespace(num_classes=num_classes,\n", + " act_fn_name=act_fn_name)\n", + " #print(self.hparams)\n", + " self._create_network()\n", + " self._init_params()\n", + "\n", + "\n", + " def _create_network(self):\n", + " \n", + " # Creating the features map\n", + " self.vgg_blocks = nn.Sequential(\n", + " CnnBlock(3, 64, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " CnnBlock(64, 128, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " CnnBlock(128, 256, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " #CnnBlock(256, 256, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " CnnBlock(256, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " #CnnBlock(512, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " CnnBlock(512, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " #CnnBlock(512, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " )\n", + " \n", + " # Mapping to classification output\n", + " self.output_net = nn.Sequential(nn.Flatten(),\n", + " nn.Linear(512, 512),\n", + " act_fn_by_name[self.hparams.act_fn_name](),\n", + " nn.Linear(512, 512),\n", + " act_fn_by_name[self.hparams.act_fn_name](),\n", + " nn.Linear(512, self.hparams.num_classes),\n", + " )\n", + "\n", + " def _init_params(self):\n", + " for m in self.modules():\n", + " if isinstance(m, nn.Conv2d):\n", + " nn.init.kaiming_normal_(\n", + " m.weight, \n", + " nonlinearity=self.hparams.act_fn_name)\n", + " elif isinstance(m, nn.BatchNorm2d):\n", + " nn.init.constant_(m.weight, 1)\n", + " nn.init.constant_(m.bias, 0)\n", + "\n", + "\n", + " def forward(self, x):\n", + " x = self.vgg_blocks(x)\n", + " x = self.output_net(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # PyTorch v0.4.0\n", + "model = VGG11().to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "----------------------------------------------------------------\n Layer (type) Output Shape Param #\n================================================================\n Conv2d-1 [-1, 64, 32, 32] 1,792\n BatchNorm2d-2 [-1, 64, 32, 32] 128\n ReLU-3 [-1, 64, 32, 32] 0\n CnnBlock-4 [-1, 64, 32, 32] 0\n MaxPool2d-5 [-1, 64, 16, 16] 0\n Conv2d-6 [-1, 128, 16, 16] 73,856\n BatchNorm2d-7 [-1, 128, 16, 16] 256\n ReLU-8 [-1, 128, 16, 16] 0\n CnnBlock-9 [-1, 128, 16, 16] 0\n MaxPool2d-10 [-1, 128, 8, 8] 0\n Conv2d-11 [-1, 256, 8, 8] 295,168\n BatchNorm2d-12 [-1, 256, 8, 8] 512\n ReLU-13 [-1, 256, 8, 8] 0\n CnnBlock-14 [-1, 256, 8, 8] 0\n MaxPool2d-15 [-1, 256, 4, 4] 0\n Conv2d-16 [-1, 512, 4, 4] 1,180,160\n BatchNorm2d-17 [-1, 512, 4, 4] 1,024\n ReLU-18 [-1, 512, 4, 4] 0\n CnnBlock-19 [-1, 512, 4, 4] 0\n MaxPool2d-20 [-1, 512, 2, 2] 0\n Conv2d-21 [-1, 512, 2, 2] 2,359,808\n BatchNorm2d-22 [-1, 512, 2, 2] 1,024\n ReLU-23 [-1, 512, 2, 2] 0\n CnnBlock-24 [-1, 512, 2, 2] 0\n MaxPool2d-25 [-1, 512, 1, 1] 0\n Flatten-26 [-1, 512] 0\n Linear-27 [-1, 512] 262,656\n ReLU-28 [-1, 512] 0\n Linear-29 [-1, 512] 262,656\n ReLU-30 [-1, 512] 0\n Linear-31 [-1, 10] 5,130\n================================================================\nTotal params: 4,444,170\nTrainable params: 4,444,170\nNon-trainable params: 0\n----------------------------------------------------------------\nInput size (MB): 0.01\nForward/backward pass size (MB): 4.07\nParams size (MB): 16.95\nEstimated Total Size (MB): 21.04\n----------------------------------------------------------------\n" + } + ], + "source": [ + "summary(model, (3, 32, 32))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/pl-hydra/scripts/schedule.sh b/pl-hydra/scripts/schedule.sh new file mode 100644 index 0000000..a3fbaaa --- /dev/null +++ b/pl-hydra/scripts/schedule.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Shedule execution of many runs +# Run from root folder with: bash scripts/schedule.sh + +python train.py trainer.max_epochs=5 + +python train.py trainer.max_epochs=10 logger=csv diff --git a/pl-hydra/setup.cfg b/pl-hydra/setup.cfg new file mode 100644 index 0000000..a205e67 --- /dev/null +++ b/pl-hydra/setup.cfg @@ -0,0 +1,36 @@ +[isort] +line_length = 99 +profile = black +filter_files = True + + +[flake8] +max_line_length = 99 +show_source = True +format = pylint +ignore = + F401 # Module imported but unused + W504 # Line break occurred after a binary operator + F841 # Local variable name is assigned to but never used + E501 # Line too long +exclude = + .git + __pycache__ + data/* + tests/* + notebooks/* + logs/* + + +[tool:pytest] +testpaths = tests/ +log_cli = True +markers = + slow +addopts = + --durations=0 + --strict-markers + --doctest-modules +filterwarnings = + ignore::DeprecationWarning + ignore::UserWarning diff --git a/pl-hydra/src/__init__.py b/pl-hydra/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/src/datamodules/__init__.py b/pl-hydra/src/datamodules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/src/datamodules/cifar10_datamodule.py b/pl-hydra/src/datamodules/cifar10_datamodule.py new file mode 100644 index 0000000..6da4c37 --- /dev/null +++ b/pl-hydra/src/datamodules/cifar10_datamodule.py @@ -0,0 +1,128 @@ +from typing import Optional, Tuple + +import torch +from pytorch_lightning import LightningDataModule +from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split +from torchvision.datasets import CIFAR10 +from torchvision.transforms import transforms + + +class CIFAR10DataModule(LightningDataModule): + """Example of LightningDataModule for MNIST dataset. + + A DataModule implements 5 key methods: + - prepare_data (things to do on 1 GPU/TPU, not on every GPU/TPU in distributed mode) + - setup (things to do on every accelerator in distributed mode) + - train_dataloader (the training dataloader) + - val_dataloader (the validation dataloader(s)) + - test_dataloader (the test dataloader(s)) + + This allows you to share a full dataset without explaining how to download, + split, transform and process the data. + + Read the docs: + https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html + """ + + def __init__( + self, + data_dir: str = "data/", + train_val_split: Tuple[int, int, int] = (55_000, 5_000, 10_000), + batch_size: int = 128, + num_workers: int = 4, + pin_memory: bool = True, + data_mean: Tuple[float, float, float] = (0.0, 0.0, 0.0), + data_std: Tuple[float, float, float] = (0.0, 0.0, 0.0), + image_size: Tuple[int, int] = (32, 32), + scale_bounds: Tuple[float, float] = (0.8, 1.0), + aspect_bounds: Tuple[float, float] = (0.9, 1.1) + ): + super().__init__() + + # this line allows to access init params with 'self.hparams' attribute + self.save_hyperparameters(logger=False) + + # data transformations + self.train_transforms = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop(image_size,scale=scale_bounds,ratio=aspect_bounds), + transforms.ToTensor(), + transforms.Normalize(data_mean, data_std) + ]) + + self.test_transforms = transforms.Compose([transforms.ToTensor(), + transforms.Normalize(data_mean, data_std) + ]) + + self.data_train: Optional[Dataset] = None + self.data_val: Optional[Dataset] = None + self.data_test: Optional[Dataset] = None + + @property + def num_classes(self) -> int: + return 10 + + def prepare_data(self): + """Download data if needed. + + This method is called only from a single GPU. + Do not use it to assign state (self.x = y). + """ + CIFAR10(self.hparams.data_dir, train=True, download=True) + CIFAR10(self.hparams.data_dir, train=False, download=True) + + + def setup(self, stage: Optional[str] = None): + """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`. + + This method is called by lightning when doing `trainer.fit()` and `trainer.test()`, + so be careful not to execute the random split twice! The `stage` can be used to + differentiate whether it's called before trainer.fit()` or `trainer.test()`. + """ + + # load datasets only if they're not loaded already + if not self.data_train and not self.data_val and not self.data_test: + trainset = CIFAR10(self.hparams.data_dir, + train=True, + transform=self.train_transforms) + valset = CIFAR10(self.hparams.data_dir, + train=False, + transform=self.test_transforms) + testset = CIFAR10(self.hparams.data_dir, + train=False, + transform=self.test_transforms) + + self.data_train = trainset + self.data_val = valset + self.data_test = testset + + #print(type(self.data_train.dataset)) + + def train_dataloader(self): + return DataLoader( + dataset=self.data_train, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=True, + drop_last=True + ) + + def val_dataloader(self): + return DataLoader( + dataset=self.data_val, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + shuffle=False, + drop_last=False + ) + + + def test_dataloader(self): + return DataLoader( + dataset=self.data_test, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + shuffle=False, + drop_last=False + ) diff --git a/pl-hydra/src/datamodules/components/__init__.py b/pl-hydra/src/datamodules/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/src/datamodules/mnist_datamodule.py b/pl-hydra/src/datamodules/mnist_datamodule.py new file mode 100644 index 0000000..377c537 --- /dev/null +++ b/pl-hydra/src/datamodules/mnist_datamodule.py @@ -0,0 +1,106 @@ +from typing import Optional, Tuple + +import torch +from pytorch_lightning import LightningDataModule +from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split +from torchvision.datasets import MNIST +from torchvision.transforms import transforms + + +class MNISTDataModule(LightningDataModule): + """Example of LightningDataModule for MNIST dataset. + + A DataModule implements 5 key methods: + - prepare_data (things to do on 1 GPU/TPU, not on every GPU/TPU in distributed mode) + - setup (things to do on every accelerator in distributed mode) + - train_dataloader (the training dataloader) + - val_dataloader (the validation dataloader(s)) + - test_dataloader (the test dataloader(s)) + + This allows you to share a full dataset without explaining how to download, + split, transform and process the data. + + Read the docs: + https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html + """ + + def __init__( + self, + data_dir: str = "data/", + train_val_test_split: Tuple[int, int, int] = (55_000, 5_000, 10_000), + batch_size: int = 64, + num_workers: int = 0, + pin_memory: bool = False, + ): + super().__init__() + + # this line allows to access init params with 'self.hparams' attribute + self.save_hyperparameters(logger=False) + + # data transformations + self.transforms = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ) + + self.data_train: Optional[Dataset] = None + self.data_val: Optional[Dataset] = None + self.data_test: Optional[Dataset] = None + + @property + def num_classes(self) -> int: + return 10 + + def prepare_data(self): + """Download data if needed. + + This method is called only from a single GPU. + Do not use it to assign state (self.x = y). + """ + MNIST(self.hparams.data_dir, train=True, download=True) + MNIST(self.hparams.data_dir, train=False, download=True) + + def setup(self, stage: Optional[str] = None): + """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`. + + This method is called by lightning when doing `trainer.fit()` and `trainer.test()`, + so be careful not to execute the random split twice! The `stage` can be used to + differentiate whether it's called before trainer.fit()` or `trainer.test()`. + """ + + # load datasets only if they're not loaded already + if not self.data_train and not self.data_val and not self.data_test: + trainset = MNIST(self.hparams.data_dir, train=True, transform=self.transforms) + testset = MNIST(self.hparams.data_dir, train=False, transform=self.transforms) + dataset = ConcatDataset(datasets=[trainset, testset]) + self.data_train, self.data_val, self.data_test = random_split( + dataset=dataset, + lengths=self.hparams.train_val_test_split, + generator=torch.Generator().manual_seed(42), + ) + + def train_dataloader(self): + return DataLoader( + dataset=self.data_train, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=True, + ) + + def val_dataloader(self): + return DataLoader( + dataset=self.data_val, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=False, + ) + + def test_dataloader(self): + return DataLoader( + dataset=self.data_test, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=False, + ) diff --git a/pl-hydra/src/models/__init__.py b/pl-hydra/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/src/models/cifar10_module.py b/pl-hydra/src/models/cifar10_module.py new file mode 100644 index 0000000..d45fc25 --- /dev/null +++ b/pl-hydra/src/models/cifar10_module.py @@ -0,0 +1,198 @@ +from typing import Any, List + +import torch +from pytorch_lightning import LightningModule +from torchmetrics import MaxMetric +from torchmetrics.classification.accuracy import Accuracy +from torchmetrics.functional import confusion_matrix + +from src.models.components.resnet import ResNet +#import torch.optim as optim +import torch.nn as nn + +import seaborn as sns +import pandas as pd +import matplotlib.pylab as plt +import numpy as np +from src.utils.plotter import plot_cm, plot_preds + +import hydra + +## classes +name_classes = ['Airplane', 'Automobile', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck'] +num_classes = len(name_classes) +data_mean = [0.49421428, 0.48513139, 0.45040909] +data_std = [0.24665252, 0.24289226, 0.26159238] + +class CIFAR10LitModule(LightningModule): + """Example of LightningModule for MNIST classification. + + A LightningModule organizes your PyTorch code into 5 sections: + - Computations (init). + - Train loop (training_step) + - Validation loop (validation_step) + - Test loop (test_step) + - Optimizers (configure_optimizers) + + Read the docs: + https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html + """ + + def __init__(self,*args,**kwargs): + super().__init__() + + # this line allows to access init params with 'self.hparams' attribute + # it also ensures init params will be stored in ckpt + print('here') + + self.save_hyperparameters() + # network + self.net = hydra.utils.instantiate(self.hparams['net']) + + # loss function + self.criterion = nn.CrossEntropyLoss() + + # Example input for visualizing the graph in Tensorboard + self.example_input_array = torch.zeros((1, 3, 32, 32), dtype=torch.float32) + + # use separate metric instance for train, val and test step + # to ensure a proper reduction over the epoch + self.train_acc = Accuracy() + self.val_acc = Accuracy() + self.test_acc = Accuracy() + + # for logging best so far validation accuracy + self.val_acc_best = MaxMetric() + + def forward(self, x: torch.Tensor): + return self.net(x) + + def step(self, batch: Any): + x, y = batch + logits = self.forward(x) + loss = self.criterion(logits, y) + preds = torch.argmax(logits, dim=1) + return loss, preds, y + + def training_step(self, batch: Any, batch_idx: int): + loss, preds, targets = self.step(batch) + + # log train metrics + acc = self.train_acc(preds, targets) + self.log("train/loss", loss, on_step=False, on_epoch=True, prog_bar=False) + self.log("train/acc", acc, on_step=False, on_epoch=True, prog_bar=True) + + # we can return here dict with any tensors + # and then read it in some callback or in `training_epoch_end()`` below + # remember to always return loss from `training_step()` or else backpropagation will fail! + return {"loss": loss, "preds": preds, "targets": targets} + + def training_epoch_end(self, outputs: List[Any]): + # `outputs` is a list of dicts returned from `training_step()` + # plot the confusion matrix at the end of each epoch + preds = torch.cat([tmp['preds'] for tmp in outputs]) + targets = torch.cat([tmp['targets'] for tmp in outputs]) + + # plot confusion matrix + cm = confusion_matrix(targets, preds, num_classes) + fig_ = plot_cm(cm, name_classes) + plt.close(fig_) + self.logger.experiment.add_figure("confusion_matrix_train", fig_, self.current_epoch) + + def validation_step(self, batch: Any, batch_idx: int): + loss, preds, targets = self.step(batch) + + # plot figures + if batch_idx == 0: + images, _ = batch + fig_ = plot_preds(images.cpu().numpy(), + targets.cpu().numpy(), + preds.cpu().numpy(), + name_classes, + nimg=32, + ncols=8, + data_mean=data_mean, + data_std=data_std) + self.logger.experiment.add_figure( + "examples_val_batch_idx_" + str(batch_idx), + fig_, + self.current_epoch) + + # log val metrics + acc = self.val_acc(preds, targets) + self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=False) + self.log("val/acc", acc, on_step=False, on_epoch=True, prog_bar=True) + #print(preds) + return {"loss": loss, "preds": preds, "targets": targets} + + def validation_epoch_end(self, outputs: List[Any]): + acc = self.val_acc.compute() # get val accuracy from current epoch + self.val_acc_best.update(acc) + self.log("val/acc_best", self.val_acc_best.compute(), on_epoch=True, prog_bar=True) + + # plot the confusion matrix at the end of each epoch + preds = torch.cat([tmp['preds'] for tmp in outputs]) + targets = torch.cat([tmp['targets'] for tmp in outputs]) + cm = confusion_matrix(targets, preds, num_classes) + fig_ = plot_cm(cm, name_classes) + plt.close(fig_) + self.logger.experiment.add_figure("confusion_matrix_val", fig_, self.current_epoch) + + + def test_step(self, batch: Any, batch_idx: int): + loss, preds, targets = self.step(batch) + + # plot figures + if batch_idx == 0: + images, _ = batch + fig_ = plot_preds(images.cpu().numpy(), + targets.cpu().numpy(), + preds.cpu().numpy(), + name_classes, + nimg=32, + ncols=8, + data_mean=data_mean, + data_std=data_std) + self.logger.experiment.add_figure( + "examples_test_batch_idx_" + str(batch_idx), + fig_, + self.current_epoch) + + # log test metrics + acc = self.test_acc(preds, targets) + self.log("test/loss", loss, on_step=False, on_epoch=True) + self.log("test/acc", acc, on_step=False, on_epoch=True) + + return {"loss": loss, "preds": preds, "targets": targets} + + def test_epoch_end(self, outputs: List[Any]): + # plot the confusion matrix at the end of each epoch + preds = torch.cat([tmp['preds'] for tmp in outputs]) + targets = torch.cat([tmp['targets'] for tmp in outputs]) + cm = confusion_matrix(targets, preds, num_classes) + fig_ = plot_cm(cm, name_classes) + plt.close(fig_) + + self.logger.experiment.add_figure("confusion_matrix_test", fig_, self.current_epoch) + + def on_epoch_end(self): + # reset metrics at the end of every epoch + self.train_acc.reset() + self.test_acc.reset() + self.val_acc.reset() + + + def configure_optimizers(self): + optimizer=hydra.utils.instantiate( + self.hparams.optim["optimizer"], + params=self.net.parameters() + ) + + if(self.hparams.optim['use_lr_scheduler']==True): + scheduler=hydra.utils.instantiate( + self.hparams.optim['lr_scheduler'], + optimizer=optimizer + ) + return [optimizer],[scheduler] + else: + return optimizer \ No newline at end of file diff --git a/pl-hydra/src/models/components/__init__.py b/pl-hydra/src/models/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/src/models/components/densenet.py b/pl-hydra/src/models/components/densenet.py new file mode 100644 index 0000000..43b3670 --- /dev/null +++ b/pl-hydra/src/models/components/densenet.py @@ -0,0 +1,153 @@ +''' +Modified from https://github.com/phlippe/uvadlc_notebooks.git +''' + +## Standard libraries +import os + +## PyTorch +import torch +import torch.nn as nn +import torch.utils.data as data +import torch.optim as optim +from types import SimpleNamespace + + +act_fn_by_name = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "leakyrelu": nn.LeakyReLU, + "gelu": nn.GELU +} + +class DenseLayer(nn.Module): + + def __init__(self, c_in, bn_size, growth_rate, act_fn): + """ + Inputs: + c_in - Number of input channels + bn_size - Bottleneck size (factor of growth rate) for the output of the 1x1 convolution. Typically between 2 and 4. + growth_rate - Number of output channels of the 3x3 convolution + act_fn - Activation class constructor (e.g. nn.ReLU) + """ + super().__init__() + self.net = nn.Sequential( + nn.BatchNorm2d(c_in), + act_fn(), + nn.Conv2d(c_in, bn_size * growth_rate, kernel_size=1, bias=False), + nn.BatchNorm2d(bn_size * growth_rate), + act_fn(), + nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) + ) + + def forward(self, x): + out = self.net(x) + out = torch.cat([out, x], dim=1) + return out + +# Block +class DenseBlock(nn.Module): + + def __init__(self, c_in, num_layers, bn_size, growth_rate, act_fn): + """ + Inputs: + c_in - Number of input channels + num_layers - Number of dense layers to apply in the block + bn_size - Bottleneck size to use in the dense layers + growth_rate - Growth rate to use in the dense layers + act_fn - Activation function to use in the dense layers + """ + super().__init__() + layers = [] + for layer_idx in range(num_layers): + layers.append( + DenseLayer(c_in=c_in + layer_idx * growth_rate, # Input channels are original plus the feature maps from previous layers + bn_size=bn_size, + growth_rate=growth_rate, + act_fn=act_fn) + ) + self.block = nn.Sequential(*layers) + + def forward(self, x): + out = self.block(x) + return out + +class TransitionLayer(nn.Module): + + def __init__(self, c_in, c_out, act_fn): + super().__init__() + self.transition = nn.Sequential( + nn.BatchNorm2d(c_in), + act_fn(), + nn.Conv2d(c_in, c_out, kernel_size=1, bias=False), + nn.AvgPool2d(kernel_size=2, stride=2) # Average the output for each 2x2 pixel group + ) + + def forward(self, x): + return self.transition(x) + +class DenseNet(nn.Module): + + def __init__(self, num_classes=10, num_layers=[6,6,6,6], bn_size=2, growth_rate=16, act_fn_name="relu", **kwargs): + super().__init__() + self.hparams = SimpleNamespace(num_classes=num_classes, + num_layers=num_layers, + bn_size=bn_size, + growth_rate=growth_rate, + act_fn_name=act_fn_name, + ) + self._create_network() + self._init_params() + + def _create_network(self): + c_hidden = self.hparams.growth_rate * self.hparams.bn_size # The start number of hidden channels + + # A first convolution on the original image to scale up the channel size + self.input_net = nn.Sequential( + nn.Conv2d(3, c_hidden, kernel_size=3, padding=1) # No batch norm or activation function as done inside the Dense layers + ) + + # Creating the dense blocks, eventually including transition layers + blocks = [] + for block_idx, num_layers in enumerate(self.hparams.num_layers): + blocks.append( + DenseBlock(c_in=c_hidden, + num_layers=num_layers, + bn_size=self.hparams.bn_size, + growth_rate=self.hparams.growth_rate, + act_fn=act_fn_by_name[self.hparams.act_fn_name]) + ) + c_hidden = c_hidden + num_layers * self.hparams.growth_rate # Overall output of the dense block + if block_idx < len(self.hparams.num_layers)-1: # Don't apply transition layer on last block + blocks.append( + TransitionLayer(c_in=c_hidden, + c_out=c_hidden // 2, + act_fn=act_fn_by_name[self.hparams.act_fn_name])) + c_hidden = c_hidden // 2 + + self.blocks = nn.Sequential(*blocks) + + # Mapping to classification output + self.output_net = nn.Sequential( + nn.BatchNorm2d(c_hidden), # The features have not passed a non-linearity until here. + act_fn_by_name[self.hparams.act_fn_name](), + nn.AdaptiveAvgPool2d((1,1)), + nn.Flatten(), + nn.Linear(c_hidden, self.hparams.num_classes) + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the convolutions according to the activation function + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.input_net(x) + x = self.blocks(x) + x = self.output_net(x) + return x + diff --git a/pl-hydra/src/models/components/googlenet.py b/pl-hydra/src/models/components/googlenet.py new file mode 100644 index 0000000..0745736 --- /dev/null +++ b/pl-hydra/src/models/components/googlenet.py @@ -0,0 +1,132 @@ +''' +Modified from https://github.com/phlippe/uvadlc_notebooks.git +''' + +## Standard libraries +import os + +## PyTorch +import torch +import torch.nn as nn +import torch.utils.data as data +import torch.optim as optim +from types import SimpleNamespace + + +act_fn_by_name = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "leakyrelu": nn.LeakyReLU, + "gelu": nn.GELU +} + + +# Block +class InceptionBlock(nn.Module): + + def __init__(self, c_in, c_red : dict, c_out : dict, act_fn): + """ + Inputs: + c_in - Number of input feature maps from the previous layers + c_red - Dictionary with keys "3x3" and "5x5" specifying the output of the dimensionality reducing 1x1 convolutions + c_out - Dictionary with keys "1x1", "3x3", "5x5", and "max" + act_fn - Activation class constructor (e.g. nn.ReLU) + """ + super().__init__() + + # 1x1 convolution branch + self.conv_1x1 = nn.Sequential( + nn.Conv2d(c_in, c_out["1x1"], kernel_size=1), + nn.BatchNorm2d(c_out["1x1"]), + act_fn() + ) + + # 3x3 convolution branch + self.conv_3x3 = nn.Sequential( + nn.Conv2d(c_in, c_red["3x3"], kernel_size=1), + nn.BatchNorm2d(c_red["3x3"]), + act_fn(), + nn.Conv2d(c_red["3x3"], c_out["3x3"], kernel_size=3, padding=1), + nn.BatchNorm2d(c_out["3x3"]), + act_fn() + ) + + # 5x5 convolution branch + self.conv_5x5 = nn.Sequential( + nn.Conv2d(c_in, c_red["5x5"], kernel_size=1), + nn.BatchNorm2d(c_red["5x5"]), + act_fn(), + nn.Conv2d(c_red["5x5"], c_out["5x5"], kernel_size=5, padding=2), + nn.BatchNorm2d(c_out["5x5"]), + act_fn() + ) + + # Max-pool branch + self.max_pool = nn.Sequential( + nn.MaxPool2d(kernel_size=3, padding=1, stride=1), + nn.Conv2d(c_in, c_out["max"], kernel_size=1), + nn.BatchNorm2d(c_out["max"]), + act_fn() + ) + + def forward(self, x): + x_1x1 = self.conv_1x1(x) + x_3x3 = self.conv_3x3(x) + x_5x5 = self.conv_5x5(x) + x_max = self.max_pool(x) + x_out = torch.cat([x_1x1, x_3x3, x_5x5, x_max], dim=1) + return x_out + + +class GoogleNet(nn.Module): + + def __init__(self, num_classes=10, act_fn_name="relu", **kwargs): + super().__init__() + self.hparams = SimpleNamespace(num_classes=num_classes, + act_fn_name=act_fn_name,) + self._create_network() + self._init_params() + + def _create_network(self): + # A first convolution on the original image to scale up the channel size + self.input_net = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=3, padding=1), + nn.BatchNorm2d(64), + act_fn_by_name[self.hparams.act_fn_name]() + ) + # Stacking inception blocks + self.inception_blocks = nn.Sequential( + InceptionBlock(64, c_red={"3x3": 32, "5x5": 16}, c_out={"1x1": 16, "3x3": 32, "5x5": 8, "max": 8}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + InceptionBlock(64, c_red={"3x3": 32, "5x5": 16}, c_out={"1x1": 24, "3x3": 48, "5x5": 12, "max": 12}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(3, stride=2, padding=1), # 32x32 => 16x16 + InceptionBlock(96, c_red={"3x3": 32, "5x5": 16}, c_out={"1x1": 24, "3x3": 48, "5x5": 12, "max": 12}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + InceptionBlock(96, c_red={"3x3": 32, "5x5": 16}, c_out={"1x1": 16, "3x3": 48, "5x5": 16, "max": 16}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + InceptionBlock(96, c_red={"3x3": 32, "5x5": 16}, c_out={"1x1": 16, "3x3": 48, "5x5": 16, "max": 16}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + InceptionBlock(96, c_red={"3x3": 32, "5x5": 16}, c_out={"1x1": 32, "3x3": 48, "5x5": 24, "max": 24}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(3, stride=2, padding=1), # 16x16 => 8x8 + InceptionBlock(128, c_red={"3x3": 48, "5x5": 16}, c_out={"1x1": 32, "3x3": 64, "5x5": 16, "max": 16}, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + InceptionBlock(128, c_red={"3x3": 48, "5x5": 16}, c_out={"1x1": 32, "3x3": 64, "5x5": 16, "max": 16}, act_fn=act_fn_by_name[self.hparams.act_fn_name]) + ) + # Mapping to classification output + self.output_net = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + nn.Flatten(), + nn.Linear(128, self.hparams.num_classes) + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the convolutions according to the activation function + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, + nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.input_net(x) + x = self.inception_blocks(x) + x = self.output_net(x) + return x diff --git a/pl-hydra/src/models/components/resnet.py b/pl-hydra/src/models/components/resnet.py new file mode 100644 index 0000000..5004e2a --- /dev/null +++ b/pl-hydra/src/models/components/resnet.py @@ -0,0 +1,135 @@ +''' +Modified from https://github.com/phlippe/uvadlc_notebooks.git +''' + +## Standard libraries +import os + +## PyTorch +import torch +import torch.nn as nn +import torch.utils.data as data +import torch.optim as optim +from types import SimpleNamespace + + +act_fn_by_name = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "leakyrelu": nn.LeakyReLU, + "gelu": nn.GELU +} + + +# Block +class ResNetBlock(nn.Module): + + def __init__(self, c_in, act_fn, subsample=False, c_out=-1): + """ + Inputs: + c_in - Number of input features + act_fn - Activation class constructor (e.g. nn.ReLU) + subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width + c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in + """ + super().__init__() + if not subsample: + c_out = c_in + + # Network representing F + self.net = nn.Sequential( + nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, stride=1 if not subsample else 2, bias=False), # No bias needed as the Batch Norm handles it + nn.BatchNorm2d(c_out), + act_fn(), + nn.Conv2d(c_out, c_out, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(c_out) + ) + + # 1x1 convolution with stride 2 means we take the upper left value, and transform it to new output size + self.downsample = nn.Conv2d(c_in, c_out, kernel_size=1, stride=2) if subsample else None + self.act_fn = act_fn() + #print(self.act_fn) + + def forward(self, x): + z = self.net(x) + if self.downsample is not None: + x = self.downsample(x) + out = z + x + out = self.act_fn(out) + return out + + +# Residual Network +class ResNet(nn.Module): + + def __init__(self, + num_classes: int=10, + num_blocks: list=[3,3,3], + c_hidden: list=[16,32,64], + act_fn_name = "relu", + **kwargs): + """ + Inputs: + num_classes - Number of classification outputs (10 for CIFAR10) + num_blocks - List with the number of ResNet blocks to use. The first block of each group uses downsampling, except the first. + c_hidden - List with the hidden dimensionalities in the different blocks. Usually multiplied by 2 the deeper we go. + act_fn_name - Name of the activation function to use, looked up in "act_fn_by_name" + block_name - Name of the ResNet block, looked up in "resnet_blocks_by_name" + """ + super().__init__() + self.hparams = SimpleNamespace(num_classes=num_classes, + c_hidden=c_hidden, + num_blocks=num_blocks, + act_fn_name=act_fn_name) + #print(self.hparams) + self._create_network() + self._init_params() + + def _create_network(self): + c_hidden = self.hparams.c_hidden + + # A first convolution on the original image to scale up the channel size + self.input_net = nn.Sequential( + nn.Conv2d(3, c_hidden[0], kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(c_hidden[0]), + act_fn_by_name[self.hparams.act_fn_name]() + ) + #print(self.input_net) + + # Creating the ResNet blocks + blocks = [] + for block_idx, block_count in enumerate(self.hparams.num_blocks): + for bc in range(block_count): + subsample = (bc == 0 and block_idx > 0) # Subsample the first block of each group, except the very first one. + blocks.append( + ResNetBlock(c_in=c_hidden[block_idx if not subsample else (block_idx-1)], + act_fn=act_fn_by_name[self.hparams.act_fn_name], + subsample=subsample, + c_out=c_hidden[block_idx]) + ) + self.blocks = nn.Sequential(*blocks) + + # Mapping to classification output + self.output_net = nn.Sequential( + nn.AdaptiveAvgPool2d((1,1)), + nn.Flatten(), + nn.Linear(c_hidden[-1], self.hparams.num_classes) + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the convolutions according to the activation function + # Fan-out focuses on the gradient distribution, and is commonly used in ResNets + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, + mode='fan_out', + nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.input_net(x) + x = self.blocks(x) + x = self.output_net(x) + return x diff --git a/pl-hydra/src/models/components/vgg.py b/pl-hydra/src/models/components/vgg.py new file mode 100644 index 0000000..ccd0926 --- /dev/null +++ b/pl-hydra/src/models/components/vgg.py @@ -0,0 +1,111 @@ +''' +Modified from https://github.com/chengyangfu/pytorch-vgg-cifar10.git +''' + +## Standard libraries +import os + +## PyTorch +import torch +import torch.nn as nn +import torch.utils.data as data +import torch.optim as optim +from types import SimpleNamespace + + +act_fn_by_name = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "leakyrelu": nn.LeakyReLU, + "gelu": nn.GELU +} + + +import math + +import torch.nn as nn +import torch.nn.init as init + + +class CnnBlock(nn.Module): + + def __init__(self, c_in, c_out, act_fn): + """ + Inputs: + c_in - Number of input feature maps from the previous layers + c_out - Number of output feature maps + act_fn - Activation class constructor (e.g. nn.ReLU) + """ + super().__init__() + + self.conv = nn.Sequential( + nn.Conv2d(c_in, c_out, kernel_size=3, padding=1), + nn.BatchNorm2d(c_out), + act_fn() + ) + + def forward(self, x): + return self.conv(x) + +class VGG11(nn.Module): + ''' + VGG model + ''' + def __init__(self, + num_classes: int=10, + act_fn_name = "relu", + **kwargs): + super().__init__() + self.hparams = SimpleNamespace(num_classes=num_classes, + act_fn_name=act_fn_name) + #print(self.hparams) + self._create_network() + self._init_params() + + + def _create_network(self): + + # Creating the features map + self.vgg_blocks = nn.Sequential( + CnnBlock(3, 64, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(kernel_size=2, stride=2), + CnnBlock(64, 128, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(kernel_size=2, stride=2), + CnnBlock(128, 256, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + CnnBlock(256, 256, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(kernel_size=2, stride=2), + CnnBlock(256, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + CnnBlock(512, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(kernel_size=2, stride=2), + CnnBlock(512, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + CnnBlock(512, 512, act_fn=act_fn_by_name[self.hparams.act_fn_name]), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + + # Mapping to classification output + self.output_net = nn.Sequential(nn.Flatten(), + nn.Dropout(0.4), + nn.Linear(512, 512), + act_fn_by_name[self.hparams.act_fn_name](), + nn.Dropout(0.4), + nn.Linear(512, 512), + act_fn_by_name[self.hparams.act_fn_name](), + nn.Linear(512, self.hparams.num_classes), + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the convolutions according to the activation function + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, + nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + + def forward(self, x): + x = self.vgg_blocks(x) + x = self.output_net(x) + return x \ No newline at end of file diff --git a/pl-hydra/src/models/components/vit.py b/pl-hydra/src/models/components/vit.py new file mode 100644 index 0000000..1d365b2 --- /dev/null +++ b/pl-hydra/src/models/components/vit.py @@ -0,0 +1,128 @@ +''' +Modified from https://github.com/phlippe/uvadlc_notebooks.git +''' + +## Standard libraries +import os + +## PyTorch +import torch +import torch.nn as nn +import torch.utils.data as data +import torch.optim as optim +from types import SimpleNamespace + + +act_fn_by_name = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "leakyrelu": nn.LeakyReLU, + "gelu": nn.GELU +} + +# helper function +def img_to_patch(x, patch_size, flatten_channels=True): + """ + Inputs: + x - torch.Tensor representing the image of shape [B, C, H, W] + patch_size - Number of pixels per dimension of the patches (integer) + flatten_channels - If True, the patches will be returned in a flattened format + as a feature vector instead of a image grid. + """ + B, C, H, W = x.shape + x = x.reshape(B, C, H//patch_size, patch_size, W//patch_size, patch_size) + x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W] + x = x.flatten(1,2) # [B, H'*W', C, p_H, p_W] + if flatten_channels: + x = x.flatten(2,4) # [B, H'*W', C*p_H*p_W] + return x + + +# Attention block +class AttentionBlock(nn.Module): + + def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super().__init__() + + self.layer_norm_1 = nn.LayerNorm(embed_dim) + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.layer_norm_2 = nn.LayerNorm(embed_dim) + self.linear = nn.Sequential( + nn.Linear(embed_dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, embed_dim), + nn.Dropout(dropout) + ) + + + def forward(self, x): + inp_x = self.layer_norm_1(x) + x = x + self.attn(inp_x, inp_x, inp_x)[0] + x = x + self.linear(self.layer_norm_2(x)) + return x + + +class VisionTransformer(nn.Module): + + def __init__(self, embed_dim, hidden_dim, num_channels, num_heads, num_layers, num_classes, patch_size, num_patches, dropout=0.0): + """ + Inputs: + embed_dim - Dimensionality of the input feature vectors to the Transformer + hidden_dim - Dimensionality of the hidden layer in the feed-forward networks + within the Transformer + num_channels - Number of channels of the input (3 for RGB) + num_heads - Number of heads to use in the Multi-Head Attention block + num_layers - Number of layers to use in the Transformer + num_classes - Number of classes to predict + patch_size - Number of pixels that the patches have per dimension + num_patches - Maximum number of patches an image can have + dropout - Amount of dropout to apply in the feed-forward network and + on the input encoding + """ + super().__init__() + + self.patch_size = patch_size + + # Layers/Networks + self.input_layer = nn.Linear(num_channels*(patch_size**2), embed_dim) + self.transformer = nn.Sequential(*[AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)]) + self.mlp_head = nn.Sequential( + nn.LayerNorm(embed_dim), + nn.Linear(embed_dim, num_classes) + ) + self.dropout = nn.Dropout(dropout) + + # Parameters/Embeddings + self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim)) + self.pos_embedding = nn.Parameter(torch.randn(1,1+num_patches,embed_dim)) + + + def forward(self, x): + # Preprocess input + x = img_to_patch(x, self.patch_size) + B, T, _ = x.shape + x = self.input_layer(x) + + # Add CLS token and positional encoding + cls_token = self.cls_token.repeat(B, 1, 1) + x = torch.cat([cls_token, x], dim=1) + x = x + self.pos_embedding[:,:T+1] + + # Apply Transforrmer + x = self.dropout(x) + x = x.transpose(0, 1) + x = self.transformer(x) + + # Perform classification prediction + cls = x[0] + out = self.mlp_head(cls) + return out \ No newline at end of file diff --git a/pl-hydra/src/testing_pipeline.py b/pl-hydra/src/testing_pipeline.py new file mode 100644 index 0000000..abd030a --- /dev/null +++ b/pl-hydra/src/testing_pipeline.py @@ -0,0 +1,57 @@ +import os +from typing import List + +import hydra +from omegaconf import DictConfig +from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything +from pytorch_lightning.loggers import LightningLoggerBase + +from src import utils + +log = utils.get_logger(__name__) + + +def test(config: DictConfig) -> None: + """Contains minimal example of the testing pipeline. Evaluates given checkpoint on a testset. + + Args: + config (DictConfig): Configuration composed by Hydra. + + Returns: + None + """ + + # Set seed for random number generators in pytorch, numpy and python.random + if config.get("seed"): + seed_everything(config.seed, workers=True) + + # Convert relative ckpt path to absolute path if necessary + if not os.path.isabs(config.ckpt_path): + config.ckpt_path = os.path.join(hydra.utils.get_original_cwd(), config.ckpt_path) + + # Init lightning datamodule + log.info(f"Instantiating datamodule <{config.datamodule._target_}>") + datamodule: LightningDataModule = hydra.utils.instantiate(config.datamodule) + + # Init lightning model + log.info(f"Instantiating model <{config.model._target_}>") + model: LightningModule = hydra.utils.instantiate(config.model) + + # Init lightning loggers + logger: List[LightningLoggerBase] = [] + if "logger" in config: + for _, lg_conf in config.logger.items(): + if "_target_" in lg_conf: + log.info(f"Instantiating logger <{lg_conf._target_}>") + logger.append(hydra.utils.instantiate(lg_conf)) + + # Init lightning trainer + log.info(f"Instantiating trainer <{config.trainer._target_}>") + trainer: Trainer = hydra.utils.instantiate(config.trainer, logger=logger) + + # Log hyperparameters + if trainer.logger: + trainer.logger.log_hyperparams({"ckpt_path": config.ckpt_path}) + + log.info("Starting testing!") + trainer.test(model=model, datamodule=datamodule, ckpt_path=config.ckpt_path) diff --git a/pl-hydra/src/training_pipeline.py b/pl-hydra/src/training_pipeline.py new file mode 100644 index 0000000..fea28c1 --- /dev/null +++ b/pl-hydra/src/training_pipeline.py @@ -0,0 +1,126 @@ +import os +from typing import List, Optional + +import hydra +from omegaconf import DictConfig, OmegaConf +from pytorch_lightning import ( + Callback, + LightningDataModule, + LightningModule, + Trainer, + seed_everything, +) +from pytorch_lightning.loggers import LightningLoggerBase + +from src import utils + +log = utils.get_logger(__name__) + + +def train(config: DictConfig) -> Optional[float]: + """Contains the training pipeline. Can additionally evaluate model on a testset, using best + weights achieved during training. + + Args: + config (DictConfig): Configuration composed by Hydra. + + Returns: + Optional[float]: Metric score for hyperparameter optimization. + """ + + # Set seed for random number generators in pytorch, numpy and python.random + if config.get("seed"): + seed_everything(config.seed, workers=True) + + # Convert relative ckpt path to absolute path if necessary + ckpt_path = config.trainer.get("resume_from_checkpoint") + if ckpt_path and not os.path.isabs(ckpt_path): + config.trainer.resume_from_checkpoint = os.path.join( + hydra.utils.get_original_cwd(), ckpt_path + ) + + # Init lightning datamodule + log.info(f"Instantiating datamodule <{config.datamodule._target_}>") + datamodule: LightningDataModule = hydra.utils.instantiate(config.datamodule) + + # Init lightning model + log.info(f"Instantiating model <{config.model._target_}>") + model: LightningModule = hydra.utils.instantiate(config.model, + optim=config.optim, + _recursive_=False, + _convert_ = "partial") + + # Init lightning callbacks + callbacks: List[Callback] = [] + if "callbacks" in config: + for _, cb_conf in config.callbacks.items(): + if "_target_" in cb_conf: + log.info(f"Instantiating callback <{cb_conf._target_}>") + callbacks.append(hydra.utils.instantiate(cb_conf)) + + # Init lightning loggers + logger: List[LightningLoggerBase] = [] + if "logger" in config: + for _, lg_conf in config.logger.items(): + if "_target_" in lg_conf: + log.info(f"Instantiating logger <{lg_conf._target_}>") + logger.append(hydra.utils.instantiate(lg_conf)) + + # Init lightning trainer + log.info(f"Instantiating trainer <{config.trainer._target_}>") + trainer: Trainer = hydra.utils.instantiate( + config.trainer, + callbacks=callbacks, + logger=logger, + _convert_="partial") + + # Send some parameters from config to all lightning loggers + log.info("Logging hyperparameters!") + utils.log_hyperparameters( + config=config, + model=model, + datamodule=datamodule, + trainer=trainer, + callbacks=callbacks, + logger=logger, + ) + + # Train the model + if config.get("train"): + log.info("Starting training!") + trainer.fit(model=model, datamodule=datamodule) + + # Get metric score for hyperparameter optimization + optimized_metric = config.get("optimized_metric") + if optimized_metric and optimized_metric not in trainer.callback_metrics: + raise Exception( + "Metric for hyperparameter optimization not found! " + "Make sure the `optimized_metric` in `hparams_search` config is correct!" + ) + score = trainer.callback_metrics.get(optimized_metric) + + # Test the model + if config.get("test"): + ckpt_path = "best" + if not config.get("train") or config.trainer.get("fast_dev_run"): + ckpt_path = None + log.info("Starting testing!") + trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path) + + # Make sure everything closed properly + log.info("Finalizing!") + utils.finish( + config=config, + model=model, + datamodule=datamodule, + trainer=trainer, + callbacks=callbacks, + logger=logger, + ) + + # Print path to best checkpoint + if not config.trainer.get("fast_dev_run") and config.get("train"): + log.info(f"Best model ckpt at {trainer.checkpoint_callback.best_model_path}") + + # Return metric score for hyperparameter optimization + return score diff --git a/pl-hydra/src/utils/__init__.py b/pl-hydra/src/utils/__init__.py new file mode 100644 index 0000000..b31f39a --- /dev/null +++ b/pl-hydra/src/utils/__init__.py @@ -0,0 +1,164 @@ +import logging +import warnings +from typing import List, Sequence + +import pytorch_lightning as pl +import rich.syntax +import rich.tree +from omegaconf import DictConfig, OmegaConf +from pytorch_lightning.utilities import rank_zero_only + + +def get_logger(name=__name__) -> logging.Logger: + """Initializes multi-GPU-friendly python command line logger.""" + + logger = logging.getLogger(name) + + # this ensures all logging levels get marked with the rank zero decorator + # otherwise logs would get multiplied for each GPU process in multi-GPU setup + for level in ( + "debug", + "info", + "warning", + "error", + "exception", + "fatal", + "critical", + ): + setattr(logger, level, rank_zero_only(getattr(logger, level))) + + return logger + + +log = get_logger(__name__) + + +def extras(config: DictConfig) -> None: + """Applies optional utilities, controlled by config flags. + + Utilities: + - Ignoring python warnings + - Rich config printing + """ + + # disable python warnings if + if config.get("ignore_warnings"): + log.info("Disabling python warnings! ") + warnings.filterwarnings("ignore") + + # pretty print config tree using Rich library if + if config.get("print_config"): + log.info("Printing config tree with Rich! ") + print_config(config, resolve=True) + + +@rank_zero_only +def print_config( + config: DictConfig, + print_order: Sequence[str] = ( + "datamodule", + "model", + "callbacks", + "logger", + "trainer", + ), + resolve: bool = True, +) -> None: + """Prints content of DictConfig using Rich library and its tree structure. + + Args: + config (DictConfig): Configuration composed by Hydra. + print_order (Sequence[str], optional): Determines in what order config components are printed. + resolve (bool, optional): Whether to resolve reference fields of DictConfig. + """ + + style = "dim" + tree = rich.tree.Tree("CONFIG", style=style, guide_style=style) + + quee = [] + + for field in print_order: + quee.append(field) if field in config else log.info(f"Field '{field}' not found in config") + + for field in config: + if field not in quee: + quee.append(field) + + for field in quee: + branch = tree.add(field, style=style, guide_style=style) + + config_group = config[field] + if isinstance(config_group, DictConfig): + branch_content = OmegaConf.to_yaml(config_group, resolve=resolve) + else: + branch_content = str(config_group) + + branch.add(rich.syntax.Syntax(branch_content, "yaml")) + + rich.print(tree) + + with open("config_tree.log", "w") as file: + rich.print(tree, file=file) + + +@rank_zero_only +def log_hyperparameters( + config: DictConfig, + model: pl.LightningModule, + datamodule: pl.LightningDataModule, + trainer: pl.Trainer, + callbacks: List[pl.Callback], + logger: List[pl.loggers.LightningLoggerBase], +) -> None: + """Controls which config parts are saved by Lightning loggers. + + Additionaly saves: + - number of model parameters + """ + + if not trainer.logger: + return + + hparams = {} + + # choose which parts of hydra config will be saved to loggers + hparams["model"] = config["model"] + + # save number of model parameters + hparams["model/params/total"] = sum(p.numel() for p in model.parameters()) + hparams["model/params/trainable"] = sum( + p.numel() for p in model.parameters() if p.requires_grad + ) + hparams["model/params/non_trainable"] = sum( + p.numel() for p in model.parameters() if not p.requires_grad + ) + + hparams["datamodule"] = config["datamodule"] + hparams["trainer"] = config["trainer"] + + if "seed" in config: + hparams["seed"] = config["seed"] + if "callbacks" in config: + hparams["callbacks"] = config["callbacks"] + + hparams["optim"] = config["optim"] + # send hparams to all loggers + trainer.logger.log_hyperparams(hparams) + + +def finish( + config: DictConfig, + model: pl.LightningModule, + datamodule: pl.LightningDataModule, + trainer: pl.Trainer, + callbacks: List[pl.Callback], + logger: List[pl.loggers.LightningLoggerBase], +) -> None: + """Makes sure everything closed properly.""" + + # without this sweeps with wandb logger might crash! + for lg in logger: + if isinstance(lg, pl.loggers.wandb.WandbLogger): + import wandb + + wandb.finish() diff --git a/pl-hydra/src/utils/plotter.py b/pl-hydra/src/utils/plotter.py new file mode 100644 index 0000000..8ef3541 --- /dev/null +++ b/pl-hydra/src/utils/plotter.py @@ -0,0 +1,37 @@ +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + +def plot_cm(cm, name_classes): + cm_np = cm.cpu().numpy() + df_cm = pd.DataFrame((cm_np/np.sum(cm_np))*10, + index = [i for i in name_classes], + columns= [i for i in name_classes]) + + plt.figure(figsize = (10,7)) + fig_ = sns.heatmap(df_cm, annot=True, cmap=None).get_figure() + return fig_ + +def plot_preds(images, labels, preds, name_classes, nimg=32, ncols=8, + data_mean=[], data_std=[]): + nrows = nimg//ncols + # define figure + fig_, axes=plt.subplots(nrows, ncols, figsize=(12, 8)) + axes = axes.ravel() + + #print(np.min(images), np.max(images)) + for i in range(nimg): + label_name = name_classes[labels[i]] + pred_name = name_classes[preds[i]] + image = images[i] + image[0] = image[0]*data_std[0] + data_mean[0] + image[1] = image[1]*data_std[1] + data_mean[1] + image[2] = image[1]*data_std[2] + data_mean[2] + #print(np.min(image), np.max(image)) + image = np.transpose((image*255).astype('uint8'), (1,2,0)) + axes[i].imshow(image) + axes[i].set_title(f'label: {label_name} \n pred: {pred_name}', fontsize=8) + axes[i].axis('off') + plt.subplots_adjust(hspace=0.2) + return fig_ \ No newline at end of file diff --git a/pl-hydra/src/vendor/__init__.py b/pl-hydra/src/vendor/__init__.py new file mode 100644 index 0000000..203cb9c --- /dev/null +++ b/pl-hydra/src/vendor/__init__.py @@ -0,0 +1 @@ +# use this folder for storing third party code that cannot be installed using pip/conda diff --git a/pl-hydra/test.py b/pl-hydra/test.py new file mode 100644 index 0000000..ee02d04 --- /dev/null +++ b/pl-hydra/test.py @@ -0,0 +1,26 @@ +import dotenv +import hydra +from omegaconf import DictConfig + +# load environment variables from `.env` file if it exists +# recursively searches for `.env` in all folders starting from work dir +dotenv.load_dotenv(override=True) + + +@hydra.main(config_path="configs/", config_name="test.yaml") +def main(config: DictConfig): + + # Imports can be nested inside @hydra.main to optimize tab completion + # https://github.com/facebookresearch/hydra/issues/934 + from src import utils + from src.testing_pipeline import test + + # Applies optional utilities + utils.extras(config) + + # Evaluate model + return test(config) + + +if __name__ == "__main__": + main() diff --git a/pl-hydra/tests/__init__.py b/pl-hydra/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/tests/helpers/__init__.py b/pl-hydra/tests/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/tests/helpers/module_available.py b/pl-hydra/tests/helpers/module_available.py new file mode 100644 index 0000000..d3137f3 --- /dev/null +++ b/pl-hydra/tests/helpers/module_available.py @@ -0,0 +1,28 @@ +import platform +from importlib.util import find_spec + +""" +Adapted from: + https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/utilities/imports.py +""" + + +def _module_available(module_path: str) -> bool: + """Check if a path is available in your environment. + + >>> _module_available('os') + True + >>> _module_available('bla.bla') + False + """ + try: + return find_spec(module_path) is not None + except ModuleNotFoundError: + # Python 3.7+ + return False + + +_IS_WINDOWS = platform.system() == "Windows" +_DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _module_available("deepspeed") +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") +_RPC_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.rpc") diff --git a/pl-hydra/tests/helpers/run_command.py b/pl-hydra/tests/helpers/run_command.py new file mode 100644 index 0000000..1670988 --- /dev/null +++ b/pl-hydra/tests/helpers/run_command.py @@ -0,0 +1,15 @@ +from typing import List + +import pytest +import sh + + +def run_command(command: List[str]): + """Default method for executing shell commands with pytest.""" + msg = None + try: + sh.python(command) + except sh.ErrorReturnCode as e: + msg = e.stderr.decode() + if msg: + pytest.fail(msg=msg) diff --git a/pl-hydra/tests/helpers/runif.py b/pl-hydra/tests/helpers/runif.py new file mode 100644 index 0000000..36d73e1 --- /dev/null +++ b/pl-hydra/tests/helpers/runif.py @@ -0,0 +1,104 @@ +import sys +from typing import Optional + +import pytest +import torch +from packaging.version import Version +from pkg_resources import get_distribution + +""" +Adapted from: + https://github.com/PyTorchLightning/pytorch-lightning/blob/master/tests/helpers/runif.py +""" + +from tests.helpers.module_available import ( + _DEEPSPEED_AVAILABLE, + _FAIRSCALE_AVAILABLE, + _IS_WINDOWS, + _RPC_AVAILABLE, +) + + +class RunIf: + """RunIf wrapper for conditional skipping of tests. + + Fully compatible with `@pytest.mark`. + + Example: + + @RunIf(min_torch="1.8") + @pytest.mark.parametrize("arg1", [1.0, 2.0]) + def test_wrapper(arg1): + assert arg1 > 0 + """ + + def __new__( + self, + min_gpus: int = 0, + min_torch: Optional[str] = None, + max_torch: Optional[str] = None, + min_python: Optional[str] = None, + skip_windows: bool = False, + rpc: bool = False, + fairscale: bool = False, + deepspeed: bool = False, + **kwargs, + ): + """ + Args: + min_gpus: min number of gpus required to run test + min_torch: minimum pytorch version to run test + max_torch: maximum pytorch version to run test + min_python: minimum python version required to run test + skip_windows: skip test for Windows platform + rpc: requires Remote Procedure Call (RPC) + fairscale: if `fairscale` module is required to run the test + deepspeed: if `deepspeed` module is required to run the test + kwargs: native pytest.mark.skipif keyword arguments + """ + conditions = [] + reasons = [] + + if min_gpus: + conditions.append(torch.cuda.device_count() < min_gpus) + reasons.append(f"GPUs>={min_gpus}") + + if min_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) < Version(min_torch)) + reasons.append(f"torch>={min_torch}") + + if max_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) >= Version(max_torch)) + reasons.append(f"torch<{max_torch}") + + if min_python: + py_version = ( + f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ) + conditions.append(Version(py_version) < Version(min_python)) + reasons.append(f"python>={min_python}") + + if skip_windows: + conditions.append(_IS_WINDOWS) + reasons.append("does not run on Windows") + + if rpc: + conditions.append(not _RPC_AVAILABLE) + reasons.append("RPC") + + if fairscale: + conditions.append(not _FAIRSCALE_AVAILABLE) + reasons.append("Fairscale") + + if deepspeed: + conditions.append(not _DEEPSPEED_AVAILABLE) + reasons.append("Deepspeed") + + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] + return pytest.mark.skipif( + condition=any(conditions), + reason=f"Requires: [{' + '.join(reasons)}]", + **kwargs, + ) diff --git a/pl-hydra/tests/shell/__init__.py b/pl-hydra/tests/shell/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/tests/shell/test_basic_commands.py b/pl-hydra/tests/shell/test_basic_commands.py new file mode 100644 index 0000000..f708ede --- /dev/null +++ b/pl-hydra/tests/shell/test_basic_commands.py @@ -0,0 +1,58 @@ +import pytest + +from tests.helpers.run_command import run_command +from tests.helpers.runif import RunIf + +""" +A couple of sanity checks to make sure the model doesn't crash with different running options. +""" + + +def test_fast_dev_run(): + """Test running for 1 train, val and test batch.""" + command = ["train.py", "++trainer.fast_dev_run=true"] + run_command(command) + + +@pytest.mark.slow +def test_cpu(): + """Test running 1 epoch on CPU.""" + command = ["train.py", "++trainer.max_epochs=1", "++trainer.gpus=0"] + run_command(command) + + +# use RunIf to skip execution of some tests, e.g. when no gpus are available +@RunIf(min_gpus=1) +@pytest.mark.slow +def test_gpu(): + """Test running 1 epoch on GPU.""" + command = [ + "train.py", + "++trainer.max_epochs=1", + "++trainer.gpus=1", + ] + run_command(command) + + +@RunIf(min_gpus=1) +@pytest.mark.slow +def test_mixed_precision(): + """Test running 1 epoch with pytorch native automatic mixed precision (AMP).""" + command = [ + "train.py", + "++trainer.max_epochs=1", + "++trainer.gpus=1", + "++trainer.precision=16", + ] + run_command(command) + + +@pytest.mark.slow +def test_double_validation_loop(): + """Test running 1 epoch with validation loop twice per epoch.""" + command = [ + "train.py", + "++trainer.max_epochs=1", + "++trainer.val_check_interval=0.5", + ] + run_command(command) diff --git a/pl-hydra/tests/shell/test_debug_configs.py b/pl-hydra/tests/shell/test_debug_configs.py new file mode 100644 index 0000000..a73dda8 --- /dev/null +++ b/pl-hydra/tests/shell/test_debug_configs.py @@ -0,0 +1,35 @@ +import pytest + +from tests.helpers.run_command import run_command + + +@pytest.mark.slow +def test_debug_default(): + command = ["train.py", "debug=default"] + run_command(command) + + +def test_debug_limit_batches(): + command = ["train.py", "debug=limit_batches"] + run_command(command) + + +def test_debug_overfit(): + command = ["train.py", "debug=overfit"] + run_command(command) + + +@pytest.mark.slow +def test_debug_profiler(): + command = ["train.py", "debug=profiler"] + run_command(command) + + +def test_debug_step(): + command = ["train.py", "debug=step"] + run_command(command) + + +def test_debug_test_only(): + command = ["train.py", "debug=test_only"] + run_command(command) diff --git a/pl-hydra/tests/shell/test_sweeps.py b/pl-hydra/tests/shell/test_sweeps.py new file mode 100644 index 0000000..10a298d --- /dev/null +++ b/pl-hydra/tests/shell/test_sweeps.py @@ -0,0 +1,44 @@ +import pytest + +from tests.helpers.run_command import run_command + +""" +A couple of tests executing hydra sweeps. + +Use the following command to skip slow tests: + pytest -k "not slow" +""" + + +@pytest.mark.slow +def test_experiments(): + """Test running all available experiment configs for 1 epoch.""" + command = ["train.py", "-m", "experiment=glob(*)", "++trainer.max_epochs=1"] + run_command(command) + + +@pytest.mark.slow +def test_default_sweep(): + """Test default Hydra sweeper.""" + command = [ + "train.py", + "-m", + "datamodule.batch_size=64,128", + "model.lr=0.01,0.02", + "trainer=default", + "++trainer.fast_dev_run=true", + ] + run_command(command) + + +@pytest.mark.slow +def test_optuna_sweep(): + """Test Optuna sweeper.""" + command = [ + "train.py", + "-m", + "hparams_search=mnist_optuna", + "trainer=default", + "++trainer.fast_dev_run=true", + ] + run_command(command) diff --git a/pl-hydra/tests/unit/__init__.py b/pl-hydra/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pl-hydra/tests/unit/test_mnist_datamodule.py b/pl-hydra/tests/unit/test_mnist_datamodule.py new file mode 100644 index 0000000..91e6182 --- /dev/null +++ b/pl-hydra/tests/unit/test_mnist_datamodule.py @@ -0,0 +1,36 @@ +import os + +import pytest +import torch + +from src.datamodules.mnist_datamodule import MNISTDataModule + + +@pytest.mark.parametrize("batch_size", [32, 128]) +def test_mnist_datamodule(batch_size): + datamodule = MNISTDataModule(batch_size=batch_size) + datamodule.prepare_data() + + assert not datamodule.data_train and not datamodule.data_val and not datamodule.data_test + + assert os.path.exists(os.path.join("data", "MNIST")) + assert os.path.exists(os.path.join("data", "MNIST", "raw")) + + datamodule.setup() + + assert datamodule.data_train and datamodule.data_val and datamodule.data_test + assert ( + len(datamodule.data_train) + len(datamodule.data_val) + len(datamodule.data_test) == 70_000 + ) + + assert datamodule.train_dataloader() + assert datamodule.val_dataloader() + assert datamodule.test_dataloader() + + batch = next(iter(datamodule.train_dataloader())) + x, y = batch + + assert len(x) == batch_size + assert len(y) == batch_size + assert x.dtype == torch.float32 + assert y.dtype == torch.int64 diff --git a/pl-hydra/train.py b/pl-hydra/train.py new file mode 100644 index 0000000..05b78f3 --- /dev/null +++ b/pl-hydra/train.py @@ -0,0 +1,33 @@ +''' +Modified from https://github.com/phlippe/uvadlc_notebooks.git +''' + +import dotenv +import hydra +from omegaconf import DictConfig +import os +# Set the visible GPUs (curent machine has 16 GPUS [0-15]) +os.environ["CUDA_VISIBLE_DEVICES"]="11" + +# load environment variables from `.env` file if it exists +# recursively searches for `.env` in all folders starting from work dir +dotenv.load_dotenv(override=True) + + +@hydra.main(config_path="configs/", config_name="train.yaml") +def main(config: DictConfig): + + # Imports can be nested inside @hydra.main to optimize tab completion + # https://github.com/facebookresearch/hydra/issues/934 + from src import utils + from src.training_pipeline import train + + # Applies optional utilities + utils.extras(config) + + # Train model + return train(config) + + +if __name__ == "__main__": + main()