diff --git a/README.md b/README.md
index 797cacb..e1852ab 100644
--- a/README.md
+++ b/README.md
@@ -23,28 +23,32 @@ going to be overkill. However, if you want to get lost in the world of neural ne
 for months on end then this is probably the right place for you :)
 
 Among other things Pywick includes:
-- State of the art normalization, activation, loss functions and
-  optimizers not included in the standard Pytorch library (Addsign, Eve, Lookahead, Radam, Ralamb, RangerLARS etc).
-- A high-level module for training with callbacks, constraints, metrics,
-  conditions and regularizers.
-- Dozens of popular object classification and semantic segmentation models.
+- State of the art normalization, activation, loss functions and optimizers not included in the standard Pytorch library (AdaBelief, Addsign, Apollo, Eve, Lookahead, Radam, Ralamb, RangerLARS etc).
+- A high-level module for training with callbacks, constraints, metrics, conditions and regularizers.
+- Hundreds of popular object classification and semantic segmentation models!
 - Comprehensive data loading, augmentation, transforms, and sampling capability.
 - Utility tensor functions.
 - Useful meters.
 - Basic GridSearch (exhaustive and random).
 
 ## Docs
-Hey, [check this out](https://pywick.readthedocs.io/en/latest/), we now
-have [docs](https://pywick.readthedocs.io/en/latest/)! They're still a
-work in progress though so apologies for anything that's broken.
+Hey, [check this out](https://pywick.readthedocs.io/en/latest/), we now have [docs](https://pywick.readthedocs.io/en/latest/)! They're still a work in progress though so apologies for anything that's broken.
 
 ## What's New (highlights)
+
+### v0.6.0 - We thought ya might like YAML!
+So you're saying you like **configuration files**? You're saying you like **examples** too? Well, we've got you covered! Huge release today with a configuration-based training example! All you have to do is:
+  - Get your favorite dataset (or download [17 flowers](https://www.robots.ox.ac.uk/~vgg/data/flowers/17/) to get started and `pywick/examples/17flowers_split.py` to convert)
+  - Adjust the `configs/train_classifier.yaml` file to fit your workspace
+  - Then simply run: `python3 train_classifier.py configs/train_classifier.yaml` and watch it train!
+
+### Older Notes
 - **May 6, 2021**
   - Many SoTA classification and segmentation models added: Swin-Transformer variants, NFNet variants (L0, L1), Halo nets, Lambda nets, ECA variants, Rexnet + others
   - Many new loss functions added: RecallLoss, SoftInvDiceLoss, OhemBCEDicePenalizeBorderLoss, RMIBCEDicePenalizeBorderLoss + others
   - Bug fixes
 - **Jun. 15, 2020**
-  - 200+ models added from [rwightman's](https://github.com/rwightman/pytorch-image-models) repo via `torch.hub`! See docs for all the variants!
+  - 700+ models added from [rwightman's](https://github.com/rwightman/pytorch-image-models) repo via `torch.hub`! See docs for all the variants!
   - Some minor bug fixes
 - **Jan. 20, 2020**
   - New release: 0.5.6 (minor fix from 0.5.5 for pypi)
@@ -63,74 +67,23 @@ work in progress though so apologies for anything that's broken.
     - General bug fixes and code improvements 
 
 ## Install
-Pywick requires **pytorch >= 1.0**
+Pywick requires **pytorch >= 1.4**
 
 `pip install pywick`
 
 or specific version from git:
 
-`pip install git+https://github.com/achaiah/pywick.git@v0.5.6`
+`pip install git+https://github.com/achaiah/pywick.git@v0.6.0`
 
 ## ModuleTrainer
-The `ModuleTrainer` class provides a high-level training interface which abstracts
-away the training loop while providing callbacks, constraints, initializers, regularizers,
+The `ModuleTrainer` class provides a high-level training interface which abstracts away the training loop while providing callbacks, constraints, initializers, regularizers,
 and more.
 
-Example:
-```python
-from pywick.modules import ModuleTrainer
-from pywick.initializers import XavierUniform
-from pywick.metrics import CategoricalAccuracySingleInput
-import torch.nn as nn
-import torch.functional as F
-
-# Define your model EXACTLY as normal
-class Network(nn.Module):
-    def __init__(self):
-        super(Network, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
-        self.fc1 = nn.Linear(1600, 128)
-        self.fc2 = nn.Linear(128, 10)
-
-    def forward(self, x):
-        x = F.relu(F.max_pool2d(self.conv1(x), 2))
-        x = F.relu(F.max_pool2d(self.conv2(x), 2))
-        x = x.view(-1, 1600)
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, training=self.training)
-        x = self.fc2(x)
-        return F.log_softmax(x)
-
-model = Network()
-trainer = ModuleTrainer(model)   # optionally supply cuda_devices as a parameter
-
-initializers = [XavierUniform(bias=False, module_filter='fc*')]
-
-# initialize metrics with top1 and top5 
-metrics = [CategoricalAccuracySingleInput(top_k=1), CategoricalAccuracySingleInput(top_k=5)]
-
-trainer.compile(loss='cross_entropy',
-                # callbacks=callbacks,          # define your callbacks here (e.g. model saver, LR scheduler)
-                # regularizers=regularizers,    # define regularizers
-                # constraints=constraints,      # define constraints
-                optimizer='sgd',
-                initializers=initializers,
-                metrics=metrics)
-
-trainer.fit_loader(train_dataset_loader, 
-            val_loader=val_dataset_loader,
-            num_epoch=20,
-            verbose=1)
-```
-You also have access to the standard evaluation and prediction functions:
+See the `train_classifier.py` example for a pretty complete configuration example. To get up and running with your own data quickly simply edit the `configs/train_classifier.yaml` file with your desired parameters and dataset location(s).
 
-```python
-loss = trainer.evaluate(x_train, y_train)
-y_pred = trainer.predict(x_train)
-```
-PyWick provides a wide range of <b>callbacks</b>, generally mimicking the interface
-found in `Keras`:
+Note: <i>Dataset needs to be organized for classification where each directory name is the name of a class and contains all images pertaining to that class</i>
+
+PyWick provides a wide range of <b>callbacks</b>, generally mimicking the interface found in `Keras`:
 
 - `CSVLogger` - Logs epoch-level metrics to a CSV file
 - [`CyclicLRScheduler`](https://github.com/bckenstler/CLR) - Cycles through min-max learning rate
@@ -141,9 +94,7 @@ found in `Keras`:
 - `ModelCheckpoint` - Comprehensive model saver
 - `ReduceLROnPlateau` - Reduces learning rate (LR) when a plateau has been reached
 - `SimpleModelCheckpoint` - Simple model saver
-- Additionally, a `TensorboardLogger` is incredibly easy to implement
-  via the [TensorboardX](https://github.com/lanpa/tensorboardX) (now
-  part of pytorch 1.1 release!)
+- Additionally, a `TensorboardLogger` is incredibly easy to implement via tensorboardX (now part of pytorch 1.1 release!)
 
 
 ```python
@@ -168,7 +119,7 @@ and <b>constraints</b>:
 Both regularizers and constraints can be selectively applied on layers using regular expressions and the `module_filter`
 argument. Constraints can be explicit (hard) constraints applied at an arbitrary batch or
 epoch frequency, or they can be implicit (soft) constraints similar to regularizers
-where the the constraint deviation is added as a penalty to the total model loss.
+where the constraint deviation is added as a penalty to the total model loss.
 
 ```python
 from pywick.constraints import MaxNorm, NonNeg
@@ -239,6 +190,7 @@ trainer.fit_loader(loader, val_loader=val_loader, num_epoch=100)
 - [**TResNet: High Performance GPU-Dedicated Architecture**](https://arxiv.org/abs/2003.13630)
 - [**Wide Resnet**](https://arxiv.org/abs/1605.07146)
 - [**XCeption**](https://arxiv.org/pdf/1610.02357.pdf)
+- All the newest classification models (200+) from [rwightman's repo](https://github.com/rwightman/pytorch-image-models) ECA-NFNet, GERNet, RegNet, SKResnext, SWIN-Transformer, VIT etc.)
 
 ## Image Segmentation Models
 - **BiSeNet** ([Bilateral Segmentation Network for Real-time Semantic Segmentation](https://arxiv.org/abs/1808.00897))
@@ -254,6 +206,7 @@ trainer.fit_loader(loader, val_loader=val_loader, num_epoch=100)
     and OptDenseNet respectively ([Fully convolutional networks for semantic segmentation](http://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Long_Fully_Convolutional_Networks_2015_CVPR_paper.pdf))
 - **FRRN** ([Full Resolution Residual Networks for Semantic Segmentation in Street Scenes](https://arxiv.org/abs/1611.08323))
 - **FusionNet** ([FusionNet in Tensorflow by Hyungjoo Andrew Cho](https://github.com/NySunShine/fusion-net))
+- **GALDNet** 
 - **GCN** ([Large Kernel Matters](https://arxiv.org/pdf/1703.02719))
 - **LinkNet** ([Link-Net](https://codeac29.github.io/projects/linknet/))
 - **OCNet** ([Object Context Network for Scene Parsing](https://arxiv.org/abs/1809.00916))
diff --git a/docs/source/README.md b/docs/source/README.md
index b7593f2..c725e2b 100644
--- a/docs/source/README.md
+++ b/docs/source/README.md
@@ -23,28 +23,32 @@ going to be overkill. However, if you want to get lost in the world of neural ne
 for months on end then this is probably the right place for you :)
 
 Among other things Pywick includes:
-- State of the art normalization, activation, loss functions and
-  optimizers not included in the standard Pytorch library.
-- A high-level module for training with callbacks, constraints, metrics,
-  conditions and regularizers.
-- Dozens of popular object classification and semantic segmentation models.
+- State of the art normalization, activation, loss functions and optimizers not included in the standard Pytorch library (AdaBelief, Addsign, Apollo, Eve, Lookahead, Radam, Ralamb, RangerLARS etc).
+- A high-level module for training with callbacks, constraints, metrics, conditions and regularizers.
+- Hundreds of popular object classification and semantic segmentation models!
 - Comprehensive data loading, augmentation, transforms, and sampling capability.
 - Utility tensor functions.
 - Useful meters.
 - Basic GridSearch (exhaustive and random).
 
 ## Docs
-Hey, [check this out](https://pywick.readthedocs.io/en/latest/), we now
-have [docs](https://pywick.readthedocs.io/en/latest/)! They're still a
-work in progress though so apologies for anything that's broken.
+Hey, [check this out](https://pywick.readthedocs.io/en/latest/), we now have [docs](https://pywick.readthedocs.io/en/latest/)! They're still a work in progress though so apologies for anything that's broken.
 
 ## What's New (highlights)
+
+### v0.6.0 - We thought ya might like YAML!
+So you're saying you like **configuration files**? You're saying you like **examples** too? Well, we've got you covered! Huge release today with a configuration-based training example! All you have to do is:
+  - Get your favorite dataset (or download [17 flowers](https://www.robots.ox.ac.uk/~vgg/data/flowers/17/) to get started and `pywick/examples/17flowers_split.py` to convert)
+  - Adjust the `configs/train_classifier.yaml` file to fit your workspace
+  - Then simply run: `python3 train_classifier.py configs/train_classifier.yaml` and watch it train!
+
+### Older Notes
 - **May 6, 2021**
-  - Many SoTA classification and segmentation models added: Swin-Transformer variants, NFNets variants (L0, L1), Halo nets, Lambda nets, ECA variants + others
+  - Many SoTA classification and segmentation models added: Swin-Transformer variants, NFNet variants (L0, L1), Halo nets, Lambda nets, ECA variants, Rexnet + others
   - Many new loss functions added: RecallLoss, SoftInvDiceLoss, OhemBCEDicePenalizeBorderLoss, RMIBCEDicePenalizeBorderLoss + others
   - Bug fixes
 - **Jun. 15, 2020**
-  - 200+ models added from [rwightman's](https://github.com/rwightman/pytorch-image-models) repo via `torch.hub`! See docs for all the variants!
+  - 700+ models added from [rwightman's](https://github.com/rwightman/pytorch-image-models) repo via `torch.hub`! See docs for all the variants!
   - Some minor bug fixes
 - **Jan. 20, 2020**
   - New release: 0.5.6 (minor fix from 0.5.5 for pypi)
@@ -63,74 +67,23 @@ work in progress though so apologies for anything that's broken.
     - General bug fixes and code improvements 
 
 ## Install
-Pywick requires **pytorch >= 1.0**
+Pywick requires **pytorch >= 1.4**
 
 `pip install pywick`
 
 or specific version from git:
 
-`pip install git+https://github.com/achaiah/pywick.git@v0.5.6`
+`pip install git+https://github.com/achaiah/pywick.git@v0.6.0`
 
 ## ModuleTrainer
-The `ModuleTrainer` class provides a high-level training interface which abstracts
-away the training loop while providing callbacks, constraints, initializers, regularizers,
+The `ModuleTrainer` class provides a high-level training interface which abstracts away the training loop while providing callbacks, constraints, initializers, regularizers,
 and more.
 
-Example:
-```python
-from pywick.modules import ModuleTrainer
-from pywick.initializers import XavierUniform
-from pywick.metrics import CategoricalAccuracySingleInput
-import torch.nn as nn
-import torch.functional as F
-
-# Define your model EXACTLY as normal
-class Network(nn.Module):
-    def __init__(self):
-        super(Network, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
-        self.fc1 = nn.Linear(1600, 128)
-        self.fc2 = nn.Linear(128, 10)
-
-    def forward(self, x):
-        x = F.relu(F.max_pool2d(self.conv1(x), 2))
-        x = F.relu(F.max_pool2d(self.conv2(x), 2))
-        x = x.view(-1, 1600)
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, training=self.training)
-        x = self.fc2(x)
-        return F.log_softmax(x)
-
-model = Network()
-trainer = ModuleTrainer(model)   # optionally supply cuda_devices as a parameter
-
-initializers = [XavierUniform(bias=False, module_filter='fc*')]
-
-# initialize metrics with top1 and top5 
-metrics = [CategoricalAccuracySingleInput(top_k=1), CategoricalAccuracySingleInput(top_k=5)]
-
-trainer.compile(loss='cross_entropy',
-                # callbacks=callbacks,          # define your callbacks here (e.g. model saver, LR scheduler)
-                # regularizers=regularizers,    # define regularizers
-                # constraints=constraints,      # define constraints
-                optimizer='sgd',
-                initializers=initializers,
-                metrics=metrics)
-
-trainer.fit_loader(train_dataset_loader, 
-            val_loader=val_dataset_loader,
-            num_epoch=20,
-            verbose=1)
-```
-You also have access to the standard evaluation and prediction functions:
+See the `train_classifier.py` example for a pretty complete configuration example. To get up and running with your own data quickly simply edit the `configs/train_classifier.yaml` file with your desired parameters and dataset location(s).
 
-```python
-loss = trainer.evaluate(x_train, y_train)
-y_pred = trainer.predict(x_train)
-```
-PyWick provides a wide range of <b>callbacks</b>, generally mimicking the interface
-found in `Keras`:
+Note: <i>Dataset needs to be organized for classification where each directory name is the name of a class and contains all images pertaining to that class</i>
+
+PyWick provides a wide range of <b>callbacks</b>, generally mimicking the interface found in `Keras`:
 
 - `CSVLogger` - Logs epoch-level metrics to a CSV file
 - [`CyclicLRScheduler`](https://github.com/bckenstler/CLR) - Cycles through min-max learning rate
@@ -141,9 +94,7 @@ found in `Keras`:
 - `ModelCheckpoint` - Comprehensive model saver
 - `ReduceLROnPlateau` - Reduces learning rate (LR) when a plateau has been reached
 - `SimpleModelCheckpoint` - Simple model saver
-- Additionally, a `TensorboardLogger` is incredibly easy to implement
-  via the [TensorboardX](https://github.com/lanpa/tensorboardX) (now
-  part of pytorch 1.1 release!)
+- Additionally, a `TensorboardLogger` is incredibly easy to implement via tensorboardX (now part of pytorch 1.1 release!)
 
 
 ```python
@@ -168,7 +119,7 @@ and <b>constraints</b>:
 Both regularizers and constraints can be selectively applied on layers using regular expressions and the `module_filter`
 argument. Constraints can be explicit (hard) constraints applied at an arbitrary batch or
 epoch frequency, or they can be implicit (soft) constraints similar to regularizers
-where the the constraint deviation is added as a penalty to the total model loss.
+where the constraint deviation is added as a penalty to the total model loss.
 
 ```python
 from pywick.constraints import MaxNorm, NonNeg
@@ -239,6 +190,7 @@ trainer.fit_loader(loader, val_loader=val_loader, num_epoch=100)
 - [**TResNet: High Performance GPU-Dedicated Architecture**](https://arxiv.org/abs/2003.13630)
 - [**Wide Resnet**](https://arxiv.org/abs/1605.07146)
 - [**XCeption**](https://arxiv.org/pdf/1610.02357.pdf)
+- All the newest classification models (700+) from [rwightman's repo](https://github.com/rwightman/pytorch-image-models) ECA-NFNet, GERNet, RegNet, SKResnext, SWIN-Transformer, VIT etc.)
 
 ## Image Segmentation Models
 - **BiSeNet** ([Bilateral Segmentation Network for Real-time Semantic Segmentation](https://arxiv.org/abs/1808.00897))
@@ -254,6 +206,7 @@ trainer.fit_loader(loader, val_loader=val_loader, num_epoch=100)
     and OptDenseNet respectively ([Fully convolutional networks for semantic segmentation](http://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Long_Fully_Convolutional_Networks_2015_CVPR_paper.pdf))
 - **FRRN** ([Full Resolution Residual Networks for Semantic Segmentation in Street Scenes](https://arxiv.org/abs/1611.08323))
 - **FusionNet** ([FusionNet in Tensorflow by Hyungjoo Andrew Cho](https://github.com/NySunShine/fusion-net))
+- **GALDNet** 
 - **GCN** ([Large Kernel Matters](https://arxiv.org/pdf/1703.02719))
 - **LinkNet** ([Link-Net](https://codeac29.github.io/projects/linknet/))
 - **OCNet** ([Object Context Network for Scene Parsing](https://arxiv.org/abs/1809.00916))
diff --git a/docs/source/api/pywick.callbacks.rst b/docs/source/api/pywick.callbacks.rst
index 59fe3cd..16db055 100644
--- a/docs/source/api/pywick.callbacks.rst
+++ b/docs/source/api/pywick.callbacks.rst
@@ -78,6 +78,14 @@ ModelCheckpoint
     :undoc-members:
     :show-inheritance:
 
+OneCycleLRScheduler
+-----------------------------------
+
+.. automodule:: pywick.callbacks.OneCycleLRScheduler
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 ReduceLROnPlateau
 -----------------------------------------
 
@@ -94,3 +102,11 @@ SimpleModelCheckpoint
     :undoc-members:
     :show-inheritance:
 
+TQDM
+---------------------------------------------
+
+.. automodule:: pywick.callbacks.TQDM
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/docs/source/api/pywick.functions.rst b/docs/source/api/pywick.functions.rst
index 0097ee7..c91d6d1 100644
--- a/docs/source/api/pywick.functions.rst
+++ b/docs/source/api/pywick.functions.rst
@@ -14,7 +14,15 @@ CyclicLR
     :undoc-members:
     :show-inheritance:
 
-Aria + Swish
+Mish
+--------------------------------
+
+.. automodule:: pywick.functions.mish
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Swish + Aria
 -----------------------------
 
 .. automodule:: pywick.functions.swish
diff --git a/docs/source/api/pywick.models.rwightman.rst b/docs/source/api/pywick.models.rwightman.rst
index c7ad6d2..74ea6ca 100644
--- a/docs/source/api/pywick.models.rwightman.rst
+++ b/docs/source/api/pywick.models.rwightman.rst
@@ -1,16 +1,180 @@
 rwightman Models
 ====================================
 
-`rwightman <https://github.com/rwightman/pytorch-image-models>`_ maintains an awesome (and growing!) collection of models that is published via torch.hub -see the list below!
+`rwightman <https://github.com/rwightman/pytorch-image-models>`_ maintains an awesome (700+ and growing!) collection of models that is published via torch.hub - see the list below!
 Typically, you'll want to load these pre-trained, in order to use them with your own dataset (and your own number of classes). In that
-case you should use Pywick's ``models.model_utils.get_model(...)`` utility function which will do all the dirty work for you and give you a pretrained model but with your custom
-number of classes!
+case you should use Pywick's ``models.model_utils.get_model(...)`` utility function which will do all the dirty work for you and give you a pretrained model but with your custom number of classes!
 
+* DPN
+* dpn68
+* dpn68b
+* dpn98
+* dpn131
+* dpn107
+* BNInception
+* bninception
+* FBResNet
+* FBResNet18
+* FBResNet34
+* FBResNet50
+* FBResNet101
+* fbresnet152
+* InceptionResV2
+* InceptionResNetV2
+* inceptionresnetv2
+* InceptionV4
+* inceptionv4
+* nasnetalarge
+* NASNetALarge
+* nasnetamobile
+* NASNetAMobile
+* pnasnet5large
+* PNASNet5Large
+* PolyNet
+* polynet
+* PyResNet18
+* PyResNet34
+* PyResNet
+* PreactResnet110
+* PreactResnet164_bottleneck
+* ResNet_swish
+* ResNet18_swish
+* ResNet34_swish
+* ResNet50_swish
+* ResNet101_swish
+* ResNet152_swish
+* ResNeXt50_32x4d
+* resnext50_32x4d
+* ResNeXt101_32x4d
+* resnext101_32x4d
+* ResNeXt101_64x4d
+* resnext101_64x4d
+* SENet
+* senet154
+* se_resnet50
+* se_resnet101
+* se_resnet152
+* se_resnext50_32x4d
+* se_resnext101_32x4d
+* WideResNet
+* wideresnet50
+* Xception
+* xception
+* se_densenet121
+* se_densenet161
+* se_densenet169
+* se_densenet201
+* AlexNet
+* ResNet
+* resnet18
+* resnet34
+* resnet50
+* resnet101
+* resnet152
+* resnext50_32x4d
+* resnext101_32x8d
+* wide_resnet50_2
+* wide_resnet101_2
+* VGG
+* vgg11
+* vgg11_bn
+* vgg13
+* vgg13_bn
+* vgg16
+* vgg16_bn
+* vgg19_bn
+* vgg19
+* SqueezeNet
+* squeezenet1_0
+* squeezenet1_1
+* Inception3
+* inception_v3
+* InceptionOutputs
+* _InceptionOutputs
+* DenseNet
+* densenet121
+* densenet169
+* densenet201
+* densenet161
+* GoogLeNet
+* GoogLeNetOutputs
+* _GoogLeNetOutputs
+* MobileNetV2
+* mobilenet_v2
+* MNASNet
+* mnasnet0_5
+* mnasnet0_75
+* mnasnet1_0
+* mnasnet1_3
+* ShuffleNetV2
+* shufflenet_v2_x0_5
+* shufflenet_v2_x1_0
+* shufflenet_v2_x1_5
+* shufflenet_v2_x2_0
 * adv_inception_v3
+* bat_resnext26ts
+* beit_base_patch16_224
+* beit_base_patch16_224_in22k
+* beit_base_patch16_384
+* beit_large_patch16_224
+* beit_large_patch16_224_in22k
+* beit_large_patch16_384
+* beit_large_patch16_512
+* botnet26t_256
+* botnet50ts_256
+* cait_m36_384
+* cait_m48_448
+* cait_s24_224
+* cait_s24_384
+* cait_s36_384
+* cait_xs24_384
+* cait_xxs24_224
+* cait_xxs24_384
+* cait_xxs36_224
+* cait_xxs36_384
+* coat_lite_mini
+* coat_lite_small
+* coat_lite_tiny
+* coat_mini
+* coat_tiny
+* convit_base
+* convit_small
+* convit_tiny
+* crossvit_15_240
+* crossvit_15_dagger_240
+* crossvit_15_dagger_408
+* crossvit_18_240
+* crossvit_18_dagger_240
+* crossvit_18_dagger_408
+* crossvit_9_240
+* crossvit_9_dagger_240
+* crossvit_base_240
+* crossvit_small_240
+* crossvit_tiny_240
+* cspdarknet53
+* cspdarknet53_iabn
+* cspresnet50
+* cspresnet50d
+* cspresnet50w
+* cspresnext50
+* cspresnext50_iabn
+* darknet53
+* deit_base_distilled_patch16_224
+* deit_base_distilled_patch16_384
+* deit_base_patch16_224
+* deit_base_patch16_384
+* deit_small_distilled_patch16_224
+* deit_small_patch16_224
+* deit_tiny_distilled_patch16_224
+* deit_tiny_patch16_224
 * densenet121
+* densenet121d
 * densenet161
 * densenet169
 * densenet201
+* densenet264
+* densenet264d_iabn
+* densenetblur121d
 * dla102
 * dla102x
 * dla102x2
@@ -23,20 +187,39 @@ number of classes!
 * dla60_res2next
 * dla60x
 * dla60x_c
+* dm_nfnet_f0
+* dm_nfnet_f1
+* dm_nfnet_f2
+* dm_nfnet_f3
+* dm_nfnet_f4
+* dm_nfnet_f5
+* dm_nfnet_f6
 * dpn107
 * dpn131
 * dpn68
 * dpn68b
 * dpn92
 * dpn98
+* eca_botnext26ts_256
+* eca_halonext26ts
+* eca_nfnet_l0
+* eca_nfnet_l1
+* eca_nfnet_l2
+* eca_nfnet_l3
+* eca_resnet33ts
+* eca_resnext26ts
+* eca_vovnet39b
 * ecaresnet101d
 * ecaresnet101d_pruned
-* ecaresnet18
-* ecaresnet50
+* ecaresnet200d
+* ecaresnet269d
+* ecaresnet26t
 * ecaresnet50d
 * ecaresnet50d_pruned
+* ecaresnet50t
 * ecaresnetlight
-* ecaresnext26tn_32x4d
+* ecaresnext26t_32x4d
+* ecaresnext50t_32x4d
 * efficientnet_b0
 * efficientnet_b1
 * efficientnet_b1_pruned
@@ -55,33 +238,61 @@ number of classes!
 * efficientnet_cc_b0_8e
 * efficientnet_cc_b1_8e
 * efficientnet_el
+* efficientnet_el_pruned
 * efficientnet_em
 * efficientnet_es
+* efficientnet_es_pruned
 * efficientnet_l2
 * efficientnet_lite0
 * efficientnet_lite1
 * efficientnet_lite2
 * efficientnet_lite3
 * efficientnet_lite4
+* efficientnetv2_l
+* efficientnetv2_m
+* efficientnetv2_rw_m
+* efficientnetv2_rw_s
+* efficientnetv2_rw_t
+* efficientnetv2_s
+* efficientnetv2_xl
 * ens_adv_inception_resnet_v2
+* ese_vovnet19b_dw
+* ese_vovnet19b_slim
+* ese_vovnet19b_slim_dw
+* ese_vovnet39b
+* ese_vovnet39b_evos
+* ese_vovnet57b
+* ese_vovnet99b
+* ese_vovnet99b_iabn
 * fbnetc_100
+* fbnetv3_b
+* fbnetv3_d
+* fbnetv3_g
+* gc_efficientnetv2_rw_t
+* gcresnet33ts
+* gcresnet50t
+* gcresnext26ts
+* gcresnext50ts
+* gernet_l
+* gernet_m
+* gernet_s
+* ghostnet_050
+* ghostnet_100
+* ghostnet_130
 * gluon_inception_v3
 * gluon_resnet101_v1b
 * gluon_resnet101_v1c
 * gluon_resnet101_v1d
-* gluon_resnet101_v1e
 * gluon_resnet101_v1s
 * gluon_resnet152_v1b
 * gluon_resnet152_v1c
 * gluon_resnet152_v1d
-* gluon_resnet152_v1e
 * gluon_resnet152_v1s
 * gluon_resnet18_v1b
 * gluon_resnet34_v1b
 * gluon_resnet50_v1b
 * gluon_resnet50_v1c
 * gluon_resnet50_v1d
-* gluon_resnet50_v1e
 * gluon_resnet50_v1s
 * gluon_resnext101_32x4d
 * gluon_resnext101_64x4d
@@ -91,7 +302,20 @@ number of classes!
 * gluon_seresnext101_64x4d
 * gluon_seresnext50_32x4d
 * gluon_xception65
-* gluon_xception71
+* gmixer_12_224
+* gmixer_24_224
+* gmlp_b16_224
+* gmlp_s16_224
+* gmlp_ti16_224
+* halonet26t
+* halonet50ts
+* halonet_h1
+* hardcorenas_a
+* hardcorenas_b
+* hardcorenas_c
+* hardcorenas_d
+* hardcorenas_e
+* hardcorenas_f
 * hrnet_w18
 * hrnet_w18_small
 * hrnet_w18_small_v2
@@ -108,6 +332,34 @@ number of classes!
 * inception_resnet_v2
 * inception_v3
 * inception_v4
+* jx_nest_base
+* jx_nest_small
+* jx_nest_tiny
+* lambda_resnet26t
+* legacy_senet154
+* legacy_seresnet101
+* legacy_seresnet152
+* legacy_seresnet18
+* legacy_seresnet34
+* legacy_seresnet50
+* legacy_seresnext101_32x4d
+* legacy_seresnext26_32x4d
+* legacy_seresnext50_32x4d
+* levit_128
+* levit_128s
+* levit_192
+* levit_256
+* levit_384
+* mixer_b16_224
+* mixer_b16_224_in21k
+* mixer_b16_224_miil
+* mixer_b16_224_miil_in21k
+* mixer_b32_224
+* mixer_l16_224
+* mixer_l16_224_in21k
+* mixer_l32_224
+* mixer_s16_224
+* mixer_s32_224
 * mixnet_l
 * mixnet_m
 * mixnet_s
@@ -126,10 +378,55 @@ number of classes!
 * mobilenetv2_140
 * mobilenetv3_large_075
 * mobilenetv3_large_100
+* mobilenetv3_large_100_miil
+* mobilenetv3_large_100_miil_in21k
 * mobilenetv3_rw
 * mobilenetv3_small_075
 * mobilenetv3_small_100
 * nasnetalarge
+* nest_base
+* nest_small
+* nest_tiny
+* nf_ecaresnet101
+* nf_ecaresnet26
+* nf_ecaresnet50
+* nf_regnet_b0
+* nf_regnet_b1
+* nf_regnet_b2
+* nf_regnet_b3
+* nf_regnet_b4
+* nf_regnet_b5
+* nf_resnet101
+* nf_resnet26
+* nf_resnet50
+* nf_seresnet101
+* nf_seresnet26
+* nf_seresnet50
+* nfnet_f0
+* nfnet_f0s
+* nfnet_f1
+* nfnet_f1s
+* nfnet_f2
+* nfnet_f2s
+* nfnet_f3
+* nfnet_f3s
+* nfnet_f4
+* nfnet_f4s
+* nfnet_f5
+* nfnet_f5s
+* nfnet_f6
+* nfnet_f6s
+* nfnet_f7
+* nfnet_f7s
+* nfnet_l0
+* pit_b_224
+* pit_b_distilled_224
+* pit_s_224
+* pit_s_distilled_224
+* pit_ti_224
+* pit_ti_distilled_224
+* pit_xs_224
+* pit_xs_distilled_224
 * pnasnet5large
 * regnetx_002
 * regnetx_004
@@ -155,6 +452,14 @@ number of classes!
 * regnety_120
 * regnety_160
 * regnety_320
+* repvgg_a2
+* repvgg_b0
+* repvgg_b1
+* repvgg_b1g4
+* repvgg_b2
+* repvgg_b2g4
+* repvgg_b3
+* repvgg_b3g4
 * res2net101_26w_4s
 * res2net50_14w_8s
 * res2net50_26w_4s
@@ -162,6 +467,15 @@ number of classes!
 * res2net50_26w_8s
 * res2net50_48w_2s
 * res2next50
+* resmlp_12_224
+* resmlp_12_distilled_224
+* resmlp_24_224
+* resmlp_24_distilled_224
+* resmlp_36_224
+* resmlp_36_distilled_224
+* resmlp_big_24_224
+* resmlp_big_24_224_in22ft1k
+* resmlp_big_24_distilled_224
 * resnest101e
 * resnest14d
 * resnest200e
@@ -171,20 +485,71 @@ number of classes!
 * resnest50d_1s4x24d
 * resnest50d_4s2x40d
 * resnet101
+* resnet101d
 * resnet152
+* resnet152d
 * resnet18
+* resnet18d
+* resnet200
+* resnet200d
 * resnet26
 * resnet26d
+* resnet26t
+* resnet32ts
+* resnet33ts
 * resnet34
+* resnet34d
 * resnet50
 * resnet50d
+* resnet50t
+* resnet51q
+* resnet61q
 * resnetblur18
 * resnetblur50
+* resnetrs101
+* resnetrs152
+* resnetrs200
+* resnetrs270
+* resnetrs350
+* resnetrs420
+* resnetrs50
+* resnetv2_101
+* resnetv2_101d
+* resnetv2_101x1_bitm
+* resnetv2_101x1_bitm_in21k
+* resnetv2_101x3_bitm
+* resnetv2_101x3_bitm_in21k
+* resnetv2_152
+* resnetv2_152d
+* resnetv2_152x2_bit_teacher
+* resnetv2_152x2_bit_teacher_384
+* resnetv2_152x2_bitm
+* resnetv2_152x2_bitm_in21k
+* resnetv2_152x4_bitm
+* resnetv2_152x4_bitm_in21k
+* resnetv2_50
+* resnetv2_50d
+* resnetv2_50t
+* resnetv2_50x1_bit_distilled
+* resnetv2_50x1_bitm
+* resnetv2_50x1_bitm_in21k
+* resnetv2_50x3_bitm
+* resnetv2_50x3_bitm_in21k
 * resnext101_32x4d
 * resnext101_32x8d
 * resnext101_64x4d
+* resnext26ts
 * resnext50_32x4d
 * resnext50d_32x4d
+* rexnet_100
+* rexnet_130
+* rexnet_150
+* rexnet_200
+* rexnetr_100
+* rexnetr_130
+* rexnetr_150
+* rexnetr_200
+* sehalonet33ts
 * selecsls42
 * selecsls42b
 * selecsls60
@@ -197,14 +562,20 @@ number of classes!
 * senet154
 * seresnet101
 * seresnet152
+* seresnet152d
 * seresnet18
+* seresnet200d
+* seresnet269d
+* seresnet33ts
 * seresnet34
 * seresnet50
+* seresnet50t
 * seresnext101_32x4d
-* seresnext26_32x4d
+* seresnext101_32x8d
 * seresnext26d_32x4d
 * seresnext26t_32x4d
 * seresnext26tn_32x4d
+* seresnext26ts
 * seresnext50_32x4d
 * skresnet18
 * skresnet34
@@ -218,6 +589,16 @@ number of classes!
 * ssl_resnext101_32x4d
 * ssl_resnext101_32x8d
 * ssl_resnext50_32x4d
+* swin_base_patch4_window12_384
+* swin_base_patch4_window12_384_in22k
+* swin_base_patch4_window7_224
+* swin_base_patch4_window7_224_in22k
+* swin_large_patch4_window12_384
+* swin_large_patch4_window12_384_in22k
+* swin_large_patch4_window7_224
+* swin_large_patch4_window7_224_in22k
+* swin_small_patch4_window7_224
+* swin_tiny_patch4_window7_224
 * swsl_resnet18
 * swsl_resnet50
 * swsl_resnext101_32x16d
@@ -263,6 +644,21 @@ number of classes!
 * tf_efficientnet_lite2
 * tf_efficientnet_lite3
 * tf_efficientnet_lite4
+* tf_efficientnetv2_b0
+* tf_efficientnetv2_b1
+* tf_efficientnetv2_b2
+* tf_efficientnetv2_b3
+* tf_efficientnetv2_l
+* tf_efficientnetv2_l_in21ft1k
+* tf_efficientnetv2_l_in21k
+* tf_efficientnetv2_m
+* tf_efficientnetv2_m_in21ft1k
+* tf_efficientnetv2_m_in21k
+* tf_efficientnetv2_s
+* tf_efficientnetv2_s_in21ft1k
+* tf_efficientnetv2_s_in21k
+* tf_efficientnetv2_xl_in21ft1k
+* tf_efficientnetv2_xl_in21k
 * tf_inception_v3
 * tf_mixnet_l
 * tf_mixnet_m
@@ -273,15 +669,129 @@ number of classes!
 * tf_mobilenetv3_small_075
 * tf_mobilenetv3_small_100
 * tf_mobilenetv3_small_minimal_100
+* tnt_b_patch16_224
+* tnt_s_patch16_224
 * tresnet_l
 * tresnet_l_448
 * tresnet_m
 * tresnet_m_448
+* tresnet_m_miil_in21k
 * tresnet_xl
 * tresnet_xl_448
+* tv_densenet121
+* tv_resnet101
+* tv_resnet152
 * tv_resnet34
 * tv_resnet50
 * tv_resnext50_32x4d
+* twins_pcpvt_base
+* twins_pcpvt_large
+* twins_pcpvt_small
+* twins_svt_base
+* twins_svt_large
+* twins_svt_small
+* vgg11
+* vgg11_bn
+* vgg13
+* vgg13_bn
+* vgg16
+* vgg16_bn
+* vgg19
+* vgg19_bn
+* visformer_small
+* visformer_tiny
+* vit_base_patch16_224
+* vit_base_patch16_224_in21k
+* vit_base_patch16_224_miil
+* vit_base_patch16_224_miil_in21k
+* vit_base_patch16_384
+* vit_base_patch16_sam_224
+* vit_base_patch32_224
+* vit_base_patch32_224_in21k
+* vit_base_patch32_384
+* vit_base_patch32_sam_224
+* vit_base_r26_s32_224
+* vit_base_r50_s16_224
+* vit_base_r50_s16_224_in21k
+* vit_base_r50_s16_384
+* vit_base_resnet26d_224
+* vit_base_resnet50_224_in21k
+* vit_base_resnet50_384
+* vit_base_resnet50d_224
+* vit_huge_patch14_224_in21k
+* vit_large_patch16_224
+* vit_large_patch16_224_in21k
+* vit_large_patch16_384
+* vit_large_patch32_224
+* vit_large_patch32_224_in21k
+* vit_large_patch32_384
+* vit_large_r50_s32_224
+* vit_large_r50_s32_224_in21k
+* vit_large_r50_s32_384
+* vit_small_patch16_224
+* vit_small_patch16_224_in21k
+* vit_small_patch16_384
+* vit_small_patch32_224
+* vit_small_patch32_224_in21k
+* vit_small_patch32_384
+* vit_small_r26_s32_224
+* vit_small_r26_s32_224_in21k
+* vit_small_r26_s32_384
+* vit_small_resnet26d_224
+* vit_small_resnet50d_s16_224
+* vit_tiny_patch16_224
+* vit_tiny_patch16_224_in21k
+* vit_tiny_patch16_384
+* vit_tiny_r_s16_p8_224
+* vit_tiny_r_s16_p8_224_in21k
+* vit_tiny_r_s16_p8_384
+* vovnet39a
+* vovnet57a
 * wide_resnet101_2
 * wide_resnet50_2
 * xception
+* xception41
+* xception65
+* xception71
+* xcit_large_24_p16_224
+* xcit_large_24_p16_224_dist
+* xcit_large_24_p16_384_dist
+* xcit_large_24_p8_224
+* xcit_large_24_p8_224_dist
+* xcit_large_24_p8_384_dist
+* xcit_medium_24_p16_224
+* xcit_medium_24_p16_224_dist
+* xcit_medium_24_p16_384_dist
+* xcit_medium_24_p8_224
+* xcit_medium_24_p8_224_dist
+* xcit_medium_24_p8_384_dist
+* xcit_nano_12_p16_224
+* xcit_nano_12_p16_224_dist
+* xcit_nano_12_p16_384_dist
+* xcit_nano_12_p8_224
+* xcit_nano_12_p8_224_dist
+* xcit_nano_12_p8_384_dist
+* xcit_small_12_p16_224
+* xcit_small_12_p16_224_dist
+* xcit_small_12_p16_384_dist
+* xcit_small_12_p8_224
+* xcit_small_12_p8_224_dist
+* xcit_small_12_p8_384_dist
+* xcit_small_24_p16_224
+* xcit_small_24_p16_224_dist
+* xcit_small_24_p16_384_dist
+* xcit_small_24_p8_224
+* xcit_small_24_p8_224_dist
+* xcit_small_24_p8_384_dist
+* xcit_tiny_12_p16_224
+* xcit_tiny_12_p16_224_dist
+* xcit_tiny_12_p16_384_dist
+* xcit_tiny_12_p8_224
+* xcit_tiny_12_p8_224_dist
+* xcit_tiny_12_p8_384_dist
+* xcit_tiny_24_p16_224
+* xcit_tiny_24_p16_224_dist
+* xcit_tiny_24_p16_384_dist
+* xcit_tiny_24_p8_224
+* xcit_tiny_24_p8_224_dist
+* xcit_tiny_24_p8_384_dist
\ No newline at end of file
diff --git a/docs/source/api/pywick.models.segmentation.rst b/docs/source/api/pywick.models.segmentation.rst
index 7fa7543..7b7ecc0 100644
--- a/docs/source/api/pywick.models.segmentation.rst
+++ b/docs/source/api/pywick.models.segmentation.rst
@@ -22,6 +22,14 @@ DANet
     :undoc-members:
     :exclude-members: forward
 
+EmaNet
+--------------------------------------------------
+
+.. automodule:: pywick.models.segmentation.emanet
+    :members:
+    :undoc-members:
+    :exclude-members: forward
+
 DenseASPP
 --------------------------------------------------
 
@@ -125,6 +133,14 @@ FusionNet
     :undoc-members:
     :exclude-members: features, forward, logits
 
+GALDNet
+-------------------------------------
+
+.. automodule:: pywick.models.segmentation.galdnet
+    :members:
+    :undoc-members:
+    :exclude-members: forward
+
 GCN
 -------------------------------------
 
diff --git a/docs/source/api/pywick.optimizers.rst b/docs/source/api/pywick.optimizers.rst
index 2daa161..3122606 100644
--- a/docs/source/api/pywick.optimizers.rst
+++ b/docs/source/api/pywick.optimizers.rst
@@ -5,6 +5,34 @@ Optimizers
     :members:
     :undoc-members:
 
+A2Grad
+--------------------------------
+
+.. automodule:: pywick.optimizers.a2grad
+    :members: A2GradUni, A2GradInc, A2GradExp
+    :undoc-members:
+
+AdaBelief
+--------------------------------
+
+.. automodule:: pywick.optimizers.adabelief
+    :members: AdaBelief
+    :undoc-members:
+
+AdaHessian
+--------------------------------
+
+.. automodule:: pywick.optimizers.adahessian
+    :members: Adahessian
+    :undoc-members:
+
+AdamP
+--------------------------------
+
+.. automodule:: pywick.optimizers.adamp
+    :members: AdamP
+    :undoc-members:
+
 AdamW
 --------------------------------
 
@@ -19,6 +47,20 @@ AddSign
     :members: AddSign
     :undoc-members:
 
+Apollo
+--------------------------------
+
+.. automodule:: pywick.optimizers.apollo
+    :members: Apollo
+    :undoc-members:
+
+Lars
+----------------------------
+
+.. automodule:: pywick.optimizers.lars
+    :members: Lars
+    :undoc-members:
+
 Eve
 ----------------------------
 
@@ -33,6 +75,20 @@ Lookahead
     :members: Lookahead
     :undoc-members:
 
+LookaheadSGD
+------------------------------
+
+.. automodule:: pywick.optimizers.lookaheadsgd
+    :members:
+    :undoc-members:
+
+MADGrad
+------------------------------
+
+.. automodule:: pywick.optimizers.madgrad
+    :members: MADGRAD
+    :undoc-members:
+
 Nadam
 ------------------------------
 
@@ -47,6 +103,13 @@ PowerSign
     :members: PowerSign
     :undoc-members:
 
+QHAdam
+------------------------------
+
+.. automodule:: pywick.optimizers.qhadam
+    :members: QHAdam
+    :undoc-members:
+
 RAdam
 ------------------------------
 
diff --git a/docs/source/classification_guide.md b/docs/source/classification_guide.md
index 6985a51..3ae5c60 100644
--- a/docs/source/classification_guide.md
+++ b/docs/source/classification_guide.md
@@ -1,6 +1,21 @@
 ## Classification
 
-In a short while we will publish a walk-through that will go into detail
-on how to do classification with Pywick. In the meantime, if you feel
-adventurous feel free to look at our
-[README](https://github.com/achaiah/pywick/blob/master/README.md).
+With Pywick it is incredibly easy to perform classification training on your dataset. In a typical scenario you will not need to write any code but rather provide a configuration yaml file. See [configs/train_classifier.yaml](https://github.com/achaiah/pywick/blob/master/pywick/configs/train_classifier.yaml) for configuration options. Most of them are well-documented inside the configuration file.
+
+Your dataset should be arranged such that each directory under your root dir is named after the corresponding class of images that it contains (e.g. 17flowers/colt, 17flowers/daisy etc). You can include multiple `dataroots` directories as a list. As an easy starting point, download [17 flowers](https://www.robots.ox.ac.uk/~vgg/data/flowers/17/) dataset and run [examples/17flowers_split.py](https://github.com/achaiah/pywick/blob/master/examples/17flowers_split.py) to convert it into appropriate directory structure.
+
+Some options you may want to tweak:
+- `dataroots` - where to find the training data
+- `model_spec` - model to use
+- `num_epochs` - number of epochs to train for
+- `output_root` - where to save outputs (e.g. trained NNs)
+- `use_gpu` - whether to use the GPU(s) for training
+
+Once you are happy with your configuration, simply invoke the pywick training code:
+```bash
+# change to pywick
+cd pywick/pywick
+python3 train_classifier.py configs/train_classifier.yaml
+```
+
+To see how the training code is structured under the hood and to customize it to your liking, see [train_classifier.py](https://github.com/achaiah/pywick/blob/master/pywick/train_classifier.py).
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 33ab00f..0e9de1c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,7 +20,7 @@
 # -- Project information -----------------------------------------------------
 
 project = u'pywick'
-copyright = u'2019, Achaiah'
+copyright = u'2021, Achaiah'
 author = u'Achaiah'
 
 '''
@@ -40,7 +40,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.5.3'
+release = '0.6.5'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt
index 609f4b6..a74f64c 100644
--- a/docs/source/requirements.txt
+++ b/docs/source/requirements.txt
@@ -1,4 +1,20 @@
-hickle
+albumentations
+dill
+#hickle
+h5py
+# inplace_abn
+numpy
+opencv-python-headless
+pandas
 pillow
+prodict
+pycm
+pyyaml
+scipy
+requests
+scikit-image
 six
-tqdm
\ No newline at end of file
+tabulate
+tini
+tqdm
+yacs
\ No newline at end of file
diff --git a/docs/source/segmentation_guide.md b/docs/source/segmentation_guide.md
index b451616..0c883de 100644
--- a/docs/source/segmentation_guide.md
+++ b/docs/source/segmentation_guide.md
@@ -3,4 +3,6 @@
 In a short while we will publish a walk-through that will go into detail
 on how to do segmentation with Pywick. In the meantime, if you feel
 adventurous feel free to look at our
-[README](https://github.com/achaiah/pywick/blob/master/README.md).
\ No newline at end of file
+[README](https://github.com/achaiah/pywick/blob/master/README.md).
+
+You can also take a look at our [Classification guide](https://github.com/achaiah/pywick/blob/master/docs/source/classification_guide.md) to get a good idea of how to get started on your own. The segmentation training process is very similar but involves more complicated directory structure for data.
\ No newline at end of file
diff --git a/examples/17flowers_split.py b/examples/17flowers_split.py
new file mode 100644
index 0000000..c25d176
--- /dev/null
+++ b/examples/17flowers_split.py
@@ -0,0 +1,43 @@
+import shutil
+import os
+
+directory = "jpg"
+target_train = "17flowers"
+
+if not os.path.isdir(target_train):
+    os.makedirs(target_train)
+
+classes = [
+    "daffodil",
+    "snowdrop",
+    "lilyvalley",
+    "bluebell",
+    "crocus",
+    "iris",
+    "tigerlily",
+    "tulip",
+    "fritillary",
+    "sunflower",
+    "daisy",
+    "coltsfoot",
+    "dandelion",
+    "cowslip",
+    "buttercup",
+    "windflower",
+    "pansy",
+]
+
+j = 0
+for i in range(1, 1361):
+    label_dir = os.path.join(target_train, classes[j])
+
+    if not os.path.isdir(label_dir):
+        os.makedirs(label_dir)
+
+    filename = "image_" + str(i).zfill(4) + ".jpg"
+    shutil.copy(
+        os.path.join(directory, filename), os.path.join(label_dir, filename)
+    )
+
+    if i % 80 == 0:
+        j += 1
\ No newline at end of file
diff --git a/pywick/__init__.py b/pywick/__init__.py
index dc2f86c..06a36d6 100644
--- a/pywick/__init__.py
+++ b/pywick/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.6.0'
+__version__ = '0.6.5'
 __author__ = 'Achaiah'
 __description__ = 'High-level batteries-included neural network training library for Pytorch'
 
@@ -19,5 +19,6 @@
     optimizers,
     regularizers,
     samplers,
-    transforms
+    transforms,
+    utils
 )
diff --git a/pywick/callbacks/Callback.py b/pywick/callbacks/Callback.py
index 3eea942..76d257b 100644
--- a/pywick/callbacks/Callback.py
+++ b/pywick/callbacks/Callback.py
@@ -1,4 +1,4 @@
-class Callback(object):
+class Callback:
     """
     Abstract base class used to build new callbacks. Extend this class to build your own callbacks and overwrite functions
     that you want to monitor. Functions will be called automatically from the trainer once per relevant training event
diff --git a/pywick/callbacks/CallbackContainer.py b/pywick/callbacks/CallbackContainer.py
index 9bac6f1..d5ce60b 100644
--- a/pywick/callbacks/CallbackContainer.py
+++ b/pywick/callbacks/CallbackContainer.py
@@ -6,7 +6,7 @@ def _get_current_time():
     return time_s, datetime.datetime.fromtimestamp(time_s).strftime("%B %d, %Y - %I:%M%p")
 
 
-class CallbackContainer(object):
+class CallbackContainer:
     """
     Container holding a list of callbacks.
     """
@@ -15,8 +15,7 @@ def __init__(self, callbacks=None, queue_length=10):
         self.initial_epoch = -1
         self.final_epoch = -1
         self.has_val_data = False
-        callbacks = callbacks or []
-        self.callbacks = [c for c in callbacks]
+        self.callbacks = callbacks or []
         self.queue_length = queue_length
 
     def append(self, callback):
diff --git a/pywick/callbacks/CyclicLRScheduler.py b/pywick/callbacks/CyclicLRScheduler.py
index f347360..cba0522 100644
--- a/pywick/callbacks/CyclicLRScheduler.py
+++ b/pywick/callbacks/CyclicLRScheduler.py
@@ -92,14 +92,14 @@ def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
             raise TypeError('{} is not an Optimizer'.format(type(optimizer).__name__))
         self.optimizer = optimizer
 
-        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
+        if isinstance(base_lr, (list, tuple)):
             if len(base_lr) != len(optimizer.param_groups):
                 raise ValueError("expected {} base_lr, got {}".format(len(optimizer.param_groups), len(base_lr)))
             self.base_lrs = list(base_lr)
         else:
             self.base_lrs = [base_lr] * len(optimizer.param_groups)
 
-        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
+        if isinstance(max_lr, (list, tuple)):
             if len(max_lr) != len(optimizer.param_groups):
                 raise ValueError("expected {} max_lr, got {}".format(len(optimizer.param_groups), len(max_lr)))
             self.max_lrs = list(max_lr)
@@ -155,10 +155,12 @@ def on_batch_end(self, batch, logs=None):
 
             self.trainer.history.lrs = computed_lr
 
-    def _triangular_scale_fn(self, x):
+    @staticmethod
+    def _triangular_scale_fn(x):
         return 1.
 
-    def _triangular2_scale_fn(self, x):
+    @staticmethod
+    def _triangular2_scale_fn(x):
         return 1 / (2. ** (x - 1))
 
     def _exp_range_scale_fn(self, x):
diff --git a/pywick/callbacks/EarlyStopping.py b/pywick/callbacks/EarlyStopping.py
index 68efd7c..cc87beb 100644
--- a/pywick/callbacks/EarlyStopping.py
+++ b/pywick/callbacks/EarlyStopping.py
@@ -47,7 +47,6 @@ def on_epoch_end(self, epoch, logs=None):
                     self.trainer._stop_training = True
                 self.wait += 1
 
-    def on_train_end(self, logs):
+    def on_train_end(self, logs=None):
         if self.stopped_epoch > 0:
-            print('\nTerminated Training for Early Stopping at Epoch %04i' %
-                  (self.stopped_epoch))
\ No newline at end of file
+            print(f'\nTerminated Training for Early Stopping at Epoch: {self.stopped_epoch}')
diff --git a/pywick/callbacks/LRScheduler.py b/pywick/callbacks/LRScheduler.py
index 0373882..1426c04 100644
--- a/pywick/callbacks/LRScheduler.py
+++ b/pywick/callbacks/LRScheduler.py
@@ -22,7 +22,7 @@ def __init__(self, schedule):
         if isinstance(schedule, dict):
             schedule = self.schedule_from_dict
             self.schedule_dict = schedule
-            if any([k < 1.0 for k in schedule.keys()]):
+            if any(k < 1.0 for k in schedule.keys()):
                 self.fractional_bounds = False
             else:
                 self.fractional_bounds = True
@@ -30,6 +30,7 @@ def __init__(self, schedule):
         super(LRScheduler, self).__init__()
 
     def schedule_from_dict(self, epoch, logs=None):
+        learn_rate = None
         for epoch_bound, learn_rate in self.schedule_dict.items():
             # epoch_bound is in units of "epochs"
             if not self.fractional_bounds:
diff --git a/pywick/callbacks/ModelCheckpoint.py b/pywick/callbacks/ModelCheckpoint.py
index ec0d68f..ea9611e 100644
--- a/pywick/callbacks/ModelCheckpoint.py
+++ b/pywick/callbacks/ModelCheckpoint.py
@@ -13,7 +13,7 @@
 
 class ModelCheckpoint(Callback):
     """
-    Model Checkpoint to save model weights during training. 'Best' is determined by minimizing the value found under monitored_log_key in the logs
+    Model Checkpoint to save model weights during training. 'Best' is determined by minimizing (or maximizing) the value found under monitored_log_key in the logs
     Saved checkpoints contain these keys by default:
         'run_id'
         'epoch'
@@ -62,8 +62,12 @@ class ModelCheckpoint(Callback):
         Default: False
     """
 
-    def __init__(self, run_id, monitored_log_key, save_dir, addl_k_v=dict(), epoch_log_keys=[], save_interval=5, save_best_only=False, max_saves=5,
+    def __init__(self, run_id, monitored_log_key, save_dir, addl_k_v=None, epoch_log_keys=None, save_interval=5, save_best_only=False, max_saves=5,
                  custom_func=None, do_minimize=True, verbose=False):
+        if addl_k_v is None:
+            addl_k_v = {}
+        if epoch_log_keys is None:
+            epoch_log_keys = []
 
         self.run_id = run_id
         self.addl_k_v = addl_k_v
@@ -73,7 +77,7 @@ def __init__(self, run_id, monitored_log_key, save_dir, addl_k_v=dict(), epoch_l
         self.save_best_only = save_best_only
         self.max_saves = max_saves
         self.custom_func = custom_func
-        self.custom_func_dict = dict()  # this is expected to be filled by the custom_func
+        self.custom_func_dict = {}  # this is expected to be filled by the custom_func
         self.verbose = verbose
         self.monitored_log_key = monitored_log_key  # 'e.g. dice_coeff'
         self.do_minimize = do_minimize
@@ -94,12 +98,10 @@ def __init__(self, run_id, monitored_log_key, save_dir, addl_k_v=dict(), epoch_l
         super().__init__()
 
     def on_epoch_end(self, epoch, logs=None):
-        # import pdb
-        # pdb.set_trace()
         self.last_epoch_logs = logs
         self.last_epoch = epoch
 
-        if ((epoch + 1) % self.save_interval == 0):  # only save with given frequency
+        if (epoch + 1) % self.save_interval == 0:  # only save with given frequency
             current_loss = logs.get(self.monitored_log_key)
 
             if (current_loss < self.best_loss and self.save_best_only) or not self.save_best_only or (not self.do_minimize and current_loss > self.best_loss):
diff --git a/pywick/callbacks/OneCycleLRScheduler.py b/pywick/callbacks/OneCycleLRScheduler.py
index 9be3635..49acf4e 100644
--- a/pywick/callbacks/OneCycleLRScheduler.py
+++ b/pywick/callbacks/OneCycleLRScheduler.py
@@ -129,7 +129,7 @@ def __init__(self,
         # Validate total_steps
         if total_steps is None and epochs is None and steps_per_epoch is None:
             raise ValueError("You must define either total_steps OR (epochs AND steps_per_epoch)")
-        elif total_steps is not None:
+        if total_steps is not None:
             if total_steps <= 0 or not isinstance(total_steps, int):
                 raise ValueError("Expected non-negative integer total_steps, but got {}".format(total_steps))
             self.total_steps = total_steps
@@ -149,7 +149,7 @@ def __init__(self,
         # Validate anneal_strategy
         if anneal_strategy not in ['cos', 'linear']:
             raise ValueError("anneal_strategy must by one of 'cos' or 'linear', instead got {}".format(anneal_strategy))
-        elif anneal_strategy == 'cos':
+        if anneal_strategy == 'cos':
             self.anneal_func = self._annealing_cos
         elif anneal_strategy == 'linear':
             self.anneal_func = self._annealing_linear
@@ -182,7 +182,8 @@ def __init__(self,
 
         super(OneCycleLR, self).__init__(optimizer=optimizer, step_size=1, last_epoch=last_epoch)
 
-    def _format_param(self, name, optimizer, param):
+    @staticmethod
+    def _format_param(name, optimizer, param):
         """Return correctly formatted lr/momentum for each param group."""
         if isinstance(param, (list, tuple)):
             if len(param) != len(optimizer.param_groups):
@@ -192,12 +193,14 @@ def _format_param(self, name, optimizer, param):
         else:
             return [param] * len(optimizer.param_groups)
 
-    def _annealing_cos(self, start, end, pct):
+    @staticmethod
+    def _annealing_cos(start, end, pct):
         "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
         cos_out = math.cos(math.pi * pct) + 1
         return end + (start - end) / 2.0 * cos_out
 
-    def _annealing_linear(self, start, end, pct):
+    @staticmethod
+    def _annealing_linear(start, end, pct):
         "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
         return (end - start) * pct + start
 
diff --git a/pywick/callbacks/TQDM.py b/pywick/callbacks/TQDM.py
index 7338386..beec1ea 100644
--- a/pywick/callbacks/TQDM.py
+++ b/pywick/callbacks/TQDM.py
@@ -14,6 +14,7 @@ def __init__(self):
         every SuperModule if verbose > 0
         """
         self.progbar = None
+        self.train_logs = None
         super(TQDM, self).__init__()
 
     def __enter__(self):
@@ -24,7 +25,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if self.progbar is not None:
             self.progbar.close()
 
-    def on_train_begin(self, logs):
+    def on_train_begin(self, logs=None):
         self.train_logs = logs
 
     def on_epoch_begin(self, epoch, logs=None):
diff --git a/pywick/conditions.py b/pywick/conditions.py
index f90f291..88d8b13 100644
--- a/pywick/conditions.py
+++ b/pywick/conditions.py
@@ -13,7 +13,7 @@ class CondType(Enum):
     PRE = auto()
     POST = auto()
 
-class ConditionsContainer(object):
+class ConditionsContainer:
     '''
     This container maintains metadata about the execution environment in which the conditions are performed
 
@@ -80,7 +80,7 @@ def __call__(self, cond_type, epoch_num, batch_num, net=None, input_batch=None,
                 logs[self.prefix + condition._name] = logs_out
         return logs
 
-class Condition(object):
+class Condition:
     """
     Default class from which all other Condition implementations inherit.
     """
@@ -118,7 +118,8 @@ class SegmentationInputAsserts(Condition):
     '''
 
     def __call__(self, exec_type, epoch_num, batch_num, net=None, inputs=None, outputs=None, labels=None):
-        assert inputs.size()[2:] == labels.size()[1:]
+        if inputs.size()[2:] != labels.size()[1:]:
+            raise AssertionError
 
     def reset(self):
         pass
@@ -140,8 +141,10 @@ def __call__(self, exec_type, epoch_num, batch_num, net=None, inputs=None, outpu
                 outs, aux = outputs
         else:
             outs = outputs
-        assert outs.size()[2:] == labels.size()[1:]
-        assert outs.size()[1] == self.num_classes
+        if outs.size()[2:] != labels.size()[1:]:
+            raise AssertionError
+        if outs.size()[1] != self.num_classes:
+            raise AssertionError
 
     def reset(self):
         pass
diff --git a/pywick/configs/default.yaml b/pywick/configs/default.yaml
new file mode 100644
index 0000000..eecabae
--- /dev/null
+++ b/pywick/configs/default.yaml
@@ -0,0 +1,56 @@
+# This file provides basic configuration for training neural networks. Many values are unset or provided as dummy defaults.
+# This file is meant to be extended with more specific values (e.g. train_classifier.yaml)
+train:
+  # define general variables to reuse
+  lr: &lr 0.001                   # optimizer learning rate
+  momentum: &momentum 0.9         # optimizer momentum
+  weight_decay: &wd 0.0001        # important to keep FIXED during the entire training. Can be 1e-4 or 1e-5!
+
+  auto_balance_dataset: False     # whether to attempt to fix imbalances in class representation within the dataset
+  batch_size: 12                  # Size of the batch to use when training (per GPU)
+  dataroots: ['/data/17flowers']  # where to find the training data
+  gpu_ids: [0]                    # gpus to use for training (if more than one available)
+#  gpu_ids: [0, 1, 2, 4]          # gpus to use for training (if more than one available)
+  input_size:                     # size of the input image. Networks with atrous convolutions (densenet, fbresnet, inceptionv4) allow flexible image sizes while others do not
+                                  # see table: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv
+
+  mean_std: [[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]]      # imagenet default
+
+  model_spec: resnet18            # model to use (over 200 models available! see: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv)
+
+  num_epochs: &nepochs 15         # number of epochs to train for (use small number if starting from pretrained NN)
+  optimizer:                      # choice of optimizer (sgd is typically slowest but best)
+    name: SGD
+    params:
+      lr: *lr
+      momentum: *momentum
+      weight_decay: *wd
+  output_root: &outroot '/jobs/17flowers'  # where to save outputs (e.g. trained NNs)
+  random_seed: 1337               # the random seed used to initialize various randomness functions (set for reproduceability)
+  save_callback:                  # callback to use for saving the model (if any)
+  scheduler:                      # scheduler configuration
+    name: OnceCycleLRScheduler    # should match to a name of an imported scheduler (either from callbacks or torch.optim.lr_scheduler)
+    params:
+      epochs: *nepochs
+      gamma: 0.1
+      max_lr: 0.05
+      pct_start: 0.2
+      step_size: 2
+  train_val_ratio: 0.9            # split ratio between training and validation data (if using a single dataset)
+  use_apex: False                 # whether to use APEX optimization
+  use_gpu: True                   # whether to use the GPU for training
+  val_root:                       # where to find validation data (if separate). Note that typically validation data is simply split off from training data based on split_ratio
+  workers: 6                      # number of workers to read training data from disk and feed it to the GPU
+
+eval:
+  batch_size:  1                          # size of batch to run through eval
+  CUDA_VISIBLE_DEVICES: '0'
+  dataroots:  '/data/eval'                # directory containing evaluation data
+  eval_chkpt: '/data/models/best.pth'     # saved checkpoint to use for evaluation
+  gpu_id: [0]
+  has_grnd_truth:  True                   # whether ground truth is provided (as directory names under which images reside)
+#  input_size:  224                       # should be saved with the model but could be overridden here
+  jobroot:  '/jobs/eval_output'           # where to output predictions
+  topK: 5                                 # number of results to return
+  use_gpu: False                          # toggle gpu use for inference
+  workers:  1                             # keep at 1 otherwise statistics may not be accurate
\ No newline at end of file
diff --git a/pywick/configs/eval_classifier.yaml b/pywick/configs/eval_classifier.yaml
new file mode 100644
index 0000000..65a9a44
--- /dev/null
+++ b/pywick/configs/eval_classifier.yaml
@@ -0,0 +1,14 @@
+# This specification extends / overrides default.yaml where necessary
+__include__: default.yaml
+
+eval:
+  batch_size:  1                          # size of batch to run through eval
+  dataroots:  '/data/eval'                # directory containing evaluation data
+  eval_chkpt: '/data/models/best.pth'     # saved checkpoint to use for evaluation
+  gpu_id: 0
+  has_grnd_truth:  True                   # whether ground truth is provided (as directory names under which images reside)
+#  input_size:  224                       # should be saved with the model but could be overridden here
+  jobroot:  '/jobs/eval_output'           # where to output predictions
+  topK: 5                                 # number of results to return
+  use_gpu: False                          # toggle gpu use for inference
+  workers:  1                             # keep at 1 otherwise statistics may not be accurate
\ No newline at end of file
diff --git a/pywick/configs/train_classifier.json b/pywick/configs/train_classifier.json
new file mode 100644
index 0000000..b715fb5
--- /dev/null
+++ b/pywick/configs/train_classifier.json
@@ -0,0 +1,80 @@
+{
+  "train": {
+    "lr": 0.001,
+    "momentum": 0.9,
+    "weight_decay": 0.0001,
+    "auto_balance_dataset": false,
+    "batch_size": 32,
+    "dataroots": [
+      "/data/17flowers"
+    ],
+    "gpu_ids": [
+      0
+    ],
+    "input_size": 224,
+    "mean_std": [
+      [
+        0.485,
+        0.456,
+        0.406
+      ],
+      [
+        0.229,
+        0.224,
+        0.225
+      ]
+    ],
+    "model_spec": "resnet50",
+    "num_epochs": 15,
+    "optimizer": {
+      "name": "SGD",
+      "params": {
+        "lr": 0.001,
+        "momentum": 0.9,
+        "weight_decay": 0.0001
+      }
+    },
+    "output_root": "/jobs/17flowers",
+    "random_seed": 1337,
+    "save_callback": {
+      "name": "ModelCheckpoint",
+      "params": {
+        "do_minimize": true,
+        "max_saves": 5,
+        "monitored_log_key": "val_loss",
+        "save_best_only": false,
+        "save_interval": 1,
+        "save_dir": "/jobs/17flowers",
+        "custom_func": null,
+        "verbose": false
+      }
+    },
+    "scheduler": {
+      "name": "OnceCycleLRScheduler",
+      "params": {
+        "epochs": 15,
+        "max_lr": 0.05,
+        "pct_start": 0.2
+      }
+    },
+    "train_val_ratio": 0.9,
+    "use_apex": false,
+    "use_gpu": true,
+    "val_root": null,
+    "workers": 6
+  },
+  "eval": {
+    "batch_size": 1,
+    "CUDA_VISIBLE_DEVICES": "0",
+    "dataroots": "/data/eval",
+    "eval_chkpt": "/data/models/best.pth",
+    "gpu_id": [
+      0
+    ],
+    "has_grnd_truth": true,
+    "jobroot": "/jobs/eval_output",
+    "topK": 5,
+    "use_gpu": false,
+    "workers": 1
+  }
+}
\ No newline at end of file
diff --git a/pywick/configs/train_classifier.yaml b/pywick/configs/train_classifier.yaml
new file mode 100644
index 0000000..37ff51a
--- /dev/null
+++ b/pywick/configs/train_classifier.yaml
@@ -0,0 +1,61 @@
+# This specification extends / overrides default.yaml where necessary
+__include__: default.yaml
+train:
+  # define general variables to reuse
+  lr: &lr 0.001                   # optimizer learning rate
+  momentum: &momentum 0.9         # optimizer momentum
+  weight_decay: &wd 0.0001        # important to keep FIXED during the entire training. Can be 1e-4 or 1e-5!
+
+  batch_size: 32                  # Size of the batch to use when training (per GPU)
+  dataroots: ['/data/17flowers']  # where to find the training data
+  gpu_ids: [0]                    # gpus to use for training (if more than one available)
+#  gpu_ids: [0, 1, 2, 4]          # gpus to use for training (if more than one available)
+  input_size: 224                 # size of the input image. Networks with atrous convolutions (densenet, fbresnet, inceptionv4) allow flexible image sizes while others do not
+                                  # see table: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv
+  model_spec: resnet50            # model to use (over 200 models available! see: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv)
+
+  num_epochs: &nepochs 15         # number of epochs to train for (use small number if starting from pretrained NN)
+  optimizer:                      # choice of optimizer (sgd is typically slowest but best)
+    name: SGD
+    params:
+      lr: *lr
+      momentum: *momentum
+      weight_decay: *wd
+  output_root: &outroot '/jobs/17flowers'  # where to save outputs (e.g. trained NNs)
+  random_seed: 1337               # the random seed used to initialize various randomness functions (set for reproduceability)
+  save_callback:                  # callback to use for saving the model (if any)
+    name: ModelCheckpoint
+    params:
+      do_minimize:  True            # whether the monitored key is minimized or maximized
+      max_saves:  5                 # maximum number of NNs to keep
+      monitored_log_key: val_loss   # which key is used as loss
+      save_best_only: False         # whether to save only best NN
+      save_interval: 1              # save every N epochs
+      save_dir: *outroot            # where to save output
+      custom_func:                  # name of custom function to execute on key/val dictionary (if any)
+      verbose: False
+  scheduler:                      # scheduler configuration
+    name: OnceCycleLRScheduler    # should match to a name of an imported scheduler (either from callbacks or torch.optim.lr_scheduler)
+    params:
+      epochs: *nepochs
+      max_lr: 0.05
+      pct_start: 0.2
+  train_val_ratio: 0.9            # split ratio between training and validation data (if using a single dataset)
+  use_apex: False                 # whether to use APEX optimization (not yet implemented)
+  use_gpu: True                   # whether to use the GPU for training
+  val_root:                       # where to find validation data (if separate). Note that typically validation data is simply split off from training data based on split_ratio
+  workers: 6                      # number of workers to read training data from disk and feed it to the GPU
+#  workers: 0                      # set workers to 0 if training on CPU (or alternatively must adjust multiprocessing in __main__
+
+eval:
+  batch_size:  1                          # size of batch to run through eval
+  CUDA_VISIBLE_DEVICES: '0'
+  dataroots:  '/data/eval'                # directory containing evaluation data
+  eval_chkpt: '/data/models/best.pth'     # saved checkpoint to use for evaluation
+  gpu_id: [0]
+  has_grnd_truth:  True                   # whether ground truth is provided (as directory names under which images reside)
+#  input_size:  224                       # should be saved with the model but could be overridden here
+  jobroot:  '/jobs/eval_output'           # where to output predictions
+  topK: 5                                 # number of results to return
+  use_gpu: False                          # toggle gpu use for inference
+  workers:  1                             # keep at 1 otherwise statistics may not be accurate
\ No newline at end of file
diff --git a/pywick/constraints.py b/pywick/constraints.py
index d3d390e..f595afb 100644
--- a/pywick/constraints.py
+++ b/pywick/constraints.py
@@ -10,7 +10,7 @@
 from .callbacks import Callback
 
 
-class ConstraintContainer(object):
+class ConstraintContainer:
 
     def __init__(self, constraints):
         self.constraints = constraints
@@ -64,7 +64,7 @@ def on_epoch_end(self, epoch_idx, logs):
         self.container.apply_epoch_constraints(epoch_idx)
 
 
-class Constraint(object):
+class Constraint:
     """
     Default class from which all Constraint implementations inherit.
     """
diff --git a/pywick/random.py b/pywick/cust_random.py
similarity index 100%
rename from pywick/random.py
rename to pywick/cust_random.py
diff --git a/pywick/data_stats.py b/pywick/data_stats.py
index f349f85..18ff849 100644
--- a/pywick/data_stats.py
+++ b/pywick/data_stats.py
@@ -7,7 +7,7 @@
 
 from pywick.datasets.FolderDataset import FolderDataset, rgb_image_loader
 
-opt = dict()
+opt = {}
 parser = argparse.ArgumentParser()
 parser.add_argument('--root_path', required=False, type=str, help='Path to root directory of the images')
 parser.add_argument('--output_path', required=False, type=str, help='Path to save computed statistics to. If not provided, will save inside root_path')
@@ -15,7 +15,7 @@
 opt = vars(parser.parse_args())
 
 # clean up the dictionary so it doesn't contain 'None' values
-removals = list()
+removals = []
 for key, val in opt.items():
     if val is None:
         removals.append(key)
diff --git a/pywick/datasets/BaseDataset.py b/pywick/datasets/BaseDataset.py
index 9d0e1e5..108ed5b 100644
--- a/pywick/datasets/BaseDataset.py
+++ b/pywick/datasets/BaseDataset.py
@@ -4,7 +4,7 @@
 from .data_utils import is_tuple_or_list
 
 
-class BaseDataset(object):
+class BaseDataset:
     """An abstract class representing a Dataset.
 
     All other datasets should subclass it. All subclasses should override
diff --git a/pywick/datasets/CSVDataset.py b/pywick/datasets/CSVDataset.py
index fd63a2a..cb5ec1e 100644
--- a/pywick/datasets/CSVDataset.py
+++ b/pywick/datasets/CSVDataset.py
@@ -44,7 +44,8 @@ def __init__(self,
                  target_transform=None,
                  co_transform=None,
                  apply_transforms_individually=False):
-        assert(input_cols is not None)
+        if (input_cols is None):
+            raise AssertionError
 
         self.input_cols = _process_cols_argument(input_cols)
         self.target_cols = _process_cols_argument(target_cols)
diff --git a/pywick/datasets/ClonedFolderDataset.py b/pywick/datasets/ClonedFolderDataset.py
index 3cd81be..a7d3b06 100644
--- a/pywick/datasets/ClonedFolderDataset.py
+++ b/pywick/datasets/ClonedFolderDataset.py
@@ -19,8 +19,7 @@ def __init__(self, data, meta_data, **kwargs):
 
         if len(data) == 0:
             raise (RuntimeError('No data provided'))
-        else:
-            print('Initializing with %i data items' % len(data))
+        print('Initializing with %i data items' % len(data))
 
         self.data = data
 
diff --git a/pywick/datasets/FolderDataset.py b/pywick/datasets/FolderDataset.py
index 59a340b..5b8a14e 100644
--- a/pywick/datasets/FolderDataset.py
+++ b/pywick/datasets/FolderDataset.py
@@ -117,8 +117,7 @@ def __init__(self,
 
         if len(data) == 0:
             raise (RuntimeError('Found 0 data items in subfolders of: %s' % root))
-        else:
-            print('Found %i data items' % len(data))
+        print('Found %i data items' % len(data))
 
         self.root = os.path.expanduser(root)
         self.data = data
diff --git a/pywick/datasets/MultiFolderDataset.py b/pywick/datasets/MultiFolderDataset.py
index 34578ab..10e60ae 100644
--- a/pywick/datasets/MultiFolderDataset.py
+++ b/pywick/datasets/MultiFolderDataset.py
@@ -108,7 +108,7 @@ def __init__(self,
             else:
                 self.classes, self.class_to_idx = _find_classes(roots)
 
-            data_list = list()
+            data_list = []
             for root in roots:
                 datai, _ = _finds_inputs_and_targets(root, class_mode=class_mode, class_to_idx=self.class_to_idx, input_regex=input_regex,
                                                      rel_target_root=rel_target_root, target_prefix=target_prefix, target_postfix=target_postfix,
@@ -119,8 +119,7 @@ def __init__(self,
 
             if len(self.data) == 0:
                 raise (RuntimeError('Found 0 data items in subfolders of: {}'.format(roots)))
-            else:
-                print('Found %i data items' % len(self.data))
+            print('Found %i data items' % len(self.data))
 
             self.roots = [os.path.expanduser(x) for x in roots]
             self.transform = transform
diff --git a/pywick/datasets/data_utils.py b/pywick/datasets/data_utils.py
index 6042eab..507ef56 100644
--- a/pywick/datasets/data_utils.py
+++ b/pywick/datasets/data_utils.py
@@ -35,13 +35,12 @@ def pil_loader(path, color_space=''):
             return Image.open(path).convert('RGBA')
         elif color_space.lower() == 'l':
             return Image.open(path).convert('L')
-        elif color_space.lower() == '1' or color_space.lower() == 'binary':
+        elif color_space.lower() in ('1', 'binary'):
             return Image.open(path).convert('1')
         else:
             return Image.open(path)
     except OSError:
-        print("!!!  Could not read path: " + path)
-        exit(2)
+        raise Exception("!!!  Could not read path: " + path)
 
 
 def pil_loader_rgb(path):
@@ -118,10 +117,10 @@ def _multi_arg_pass_through(*x):
 
 
 def _find_classes(dirs):
-    classes = list()
-    for dir in dirs:
-        dir = os.path.expanduser(dir)
-        loc_classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes = []
+    for dir_ in dirs:
+        dir_ = os.path.expanduser(dir_)
+        loc_classes = [d for d in os.listdir(dir_) if os.path.isdir(os.path.join(dir_, d))]
         for cls in loc_classes:
             if cls not in classes:
                 classes.append(cls)
@@ -164,7 +163,7 @@ def _finds_inputs_and_targets(root, class_mode, class_to_idx=None, input_regex='
         The list must contain paths relative to the root parameter\n
         each line may include the filename and additional comma-separated metadata, in which case the first item will be considered the path itself and the rest will be ignored
 
-    :return: partition1 (list of (input, target)), partition2 (list of (input, target))
+    :return: partition1 (list of (input_, target)), partition2 (list of (input_, target))
     """
     if class_mode not in ('image', 'label', 'path'):
         raise ValueError('class_mode must be one of: {label, image, path}')
@@ -226,7 +225,8 @@ def _finds_inputs_and_targets(root, class_mode, class_to_idx=None, input_regex='
     if class_mode is None:
         return trainlist_inputs, vallist_inputs
     else:
-        assert len(trainlist_inputs) == len(trainlist_targets) and len(vallist_inputs) == len(vallist_targets)
+        if not (len(trainlist_inputs) == len(trainlist_targets) and len(vallist_inputs) == len(vallist_targets)):
+            raise AssertionError
         print("Total processed: %i    Train-list: %i items   Val-list: %i items    Exclusion-list: %i items" % (icount, len(trainlist_inputs), len(vallist_inputs), len(exclusion_list)))
         return list(zip(trainlist_inputs, trainlist_targets)), list(zip(vallist_inputs, vallist_targets))
 
@@ -271,9 +271,66 @@ def get_dataset_mean_std(data_set, img_size=256, output_div=255.0):
     return total.mean(1) / output_div, total.std(1) / output_div        # return channel-wise mean for the entire dataset
 
 
+def adjust_dset_length(dataset, num_batches: int, num_devices: int, batch_size: int):
+    """
+    To properly distribute computation across devices (typically GPUs) we need to meet two criteria:
+        1. batch size on each device must be > 1
+        2. dataset must be evenly partitioned across devices in specified batches
+
+    :param dataset:         Dataset to trim
+    :param num_batches:     Number of batches that dset will be partitioned into
+    :param num_devices:     Number of devices dset will be distributed onto
+    :param batch_size:      Size of individual batch
+    :return:
+    """
+
+    # We need to trim the dataset if it cannot be split evenly among num_devices with batch_size batches.
+    # Formula is:
+    #               num_batches = DataLen / (num_devices * batch_size)
+    #               remainderLen = DataLen - (num_batches * num_devices * batch_size)
+    #               if remainderLen / num_devices < 2
+    #                   remove remainderLen items
+    #               else if (remainderLen / num_devices) % 2 != 0
+    #                   remove remainderLen - ((remainderLen // num_devices) * num_devices)
+    #                   remainderLen = DataLen - (num_batches * num_devices * batch_size)
+    #                   if remainderLen / num_devices < 2
+    #                       remove remainderLen items
+
+    remainder_len = len(dataset) - (num_batches * num_devices * batch_size)
+    if remainder_len * 1. / num_devices < 2:
+        num_remove = remainder_len
+        for _ in range(num_remove):
+            last_el = dataset.data.pop()
+            print(f"  ==> WARN: Data element removed: {last_el}.")
+        print(
+            f"  ==> WARN: Length of training set ({len(dataset)}) did not fit onto num Devices: {num_devices}. Removing {num_remove} data elements to avoid training issues with BatchNorm")
+        print(f"New dataset length is: {len(dataset)}")
+        print('| -------------- |')
+
+    elif (remainder_len / num_devices) % 2 != 0:
+        num_remove = remainder_len - ((remainder_len // num_devices) * num_devices)
+        for _ in range(num_remove):
+            last_el = dataset.data.pop()
+            print(f"  ==> WARN: Data element removed: {last_el}.")
+        print(
+            f"  ==> WARN: Length of training set ({len(dataset)}) did not fit onto num Devices: {num_devices}. Removing {num_remove} data elements to avoid training issues with BatchNorm")
+        print(f"New dataset length is: {len(dataset)}")
+
+        remainder_len = len(dataset) - (num_batches * num_devices * batch_size)
+        if remainder_len * 1. / num_devices < 2:
+            num_remove = remainder_len
+            for _ in range(num_remove):
+                last_el = dataset.data.pop()
+                print(f"  ==> WARN: Data element removed: {last_el}.")
+            print(
+                f"  ==> WARN: Length of training set ({len(dataset)}) did not fit onto GPUs with len: {num_devices}. Removing {num_remove} data elements to avoid training issues with BatchNorm")
+            print(f"New dataset length is: {len(dataset)}")
+
+        print('| -------------- |')
+
+
 if __name__ == "__main__":
     from pywick.datasets.FolderDataset import FolderDataset
-    from pywick.datasets.data_utils import pil_loader_rgb
 
     dataset = FolderDataset(root='/home/users/youruser/images', class_mode='label', default_loader=pil_loader_rgb)
     mean, std = get_dataset_mean_std(dataset)
diff --git a/pywick/datasets/tnt/batchdataset.py b/pywick/datasets/tnt/batchdataset.py
index 7560167..4386730 100755
--- a/pywick/datasets/tnt/batchdataset.py
+++ b/pywick/datasets/tnt/batchdataset.py
@@ -80,12 +80,13 @@ def __len__(self):
         elif self.policy == 'skip-last':
             return int(math.floor(float(len(self.dataset) / self.batchsize)))
         elif self.policy == 'divisible-only':
-            assert len(self.dataset) % self.batchsize == 0, \
-                'dataset size is not divisible by batch size'
+            if len(self.dataset) % self.batchsize != 0:
+                raise AssertionError('dataset size is not divisible by batch size')
             return len(self.dataset) / self.batchsize
         else:
-            assert False, 'invalid policy (include-last | skip-last | \
-                divisible-only expected)'
+            if not False:
+                raise AssertionError('invalid policy (include-last | skip-last | \
+                divisible-only expected)')
 
     def __getitem__(self, idx):
         super(BatchDataset, self).__getitem__(idx)
diff --git a/pywick/datasets/tnt/concatdataset.py b/pywick/datasets/tnt/concatdataset.py
index 6ee5471..8605ff7 100644
--- a/pywick/datasets/tnt/concatdataset.py
+++ b/pywick/datasets/tnt/concatdataset.py
@@ -18,7 +18,8 @@ def __init__(self, datasets):
         super(ConcatDataset, self).__init__()
 
         self.datasets = list(datasets)
-        assert len(datasets) > 0, 'datasets should not be an empty iterable'
+        if len(datasets) <= 0:
+            raise AssertionError('datasets should not be an empty iterable')
         self.cum_sizes = np.cumsum([len(x) for x in self.datasets])
 
     def __len__(self):
diff --git a/pywick/datasets/tnt/dataset.py b/pywick/datasets/tnt/dataset.py
index ff3640e..9822fb1 100755
--- a/pywick/datasets/tnt/dataset.py
+++ b/pywick/datasets/tnt/dataset.py
@@ -5,7 +5,7 @@
 from torch.utils.data import DataLoader
 
 
-class Dataset(object):
+class Dataset:
     def __init__(self):
         pass
 
diff --git a/pywick/datasets/tnt/multipartitiondataset.py b/pywick/datasets/tnt/multipartitiondataset.py
index 90b45e2..c97d754 100755
--- a/pywick/datasets/tnt/multipartitiondataset.py
+++ b/pywick/datasets/tnt/multipartitiondataset.py
@@ -33,12 +33,14 @@ def __init__(self, dataset, partitions, initial_partition=None):
         self.partitions = partitions
 
         # A few assertions
-        assert isinstance(partitions, dict), 'partitions must be a dict'
-        assert len(partitions) >= 2, \
-            'MultiPartitionDataset should have at least two partitions'
-        assert min(partitions.values()) >= 0, \
-            'partition sizes cannot be negative'
-        assert max(partitions.values()) > 0, 'all partitions cannot be empty'
+        if not isinstance(partitions, dict):
+            raise AssertionError('partitions must be a dict')
+        if len(partitions) < 2:
+            raise AssertionError('MultiPartitionDataset should have at least two partitions')
+        if min(partitions.values()) < 0:
+            raise AssertionError('partition sizes cannot be negative')
+        if max(partitions.values()) <= 0:
+            raise AssertionError('all partitions cannot be empty')
 
         self.partition_names = list(self.partitions.keys())
         self.partition_index = {partition: i for i, partition in
@@ -52,8 +54,9 @@ def __init__(self, dataset, partitions, initial_partition=None):
                                     self.partition_sizes]
         else:
             for x in self.partition_sizes:
-                assert x == int(x), ('partition sizes should be integer'
-                                     ' numbers, or sum up to <= 1 ')
+                if x != int(x):
+                    raise AssertionError('partition sizes should be integer'
+                                         ' numbers, or sum up to <= 1 ')
 
         self.partition_cum_sizes = np.cumsum(self.partition_sizes)
 
diff --git a/pywick/datasets/tnt/resampledataset.py b/pywick/datasets/tnt/resampledataset.py
index 8a6c62f..5eea9da 100755
--- a/pywick/datasets/tnt/resampledataset.py
+++ b/pywick/datasets/tnt/resampledataset.py
@@ -36,7 +36,7 @@ def __init__(self, dataset, sampler=lambda ds, idx: idx, size=None):
         self.size = size
 
     def __len__(self):
-        return (self.size and self.size > 0) and self.size or len(self.dataset)
+        return self.size if (self.size and self.size > 0) else len(self.dataset)
 
     def __getitem__(self, idx):
         super(ResampleDataset, self).__getitem__(idx)
diff --git a/pywick/datasets/tnt/splitdataset.py b/pywick/datasets/tnt/splitdataset.py
index 9d927a5..530cc55 100644
--- a/pywick/datasets/tnt/splitdataset.py
+++ b/pywick/datasets/tnt/splitdataset.py
@@ -34,12 +34,14 @@ def __init__(self, dataset, partitions, initial_partition=None):
         self.partitions = partitions
 
         # A few assertions
-        assert isinstance(partitions, dict), 'partitions must be a dict'
-        assert len(partitions) >= 2, \
-            'SplitDataset should have at least two partitions'
-        assert min(partitions.values()) >= 0, \
-            'partition sizes cannot be negative'
-        assert max(partitions.values()) > 0, 'all partitions cannot be empty'
+        if not isinstance(partitions, dict):
+            raise AssertionError('partitions must be a dict')
+        if len(partitions) < 2:
+            raise AssertionError('SplitDataset should have at least two partitions')
+        if min(partitions.values()) < 0:
+            raise AssertionError('partition sizes cannot be negative')
+        if max(partitions.values()) <= 0:
+            raise AssertionError('all partitions cannot be empty')
 
         self.partition_names = sorted(list(self.partitions.keys()))
         self.partition_index = {partition: i for i, partition in
@@ -53,8 +55,9 @@ def __init__(self, dataset, partitions, initial_partition=None):
                                     self.partition_sizes]
         else:
             for x in self.partition_sizes:
-                assert x == int(x), ('partition sizes should be integer'
-                                     ' numbers, or sum up to <= 1 ')
+                if x != int(x):
+                    raise AssertionError('partition sizes should be integer'
+                                         ' numbers, or sum up to <= 1 ')
 
         self.partition_cum_sizes = np.cumsum(self.partition_sizes)
 
diff --git a/pywick/datasets/tnt/transform.py b/pywick/datasets/tnt/transform.py
index 0ca72ae..784f910 100755
--- a/pywick/datasets/tnt/transform.py
+++ b/pywick/datasets/tnt/transform.py
@@ -1,12 +1,14 @@
 from six import iteritems
 from pywick.datasets.tnt.table import canmergetensor as canmerge
-from pywick.datasets.tnt.table import mergetensor as mergetensor
+from pywick.datasets.tnt.table import mergetensor
 
 
 def compose(transforms):
-    assert isinstance(transforms, list)
+    if not isinstance(transforms, list):
+        raise AssertionError
     for tr in transforms:
-        assert callable(tr), 'list of functions expected'
+        if not callable(tr):
+            raise AssertionError('list of functions expected')
 
     def composition(z):
         for tr in transforms:
@@ -21,13 +23,13 @@ def mergekeys(tbl):
         if isinstance(tbl, dict):
             for idx, elem in tbl.items():
                 for key, value in elem.items():
-                    if not key in mergetbl:
+                    if key not in mergetbl:
                         mergetbl[key] = {}
                     mergetbl[key][idx] = value
         elif isinstance(tbl, list):
             for elem in tbl:
                 for key, value in elem.items():
-                    if not key in mergetbl:
+                    if key not in mergetbl:
                         mergetbl[key] = []
                     mergetbl[key].append(value)
         return mergetbl
@@ -47,4 +49,4 @@ def makebatch(merge=None):
                        if canmerge(field) else field)
         ])
 
-    return lambda samples: makebatch(samples)
+    return makebatch
diff --git a/pywick/datasets/tnt/transformdataset.py b/pywick/datasets/tnt/transformdataset.py
index d53e0b9..3d60f25 100755
--- a/pywick/datasets/tnt/transformdataset.py
+++ b/pywick/datasets/tnt/transformdataset.py
@@ -29,11 +29,12 @@ class TransformDataset(Dataset):
     def __init__(self, dataset, transforms):
         super(TransformDataset, self).__init__()
 
-        assert isinstance(transforms, dict) or callable(transforms), \
-            'expected a dict of transforms or a function'
+        if not (isinstance(transforms, dict) or callable(transforms)):
+            raise AssertionError('expected a dict of transforms or a function')
         if isinstance(transforms, dict):
             for k, v in transforms.items():
-                assert callable(v), str(k) + ' is not a function'
+                if not callable(v):
+                    raise AssertionError(str(k) + ' is not a function')
 
         self.dataset = dataset
         self.transforms = transforms
diff --git a/pywick/dictmodels/__init__.py b/pywick/dictmodels/__init__.py
index 727a51d..d9420fc 100644
--- a/pywick/dictmodels/__init__.py
+++ b/pywick/dictmodels/__init__.py
@@ -1 +1,2 @@
-from .dict_config import *
\ No newline at end of file
+from .dict_config import *
+from .model_spec import *
diff --git a/pywick/dictmodels/dict_config.py b/pywick/dictmodels/dict_config.py
index 628606c..a857997 100644
--- a/pywick/dictmodels/dict_config.py
+++ b/pywick/dictmodels/dict_config.py
@@ -1,26 +1,48 @@
-from datetime import datetime
+import time
+from typing import List
+
 from prodict import Prodict
 
 
 class ExpConfig(Prodict):
     """
-    Default configuration class to define some static types (based on configs/train_classifier.yml)
+    Default configuration class to define some static types (based on configs/train_classifier.yaml)
     """
 
+    auto_balance_dataset: bool  # whether to attempt to fix imbalances in class representation within the dataset (default: False)
     batch_size          : int   # Size of the batch to use when training (per GPU)
-    dataroots           : str   # where to find the training data
-    exp_id              : str   # id of the experiment
-    gpu_ids             : list  # list of GPUs to use
+    dataroots           : List  # where to find the training data
+    exp_id              : str   # id of the experiment (default: generated from datetime)
+    gpu_ids             : List  # list of GPUs to use
     input_size          : int   # size of the input image. Networks with atrous convolutions (densenet, fbresnet, inceptionv4) allow flexible image sizes while others do not
                                 # see table: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv
-    model_name          : str   # model to use (over 200 models available! see: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv)
+    mean_std            : List  # mean, std to use for image transforms
+    model_spec          : str   # model to use (over 200 models available! see: https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet-a.csv)
     num_epochs          : int   # number of epochs to train for (use small number if starting from pretrained NN)
     optimizer           : dict  # optimizer configuration
     output_root         : str   # where to save outputs (e.g. trained NNs)
+    random_seed         : int   # random seed to use (default: 1377)
     save_callback       : dict  # callback to use for saving the model (if any)
     scheduler           : dict  # scheduler configuration
-    use_gpu             : bool  # whether to use the GPU for training
-    workers             : int   # number of workers to read training data from disk and feed it to the GPU
+    train_val_ratio     : float # ratio of train to val data (if splitting a single dataset)
+    use_apex            : bool  # whether to use APEX optimization (only valid if use_gpu = True)
+    use_gpu             : bool  # whether to use the GPU for training (default: False)
+    val_root            : str   # root dir to use for validation data (if different from dataroots)
+    workers             : int   # number of workers to read training data from disk and feed it to the GPU (default: 8)
+
+    keys_to_verify      : List  # Minimum set of keys that must be set to ensure proper configuration
 
     def init(self):
-        self.exp_id = str(datetime.now())
+        self.auto_balance_dataset = False
+        self.exp_id = str(int(time.time() * 1000))
+        self.mean_std = [[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]]      # imagenet default
+        self.random_seed = 1337
+        self.train_val_ratio = 0.8
+        self.use_gpu = False
+
+        self.keys_to_verify = ['batch_size', 'dataroots', 'input_size', 'model_spec', 'num_epochs', 'optimizer', 'output_root', 'scheduler', 'use_gpu', 'workers']
+
+    def verify_properties(self):
+        mapped_keys = [i in self.keys() for i in self.keys_to_verify]
+        if not all(mapped_keys):
+            raise Exception(f'Property verification failed. Not all required properties have been set: {[i for (i, v) in zip(self.keys_to_verify, mapped_keys) if not v]}')
\ No newline at end of file
diff --git a/pywick/dictmodels/model_spec.py b/pywick/dictmodels/model_spec.py
new file mode 100644
index 0000000..71b366a
--- /dev/null
+++ b/pywick/dictmodels/model_spec.py
@@ -0,0 +1,16 @@
+from typing import Dict
+
+from prodict import Prodict
+
+
+class ModelSpec(Prodict):
+    """
+    Model specification to instantiate. Most models will have pre-configured and pre-trained variants but this gives you more fine-grained control
+    """
+
+    model_name          : int   # Size of the batch to use when training (per GPU)
+    model_params        : Dict  # where to find the training data
+
+    def init(self):
+        # nothing initialized yet but will be expanded in the future
+        pass
diff --git a/pywick/functions/activations_autofn.py b/pywick/functions/activations_autofn.py
index 929d533..b341324 100644
--- a/pywick/functions/activations_autofn.py
+++ b/pywick/functions/activations_autofn.py
@@ -1,7 +1,7 @@
 # Source: https://github.com/rwightman/gen-efficientnet-pytorch/blob/master/geffnet/activations/activations_autofn.py (Apache 2.0)
 
 import torch
-from torch import nn as nn
+from torch import nn
 from torch.nn import functional as F
 
 
@@ -36,7 +36,8 @@ def __init__(self, inplace: bool = False):
         super(SwishAuto, self).__init__()
         self.inplace = inplace
 
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return SwishAutoFn.apply(x)
 
 
@@ -69,6 +70,7 @@ def __init__(self, inplace: bool = False):
         super(MishAuto, self).__init__()
         self.inplace = inplace
 
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return MishAutoFn.apply(x)
 
diff --git a/pywick/functions/activations_jit.py b/pywick/functions/activations_jit.py
index a22b424..f471696 100644
--- a/pywick/functions/activations_jit.py
+++ b/pywick/functions/activations_jit.py
@@ -1,7 +1,7 @@
 # Source: https://github.com/rwightman/gen-efficientnet-pytorch/blob/master/geffnet/activations/activations_jit.py (Apache 2.0)
 
 import torch
-from torch import nn as nn
+from torch import nn
 from torch.nn import functional as F
 
 
@@ -46,7 +46,8 @@ def __init__(self, inplace: bool = False):
         super(SwishJit, self).__init__()
         self.inplace = inplace
 
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return SwishJitAutoFn.apply(x)
 
 
@@ -84,7 +85,8 @@ def __init__(self, inplace: bool = False):
         super(MishJit, self).__init__()
         self.inplace = inplace
 
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return MishJitAutoFn.apply(x)
 
 
diff --git a/pywick/functions/affine.py b/pywick/functions/affine.py
index 4b84e33..520b34e 100644
--- a/pywick/functions/affine.py
+++ b/pywick/functions/affine.py
@@ -39,24 +39,24 @@ def F_affine2d(x, matrix, center=True):
     return x_transformed
 
 
-def F_bilinear_interp2d(input, coords):
+def F_bilinear_interp2d(input_, coords):
     """
     bilinear interpolation of 2d torch Tensor
     """
-    x = torch.clamp(coords[:, :, 0], 0, input.size(1) - 2)
+    x = torch.clamp(coords[:, :, 0], 0, input_.size(1) - 2)
     x0 = x.floor()
     x1 = x0 + 1
-    y = torch.clamp(coords[:, :, 1], 0, input.size(2) - 2)
+    y = torch.clamp(coords[:, :, 1], 0, input_.size(2) - 2)
     y0 = y.floor()
     y1 = y0 + 1
 
-    stride = torch.LongTensor(input.stride())
+    stride = torch.LongTensor(input_.stride())
     x0_ix = x0.mul(stride[1]).long()
     x1_ix = x1.mul(stride[1]).long()
     y0_ix = y0.mul(stride[2]).long()
     y1_ix = y1.mul(stride[2]).long()
 
-    input_flat = input.view(input.size(0), -1).contiguous()
+    input_flat = input_.view(input_.size(0), -1).contiguous()
 
     vals_00 = input_flat.gather(1, x0_ix.add(y0_ix).detach())
     vals_10 = input_flat.gather(1, x1_ix.add(y0_ix).detach())
@@ -73,7 +73,7 @@ def F_bilinear_interp2d(input, coords):
                 vals_01.mul(xm).mul(yd) +
                 vals_11.mul(xd).mul(yd))
 
-    return x_mapped.view_as(input)
+    return x_mapped.view_as(input_)
 
 
 def F_batch_affine2d(x, matrix, center=True):
@@ -125,27 +125,27 @@ def F_batch_affine2d(x, matrix, center=True):
     return x_transformed
 
 
-def F_batch_bilinear_interp2d(input, coords):
+def F_batch_bilinear_interp2d(input_, coords):
     """
     input : torch.Tensor
         size = (N,H,W,C)
     coords : torch.Tensor
         size = (N,H*W*C,2)
     """
-    x = torch.clamp(coords[:, :, 0], 0, input.size(2) - 2)
+    x = torch.clamp(coords[:, :, 0], 0, input_.size(2) - 2)
     x0 = x.floor()
     x1 = x0 + 1
-    y = torch.clamp(coords[:, :, 1], 0, input.size(3) - 2)
+    y = torch.clamp(coords[:, :, 1], 0, input_.size(3) - 2)
     y0 = y.floor()
     y1 = y0 + 1
 
-    stride = torch.LongTensor(input.stride())
+    stride = torch.LongTensor(input_.stride())
     x0_ix = x0.mul(stride[2]).long()
     x1_ix = x1.mul(stride[2]).long()
     y0_ix = y0.mul(stride[3]).long()
     y1_ix = y1.mul(stride[3]).long()
 
-    input_flat = input.view(input.size(0), -1).contiguous()
+    input_flat = input_.view(input_.size(0), -1).contiguous()
 
     vals_00 = input_flat.gather(1, x0_ix.add(y0_ix).detach())
     vals_10 = input_flat.gather(1, x1_ix.add(y0_ix).detach())
@@ -162,7 +162,7 @@ def F_batch_bilinear_interp2d(input, coords):
                 vals_01.mul(xm).mul(yd) +
                 vals_11.mul(xd).mul(yd))
 
-    return x_mapped.view_as(input)
+    return x_mapped.view_as(input_)
 
 
 def F_affine3d(x, matrix, center=True):
@@ -194,24 +194,24 @@ def F_affine3d(x, matrix, center=True):
     return x_transformed
 
 
-def F_trilinear_interp3d(input, coords):
+def F_trilinear_interp3d(input_, coords):
     """
     trilinear interpolation of 3D image
     """
     # take clamp then floor/ceil of x coords
-    x = torch.clamp(coords[:, 0], 0, input.size(1) - 2)
+    x = torch.clamp(coords[:, 0], 0, input_.size(1) - 2)
     x0 = x.floor()
     x1 = x0 + 1
     # take clamp then floor/ceil of y coords
-    y = torch.clamp(coords[:, 1], 0, input.size(2) - 2)
+    y = torch.clamp(coords[:, 1], 0, input_.size(2) - 2)
     y0 = y.floor()
     y1 = y0 + 1
     # take clamp then floor/ceil of z coords
-    z = torch.clamp(coords[:, 2], 0, input.size(3) - 2)
+    z = torch.clamp(coords[:, 2], 0, input_.size(3) - 2)
     z0 = z.floor()
     z1 = z0 + 1
 
-    stride = torch.LongTensor(input.stride())[1:]
+    stride = torch.LongTensor(input_.stride())[1:]
     x0_ix = x0.mul(stride[0]).long()
     x1_ix = x1.mul(stride[0]).long()
     y0_ix = y0.mul(stride[1]).long()
@@ -219,7 +219,7 @@ def F_trilinear_interp3d(input, coords):
     z0_ix = z0.mul(stride[2]).long()
     z1_ix = z1.mul(stride[2]).long()
 
-    input_flat = th_flatten(input)
+    input_flat = th_flatten(input_)
 
     vals_000 = input_flat[x0_ix.add(y0_ix).add(z0_ix).detach()]
     vals_100 = input_flat[x1_ix.add(y0_ix).add(z0_ix).detach()]
@@ -246,7 +246,7 @@ def F_trilinear_interp3d(input, coords):
                 vals_110.mul(xd).mul(yd).mul(zm) +
                 vals_111.mul(xd).mul(yd).mul(zd))
 
-    return x_mapped.view_as(input)
+    return x_mapped.view_as(input_)
 
 
 def F_batch_affine3d(x, matrix, center=True):
@@ -300,24 +300,24 @@ def F_batch_affine3d(x, matrix, center=True):
     return x_transformed
 
 
-def F_batch_trilinear_interp3d(input, coords):
+def F_batch_trilinear_interp3d(input_, coords):
     """
     input : torch.Tensor
         size = (N,H,W,C)
     coords : torch.Tensor
         size = (N,H*W*C,2)
     """
-    x = torch.clamp(coords[:, :, 0], 0, input.size(2) - 2)
+    x = torch.clamp(coords[:, :, 0], 0, input_.size(2) - 2)
     x0 = x.floor()
     x1 = x0 + 1
-    y = torch.clamp(coords[:, :, 1], 0, input.size(3) - 2)
+    y = torch.clamp(coords[:, :, 1], 0, input_.size(3) - 2)
     y0 = y.floor()
     y1 = y0 + 1
-    z = torch.clamp(coords[:, :, 2], 0, input.size(4) - 2)
+    z = torch.clamp(coords[:, :, 2], 0, input_.size(4) - 2)
     z0 = z.floor()
     z1 = z0 + 1
 
-    stride = torch.LongTensor(input.stride())
+    stride = torch.LongTensor(input_.stride())
     x0_ix = x0.mul(stride[2]).long()
     x1_ix = x1.mul(stride[2]).long()
     y0_ix = y0.mul(stride[3]).long()
@@ -325,7 +325,7 @@ def F_batch_trilinear_interp3d(input, coords):
     z0_ix = z0.mul(stride[4]).long()
     z1_ix = z1.mul(stride[4]).long()
 
-    input_flat = input.contiguous().view(input.size(0), -1)
+    input_flat = input_.contiguous().view(input_.size(0), -1)
 
     vals_000 = input_flat.gather(1, x0_ix.add(y0_ix).add(z0_ix).detach())
     vals_100 = input_flat.gather(1, x1_ix.add(y0_ix).add(z0_ix).detach())
@@ -352,4 +352,4 @@ def F_batch_trilinear_interp3d(input, coords):
                 vals_110.mul(xd).mul(yd).mul(zm) +
                 vals_111.mul(xd).mul(yd).mul(zd))
 
-    return x_mapped.view_as(input)
+    return x_mapped.view_as(input_)
diff --git a/pywick/functions/batchrenorm.py b/pywick/functions/batchrenorm.py
index 6ff835d..f6ef27c 100644
--- a/pywick/functions/batchrenorm.py
+++ b/pywick/functions/batchrenorm.py
@@ -39,19 +39,19 @@ def reset_parameters(self):
             self.weight.data.uniform_()
             self.bias.data.zero_()
 
-    def _check_input_dim(self, input):
-        if input.size(1) != self.running_mean.nelement():
+    def _check_input_dim(self, input_):
+        if input_.size(1) != self.running_mean.nelement():
             raise ValueError('got {}-feature tensor, expected {}'
-                             .format(input.size(1), self.num_features))
+                             .format(input_.size(1), self.num_features))
 
-    def forward(self, input):
-        self._check_input_dim(input)
-        n = input.size()[0]
+    def forward(self, input_):
+        self._check_input_dim(input_)
+        n = input_.size()[0]
 
         if self.training:
-            mean = torch.mean(input, dim=0)
+            mean = torch.mean(input_, dim=0)
 
-            sum = torch.sum((input - mean.expand_as(input)) ** 2, dim=0)
+            sum = torch.sum((input_ - mean.expand_as(input_)) ** 2, dim=0)
             if sum == 0 and self.eps == 0:
                 invstd = 0.0
             else:
@@ -63,10 +63,10 @@ def forward(self, input):
             self.d = torch.clamp((mean.data - self.running_mean) / torch.sqrt(self.running_var),
                                  -self.dmax, self.dmax)
 
-            r = self.r.expand_as(input)
-            d = self.d.expand_as(input)
+            r = self.r.expand_as(input_)
+            d = self.d.expand_as(input_)
 
-            input_normalized = (input - mean.expand_as(input)) * invstd.expand_as(input)
+            input_normalized = (input_ - mean.expand_as(input_)) * invstd.expand_as(input_)
 
             input_normalized = input_normalized * r + d
 
@@ -76,22 +76,22 @@ def forward(self, input):
             if not self.affine:
                 return input_normalized
 
-            output = input_normalized * self.weight.expand_as(input)
-            output += self.bias.unsqueeze(0).expand_as(input)
+            output = input_normalized * self.weight.expand_as(input_)
+            output += self.bias.unsqueeze(0).expand_as(input_)
 
             return output
 
         else:
-            mean = self.running_mean.expand_as(input)
-            invstd = 1. / torch.sqrt(self.running_var.expand_as(input) + self.eps)
+            mean = self.running_mean.expand_as(input_)
+            invstd = 1. / torch.sqrt(self.running_var.expand_as(input_) + self.eps)
 
-            input_normalized = (input - mean.expand_as(input)) * invstd.expand_as(input)
+            input_normalized = (input_ - mean.expand_as(input_)) * invstd.expand_as(input_)
 
             if not self.affine:
                 return input_normalized
 
-            output = input_normalized * self.weight.expand_as(input)
-            output += self.bias.unsqueeze(0).expand_as(input)
+            output = input_normalized * self.weight.expand_as(input_)
+            output += self.bias.unsqueeze(0).expand_as(input_)
 
             return output
 
diff --git a/pywick/functions/cyclicLR.py b/pywick/functions/cyclicLR.py
index 7671970..ec785ae 100644
--- a/pywick/functions/cyclicLR.py
+++ b/pywick/functions/cyclicLR.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 
-class CyclicLR(object):
+class CyclicLR:
     """Sets the learning rate of each parameter group according to
     cyclical learning rate policy (CLR). The policy cycles the learning
     rate between two boundaries with a constant frequency, as detailed in
@@ -82,14 +82,14 @@ def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
             raise TypeError('{} is not an Optimizer'.format(type(optimizer).__name__))
         self.optimizer = optimizer
 
-        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
+        if isinstance(base_lr, (list, tuple)):
             if len(base_lr) != len(optimizer.param_groups):
                 raise ValueError("expected {} base_lr, got {}".format(len(optimizer.param_groups), len(base_lr)))
             self.base_lrs = list(base_lr)
         else:
             self.base_lrs = [base_lr] * len(optimizer.param_groups)
 
-        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
+        if isinstance(max_lr, (list, tuple)):
             if len(max_lr) != len(optimizer.param_groups):
                 raise ValueError("expected {} max_lr, got {}".format(len(optimizer.param_groups), len(max_lr)))
             self.max_lrs = list(max_lr)
@@ -128,10 +128,12 @@ def batch_step(self, batch_iteration=None):
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
 
-    def _triangular_scale_fn(self, x):
+    @staticmethod
+    def _triangular_scale_fn(x):
         return 1.
 
-    def _triangular2_scale_fn(self, x):
+    @staticmethod
+    def _triangular2_scale_fn(x):
         return 1 / (2. ** (x - 1))
 
     def _exp_range_scale_fn(self, x):
diff --git a/pywick/functions/group_norm.py b/pywick/functions/group_norm.py
index 6dc4377..297f7f9 100644
--- a/pywick/functions/group_norm.py
+++ b/pywick/functions/group_norm.py
@@ -7,8 +7,9 @@
 import torch.nn.functional as F
 from torch.nn.modules.batchnorm import _BatchNorm
 
-def group_norm(input, group, running_mean, running_var, weight=None, bias=None,
-                  use_input_stats=True, momentum=0.1, eps=1e-5):
+
+def group_norm(input_, group, running_mean, running_var, weight=None, bias=None,
+               use_input_stats=True, momentum=0.1, eps=1e-5):
     r"""Applies Group Normalization for channels in the same group in each data sample in a
     batch.
 
@@ -17,14 +18,14 @@ def group_norm(input, group, running_mean, running_var, weight=None, bias=None,
     if not use_input_stats and (running_mean is None or running_var is None):
         raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False')
 
-    b, c = input.size(0), input.size(1)
+    b, c = input_.size(0), input_.size(1)
     if weight is not None:
         weight = weight.repeat(b)
     if bias is not None:
         bias = bias.repeat(b)
 
-    def _group_norm(input, group, running_mean=None, running_var=None, weight=None,
-                       bias=None, use_input_stats=None, momentum=None, eps=None):
+    def _group_norm(input_, group, running_mean=None, running_var=None, weight=None,
+                    bias=None, use_input_stats=None, momentum=None, eps=None):
         # Repeat stored stats and affine transform params if necessary
         if running_mean is not None:
             running_mean_orig = running_mean
@@ -36,7 +37,7 @@ def _group_norm(input, group, running_mean=None, running_var=None, weight=None,
         #norm_shape = [1, b * c / group, group]
         #print(norm_shape)
         # Apply group norm
-        input_reshaped = input.contiguous().view(1, int(b * c/group), group, *input.size()[2:])
+        input_reshaped = input_.contiguous().view(1, int(b * c / group), group, *input_.size()[2:])
 
         out = F.batch_norm(
             input_reshaped, running_mean, running_var, weight=weight, bias=bias,
@@ -48,11 +49,11 @@ def _group_norm(input, group, running_mean=None, running_var=None, weight=None,
         if running_var is not None:
             running_var_orig.copy_(running_var.view(b, int(c/group)).mean(0, keepdim=False))
 
-        return out.view(b, c, *input.size()[2:])
-    return _group_norm(input, group, running_mean=running_mean,
-                          running_var=running_var, weight=weight, bias=bias,
-                          use_input_stats=use_input_stats, momentum=momentum,
-                          eps=eps)
+        return out.view(b, c, *input_.size()[2:])
+    return _group_norm(input_, group, running_mean=running_mean,
+                       running_var=running_var, weight=weight, bias=bias,
+                       use_input_stats=use_input_stats, momentum=momentum,
+                       eps=eps)
 
 
 class _GroupNorm(_BatchNorm):
@@ -63,14 +64,15 @@ def __init__(self, num_features, num_groups=1, eps=1e-5, momentum=0.1,
         super(_GroupNorm, self).__init__(int(num_features/num_groups), eps,
                                          momentum, affine)
 
-    def _check_input_dim(self, input):
+    @staticmethod
+    def _check_input_dim(input_):
         return NotImplemented
 
-    def forward(self, input):
-        self._check_input_dim(input)
+    def forward(self, input_):
+        self._check_input_dim(input_)
 
         return group_norm(
-            input, self.num_groups, self.running_mean, self.running_var, self.weight, self.bias,
+            input_, self.num_groups, self.running_mean, self.running_var, self.weight, self.bias,
             self.training or not self.track_running_stats, self.momentum, self.eps)
 
 
@@ -103,12 +105,12 @@ class GroupNorm2d(_GroupNorm):
         >>> # With Learnable Parameters
         >>> m = nn.GroupNorm2d(100, 4, affine=True)
         >>> input = torch.randn(20, 100, 35, 45)
-        >>> output = m(input)
+        >>> output = m(input_)
 
     """
 
-    def _check_input_dim(self, input):
-        if input.dim() != 4:
+    def _check_input_dim(self, input_):
+        if input_.dim() != 4:
             raise ValueError('expected 4D input (got {}D input)'
-                             .format(input.dim()))
+                             .format(input_.dim()))
 
diff --git a/pywick/gridsearch/gridsearch.py b/pywick/gridsearch/gridsearch.py
index 1baa893..132a26a 100644
--- a/pywick/gridsearch/gridsearch.py
+++ b/pywick/gridsearch/gridsearch.py
@@ -1,7 +1,7 @@
 import random
 import collections
 
-class GridSearch(object):
+class GridSearch:
     """
     Simple GridSearch to apply to a generic function
 
@@ -60,7 +60,7 @@ def _execute(self, input_args, available_args):
 
         # get all keys
         keys = available_args.keys()
-        keys_to_remove = list()
+        keys_to_remove = []
 
         for i, key in enumerate(keys):
             values = available_args.get(key)
@@ -78,11 +78,10 @@ def _execute(self, input_args, available_args):
 
                 available_args[key] = values  # replace values so they can be used in the next iterative call
                 break    # don't do any more iterations after we handled the first key with multiple choices
-            else:
-                input_args[key] = values
-                keys_to_remove.append(key)
-                if (i+1) == len(keys):        # we've reached the final item in the available args
-                    self._execute(input_args, dict())
+            input_args[key] = values
+            keys_to_remove.append(key)
+            if (i+1) == len(keys):        # we've reached the final item in the available args
+                self._execute(input_args, {})
 
     def run(self):
         """
@@ -90,5 +89,5 @@ def run(self):
         :return:
         """
 
-        input_args = dict()
+        input_args = {}
         self._execute(input_args, self.args)
diff --git a/pywick/gridsearch/pipeline.py b/pywick/gridsearch/pipeline.py
index f00c836..7436162 100644
--- a/pywick/gridsearch/pipeline.py
+++ b/pywick/gridsearch/pipeline.py
@@ -8,7 +8,7 @@ def merge_dicts(*dict_args):
         result.update(dictionary)
     return result
 
-class Pipeline(object):
+class Pipeline:
     """
     Defines a pipeline for operating on data. Output of first function will be passed to the second and so forth.
 
@@ -25,12 +25,12 @@ def __init__(self, ordered_func_list, func_args=None):
         self.func_args = func_args
         self.output = None
 
-    def call(self, input):
+    def call(self, input_):
         """Apply the functions in current Pipeline to an input.
 
-        :param input: The input to process with the Pipeline.
+        :param input_: The input to process with the Pipeline.
         """
-        out = input
+        out = input_
         for pipe in self.pipes:
             if pipe.__name__ in self.func_args:     # if additional arguments present
                 all_args = self.func_args[pipe.__name__]
diff --git a/pywick/image_utils.py b/pywick/image_utils.py
index 78b5bef..6435786 100644
--- a/pywick/image_utils.py
+++ b/pywick/image_utils.py
@@ -30,12 +30,12 @@ def draw_dice_on_image(label, prob, threshold=125, is_0_255=False):
     results[miss] = np.array([255,255,255])
     results[hit]  = np.array([19,138,249])
     results[fp]   = np.array([246,249,16])
-    results = results.reshape(H,W,3)
+    results = results.reshape((H, W, 3))
 
     return results
 
 
-def draw_mask_on_image(image, mask, bg_color=(19, 138, 249), mask_color=[255, 255, 0], threshold=125, foreground_alpha=[1.0, 1.0, 0.5], is_0_255=False):
+def draw_mask_on_image(image, mask, bg_color=(19, 138, 249), mask_color=None, threshold=125, foreground_alpha=None, is_0_255=False):
     '''
 
     Draws a mask on top of the original image. This is pretty CPU intensive so may want to revise for production environment
@@ -52,6 +52,10 @@ def draw_mask_on_image(image, mask, bg_color=(19, 138, 249), mask_color=[255, 25
     :return: numpy array containing composite image [RGB]
 
     '''
+    if mask_color is None:
+        mask_color = [255, 255, 0]
+    if foreground_alpha is None:
+        foreground_alpha = [1.0, 1.0, 0.5]
     if not is_0_255:
         image = image * 255
         mask = mask * 255
@@ -60,7 +64,8 @@ def draw_mask_on_image(image, mask, bg_color=(19, 138, 249), mask_color=[255, 25
 
     H, W, _ = image.shape
 
-    assert (H,W) == mask.shape, "image size does not equal mask size!"
+    if (H,W) != mask.shape:
+        raise AssertionError("image size does not equal mask size!")
 
     results = np.zeros((H, W, 3), np.uint8)  # create new image and fill with zeros
     results[...] = bg_color     # fill entire image with bg_color at first
diff --git a/pywick/initializers.py b/pywick/initializers.py
index 53bbb39..b900c87 100644
--- a/pywick/initializers.py
+++ b/pywick/initializers.py
@@ -26,7 +26,7 @@ def _validate_initializer_string(init):
         raise ValueError('Invalid loss input')
 
 
-class InitializerContainer(object):
+class InitializerContainer:
 
     def __init__(self, initializers):
         self._initializers = initializers
@@ -36,7 +36,7 @@ def apply(self, model):
             model.apply(initializer)
 
 
-class Initializer(object):
+class Initializer:
     """
     Blank Initializer class from which all other Initializers must inherit
     """
diff --git a/pywick/losses.py b/pywick/losses.py
index f8f1823..b7beae6 100644
--- a/pywick/losses.py
+++ b/pywick/losses.py
@@ -39,13 +39,22 @@
 import numpy as np
 import torch
 import math
-from .models.segmentation.testnets.drnet.drnet import DRCLoss
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Function
 from torch.autograd import Variable
 from torch import Tensor
-from typing import Iterable, Set, Any
+from typing import Iterable, Set
+
+
+__all__ = ['ActiveContourLoss', 'ActiveContourLossAlt', 'AngularPenaltySMLoss', 'AsymLoss', 'BCELoss2d', 'BCEDiceLoss',
+           'BCEWithLogitsViewLoss', 'BCEDiceTL1Loss', 'BCEDicePenalizeBorderLoss', 'BCEDiceFocalLoss', 'BinaryFocalLoss',
+           'ComboBCEDiceLoss', 'ComboSemsegLossWeighted', 'EncNetLoss', 'FocalLoss', 'FocalLoss2',
+           'HausdorffERLoss', 'HausdorffDTLoss', 'LovaszSoftmax', 'mIoULoss', 'MixSoftmaxCrossEntropyOHEMLoss',
+           'MSE3D', 'OhemCELoss', 'OhemCrossEntropy2d', 'OhemBCEDicePenalizeBorderLoss', 'PoissonLoss',
+           'PoissonLoss3d', 'RecallLoss', 'RMILoss', 'RMILossAlt', 'RMIBCEDicePenalizeBorderLoss', 'SoftInvDiceLoss',
+           'SoftDiceLoss', 'StableBCELoss', 'TverskyLoss', 'ThresholdedL1Loss', 'WeightedSoftDiceLoss', 'WeightedBCELoss2d',
+           'BDLoss', 'L1Loss3d', 'WingLoss', 'BoundaryLoss']
 
 VOID_LABEL = 255
 N_CLASSES = 1
@@ -55,9 +64,10 @@ class StableBCELoss(nn.Module):
     def __init__(self, **_):
         super(StableBCELoss, self).__init__()
 
-    def forward(self, input, target, **_):
-        neg_abs = - input.abs()
-        loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
+    @staticmethod
+    def forward(input_, target, **_):
+        neg_abs = - input_.abs()
+        loss = input_.clamp(min=0) - input_ * target + (1 + neg_abs.exp()).log()
         return loss.mean()
 
 
@@ -121,7 +131,7 @@ def gamma_fast(gt, permutation):
     return jaccard
 
 # WARN: Only applicable to Binary Segmentation right now (zip function needs to be replaced)!
-def lovaszloss(logits, labels, prox=False, max_steps=20, debug={}):
+def lovaszloss(logits, labels, prox=False, max_steps=20, debug=None):
     """
     `The Lovasz-Softmax loss <https://arxiv.org/abs/1705.08790>`_
 
@@ -132,6 +142,8 @@ def lovaszloss(logits, labels, prox=False, max_steps=20, debug={}):
     :param debug:
     :return:
     """
+    if debug is None:
+        debug = {}
 
     # image-level Lovasz hinge
     if logits.size(0) == 1:
@@ -216,7 +228,9 @@ def project(gam, active, members):
         gam[active + 1:] = 0.
 
 
-def find_proximal(x0, gam, lam, eps=1e-6, max_steps=20, debug={}):
+def find_proximal(x0, gam, lam, eps=1e-6, max_steps=20, debug=None):
+    if debug is None:
+        debug = {}
     # x0: sorted margins data
     # gam: initial gamma_fast(target, perm)
     # regularisation parameter lam
@@ -268,7 +282,9 @@ def find_proximal(x0, gam, lam, eps=1e-6, max_steps=20, debug={}):
     return x, gam
 
 
-def lovasz_binary(margins, label, prox=False, max_steps=20, debug={}):
+def lovasz_binary(margins, label, prox=False, max_steps=20, debug=None):
+    if debug is None:
+        debug = {}
     # 1d vector inputs
     # Workaround: can't sort Variable bug
     # prox: False or lambda regularization value
@@ -284,7 +300,9 @@ def lovasz_binary(margins, label, prox=False, max_steps=20, debug={}):
         return loss
 
 
-def lovasz_single(logit, label, prox=False, max_steps=20, debug={}):
+def lovasz_single(logit, label, prox=False, max_steps=20, debug=None):
+    if debug is None:
+        debug = {}
     # single images
     mask = (label.view(-1) != 255)
     num_preds = mask.long().sum()
@@ -313,7 +331,8 @@ def dice_coefficient(logit, label, isCuda=True):
     A = A.clone()
     B = B.clone()
 
-    assert len(A) == len(B)
+    if len(A) != len(B):
+        raise AssertionError
 
     for i in list(range(len(A))):
         if A[i] > 0.5:
@@ -341,7 +360,8 @@ class WeightedSoftDiceLoss(torch.nn.Module):
     def __init__(self, **_):
         super(WeightedSoftDiceLoss, self).__init__()
 
-    def forward(self, logits, labels, weights, **_):
+    @staticmethod
+    def forward(logits, labels, weights, **_):
         probs = torch.sigmoid(logits)
         num   = labels.size(0)
         w     = weights.view(num,-1)
@@ -465,7 +485,9 @@ class BCEDiceFocalLoss(nn.Module):
         :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch.
         :param weights: (list(), default = [1,1,1]) Optional weighing (0.0-1.0) of the losses in order of [bce, dice, focal]
     '''
-    def __init__(self, focal_param, weights=[1.0,1.0,1.0], **kwargs):
+    def __init__(self, focal_param, weights=None, **kwargs):
+        if weights is None:
+            weights = [1.0,1.0,1.0]
         super(BCEDiceFocalLoss, self).__init__()
         self.bce = BCEWithLogitsViewLoss(weight=None, size_average=True, **kwargs)
         self.dice = SoftDiceLoss(**kwargs)
@@ -490,7 +512,8 @@ class WeightedBCELoss2d(nn.Module):
     def __init__(self, **_):
         super(WeightedBCELoss2d, self).__init__()
 
-    def forward(self, logits, labels, weights, **_):
+    @staticmethod
+    def forward(logits, labels, weights, **_):
         w = weights.view(-1)            # (-1 operation flattens all the dimensions)
         z = logits.view(-1)             # (-1 operation flattens all the dimensions)
         t = labels.view(-1)             # (-1 operation flattens all the dimensions)
@@ -499,24 +522,6 @@ def forward(self, logits, labels, weights, **_):
         return loss
 
 
-class WeightedSoftDiceLoss(nn.Module):
-    def __init__(self, **_):
-        super(WeightedSoftDiceLoss, self).__init__()
-
-    def forward(self, logits, labels, weights, **_):
-        probs = torch.sigmoid(logits)
-        num   = labels.size(0)
-        w     = (weights).view(num,-1)
-        w2    = w*w
-        m1    = (probs).view(num, -1)
-        m2    = (labels).view(num, -1)
-        intersection = (m1 * m2)
-        smooth = 1.
-        score = 2. * ((w2*intersection).sum(1)+smooth) / ((w2*m1).sum(1) + (w2*m2).sum(1)+smooth)
-        score = 1 - score.sum()/num
-        return score
-
-
 class BCEDicePenalizeBorderLoss(nn.Module):
     def __init__(self, kernel_size=55, **_):
         super(BCEDicePenalizeBorderLoss, self).__init__()
@@ -575,7 +580,8 @@ def __init__(self, num_class, alpha=None, gamma=2, balance_index=-1, smooth=None
         if self.alpha is None:
             self.alpha = torch.ones(self.num_class, 1)
         elif isinstance(self.alpha, (list, np.ndarray)):
-            assert len(self.alpha) == self.num_class
+            if len(self.alpha) != self.num_class:
+                raise AssertionError
             self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1)
             self.alpha = self.alpha / self.alpha.sum()
         elif isinstance(self.alpha, float):
@@ -592,7 +598,7 @@ def __init__(self, num_class, alpha=None, gamma=2, balance_index=-1, smooth=None
 
     def forward(self, logits, labels, **_):
 
-        # logit = F.softmax(input, dim=1)
+        # logits = F.softmax(logits, dim=1)
 
         if logits.dim() > 2:
             # N,C,d1,d2 -> N,C,m (m=d1*d2*...)
@@ -746,7 +752,8 @@ def __init__(self, bias=1e-12, **_):
         super().__init__()
         self.bias = bias
 
-    def forward(self, output, target, **_):
+    @staticmethod
+    def forward(output, target, **_):
         # _assert_no_grad(target)
         with torch.no_grad:  # Pytorch 0.4.0 replacement (should be ok to use like this)
             lag = target.size(1) - output.size(1)
@@ -757,7 +764,8 @@ class MSE3D(nn.Module):
     def __init__(self, **_):
         super().__init__()
 
-    def forward(self, output, target, **_):
+    @staticmethod
+    def forward(output, target, **_):
         # _assert_no_grad(target)
         with torch.no_grad:  # Pytorch 0.4.0 replacement (should be ok to use like this)
             lag = target.size(1) - output.size(1)
@@ -772,15 +780,15 @@ class BCEWithLogitsViewLoss(nn.BCEWithLogitsLoss):
     def __init__(self, weight=None, size_average=True, **_):
         super().__init__(weight=weight, size_average=size_average)
 
-    def forward(self, input, target, **_):
+    def forward(self, input_, target, **_):
         '''
-        :param input:
+        :param input_:
         :param target:
         :return:
 
         Simply passes along input.view(-1), target.view(-1)
         '''
-        return super().forward(input.view(-1), target.view(-1))
+        return super().forward(input_.view(-1), target.view(-1))
 
 
 # ===================== #
@@ -921,7 +929,7 @@ def __init__(self, use_running_mean=False, bce_weight=1, dice_weight=1, eps=1e-6
         self.bce_weight = bce_weight
         self.dice_weight = dice_weight
 
-        if self.use_running_mean == True:
+        if self.use_running_mean is True:
             self.register_buffer('running_bce_loss', torch.zeros(1))
             self.register_buffer('running_dice_loss', torch.zeros(1))
             self.reset_parameters()
@@ -937,11 +945,15 @@ def reset_parameters(self):
     def forward(self, outputs, labels, **_):
         # inputs and targets are assumed to be BxCxWxH (batch, color, width, height)
         outputs = outputs.squeeze()       # necessary in case we're dealing with binary segmentation (color dim of 1)
-        assert len(outputs.shape) == len(labels.shape)
+        if len(outputs.shape) != len(labels.shape):
+            raise AssertionError
         # assert that B, W and H are the same
-        assert outputs.size(-0) == labels.size(-0)
-        assert outputs.size(-1) == labels.size(-1)
-        assert outputs.size(-2) == labels.size(-2)
+        if outputs.size(-0) != labels.size(-0):
+            raise AssertionError
+        if outputs.size(-1) != labels.size(-1):
+            raise AssertionError
+        if outputs.size(-2) != labels.size(-2):
+            raise AssertionError
 
         bce_loss = self.bce_logits_loss(outputs, labels)
 
@@ -951,7 +963,7 @@ def forward(self, outputs, labels, **_):
         union = dice_output.sum() + dice_target.sum() + self.eps
         dice_loss = (-torch.log(2 * intersection / union))
 
-        if self.use_running_mean == False:
+        if self.use_running_mean is False:
             bmw = self.bce_weight
             dmw = self.dice_weight
             # loss += torch.clamp(1 - torch.log(2 * intersection / union),0,100)  * self.dice_weight
@@ -998,7 +1010,7 @@ def __init__(self,
         self.bce_weight = bce_weight
         self.dice_weight = dice_weight
 
-        if self.use_running_mean == True:
+        if self.use_running_mean is True:
             self.register_buffer('running_bce_loss', torch.zeros(1))
             self.register_buffer('running_dice_loss', torch.zeros(1))
             self.reset_parameters()
@@ -1013,17 +1025,24 @@ def reset_parameters(self):
 
     def forward(self, logits, labels, weights, **_):
         # logits and labels are assumed to be BxCxWxH
-        assert len(logits.shape) == len(labels.shape)
+        if len(logits.shape) != len(labels.shape):
+            raise AssertionError
         # assert that B, W and H are the same
-        assert logits.size(0) == labels.size(0)
-        assert logits.size(2) == labels.size(2)
-        assert logits.size(3) == labels.size(3)
+        if logits.size(0) != labels.size(0):
+            raise AssertionError
+        if logits.size(2) != labels.size(2):
+            raise AssertionError
+        if logits.size(3) != labels.size(3):
+            raise AssertionError
 
         # weights are assumed to be BxWxH
         # assert that B, W and H are the are the same for target and mask
-        assert logits.size(0) == weights.size(0)
-        assert logits.size(2) == weights.size(1)
-        assert logits.size(3) == weights.size(2)
+        if logits.size(0) != weights.size(0):
+            raise AssertionError
+        if logits.size(2) != weights.size(1):
+            raise AssertionError
+        if logits.size(3) != weights.size(2):
+            raise AssertionError
 
         if self.use_weight_mask:
             bce_loss = F.binary_cross_entropy_with_logits(input=logits,
@@ -1039,7 +1058,7 @@ def forward(self, logits, labels, weights, **_):
         union = dice_output.sum() + dice_target.sum() + self.eps
         dice_loss = (-torch.log(2 * intersection / union))
 
-        if self.use_running_mean == False:
+        if self.use_running_mean is False:
             bmw = self.bce_weight
             dmw = self.dice_weight
             # loss += torch.clamp(1 - torch.log(2 * intersection / union),0,100)  * self.dice_weight
@@ -1211,7 +1230,7 @@ def forward(self, *inputs, **_):
         if self.aux:
             return dict(loss=self._aux_forward(*inputs))
         else:
-            return dict(loss=super(MixSoftmaxCrossEntropyOHEMLoss, self).forward(*inputs))
+            return dict(loss=super(MixSoftmaxCrossEntropyOHEMLoss, self).forward(preds, target))
 
 
 # ====================== #
@@ -1330,191 +1349,6 @@ def _scale_target(targets_, scaled_size):
         return targets.squeeze(1).long()
 
 
-# ====================== #
-# Source: https://github.com/Hsuxu/Loss_ToolBox-PyTorch/blob/master/TverskyLoss/binarytverskyloss.py (MIT)
-class FocalBinaryTverskyFunc(Function):
-    """
-        Focal Tversky Loss as defined in `this paper <https://arxiv.org/abs/1810.07842>`_
-
-        `Authors' implementation <https://github.com/nabsabraham/focal-tversky-unet>`_ in Keras.
-
-        Params:
-            :param alpha: controls the penalty for false positives.
-            :param beta: penalty for false negative.
-            :param gamma : focal coefficient range[1,3]
-            :param reduction: return mode
-
-        Notes:
-            alpha = beta = 0.5 => dice coeff
-            alpha = beta = 1 => tanimoto coeff
-            alpha + beta = 1 => F beta coeff
-            add focal index -> loss=(1-T_index)**(1/gamma)
-    """
-
-    def __init__(ctx, alpha=0.5, beta=0.7, gamma=1.0, reduction='mean', **_):
-        """
-        :param alpha: controls the penalty for false positives.
-        :param beta: penalty for false negative.
-        :param gamma : focal coefficient range[1,3]
-        :param reduction: return mode
-        Notes:
-        alpha = beta = 0.5 => dice coeff
-        alpha = beta = 1 => tanimoto coeff
-        alpha + beta = 1 => F beta coeff
-        add focal index -> loss=(1-T_index)**(1/gamma)
-        """
-        ctx.alpha = alpha
-        ctx.beta = beta
-        ctx.epsilon = 1e-6
-        ctx.reduction = reduction
-        ctx.gamma = gamma
-        sum = ctx.beta + ctx.alpha
-        if sum != 1:
-            ctx.beta = ctx.beta / sum
-            ctx.alpha = ctx.alpha / sum
-
-    # @staticmethod
-    def forward(ctx, input, target, **_):
-        batch_size = input.size(0)
-        _, input_label = input.max(1)
-
-        input_label = input_label.float()
-        target_label = target.float()
-
-        ctx.save_for_backward(input, target_label)
-
-        input_label = input_label.view(batch_size, -1)
-        target_label = target_label.view(batch_size, -1)
-
-        ctx.P_G = torch.sum(input_label * target_label, 1)  # TP
-        ctx.P_NG = torch.sum(input_label * (1 - target_label), 1)  # FP
-        ctx.NP_G = torch.sum((1 - input_label) * target_label, 1)  # FN
-
-        index = ctx.P_G / (ctx.P_G + ctx.alpha * ctx.P_NG + ctx.beta * ctx.NP_G + ctx.epsilon)
-        loss = torch.pow((1 - index), 1 / ctx.gamma)
-        # target_area = torch.sum(target_label, 1)
-        # loss[target_area == 0] = 0
-        if ctx.reduction == 'none':
-            loss = loss
-        elif ctx.reduction == 'sum':
-            loss = torch.sum(loss)
-        else:
-            loss = torch.mean(loss)
-        return loss
-
-    # @staticmethod
-    def backward(ctx, grad_out):
-        """
-        :param ctx:
-        :param grad_out:
-        :return:
-        d_loss/dT_loss=(1/gamma)*(T_loss)**(1/gamma-1)
-        (dT_loss/d_P1)  = 2*P_G*[G*(P_G+alpha*P_NG+beta*NP_G)-(G+alpha*NG)]/[(P_G+alpha*P_NG+beta*NP_G)**2]
-                        = 2*P_G
-        (dT_loss/d_p0)=
-        """
-        inputs, target = ctx.saved_tensors
-        inputs = inputs.float()
-        target = target.float()
-        batch_size = inputs.size(0)
-        sum = ctx.P_G + ctx.alpha * ctx.P_NG + ctx.beta * ctx.NP_G + ctx.epsilon
-        P_G = ctx.P_G.view(batch_size, 1, 1, 1, 1)
-        if inputs.dim() == 5:
-            sum = sum.view(batch_size, 1, 1, 1, 1)
-        elif inputs.dim() == 4:
-            sum = sum.view(batch_size, 1, 1, 1)
-            P_G = ctx.P_G.view(batch_size, 1, 1, 1)
-        sub = (ctx.alpha * (1 - target) + target) * P_G
-
-        dL_dT = (1 / ctx.gamma) * torch.pow((P_G / sum), (1 / ctx.gamma - 1))
-        dT_dp0 = -2 * (target / sum - sub / sum / sum)
-        dL_dp0 = dL_dT * dT_dp0
-
-        dT_dp1 = ctx.beta * (1 - target) * P_G / sum / sum
-        dL_dp1 = dL_dT * dT_dp1
-        grad_input = torch.cat((dL_dp1, dL_dp0), dim=1)
-        # grad_input = torch.cat((grad_out.item() * dL_dp0, dL_dp0 * grad_out.item()), dim=1)
-        return grad_input, None
-
-
-class MultiTverskyLoss(nn.Module):
-    """
-    Tversky Loss for segmentation adaptive with multi class segmentation
-
-    Args
-        :param alpha (Tensor, float, optional): controls the penalty for false positives.
-        :param beta (Tensor, float, optional): controls the penalty for false negative.
-        :param gamma (Tensor, float, optional): focal coefficient
-        :param weights (Tensor, optional): a manual rescaling weight given to each class. If given, it has to be a Tensor of size `C`
-    """
-
-    def __init__(self, alpha=0.5, beta=0.5, gamma=1.0, reduction='mean', weights=None, **_):
-        """
-        :param alpha (Tensor, float, optional): controls the penalty for false positives.
-        :param beta (Tensor, float, optional): controls the penalty for false negative.
-        :param gamma (Tensor, float, optional): focal coefficient
-        :param weights (Tensor, optional): a manual rescaling weight given to each
-            class. If given, it has to be a Tensor of size `C`
-        """
-        super(MultiTverskyLoss, self).__init__()
-        self.alpha = alpha
-        self.beta = beta
-        self.gamma = gamma
-        self.reduction = reduction
-        self.weights = weights
-
-    def forward(self, inputs, labels, **_):
-        num_class = inputs.size(1)
-        weight_losses = 0.0
-        if self.weights is not None:
-            assert len(self.weights) == num_class, 'number of classes should be equal to length of weights '
-            weights = self.weights
-        else:
-            weights = [1.0 / num_class] * num_class
-        input_slices = torch.split(inputs, [1] * num_class, dim=1)
-        for idx in range(num_class):
-            input_idx = input_slices[idx]
-            input_idx = torch.cat((1 - input_idx, input_idx), dim=1)
-            target_idx = (labels == idx) * 1
-            loss_func = FocalBinaryTverskyFunc(self.alpha, self.beta, self.gamma, self.reduction)
-            loss_idx = loss_func(input_idx, target_idx)
-            weight_losses+=loss_idx * weights[idx]
-        # loss = torch.Tensor(weight_losses)
-        # loss = loss.to(inputs.device)
-        # loss = torch.sum(loss)
-        return weight_losses
-
-
-class FocalBinaryTverskyLoss(MultiTverskyLoss):
-    """
-            Binary version of Focal Tversky Loss as defined in `this paper <https://arxiv.org/abs/1810.07842>`_
-
-            `Authors' implementation <https://github.com/nabsabraham/focal-tversky-unet>`_ in Keras.
-
-            Params:
-                :param alpha: controls the penalty for false positives.
-                :param beta: penalty for false negative.
-                :param gamma : focal coefficient range[1,3]
-                :param reduction: return mode
-
-            Notes:
-                alpha = beta = 0.5 => dice coeff
-                alpha = beta = 1 => tanimoto coeff
-                alpha + beta = 1 => F beta coeff
-                add focal index -> loss=(1-T_index)**(1/gamma)
-        """
-
-    def __init__(self, alpha=0.5, beta=0.7, gamma=1.0, reduction='mean', **_):
-        """
-        :param alpha (Tensor, float, optional): controls the penalty for false positives.
-        :param beta (Tensor, float, optional): controls the penalty for false negative.
-        :param gamma (Tensor, float, optional): focal coefficient
-        """
-        super().__init__(alpha, beta, gamma, reduction)
-
-    def forward(self, inputs, labels, **_):
-        return super().forward(inputs, labels.unsqueeze(1))
-
 # ===================== #
 # Source: https://github.com/Hsuxu/Loss_ToolBox-PyTorch/blob/master/LovaszSoftmax/lovasz_loss.py
 def lovasz_grad(gt_sorted):
@@ -1537,8 +1371,10 @@ def __init__(self, reduction='mean', **_):
         super(LovaszSoftmax, self).__init__()
         self.reduction = reduction
 
-    def prob_flatten(self, input, target):
-        assert input.dim() in [4, 5]
+    @staticmethod
+    def prob_flatten(input, target):
+        if input.dim() not in [4, 5]:
+            raise AssertionError
         num_class = input.size(1)
         if input.dim() == 4:
             input = input.permute(0, 2, 3, 1).contiguous()
@@ -1747,16 +1583,21 @@ def one_hot(t: Tensor, axis=1) -> bool:
 
 def numpy_haussdorf(pred: np.ndarray, target: np.ndarray) -> float:
     from scipy.spatial.distance import directed_hausdorff
-    assert len(pred.shape) == 2
-    assert pred.shape == target.shape
+    if len(pred.shape) != 2:
+        raise AssertionError
+    if pred.shape != target.shape:
+        raise AssertionError
 
     return max(directed_hausdorff(pred, target)[0], directed_hausdorff(target, pred)[0])
 
 
 def haussdorf(preds: Tensor, target: Tensor) -> Tensor:
-    assert preds.shape == target.shape
-    assert one_hot(preds)
-    assert one_hot(target)
+    if preds.shape != target.shape:
+        raise AssertionError
+    if not one_hot(preds):
+        raise AssertionError
+    if not one_hot(target):
+        raise AssertionError
 
     B, C, _, _ = preds.shape
 
@@ -1853,8 +1694,8 @@ def get_tp_fp_fn(net_output, gt, axes=None, mask=None, square=False):
 def compute_sdf(img_gt, out_shape):
     """
     compute the signed distance map of binary mask
-    input: segmentation, shape = (batch_size, x, y, z)
-    output: the Signed Distance Map (SDM)
+    img_gt: segmentation, shape = (batch_size, x, y, z)
+    out_shape: the Signed Distance Map (SDM)
     sdf(x) = 0; x in segmentation boundary
              -inf|x-y|; x in segmentation
              +inf|x-y|; x out of segmentation
@@ -2008,7 +1849,8 @@ def __init__(self, in_features, out_features, loss_type='arcface', eps=1e-7, s=N
         '''
         super(AngularPenaltySMLoss, self).__init__()
         loss_type = loss_type.lower()
-        assert loss_type in ['arcface', 'sphereface', 'cosface']
+        if loss_type not in ['arcface', 'sphereface', 'cosface']:
+            raise AssertionError
         if loss_type == 'arcface':
             self.s = 64.0 if not s else s
             self.m = 0.5 if not m else m
@@ -2028,9 +1870,12 @@ def forward(self, x, labels, **_):
         '''
         input shape (N, in_features)
         '''
-        assert len(x) == len(labels)
-        assert torch.min(labels) >= 0
-        assert torch.max(labels) < self.out_features
+        if len(x) != len(labels):
+            raise AssertionError
+        if torch.min(labels) < 0:
+            raise AssertionError
+        if torch.max(labels) >= self.out_features:
+            raise AssertionError
 
         for W in self.fc.parameters():
             W = F.normalize(W, p=2, dim=1)
@@ -2167,13 +2012,16 @@ def __init__(self,
 
         self.num_classes = num_classes
         # radius choices
-        assert rmi_radius in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        if rmi_radius not in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
+            raise AssertionError
         self.rmi_radius = rmi_radius
-        assert rmi_pool_way in [0, 1, 2, 3]
+        if rmi_pool_way not in [0, 1, 2, 3]:
+            raise AssertionError
         self.rmi_pool_way = rmi_pool_way
 
         # set the pool_size = rmi_pool_stride
-        assert rmi_pool_size == rmi_pool_stride
+        if rmi_pool_size != rmi_pool_stride:
+            raise AssertionError
         self.rmi_pool_size = rmi_pool_size
         self.rmi_pool_stride = rmi_pool_stride
         self.weight_lambda = loss_weight_lambda
@@ -2280,7 +2128,8 @@ def rmi_lower_bound(self, labels_4D, probs_4D):
             labels_4D 	:	[N, C, H, W], dtype=float32
             probs_4D 	:	[N, C, H, W], dtype=float32
         """
-        assert labels_4D.size() == probs_4D.size()
+        if labels_4D.size() != probs_4D.size():
+            raise AssertionError
 
         p, s = self.rmi_pool_size, self.rmi_pool_stride
         if self.rmi_pool_stride > 1:
@@ -2474,23 +2323,24 @@ def forward(self, logits, labels, **_):
             logits = torch.sigmoid(logits)
 
         # Calculate RMI loss
-        rmi = self.rmi_loss(input=logits, target=labels)
+        rmi = self.rmi_loss(input_=logits, target=labels)
         rmi = rmi.mean() * (1.0 - self.bce_weight)
         return rmi + bce
 
-    def rmi_loss(self, input, target):
+    def rmi_loss(self, input_, target):
         """
         Calculates the RMI loss between the prediction and target.
         :return:
             RMI loss
         """
 
-        assert input.shape == target.shape
+        if input_.shape != target.shape:
+            raise AssertionError
         vector_size = self.radius * self.radius
 
         # Get region vectors
         y = self.extract_region_vector(target)
-        p = self.extract_region_vector(input)
+        p = self.extract_region_vector(input_)
 
         # Convert to doubles for better precision
         if self.use_double_precision:
@@ -2672,7 +2522,8 @@ def __init__(self, alpha=2.0, **_):
         self.alpha = alpha
 
     @torch.no_grad()
-    def distance_field(self, img: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def distance_field(img: np.ndarray) -> np.ndarray:
         field = np.zeros_like(img)
 
         for batch in range(len(img)):
@@ -2696,8 +2547,10 @@ def forward(self, logits: torch.Tensor, labels: torch.Tensor, debug=False, **_)
         """
         labels = labels.unsqueeze(1)
 
-        assert logits.dim() == 4 or logits.dim() == 5, "Only 2D and 3D supported"
-        assert (logits.dim() == labels.dim()), "Prediction and target need to be of same dimension"
+        if logits.dim() not in (4, 5):
+            raise AssertionError("Only 2D and 3D supported")
+        if (logits.dim() != labels.dim()):
+            raise AssertionError("Prediction and target need to be of same dimension")
 
         # this is necessary for binary loss
         logits = torch.sigmoid(logits)
@@ -2794,8 +2647,10 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor, debug=False) -> torc
         target: (b, 1, x, y, z) or (b, 1, x, y)
         """
         target = target.unsqueeze(1)
-        assert pred.dim() == 4 or pred.dim() == 5, "Only 2D and 3D supported"
-        assert (pred.dim() == target.dim()), "Prediction and target need to be of same dimension"
+        if pred.dim() not in (4, 5):
+            raise AssertionError("Only 2D and 3D supported")
+        if (pred.dim() != target.dim()):
+            raise AssertionError("Prediction and target need to be of same dimension")
 
         pred = torch.sigmoid(pred)
 
@@ -2923,7 +2778,7 @@ def forward(self, logits, labels, **_):
                 raise Exception(f'Non-matching shapes for logits ({logits.shape}) and labels ({labels.shape})')
 
         # Calculate RMI loss
-        rmi = self.rmi_loss(input=torch.sigmoid(logits), target=labels)
+        rmi = self.rmi_loss(input_=torch.sigmoid(logits), target=labels)
         bce = self.bce(logits, labels)
         # rmi = rmi.mean() * (1.0 - self.bce_weight)
         return self.rmi_weight * rmi + self.bce_weight * bce
\ No newline at end of file
diff --git a/pywick/lovasz_losses.py b/pywick/lovasz_losses.py
index 477da2c..053cbbe 100644
--- a/pywick/lovasz_losses.py
+++ b/pywick/lovasz_losses.py
@@ -5,8 +5,6 @@
 Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
 """
 
-from __future__ import print_function, division
-
 import torch
 from torch.autograd import Variable
 import torch.nn.functional as F
@@ -81,7 +79,7 @@ def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
 def lovasz_hinge(logits, labels, per_image=True, ignore=None):
     """
     Binary Lovasz hinge loss
-      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      logits: [B, H, W] Variable, logits at each pixel (between -infty and +infty)
       labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
       per_image: compute the loss per image instead of per batch
       ignore: void class id
@@ -97,7 +95,7 @@ def lovasz_hinge(logits, labels, per_image=True, ignore=None):
 def lovasz_hinge_flat(logits, labels):
     """
     Binary Lovasz hinge loss
-      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
+      logits: [P] Variable, logits at each prediction (between -infty and +infty)
       labels: [P] Tensor, binary ground truth labels (0 or 1)
       ignore: label to ignore
     """
@@ -130,27 +128,25 @@ def flatten_binary_scores(scores, labels, ignore=None):
 
 
 class LovaszBinaryLoss(torch.nn.modules.Module):
-    def __init__(self):
-        super(LovaszBinaryLoss, self).__init__()
 
-    def forward(self, input, target):
-        return lovasz_hinge(input, target)
+    @staticmethod
+    def forward(input_, target):
+        return lovasz_hinge(input_, target)
 
 
 class StableBCELoss(torch.nn.modules.Module):
-    def __init__(self):
-        super(StableBCELoss, self).__init__()
 
-    def forward(self, input, target):
-        neg_abs = - input.abs()
-        loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
+    @staticmethod
+    def forward(input_, target):
+        neg_abs = - input_.abs()
+        loss = input_.clamp(min=0) - input_ * target + (1 + neg_abs.exp()).log()
         return loss.mean()
 
 
 def binary_xloss(logits, labels, ignore=None):
     """
     Binary Cross entropy loss
-      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      logits: [B, H, W] Variable, logits at each pixel (between -infty and +infty)
       labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
       ignore: void class id
     """
diff --git a/pywick/meters/apmeter.py b/pywick/meters/apmeter.py
index 56b4d45..57fe690 100755
--- a/pywick/meters/apmeter.py
+++ b/pywick/meters/apmeter.py
@@ -57,25 +57,27 @@ def add(self, output, target, weight=None):
         if output.dim() == 1:
             output = output.view(-1, 1)
         else:
-            assert output.dim() == 2, \
-                'wrong output size (should be 1D or 2D with one column \
-                per class)'
+            if output.dim() != 2:
+                raise AssertionError('wrong output size (should be 1D or 2D with one column \
+                per class)')
         if target.dim() == 1:
             target = target.view(-1, 1)
         else:
-            assert target.dim() == 2, \
-                'wrong target size (should be 1D or 2D with one column \
-                per class)'
+            if target.dim() != 2:
+                raise AssertionError('wrong target size (should be 1D or 2D with one column \
+                per class)')
         if weight is not None:
-            assert weight.dim() == 1, 'Weight dimension should be 1'
-            assert weight.numel() == target.size(0), \
-                'Weight dimension 1 should be the same as that of target'
-            assert torch.min(weight) >= 0, 'Weight should be non-negative only'
-        assert torch.equal(target**2, target), \
-            'targets should be binary (0 or 1)'
+            if weight.dim() != 1:
+                raise AssertionError('Weight dimension should be 1')
+            if weight.numel() != target.size(0):
+                raise AssertionError('Weight dimension 1 should be the same as that of target')
+            if torch.min(weight) < 0:
+                raise AssertionError('Weight should be non-negative only')
+        if not torch.equal(target**2, target):
+            raise AssertionError('targets should be binary (0 or 1)')
         if self.scores.numel() > 0:
-            assert target.size(1) == self.targets.size(1), \
-                'dimensions for output should match previously added examples.'
+            if target.size(1) != self.targets.size(1):
+                raise AssertionError('dimensions for output should match previously added examples.')
 
         # make sure storage is of sufficient size
         if self.scores.storage().size() < self.scores.numel() + output.numel():
diff --git a/pywick/meters/aucmeter.py b/pywick/meters/aucmeter.py
index 13d4be5..04c6427 100755
--- a/pywick/meters/aucmeter.py
+++ b/pywick/meters/aucmeter.py
@@ -34,14 +34,14 @@ def add(self, output, target):
             target = target.cpu().squeeze().numpy()
         elif isinstance(target, numbers.Number):
             target = np.asarray([target])
-        assert np.ndim(output) == 1, \
-            'wrong output size (1D expected)'
-        assert np.ndim(target) == 1, \
-            'wrong target size (1D expected)'
-        assert output.shape[0] == target.shape[0], \
-            'number of outputs and targets does not match'
-        assert np.all(np.add(np.equal(target, 1), np.equal(target, 0))), \
-            'targets should be binary (0, 1)'
+        if np.ndim(output) != 1:
+            raise AssertionError('wrong output size (1D expected)')
+        if np.ndim(target) != 1:
+            raise AssertionError('wrong target size (1D expected)')
+        if output.shape[0] != target.shape[0]:
+            raise AssertionError('number of outputs and targets does not match')
+        if not np.all(np.add(np.equal(target, 1), np.equal(target, 0))):
+            raise AssertionError('targets should be binary (0, 1)')
 
         self.scores = np.append(self.scores, output)
         self.targets = np.append(self.targets, target)
diff --git a/pywick/meters/averagemeter.py b/pywick/meters/averagemeter.py
index 4d2f6d4..0f2a1d5 100644
--- a/pywick/meters/averagemeter.py
+++ b/pywick/meters/averagemeter.py
@@ -1,4 +1,4 @@
-class AverageMeter(object):
+class AverageMeter:
     """Computes and stores the average and current value"""
     def __init__(self):
         self.reset()
diff --git a/pywick/meters/classerrormeter.py b/pywick/meters/classerrormeter.py
index 64b0c5e..eeb5458 100755
--- a/pywick/meters/classerrormeter.py
+++ b/pywick/meters/classerrormeter.py
@@ -5,7 +5,9 @@
 
 
 class ClassErrorMeter(meter.Meter):
-    def __init__(self, topk=[1], accuracy=False):
+    def __init__(self, topk=None, accuracy=False):
+        if topk is None:
+            topk = [1]
         super(ClassErrorMeter, self).__init__()
         self.topk = np.sort(topk)
         self.accuracy = accuracy
@@ -25,12 +27,12 @@ def add(self, output, target):
         if np.ndim(output) == 1:
             output = output[np.newaxis]
         else:
-            assert np.ndim(output) == 2, \
-                'wrong output size (1D or 2D expected)'
-            assert np.ndim(target) == 1, \
-                'target and output do not match'
-        assert target.shape[0] == output.shape[0], \
-            'target and output do not match'
+            if np.ndim(output) != 2:
+                raise AssertionError('wrong output size (1D or 2D expected)')
+            if np.ndim(target) != 1:
+                raise AssertionError('target and output do not match')
+        if target.shape[0] != output.shape[0]:
+            raise AssertionError('target and output do not match')
         topk = self.topk
         maxk = int(topk[-1])  # seems like Python3 wants int and not np.int64
         no = output.shape[0]
@@ -44,8 +46,8 @@ def add(self, output, target):
 
     def value(self, k=-1):
         if k != -1:
-            assert k in self.sum.keys(), \
-                'invalid k (this k was not provided at construction time)'
+            if k not in self.sum.keys():
+                raise AssertionError('invalid k (this k was not provided at construction time)')
             if self.accuracy:
                 return (1. - float(self.sum[k]) / self.n) * 100.0
             else:
diff --git a/pywick/meters/confusionmeter.py b/pywick/meters/confusionmeter.py
index e65c36d..47458fc 100755
--- a/pywick/meters/confusionmeter.py
+++ b/pywick/meters/confusionmeter.py
@@ -38,35 +38,36 @@ def add(self, predicted, target):
         predicted = predicted.cpu().numpy()
         target = target.cpu().numpy()
 
-        assert predicted.shape[0] == target.shape[0], \
-            'number of targets and predicted outputs do not match'
+        if predicted.shape[0] != target.shape[0]:
+            raise AssertionError('number of targets and predicted outputs do not match')
 
         if np.ndim(predicted) != 1:
-            assert predicted.shape[1] == self.k, \
-                'number of predictions does not match size of confusion matrix'
+            if predicted.shape[1] != self.k:
+                raise AssertionError('number of predictions does not match size of confusion matrix')
             predicted = np.argmax(predicted, 1)
         else:
-            assert (predicted.max() < self.k) and (predicted.min() >= 0), \
-                'predicted values are not between 1 and k'
+            if not ((predicted.max() < self.k) and (predicted.min() >= 0)):
+                raise AssertionError('predicted values are not between 1 and k')
 
         onehot_target = np.ndim(target) != 1
         if onehot_target:
-            assert target.shape[1] == self.k, \
-                'Onehot target does not match size of confusion matrix'
-            assert (target >= 0).all() and (target <= 1).all(), \
-                'in one-hot encoding, target values should be 0 or 1'
-            assert (target.sum(1) == 1).all(), \
-                'multi-label setting is not supported'
+            if target.shape[1] != self.k:
+                raise AssertionError('Onehot target does not match size of confusion matrix')
+            if not ((target >= 0).all() and (target <= 1).all()):
+                raise AssertionError('in one-hot encoding, target values should be 0 or 1')
+            if not (target.sum(1) == 1).all():
+                raise AssertionError('multi-label setting is not supported')
             target = np.argmax(target, 1)
         else:
-            assert (predicted.max() < self.k) and (predicted.min() >= 0), \
-                'predicted values are not between 0 and k-1'
+            if not ((predicted.max() < self.k) and (predicted.min() >= 0)):
+                raise AssertionError('predicted values are not between 0 and k-1')
 
         # hack for bincounting 2 arrays together
         x = predicted + self.k * target
         bincount_2d = np.bincount(x.astype(np.int32),
                                   minlength=self.k ** 2)
-        assert bincount_2d.size == self.k ** 2
+        if bincount_2d.size != self.k ** 2:
+            raise AssertionError
         conf = bincount_2d.reshape((self.k, self.k))
 
         self.conf += conf
diff --git a/pywick/meters/meter.py b/pywick/meters/meter.py
index 99e485a..49c7a86 100755
--- a/pywick/meters/meter.py
+++ b/pywick/meters/meter.py
@@ -1,5 +1,5 @@
 
-class Meter(object):
+class Meter:
     """
     Abstract meter class from which all other meters inherit
     """
diff --git a/pywick/metrics.py b/pywick/metrics.py
index 0e88dca..d2aeabc 100644
--- a/pywick/metrics.py
+++ b/pywick/metrics.py
@@ -7,7 +7,7 @@
 def is_iterable(x):
     return isinstance(x, (tuple, list))
 
-class MetricContainer(object):
+class MetricContainer:
     def __init__(self, metrics, prefix=''):
         self.metrics = metrics
         self.helper = None
@@ -29,7 +29,7 @@ def __call__(self, input_batch, output_batch, target_batch, is_val=False):
                 logs[self.prefix + metric._name] = metric_out
         return logs
 
-class Metric(object):
+class Metric:
 
     def __call__(self, inputs, y_pred, y_true, is_val):
         '''
@@ -80,8 +80,6 @@ class CategoricalAccuracySingleInput(CategoricalAccuracy):
     This class is a tiny modification of CategoricalAccuracy to handle the issue when we desire a single output but
     the network outputs multiple y_pred (e.g. inception)
     '''
-    def __init__(self, top_k=1):
-        super().__init__(top_k)
 
     def __call__(self, inputs, y_pred, y_true, is_val=False):
         if is_iterable(y_pred):
diff --git a/pywick/models/classification/__init__.py b/pywick/models/classification/__init__.py
index 5c2e45a..482f25a 100644
--- a/pywick/models/classification/__init__.py
+++ b/pywick/models/classification/__init__.py
@@ -4,6 +4,8 @@
 models with your own number of classes use the ``models.model_utils.get_model(...)`` function and specify the name of the model
 exactly like the pretrained model method name (e.g. if the method name reads ``pywick.models.classification.dpn.dualpath.dpn68`` then use
 `dpn68` as the model name for ``models.model_utils.get_model(...)``.
+
+Note: Since Pywick v0.6.5 we include 200+ models from `rwightman's repo <https://github.com/rwightman/pytorch-image-models>`_ which can be used by simply specifying the appropriate model name (all lowercase) in the yaml configuration file!
 """
 
 from .dpn.dualpath import *                                 # dpnXX = pretrained on imagenet, DPN = not pretrained
diff --git a/pywick/models/classification/bn_inception.py b/pywick/models/classification/bn_inception.py
index b943fe8..0d1c9ec 100644
--- a/pywick/models/classification/bn_inception.py
+++ b/pywick/models/classification/bn_inception.py
@@ -253,8 +253,8 @@ def __init__(self, num_classes=1000):
         self.inception_5b_relu_pool_proj = nn.ReLU (inplace)
         self.last_linear = nn.Linear (1024, num_classes)
 
-    def features(self, input):
-        conv1_7x7_s2_out = self.conv1_7x7_s2(input)
+    def features(self, input_):
+        conv1_7x7_s2_out = self.conv1_7x7_s2(input_)
         conv1_7x7_s2_bn_out = self.conv1_7x7_s2_bn(conv1_7x7_s2_out)
         conv1_relu_7x7_out = self.conv1_relu_7x7(conv1_7x7_s2_bn_out)
         pool1_3x3_s2_out = self.pool1_3x3_s2(conv1_relu_7x7_out)
@@ -492,8 +492,8 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
diff --git a/pywick/models/classification/dpn/adaptive_avgmax_pool.py b/pywick/models/classification/dpn/adaptive_avgmax_pool.py
index cf69d03..739f6d4 100644
--- a/pywick/models/classification/dpn/adaptive_avgmax_pool.py
+++ b/pywick/models/classification/dpn/adaptive_avgmax_pool.py
@@ -52,7 +52,7 @@ def __init__(self, output_size=1, pool_type='avg'):
         super(AdaptiveAvgMaxPool2d, self).__init__()
         self.output_size = output_size
         self.pool_type = pool_type
-        if pool_type == 'avgmaxc' or pool_type == 'avgmax':
+        if pool_type in ('avgmaxc', 'avgmax'):
             self.pool = nn.ModuleList([nn.AdaptiveAvgPool2d(output_size), nn.AdaptiveMaxPool2d(output_size)])
         elif pool_type == 'max':
             self.pool = nn.AdaptiveMaxPool2d(output_size)
diff --git a/pywick/models/classification/dpn/convert_from_mxnet.py b/pywick/models/classification/dpn/convert_from_mxnet.py
index 0c84aa4..039ff57 100644
--- a/pywick/models/classification/dpn/convert_from_mxnet.py
+++ b/pywick/models/classification/dpn/convert_from_mxnet.py
@@ -26,7 +26,8 @@ def _convert_bn(k):
         aux = True
         add = 'moving_var'
     else:
-        assert False, 'Unknown key: %s' % k
+        if not False:
+            raise AssertionError('Unknown key: %s' % k)
     return aux, add
 
 
@@ -48,20 +49,19 @@ def convert_from_mxnet(model, checkpoint_prefix, debug=False):
                     aux, key_add = _convert_bn(k[3])
                     mxnet_key += key_add
                 else:
-                    assert k[3] == 'weight'
+                    if k[3] != 'weight':
+                        raise AssertionError
                     mxnet_key += 'conv_' + k[3]
             elif k[1] == 'conv5_bn_ac':
                 # bn + ac at end of features block
                 mxnet_key += 'conv5_x_x__relu-sp__bn_'
-                assert k[2] == 'bn'
+                if k[2] != 'bn':
+                    raise AssertionError
                 aux, key_add = _convert_bn(k[3])
                 mxnet_key += key_add
             else:
                 # middle blocks
-                if model.b and 'c1x1_c' in k[2]:
-                    bc_block = True  # b-variant split c-block special treatment
-                else:
-                    bc_block = False
+                bc_block = bool(model.b and 'c1x1_c' in k[2])
                 ck = k[1].split('_')
                 mxnet_key += ck[0] + '_x__' + ck[1] + '_'
                 ck = k[2].split('_')
@@ -75,7 +75,8 @@ def convert_from_mxnet(model, checkpoint_prefix, debug=False):
                     mxnet_key += key_add
                 else:
                     ki = 3 if bc_block else 4
-                    assert k[ki] == 'weight'
+                    if k[ki] != 'weight':
+                        raise AssertionError
                     mxnet_key += 'conv_' + k[ki]
         elif k[0] == 'classifier':
             if 'fc6-1k_weight' in mxnet_weights:
@@ -84,7 +85,8 @@ def convert_from_mxnet(model, checkpoint_prefix, debug=False):
                 mxnet_key += 'fc6_'
             mxnet_key += k[1]
         else:
-            assert False, 'Unexpected token'
+            if not False:
+                raise AssertionError('Unexpected token')
 
         if debug:
             print(mxnet_key, '=> ', state_key, end=' ')
@@ -111,11 +113,9 @@ def convert_from_mxnet(model, checkpoint_prefix, debug=False):
 def main():
     args = parser.parse_args()
     if 'dpn' not in args.model:
-        print('Error: Can only convert DPN models.')
-        exit(1)
+        raise Exception('Error: Can only convert DPN models.')
     if not has_mxnet:
-        print('Error: Cannot import MXNet module. Please install.')
-        exit(1)
+        raise Exception('Error: Cannot import MXNet module. Please install.')
 
     model = create_model(args.model, num_classes=1000, pretrained=False)
 
diff --git a/pywick/models/classification/dpn/dualpath.py b/pywick/models/classification/dpn/dualpath.py
index a20a236..412dd5c 100644
--- a/pywick/models/classification/dpn/dualpath.py
+++ b/pywick/models/classification/dpn/dualpath.py
@@ -53,7 +53,8 @@ def dpn68(num_classes=1000, pretrained=False, test_time_pool=True):
         elif has_mxnet and os.path.exists('./pretrained/'):
             convert_from_mxnet(model, checkpoint_prefix='./pretrained/dpn68')
         else:
-            assert False, "Unable to load a pretrained model"
+            if not False:
+                raise AssertionError("Unable to load a pretrained model")
     return model
 
 
@@ -74,7 +75,8 @@ def dpn68b(num_classes=1000, pretrained=False, test_time_pool=True):
         elif has_mxnet and os.path.exists('./pretrained/'):
             convert_from_mxnet(model, checkpoint_prefix='./pretrained/dpn68-extra')
         else:
-            assert False, "Unable to load a pretrained model"
+            if not False:
+                raise AssertionError("Unable to load a pretrained model")
     return model
 
 
@@ -100,7 +102,8 @@ def dpn92(num_classes=1000, pretrained=False, test_time_pool=True, extra=True):
         elif has_mxnet and os.path.exists('./pretrained/'):
             convert_from_mxnet(model, checkpoint_prefix='./pretrained/' + key)
         else:
-            assert False, "Unable to load a pretrained model"
+            if not False:
+                raise AssertionError("Unable to load a pretrained model")
     return model
 
 
@@ -121,7 +124,8 @@ def dpn98(num_classes=1000, pretrained=False, test_time_pool=True):
         elif has_mxnet and os.path.exists('./pretrained/'):
             convert_from_mxnet(model, checkpoint_prefix='./pretrained/dpn98')
         else:
-            assert False, "Unable to load a pretrained model"
+            if not False:
+                raise AssertionError("Unable to load a pretrained model")
     return model
 
 
@@ -142,7 +146,8 @@ def dpn131(num_classes=1000, pretrained=False, test_time_pool=True):
         elif has_mxnet and os.path.exists('./pretrained/'):
             convert_from_mxnet(model, checkpoint_prefix='./pretrained/dpn131')
         else:
-            assert False, "Unable to load a pretrained model"
+            if not False:
+                raise AssertionError("Unable to load a pretrained model")
     return model
 
 
@@ -163,7 +168,8 @@ def dpn107(num_classes=1000, pretrained=False, test_time_pool=True):
         elif has_mxnet and os.path.exists('./pretrained/'):
             convert_from_mxnet(model, checkpoint_prefix='./pretrained/dpn107-extra')
         else:
-            assert False, "Unable to load a pretrained model"
+            if not False:
+                raise AssertionError("Unable to load a pretrained model")
     return model
 
 
@@ -222,7 +228,8 @@ def __init__(
             self.key_stride = 2
             self.has_proj = True
         else:
-            assert block_type is 'normal'
+            if block_type is not 'normal':
+                raise AssertionError
             self.key_stride = 1
             self.has_proj = False
 
diff --git a/pywick/models/classification/dpn/model_factory.py b/pywick/models/classification/dpn/model_factory.py
index 79fda71..69d12fe 100644
--- a/pywick/models/classification/dpn/model_factory.py
+++ b/pywick/models/classification/dpn/model_factory.py
@@ -52,11 +52,12 @@ def create_model(model_name, num_classes=1000, pretrained=False, **kwargs):
     elif model_name == 'inception_v3':
         model = inception_v3(num_classes=num_classes, pretrained=pretrained, transform_input=False, **kwargs)
     else:
-        assert False, "Unknown model architecture (%s)" % model_name
+        if not False:
+            raise AssertionError("Unknown model architecture (%s)" % model_name)
     return model
 
 
-class LeNormalize(object):
+class LeNormalize:
     """Normalize to -1..1 in Google Inception style
     """
     def __call__(self, tensor):
diff --git a/pywick/models/classification/fbresnet.py b/pywick/models/classification/fbresnet.py
index 250a0c7..ba0178c 100644
--- a/pywick/models/classification/fbresnet.py
+++ b/pywick/models/classification/fbresnet.py
@@ -3,8 +3,6 @@
 # Source: https://github.com/Cadene/pretrained-models.pytorch/blob/0819c4f43a70fcd40234b03ff02f87599cd8ace6/pretrainedmodels/models/fbresnet.py
 # Note this is the version with adaptive capabilities so it can accept differently-sized images
 
-
-from __future__ import print_function, division, absolute_import
 import torch.nn as nn
 import torch.nn.functional as F
 import math
@@ -151,8 +149,8 @@ def _make_layer(self, block, planes, blocks, stride=1):
 
         return nn.Sequential(*layers)
 
-    def features(self, input):
-        x = self.conv1(input)
+    def features(self, input_):
+        x = self.conv1(input_)
         self.conv1_input = x.clone()
         x = self.bn1(x)
         x = self.relu(x)
@@ -171,8 +169,8 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -226,8 +224,8 @@ def fbresnet152(num_classes=1000, pretrained='imagenet'):
     model = FBResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
     if pretrained is not None:
         settings = pretrained_settings['fbresnet152'][pretrained]
-        assert num_classes == settings['num_classes'], \
-            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+        if num_classes != settings['num_classes']:
+            raise AssertionError("num_classes should be {}, but is {}".format(settings['num_classes'], num_classes))
         model.load_state_dict(model_zoo.load_url(settings['url']))
         model.input_space = settings['input_space']
         model.input_size = settings['input_size']
diff --git a/pywick/models/classification/inception_v4.py b/pywick/models/classification/inception_v4.py
index 45833a0..a7bfc97 100644
--- a/pywick/models/classification/inception_v4.py
+++ b/pywick/models/classification/inception_v4.py
@@ -295,15 +295,15 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
 
-def inceptionv4(pretrained='imagenet'):
+def inceptionv4(num_classes=10, pretrained='imagenet'):
     # both 'imagenet'&'imagenet+background' are loaded from same parameters
-    model = InceptionV4(num_classes=1001)
+    model = InceptionV4(num_classes=num_classes)
     if pretrained:
         settings = pretrained_settings['inceptionv4'][pretrained]
         model.load_state_dict(model_zoo.load_url(settings['url']))
@@ -332,12 +332,16 @@ def inceptionv4(pretrained='imagenet'):
 ```
 '''
 if __name__ == '__main__':
-    assert inceptionv4(num_classes=10, pretrained=None)
+    if not inceptionv4(num_classes=10, pretrained=None):
+        raise AssertionError
     print('success')
-    assert inceptionv4(num_classes=1000, pretrained='imagenet')
+    if not inceptionv4(num_classes=1000, pretrained='imagenet'):
+        raise AssertionError
     print('success')
-    assert inceptionv4(num_classes=1001, pretrained='imagenet+background')
+    if not inceptionv4(num_classes=1001, pretrained='imagenet+background'):
+        raise AssertionError
     print('success')
 
     # fail
-    assert inceptionv4(num_classes=1001, pretrained='imagenet')
\ No newline at end of file
+    if not inceptionv4(num_classes=1001, pretrained='imagenet'):
+        raise AssertionError
\ No newline at end of file
diff --git a/pywick/models/classification/inceptionresnet_v2.py b/pywick/models/classification/inceptionresnet_v2.py
index edd933c..a808fa8 100644
--- a/pywick/models/classification/inceptionresnet_v2.py
+++ b/pywick/models/classification/inceptionresnet_v2.py
@@ -297,8 +297,8 @@ def __init__(self, num_classes=1001):
         self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)
         self.last_linear = nn.Linear(1536, num_classes)
 
-    def features(self, input):
-        x = self.conv2d_1a(input)
+    def features(self, input_):
+        x = self.conv2d_1a(input_)
         x = self.conv2d_2a(x)
         x = self.conv2d_2b(x)
         x = self.maxpool_3a(x)
@@ -321,16 +321,16 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
 
-def inceptionresnetv2(pretrained='imagenet'):
+def inceptionresnetv2(num_classes=10, pretrained='imagenet'):
 
     # both 'imagenet'&'imagenet+background' are loaded from same parameters
-    model = InceptionResNetV2(num_classes=1001)
+    model = InceptionResNetV2(num_classes=num_classes)
 
     if pretrained:
         settings = pretrained_settings['inceptionresnetv2'][pretrained]
@@ -361,12 +361,16 @@ def inceptionresnetv2(pretrained='imagenet'):
 ```
 '''
 if __name__ == '__main__':
-    assert inceptionresnetv2(num_classes=10, pretrained=None)
+    if not inceptionresnetv2(num_classes=10, pretrained=None):
+        raise AssertionError
     print('success')
-    assert inceptionresnetv2(num_classes=1000, pretrained='imagenet')
+    if not inceptionresnetv2(num_classes=1000, pretrained='imagenet'):
+        raise AssertionError
     print('success')
-    assert inceptionresnetv2(num_classes=1001, pretrained='imagenet+background')
+    if not inceptionresnetv2(num_classes=1001, pretrained='imagenet+background'):
+        raise AssertionError
     print('success')
 
     # fail
-    assert inceptionresnetv2(num_classes=1001, pretrained='imagenet')
\ No newline at end of file
+    if not inceptionresnetv2(num_classes=1001, pretrained='imagenet'):
+        raise AssertionError
\ No newline at end of file
diff --git a/pywick/models/classification/nasnet.py b/pywick/models/classification/nasnet.py
index 86098d9..153fe44 100644
--- a/pywick/models/classification/nasnet.py
+++ b/pywick/models/classification/nasnet.py
@@ -554,8 +554,8 @@ def __init__(self, num_classes=1001):
         self.dropout = nn.Dropout()
         self.last_linear = nn.Linear(4032, self.num_classes)
 
-    def features(self, input):
-        x_conv0 = self.conv0(input)
+    def features(self, input_):
+        x_conv0 = self.conv0(input_)
         x_stem_0 = self.cell_stem_0(x_conv0)
         x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
 
@@ -593,8 +593,8 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
diff --git a/pywick/models/classification/nasnet_mobile.py b/pywick/models/classification/nasnet_mobile.py
index 33c8517..0d73058 100644
--- a/pywick/models/classification/nasnet_mobile.py
+++ b/pywick/models/classification/nasnet_mobile.py
@@ -578,8 +578,8 @@ def __init__(self, num_classes=1001, stem_filters=32, penultimate_filters=1056,
         self.dropout = nn.Dropout()
         self.last_linear = nn.Linear(24*filters, self.num_classes)
 
-    def features(self, input):
-        x_conv0 = self.conv0(input)
+    def features(self, input_):
+        x_conv0 = self.conv0(input_)
         x_stem_0 = self.cell_stem_0(x_conv0)
         x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
 
@@ -611,8 +611,8 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -645,7 +645,7 @@ def nasnetamobile(pretrained='imagenet'):
 if __name__ == "__main__":
 
     model = NASNetAMobile()
-    input = torch.randn(2, 3, 224, 224)
-    output = model(input)
+    input_ = torch.randn(2, 3, 224, 224)
+    output = model(input_)
 
     print(output.size())
diff --git a/pywick/models/classification/pnasnet.py b/pywick/models/classification/pnasnet.py
index 6164922..d17562e 100644
--- a/pywick/models/classification/pnasnet.py
+++ b/pywick/models/classification/pnasnet.py
@@ -370,8 +370,8 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
diff --git a/pywick/models/classification/poly_net.py b/pywick/models/classification/poly_net.py
index cd41f15..55df655 100644
--- a/pywick/models/classification/poly_net.py
+++ b/pywick/models/classification/poly_net.py
@@ -243,7 +243,8 @@ class InceptionResNetBPoly(nn.Module):
 
     def __init__(self, scale, num_blocks):
         super(InceptionResNetBPoly, self).__init__()
-        assert num_blocks >= 1, 'num_blocks should be greater or equal to 1'
+        if num_blocks < 1:
+            raise AssertionError('num_blocks should be greater or equal to 1')
         self.scale = scale
         self.num_blocks = num_blocks
         self.path0_1x1 = PolyConv2d(1152, 128, kernel_size=1,
@@ -293,7 +294,8 @@ class InceptionResNetCPoly(nn.Module):
 
     def __init__(self, scale, num_blocks):
         super(InceptionResNetCPoly, self).__init__()
-        assert num_blocks >= 1, 'num_blocks should be greater or equal to 1'
+        if num_blocks < 1:
+            raise AssertionError('num_blocks should be greater or equal to 1')
         self.scale = scale
         self.num_blocks = num_blocks
         self.path0_1x1 = PolyConv2d(2048, 192, kernel_size=1,
@@ -336,7 +338,8 @@ class MultiWay(nn.Module):
 
     def __init__(self, scale, block_cls, num_blocks):
         super(MultiWay, self).__init__()
-        assert num_blocks >= 1, 'num_blocks should be greater or equal to 1'
+        if num_blocks < 1:
+            raise AssertionError('num_blocks should be greater or equal to 1')
         self.scale = scale
         self.blocks = nn.ModuleList([block_cls() for _ in range(num_blocks)])
         self.relu = nn.ReLU()
diff --git a/pywick/models/classification/resnet_preact.py b/pywick/models/classification/resnet_preact.py
index b70cba3..abe2571 100644
--- a/pywick/models/classification/resnet_preact.py
+++ b/pywick/models/classification/resnet_preact.py
@@ -217,15 +217,18 @@ def __init__(self, config):
         depth = config['depth']
         preact_stage = config['preact_stage']
 
-        assert block_type in ['basic', 'bottleneck']
+        if block_type not in ['basic', 'bottleneck']:
+            raise AssertionError
         if block_type == 'basic':
             block = BasicBlock
             n_blocks_per_stage = (depth - 2) // 6
-            assert n_blocks_per_stage * 6 + 2 == depth
+            if n_blocks_per_stage * 6 + 2 != depth:
+                raise AssertionError
         else:
             block = BottleneckBlock
             n_blocks_per_stage = (depth - 2) // 9
-            assert n_blocks_per_stage * 9 + 2 == depth
+            if n_blocks_per_stage * 9 + 2 != depth:
+                raise AssertionError
 
         n_channels = [
             base_channels,
diff --git a/pywick/models/classification/resnext.py b/pywick/models/classification/resnext.py
index ee54d79..5d56b15 100644
--- a/pywick/models/classification/resnext.py
+++ b/pywick/models/classification/resnext.py
@@ -60,14 +60,14 @@ def __init__(self, num_classes=1000):
         self.avg_pool = nn.AvgPool2d((7, 7), (1, 1))
         self.last_linear = nn.Linear(2048, num_classes)
 
-    def logits(self, input):
-        x = self.avg_pool(input)
+    def logits(self, input_):
+        x = self.avg_pool(input_)
         x = x.view(x.size(0), -1)
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -80,14 +80,14 @@ def __init__(self, num_classes=1000):
         self.avg_pool = nn.AvgPool2d((7, 7), (1, 1))
         self.last_linear = nn.Linear(2048, num_classes)
 
-    def logits(self, input):
-        x = self.avg_pool(input)
+    def logits(self, input_):
+        x = self.avg_pool(input_)
         x = x.view(x.size(0), -1)
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -101,14 +101,14 @@ def __init__(self, num_classes=1000):
         self.avg_pool = nn.AvgPool2d((7, 7), (1, 1))
         self.last_linear = nn.Linear(2048, num_classes)
 
-    def logits(self, input):
-        x = self.avg_pool(input)
+    def logits(self, input_):
+        x = self.avg_pool(input_)
         x = x.view(x.size(0), -1)
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -117,8 +117,8 @@ def resnext50_32x4d(num_classes=1000, pretrained='imagenet'):
     model = ResNeXt50_32x4d(num_classes=num_classes)
     if pretrained is not None:
         settings = pretrained_settings['resnext50_32x4d'][pretrained]
-        assert num_classes == settings['num_classes'], \
-            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+        if num_classes != settings['num_classes']:
+            raise AssertionError("num_classes should be {}, but is {}".format(settings['num_classes'], num_classes))
         model.load_state_dict(model_zoo.load_url(settings['url']))
         model.input_space = settings['input_space']
         model.input_size = settings['input_size']
diff --git a/pywick/models/classification/resnext_features/resnext101_32x4d_features.py b/pywick/models/classification/resnext_features/resnext101_32x4d_features.py
index 6da0d13..3baf5cf 100755
--- a/pywick/models/classification/resnext_features/resnext101_32x4d_features.py
+++ b/pywick/models/classification/resnext_features/resnext101_32x4d_features.py
@@ -1,43 +1,40 @@
 # Source: https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/resnext_features/resnext101_32x4d_features.py
 
-from __future__ import print_function, division, absolute_import
 from functools import reduce
 import torch.nn as nn
 
 
 class LambdaBase(nn.Sequential):
-    def __init__(self, *args):
-        super(LambdaBase, self).__init__(*args)
 
-    def forward_prepare(self, input):
+    def forward_prepare(self, input_):
         output = []
         for module in self._modules.values():
-            output.append(module(input))
-        return output if output else input
+            output.append(module(input_))
+        return output if output else input_
 
 class Lambda(LambdaBase):
     def __init__(self, *args):
         super(Lambda, self).__init__(*args)
         self.lambda_func = identity
 
-    def forward(self, input):
-        return self.lambda_func(self.forward_prepare(input))
+    def forward(self, input_):
+        return self.lambda_func(self.forward_prepare(input_))
 
 class LambdaMap(LambdaBase):
     def __init__(self, *args):
         super(LambdaMap, self).__init__(*args)
         self.lambda_func = identity
 
-    def forward(self, input):
-        return list(map(self.lambda_func,self.forward_prepare(input)))
+    def forward(self, input_):
+        return list(map(self.lambda_func, self.forward_prepare(input_)))
 
 class LambdaReduce(LambdaBase):
     def __init__(self, *args):
         super(LambdaReduce, self).__init__(*args)
         self.lambda_func = add
 
-    def forward(self, input):
-        return reduce(self.lambda_func,self.forward_prepare(input))
+    def forward(self, input_):
+        return reduce(self.lambda_func, self.forward_prepare(input_))
 
 def identity(x): return x
 
diff --git a/pywick/models/classification/resnext_features/resnext101_64x4d_features.py b/pywick/models/classification/resnext_features/resnext101_64x4d_features.py
index ff6dc62..dba4717 100755
--- a/pywick/models/classification/resnext_features/resnext101_64x4d_features.py
+++ b/pywick/models/classification/resnext_features/resnext101_64x4d_features.py
@@ -1,42 +1,39 @@
 # Source: https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/resnext_features/resnext101_64x4d_features.py
 
-from __future__ import print_function, division, absolute_import
 import torch.nn as nn
 from functools import reduce
 
 class LambdaBase(nn.Sequential):
-    def __init__(self, *args):
-        super(LambdaBase, self).__init__(*args)
 
-    def forward_prepare(self, input):
+    def forward_prepare(self, input_):
         output = []
         for module in self._modules.values():
-            output.append(module(input))
-        return output if output else input
+            output.append(module(input_))
+        return output if output else input_
 
 class Lambda(LambdaBase):
     def __init__(self, *args):
         super(Lambda, self).__init__(*args)
         self.lambda_func = identity
 
-    def forward(self, input):
-        return self.lambda_func(self.forward_prepare(input))
+    def forward(self, input_):
+        return self.lambda_func(self.forward_prepare(input_))
 
 class LambdaMap(LambdaBase):
     def __init__(self, *args):
         super(LambdaMap, self).__init__(*args)
         self.lambda_func = identity
 
-    def forward(self, input):
-        return list(map(self.lambda_func,self.forward_prepare(input)))
+    def forward(self, input_):
+        return list(map(self.lambda_func, self.forward_prepare(input_)))
 
 class LambdaReduce(LambdaBase):
     def __init__(self, *args):
         super(LambdaReduce, self).__init__(*args)
         self.lambda_func = add
 
-    def forward(self, input):
-        return reduce(self.lambda_func,self.forward_prepare(input))
+    def forward(self, input_):
+        return reduce(self.lambda_func, self.forward_prepare(input_))
 
 def identity(x): return x
 
diff --git a/pywick/models/classification/resnext_features/resnext50_32x4d_features.py b/pywick/models/classification/resnext_features/resnext50_32x4d_features.py
index ba909a1..43e8ea3 100644
--- a/pywick/models/classification/resnext_features/resnext50_32x4d_features.py
+++ b/pywick/models/classification/resnext_features/resnext50_32x4d_features.py
@@ -1,40 +1,37 @@
-from __future__ import print_function, division, absolute_import
 import torch.nn as nn
 from functools import reduce
 
 class LambdaBase(nn.Sequential):
-    def __init__(self, *args):
-        super(LambdaBase, self).__init__(*args)
 
-    def forward_prepare(self, input):
+    def forward_prepare(self, input_):
         output = []
         for module in self._modules.values():
-            output.append(module(input))
-        return output if output else input
+            output.append(module(input_))
+        return output if output else input_
 
 class Lambda(LambdaBase):
     def __init__(self, *args):
         super(Lambda, self).__init__(*args)
         self.lambda_func = identity
 
-    def forward(self, input):
-        return self.lambda_func(self.forward_prepare(input))
+    def forward(self, input_):
+        return self.lambda_func(self.forward_prepare(input_))
 
 class LambdaMap(LambdaBase):
     def __init__(self, *args):
         super(LambdaMap, self).__init__(*args)
         self.lambda_func = identity
 
-    def forward(self, input):
-        return list(map(self.lambda_func,self.forward_prepare(input)))
+    def forward(self, input_):
+        return list(map(self.lambda_func, self.forward_prepare(input_)))
 
 class LambdaReduce(LambdaBase):
     def __init__(self, *args):
         super(LambdaReduce, self).__init__(*args)
         self.lambda_func = add
 
-    def forward(self, input):
-        return reduce(self.lambda_func,self.forward_prepare(input))
+    def forward(self, input_):
+        return reduce(self.lambda_func, self.forward_prepare(input_))
 
 def identity(x): return x
 
diff --git a/pywick/models/classification/senet.py b/pywick/models/classification/senet.py
index 0e57b8a..abef975 100644
--- a/pywick/models/classification/senet.py
+++ b/pywick/models/classification/senet.py
@@ -352,7 +352,8 @@ def forward(self, x):
 
 
 def initialize_pretrained_model(model, num_classes, settings):
-    assert num_classes == settings['num_classes'], 'num_classes should be {}, but is {}'.format(settings['num_classes'], num_classes)
+    if num_classes != settings['num_classes']:
+        raise AssertionError('num_classes should be {}, but is {}'.format(settings['num_classes'], num_classes))
     model.load_state_dict(model_zoo.load_url(settings['url']))
     model.input_space = settings['input_space']
     model.input_size = settings['input_size']
diff --git a/pywick/models/classification/testnets/large_densenet.py b/pywick/models/classification/testnets/large_densenet.py
index 9eb4e63..b298a6c 100644
--- a/pywick/models/classification/testnets/large_densenet.py
+++ b/pywick/models/classification/testnets/large_densenet.py
@@ -23,12 +23,12 @@ def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 
         super(_DenseLayer, self).__init__()
 
-        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu1', nn.ReLU(inplace=True)),
-        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu2', nn.ReLU(inplace=True)),
-        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)),
+        self.add_module('norm1', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu1', nn.ReLU(inplace=True))
+        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False))
+        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate))
+        self.add_module('relu2', nn.ReLU(inplace=True))
+        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False))
         self.drop_rate = drop_rate
 
     def forward(self, x):
diff --git a/pywick/models/classification/testnets/opt_densenset.py b/pywick/models/classification/testnets/opt_densenset.py
index 96280b0..a7fd4fc 100644
--- a/pywick/models/classification/testnets/opt_densenset.py
+++ b/pywick/models/classification/testnets/opt_densenset.py
@@ -47,14 +47,12 @@ def opt_densenet264(pretrained=False, **kwargs):
 class _DenseLayer(nn.Sequential):
     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
         super(_DenseLayer, self).__init__()
-        self.add_module('norm_1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu_1', nn.ReLU(inplace=True)),
-        self.add_module('conv_1', nn.Conv2d(num_input_features, bn_size *
-                        growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm_2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu_2', nn.ReLU(inplace=True)),
-        self.add_module('conv_2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                        kernel_size=3, stride=1, padding=1, bias=False)),
+        self.add_module('norm_1', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu_1', nn.ReLU(inplace=True))
+        self.add_module('conv_1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False))
+        self.add_module('norm_2', nn.BatchNorm2d(bn_size * growth_rate))
+        self.add_module('relu_2', nn.ReLU(inplace=True))
+        self.add_module('conv_2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False))
         self.drop_rate = drop_rate
 
     def forward(self, x):
diff --git a/pywick/models/classification/testnets/pnn.py b/pywick/models/classification/testnets/pnn.py
index b82c4f4..f4ec59f 100644
--- a/pywick/models/classification/testnets/pnn.py
+++ b/pywick/models/classification/testnets/pnn.py
@@ -52,7 +52,7 @@ def __init__(self, in_channels=None, out_channels=None, nmasks=None, level=None,
         if filter_size == 1:
             padding = 0
             bias = True
-        elif filter_size == 3 or filter_size == 5:
+        elif filter_size in (3, 5):
             padding = 1
             bias = False
         elif filter_size == 7:
@@ -139,7 +139,7 @@ def __init__(self, in_channels=None, out_channels=None, nmasks=None, level=None,
         if filter_size == 1:
             padding = 0
             bias = True
-        elif filter_size == 3 or filter_size == 5:
+        elif filter_size in (3, 5):
             padding = 1
             bias = False
         elif filter_size == 7:
diff --git a/pywick/models/classification/testnets/se_densenet_full.py b/pywick/models/classification/testnets/se_densenet_full.py
index a84bd34..5b57f67 100644
--- a/pywick/models/classification/testnets/se_densenet_full.py
+++ b/pywick/models/classification/testnets/se_densenet_full.py
@@ -132,16 +132,14 @@ class _DenseLayer(nn.Sequential):
     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
         super(_DenseLayer, self).__init__()
         # Add SELayer at here, like SE-PRE block in original paper illustrates
-        self.add_module("selayer", SELayer(channel=num_input_features)),
-
-        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu1', nn.ReLU(inplace=True)),
-        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
-                        growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu2', nn.ReLU(inplace=True)),
-        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                        kernel_size=3, stride=1, padding=1, bias=False)),
+        self.add_module("selayer", SELayer(channel=num_input_features))
+
+        self.add_module('norm1', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu1', nn.ReLU(inplace=True))
+        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False))
+        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate))
+        self.add_module('relu2', nn.ReLU(inplace=True))
+        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False))
         self.drop_rate = drop_rate
 
     def forward(self, x):
@@ -253,8 +251,7 @@ def test_se_densenet(pretrained=False):
 
     if pretrained:
         model = se_densenet121(pretrained=pretrained)
-        net_state_dict = {key: value for key, value in model_zoo.load_url("https://download.pytorch.org/models/densenet121-a639ec97.pth").items()}
-        model.load_state_dict(net_state_dict, strict=False)
+        model.load_state_dict(model_zoo.load_url("https://download.pytorch.org/models/densenet121-a639ec97.pth"), strict=False)
 
     else:
         model = se_densenet121(pretrained=pretrained)
diff --git a/pywick/models/classification/testnets/se_efficient_densenet.py b/pywick/models/classification/testnets/se_efficient_densenet.py
index 6dac392..3f417d7 100644
--- a/pywick/models/classification/testnets/se_efficient_densenet.py
+++ b/pywick/models/classification/testnets/se_efficient_densenet.py
@@ -147,16 +147,14 @@ def __init__(self, num_input_features, growth_rate, bn_size, drop_rate,
                  efficient=False):
         super(_DenseLayer, self).__init__()
         # Add SELayer at here, like SE-PRE block in original paper illustrates
-        self.add_module("selayer", SELayer(channel=num_input_features)),
-
-        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu1', nn.ReLU(inplace=True)),
-        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
-                                           growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu2', nn.ReLU(inplace=True)),
-        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                                           kernel_size=3, stride=1, padding=1, bias=False)),
+        self.add_module("selayer", SELayer(channel=num_input_features))
+
+        self.add_module('norm1', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu1', nn.ReLU(inplace=True))
+        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False))
+        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate))
+        self.add_module('relu2', nn.ReLU(inplace=True))
+        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False))
         self.drop_rate = drop_rate
         self.efficient = efficient
 
@@ -241,7 +239,8 @@ def __init__(self, growth_rate=12, block_config=(16, 16, 16), compression=0.5,
                  num_classes=4096, efficient=True):
 
         super(DenseNet, self).__init__()
-        assert 0 < compression <= 1, 'compression of densenet should be between 0 and 1'
+        if not 0 < compression <= 1:
+            raise AssertionError('compression of densenet should be between 0 and 1')
 
         # First convolution
         self.features = nn.Sequential(OrderedDict([
diff --git a/pywick/models/classification/testnets/se_module.py b/pywick/models/classification/testnets/se_module.py
index dd4c59a..56e20ad 100644
--- a/pywick/models/classification/testnets/se_module.py
+++ b/pywick/models/classification/testnets/se_module.py
@@ -5,7 +5,8 @@
 
 class SELayer(nn.Module):
     def __init__(self, channel, reduction=16):
-        assert channel > reduction, "Make sure your input channel bigger than reduction which equals to {}".format(reduction)
+        if channel <= reduction:
+            raise AssertionError("Make sure your input channel bigger than reduction which equals to {}".format(reduction))
         super(SELayer, self).__init__()
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc = nn.Sequential(
diff --git a/pywick/models/classification/wideresnet.py b/pywick/models/classification/wideresnet.py
index 1b73427..98aa21d 100644
--- a/pywick/models/classification/wideresnet.py
+++ b/pywick/models/classification/wideresnet.py
@@ -4,11 +4,10 @@
 Implementation of WideResNet as described in: `Wide Residual Networks <https://arxiv.org/abs/1605.07146>`_.
 """
 
-from __future__ import print_function, division, absolute_import
 import re
 import os
 from os.path import expanduser
-import hickle as hkl
+# import hickle as hkl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -21,12 +20,12 @@
 }
 
 def define_model(params):
-    def conv2d(input, params, base, stride=1, pad=0):
-        return F.conv2d(input, params[base + '.weight'],
+    def conv2d(input_, params, base, stride=1, pad=0):
+        return F.conv2d(input_, params[base + '.weight'],
                         params[base + '.bias'], stride, pad)
 
-    def group(input, params, base, stride, n):
-        o = input
+    def group(input_, params, base, stride, n):
+        o = input_
         for i in range(0,n):
             b_base = ('%s.block%d.conv') % (base, i)
             x = o
@@ -46,8 +45,8 @@ def group(input, params, base, stride, n):
     blocks = [sum([re.match('group%d.block\d+.conv0.weight'%j, k) is not None
                    for k in params.keys()]) for j in range(4)]
 
-    def f(input, params, pooling_classif=True):
-        o = F.conv2d(input, params['conv0.weight'], params['conv0.bias'], 2, 3)
+    def f(input_, params, pooling_classif=True):
+        o = F.conv2d(input_, params['conv0.weight'], params['conv0.bias'], 2, 3)
         o = F.relu(o)
         o = F.max_pool2d(o, 3, 2, 1)
         o_g0 = group(o, params, 'group0', 1, blocks[0])
@@ -77,20 +76,21 @@ def forward(self, x):
 
 
 def wideresnet50(pooling):
+    pass
     """Pretrained WideResnet50 model"""
-    dir_models = os.path.join(expanduser("~"), '.torch/wideresnet')
-    path_hkl = os.path.join(dir_models, 'wideresnet50.hkl')
-    if os.path.isfile(path_hkl):
-        params = hkl.load(path_hkl)
-        # convert numpy arrays to torch Variables
-        for k,v in sorted(params.items()):
-            print(k, v.shape)
-            params[k] = Variable(torch.from_numpy(v), requires_grad=True)
-    else:
-        os.system('mkdir -p ' + dir_models)
-        os.system('wget {} -O {}'.format(model_urls['wideresnet50'], path_hkl))
-    f = define_model(params)
-    model = WideResNet(pooling, f, params)
-    return model
+    # dir_models = os.path.join(expanduser("~"), '.torch/wideresnet')
+    # path_hkl = os.path.join(dir_models, 'wideresnet50.hkl')
+    # if os.path.isfile(path_hkl):
+    #     params = hkl.load(path_hkl)
+    #     # convert numpy arrays to torch Variables
+    #     for k,v in sorted(params.items()):
+    #         print(k, v.shape)
+    #         params[k] = Variable(torch.from_numpy(v), requires_grad=True)
+    # else:
+    #     os.system('mkdir -p ' + dir_models)
+    #     os.system('wget {} -O {}'.format(model_urls['wideresnet50'], path_hkl))
+    # f = define_model(params)
+    # model = WideResNet(pooling, f, params)
+    # return model
 
 
diff --git a/pywick/models/classification/xception1.py b/pywick/models/classification/xception1.py
index 0c7f0f0..dec8fea 100644
--- a/pywick/models/classification/xception1.py
+++ b/pywick/models/classification/xception1.py
@@ -24,7 +24,6 @@
 #
 # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
 
-from __future__ import print_function, division, absolute_import
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.model_zoo as model_zoo
@@ -165,8 +164,8 @@ def __init__(self, num_classes=1000):
         #         m.bias.data.zero_()
         # #-----------------------------
 
-    def features(self, input):
-        x = self.conv1(input)
+    def features(self, input_):
+        x = self.conv1(input_)
         x = self.bn1(x)
         x = self.relu1(x)
 
@@ -203,8 +202,8 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
diff --git a/pywick/models/localization/fpn.py b/pywick/models/localization/fpn.py
index 337341c..7d64d1e 100644
--- a/pywick/models/localization/fpn.py
+++ b/pywick/models/localization/fpn.py
@@ -72,7 +72,8 @@ def _make_layer(self, block, planes, num_blocks, stride):
             self.in_planes = planes * block.expansion
         return nn.Sequential(*layers)
 
-    def _upsample_add(self, x, y):
+    @staticmethod
+    def _upsample_add(x, y):
         '''Upsample and add two feature maps.
 
         Args:
diff --git a/pywick/models/localization/retina_fpn.py b/pywick/models/localization/retina_fpn.py
index d9d8e50..de9f917 100644
--- a/pywick/models/localization/retina_fpn.py
+++ b/pywick/models/localization/retina_fpn.py
@@ -74,7 +74,8 @@ def _make_layer(self, block, planes, num_blocks, stride):
             self.in_planes = planes * block.expansion
         return nn.Sequential(*layers)
 
-    def _upsample_add(self, x, y):
+    @staticmethod
+    def _upsample_add(x, y):
         '''Upsample and add two feature maps.
 
         Args:
diff --git a/pywick/models/model_locations.py b/pywick/models/model_locations.py
index d83a6a5..3779d2b 100644
--- a/pywick/models/model_locations.py
+++ b/pywick/models/model_locations.py
@@ -27,7 +27,6 @@
     'fbresnet152': cadeneroot + 'fbresnet152-2e20f6b4.pth',
     'inception_v3': torchroot + 'inception_v3_google-1a9a5a14.pth',
     'inceptionv4': cadeneroot + 'inceptionv4-8e4777a0.pth',
-    'bninception': cadeneroot + 'bn_inception-52deb4733.pth',
     'inceptionresnetv2': cadeneroot + 'inceptionresnetv2-520b38e4.pth',
     'nasnetalarge': cadeneroot + 'nasnetalarge-a1897284.pth',
     'nasnetamobile': cadeneroot + 'nasnetamobile-7e03cead.pth',
diff --git a/pywick/models/model_utils.py b/pywick/models/model_utils.py
index f43408f..3913431 100644
--- a/pywick/models/model_utils.py
+++ b/pywick/models/model_utils.py
@@ -1,5 +1,8 @@
+from functools import partial
 from typing import Callable
 
+from torchvision.models.resnet import Bottleneck
+
 from . import classification
 from .segmentation import *
 from . import segmentation
@@ -247,20 +250,20 @@ def get_supported_models(type: ModelType):
 
     import pkgutil
     if type == ModelType.SEGMENTATION:
-        excludes = list()  # <-- exclude non-model names
+        excludes = []  # <-- exclude non-model names
         for importer, modname, ispkg in pkgutil.walk_packages(path=segmentation.__path__, prefix=segmentation.__name__+".", onerror=lambda x: None):
             excludes.append(modname.split('.')[-1])
         return [x for x in segmentation.__dict__.keys() if ('__' not in x and x not in excludes)]  # filter out hidden object attributes and module names
     elif type == ModelType.CLASSIFICATION:
-        pywick_excludes = list()
+        pywick_excludes = []
         for importer, modname, ispkg in pkgutil.walk_packages(path=classification.__path__, prefix=classification.__name__+".", onerror=lambda x: None):
             pywick_excludes.append(modname.split('.')[-1])
         pywick_names = [x for x in classification.__dict__.keys() if '__' not in x and x not in pywick_excludes]     # includes directory and filenames
 
-        pt_excludes = list()
+        pt_excludes = []
         for importer, modname, ispkg in pkgutil.walk_packages(path=torch_models.__path__, prefix=torch_models.__name__+".", onerror=lambda x: None):
             pt_excludes.append(modname.split('.')[-1])
-        pt_names = [x for x in torch_models.__dict__.keys() if '__' not in x and x not in pt_excludes]  # includes directory and filenames
+        pt_names = [x for x in torch_models.__dict__ if '__' not in x and x not in pt_excludes]  # includes directory and filenames
 
         torch_hub_names = torch.hub.list(rwightman_repo, force_reload=True)
 
@@ -303,7 +306,7 @@ def _get_untrained_model(model_name, num_classes):
     elif model_name.startswith('pyresnet'):
         return classification.PyResNet(num_classes=num_classes)
     elif model_name.startswith('resnet'):
-        return torch_models.ResNet(num_classes=num_classes)
+        return torch_models.ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
     elif model_name.startswith('resnext101_32x4d'):
         return classification.ResNeXt101_32x4d(num_classes=num_classes)
     elif model_name.startswith('resnext101_64x4d'):
@@ -344,12 +347,15 @@ def diff_states(dict_canonical, dict_subset):
     # for every pretrained model
     not_in_1 = [n for n in names1 if n not in names2]
     not_in_2 = [n for n in names2 if n not in names1]
-    assert len(not_in_1) == 0
-    assert len(not_in_2) == 0
+    if len(not_in_1) != 0:
+        raise AssertionError
+    if len(not_in_2) != 0:
+        raise AssertionError
 
     for name, v1 in dict_canonical.items():
         v2 = dict_subset[name]
-        assert hasattr(v2, 'size')
+        if not hasattr(v2, 'size'):
+            raise AssertionError
         if v1.size() != v2.size():
             yield (name, v1)
 
@@ -428,3 +434,20 @@ def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, bac
             print('INFO: => Attempting to load checkpoint data onto model. Device: {}    Strict: {}'.format(device, strict))
             model.load_state_dict(checkpoint['state_dict'], strict=strict)
     return checkpoint
+
+
+def load_model(model_type: ModelType, model_name: str, num_classes: int, pretrained: bool = True, **kwargs):
+    """
+    Certain timm models may exist but not be listed in torch.hub so uses a custom partial function to bypass the model check in pywick
+
+    :param model_type:
+    :param model_name:
+    :param num_classes:
+    :param pretrained:
+    :param kwargs:
+    :return:
+    """
+    custom_func = partial(torch.hub.load, github=rwightman_repo)
+    model = get_model(model_type=model_type, model_name=model_name, num_classes=num_classes, pretrained=pretrained, custom_load_fn=custom_func, **kwargs)
+
+    return model
diff --git a/pywick/models/segmentation/carvana_unet.py b/pywick/models/segmentation/carvana_unet.py
index 55f5a68..7025baf 100644
--- a/pywick/models/segmentation/carvana_unet.py
+++ b/pywick/models/segmentation/carvana_unet.py
@@ -55,9 +55,10 @@ def forward(self,x):
 
 
     def merge_bn(self):
-        if self.bn == None: return
+        if self.bn is None: return
 
-        assert(self.conv.bias==None)
+        if (self.conv.bias is not None):
+            raise AssertionError
         conv_weight     = self.conv.weight.data
         bn_weight       = self.bn.weight.data
         bn_bias         = self.bn.bias.data
diff --git a/pywick/models/segmentation/da_basenets/basic.py b/pywick/models/segmentation/da_basenets/basic.py
index c02cab5..200833d 100644
--- a/pywick/models/segmentation/da_basenets/basic.py
+++ b/pywick/models/segmentation/da_basenets/basic.py
@@ -104,10 +104,11 @@ def forward(self, x):
 class InvertedResidual(nn.Module):
     def __init__(self, in_channels, out_channels, stride, expand_ratio, norm_layer=nn.BatchNorm2d, **kwargs):
         super(InvertedResidual, self).__init__()
-        assert stride in [1, 2]
+        if stride not in [1, 2]:
+            raise AssertionError
         self.use_res_connect = stride == 1 and in_channels == out_channels
 
-        layers = list()
+        layers = []
         inter_channels = int(round(in_channels * expand_ratio))
         if expand_ratio != 1:
             # pw
diff --git a/pywick/models/segmentation/da_basenets/densenet.py b/pywick/models/segmentation/da_basenets/densenet.py
index fda0dcb..4c4aba7 100644
--- a/pywick/models/segmentation/da_basenets/densenet.py
+++ b/pywick/models/segmentation/da_basenets/densenet.py
@@ -20,12 +20,12 @@
 class _DenseLayer(nn.Sequential):
     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1, norm_layer=nn.BatchNorm2d):
         super(_DenseLayer, self).__init__()
-        self.add_module('norm1', norm_layer(num_input_features)),
-        self.add_module('relu1', nn.ReLU(True)),
-        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, 1, 1, bias=False)),
-        self.add_module('norm2', norm_layer(bn_size * growth_rate)),
-        self.add_module('relu2', nn.ReLU(True)),
-        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, 3, 1, dilation, dilation, bias=False)),
+        self.add_module('norm1', norm_layer(num_input_features))
+        self.add_module('relu1', nn.ReLU(True))
+        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, 1, 1, bias=False))
+        self.add_module('norm2', norm_layer(bn_size * growth_rate))
+        self.add_module('relu2', nn.ReLU(True))
+        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, 3, 1, dilation, dilation, bias=False))
         self.drop_rate = drop_rate
 
     def forward(self, x):
@@ -110,7 +110,8 @@ def __init__(self, growth_rate=12, block_config=(6, 12, 24, 16), num_init_featur
                  bn_size=4, drop_rate=0, num_classes=1000, dilate_scale=8, norm_layer=nn.BatchNorm2d, **kwargs):
         super(DilatedDenseNet, self).__init__(growth_rate, block_config, num_init_features,
                                               bn_size, drop_rate, num_classes, norm_layer)
-        assert (dilate_scale == 8 or dilate_scale == 16), "dilate_scale can only set as 8 or 16"
+        if dilate_scale not in (8, 16):
+            raise AssertionError("dilate_scale can only set as 8 or 16")
         from functools import partial
         if dilate_scale == 8:
             self.features.denseblock3.apply(partial(self._conv_dilate, dilate=2))
@@ -121,7 +122,8 @@ def __init__(self, growth_rate=12, block_config=(6, 12, 24, 16), num_init_featur
             self.features.denseblock4.apply(partial(self._conv_dilate, dilate=2))
             del self.features.transition3.pool
 
-    def _conv_dilate(self, m, dilate):
+    @staticmethod
+    def _conv_dilate(m, dilate):
         classname = m.__class__.__name__
         if classname.find('Conv') != -1:
             if m.kernel_size == (3, 3):
diff --git a/pywick/models/segmentation/da_basenets/model_store.py b/pywick/models/segmentation/da_basenets/model_store.py
index 99ceaa4..d9d462b 100644
--- a/pywick/models/segmentation/da_basenets/model_store.py
+++ b/pywick/models/segmentation/da_basenets/model_store.py
@@ -1,6 +1,4 @@
 """Model store which provides pretrained models."""
-from __future__ import print_function
-
 import os
 import zipfile
 
@@ -34,8 +32,7 @@ def get_resnet_file(name, root='~/.torch/models'):
         if check_sha1(file_path, sha1_hash):
             return file_path
         else:
-            print('Mismatch in the content of model file {} detected.' +
-                  ' Downloading again.'.format(file_path))
+            print('Mismatch in the content of model file {} detected. Downloading again.'.format(file_path))
     else:
         print('Model file {} is not found. Downloading.'.format(file_path))
 
diff --git a/pywick/models/segmentation/da_basenets/resnet.py b/pywick/models/segmentation/da_basenets/resnet.py
index ffd8552..730b24e 100644
--- a/pywick/models/segmentation/da_basenets/resnet.py
+++ b/pywick/models/segmentation/da_basenets/resnet.py
@@ -220,7 +220,6 @@ def resnet152(pretrained=False, **kwargs):
 
 
 if __name__ == '__main__':
-    import torch
     img = torch.randn(4, 3, 224, 224)
     model = resnet50(True)
     output = model(img)
\ No newline at end of file
diff --git a/pywick/models/segmentation/da_basenets/resnetv1b.py b/pywick/models/segmentation/da_basenets/resnetv1b.py
index de27904..c6ea65e 100644
--- a/pywick/models/segmentation/da_basenets/resnetv1b.py
+++ b/pywick/models/segmentation/da_basenets/resnetv1b.py
@@ -259,7 +259,6 @@ def resnet152_v1s(pretrained=False, root='~/.torch/models', **kwargs):
 
 
 if __name__ == '__main__':
-    import torch
 
     img = torch.randn(4, 3, 224, 224)
     model = resnet50_v1b(True)
diff --git a/pywick/models/segmentation/deeplab_v3.py b/pywick/models/segmentation/deeplab_v3.py
index 407c50e..f6bdb54 100644
--- a/pywick/models/segmentation/deeplab_v3.py
+++ b/pywick/models/segmentation/deeplab_v3.py
@@ -102,7 +102,9 @@ def _make_layer(self, block, planes, blocks, stride=1, rate=1):
 
         return nn.Sequential(*layers)
 
-    def _make_MG_unit(self, block, planes, blocks=[1, 2, 4], stride=1, rate=1):
+    def _make_MG_unit(self, block, planes, blocks=None, stride=1, rate=1):
+        if blocks is None:
+            blocks = [1, 2, 4]
         downsample = None
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
diff --git a/pywick/models/segmentation/deeplab_v3_plus.py b/pywick/models/segmentation/deeplab_v3_plus.py
index ea109f5..718c878 100644
--- a/pywick/models/segmentation/deeplab_v3_plus.py
+++ b/pywick/models/segmentation/deeplab_v3_plus.py
@@ -103,7 +103,9 @@ def _make_layer(self, block, planes, blocks, stride=1, rate=1):
 
         return nn.Sequential(*layers)
 
-    def _make_MG_unit(self, block, planes, blocks=[1, 2, 4], stride=1, rate=1):
+    def _make_MG_unit(self, block, planes, blocks=None, stride=1, rate=1):
+        if blocks is None:
+            blocks = [1, 2, 4]
         downsample = None
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
diff --git a/pywick/models/segmentation/denseaspp.py b/pywick/models/segmentation/denseaspp.py
index 2716482..bcb9062 100644
--- a/pywick/models/segmentation/denseaspp.py
+++ b/pywick/models/segmentation/denseaspp.py
@@ -76,12 +76,12 @@ class _DenseASPPConv(nn.Sequential):
     def __init__(self, in_channels, inter_channels, out_channels, atrous_rate,
                  drop_rate=0.1, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
         super(_DenseASPPConv, self).__init__()
-        self.add_module('conv1', nn.Conv2d(in_channels, inter_channels, 1)),
-        self.add_module('bn1', norm_layer(inter_channels, **({} if norm_kwargs is None else norm_kwargs))),
-        self.add_module('relu1', nn.ReLU(True)),
-        self.add_module('conv2', nn.Conv2d(inter_channels, out_channels, 3, dilation=atrous_rate, padding=atrous_rate)),
-        self.add_module('bn2', norm_layer(out_channels, **({} if norm_kwargs is None else norm_kwargs))),
-        self.add_module('relu2', nn.ReLU(True)),
+        self.add_module('conv1', nn.Conv2d(in_channels, inter_channels, 1))
+        self.add_module('bn1', norm_layer(inter_channels, **({} if norm_kwargs is None else norm_kwargs)))
+        self.add_module('relu1', nn.ReLU(True))
+        self.add_module('conv2', nn.Conv2d(inter_channels, out_channels, 3, dilation=atrous_rate, padding=atrous_rate))
+        self.add_module('bn2', norm_layer(out_channels, **({} if norm_kwargs is None else norm_kwargs)))
+        self.add_module('relu2', nn.ReLU(True))
         self.drop_rate = drop_rate
 
     def forward(self, x):
diff --git a/pywick/models/segmentation/drn.py b/pywick/models/segmentation/drn.py
index d2d4039..08d709b 100644
--- a/pywick/models/segmentation/drn.py
+++ b/pywick/models/segmentation/drn.py
@@ -180,7 +180,8 @@ def __init__(self, block, layers, num_classes=1000,
 
     def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
                     new_level=True, residual=True):
-        assert dilation == 1 or dilation % 2 == 0
+        if not (dilation == 1 or dilation % 2 == 0):
+            raise AssertionError
         downsample = None
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
@@ -189,7 +190,7 @@ def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
                 BatchNorm(planes * block.expansion),
             )
 
-        layers = list()
+        layers = []
         layers.append(block(
             self.inplanes, planes, stride, downsample,
             dilation=(1, 1) if dilation == 1 else (
@@ -215,7 +216,7 @@ def _make_conv_layers(self, channels, convs, stride=1, dilation=1):
         return nn.Sequential(*modules)
 
     def forward(self, x):
-        y = list()
+        y = []
 
         if self.arch == 'C':
             x = self.conv1(x)
diff --git a/pywick/models/segmentation/emanet/emanet.py b/pywick/models/segmentation/emanet/emanet.py
index aa918e7..3687917 100644
--- a/pywick/models/segmentation/emanet/emanet.py
+++ b/pywick/models/segmentation/emanet/emanet.py
@@ -117,7 +117,7 @@ def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
         if grids is None:
             grids = [1] * blocks
 
-        if dilation == 1 or dilation == 2:
+        if dilation in (1, 2):
             layers.append(block(self.inplanes, planes, stride, dilation=1,
                                 downsample=downsample,
                                 previous_dilation=dilation))
@@ -266,7 +266,8 @@ def forward(self, x):
 
         return x, mu
 
-    def _l2norm(self, inp, dim):
+    @staticmethod
+    def _l2norm(inp, dim):
         '''Normlize the inp tensor with l2-norm.
 
         Returns a tensor where each sub-tensor of input along the given dim is
diff --git a/pywick/models/segmentation/fcn_utils.py b/pywick/models/segmentation/fcn_utils.py
index c6bb82a..11ce1b1 100644
--- a/pywick/models/segmentation/fcn_utils.py
+++ b/pywick/models/segmentation/fcn_utils.py
@@ -17,7 +17,7 @@ def check_mkdir(dir_name):
 def initialize_weights(*models):
     for model in models:
         for module in model.modules():
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.Linear)):
                 nn.init.kaiming_normal_(module.weight)
                 if module.bias is not None:
                     module.bias.data.zero_()
@@ -81,7 +81,7 @@ def evaluate(predictions, gts, num_classes):
     return acc, acc_cls, mean_iu, fwavacc
 
 
-class AverageMeter(object):
+class AverageMeter:
     def __init__(self):
         self.reset()
 
@@ -98,7 +98,7 @@ def update(self, val, n=1):
         self.avg = self.sum / self.count
 
 
-class PolyLR(object):
+class PolyLR:
     def __init__(self, optimizer, curr_iter, max_iter, lr_decay):
         self.max_iter = float(max_iter)
         self.init_lr_groups = []
@@ -117,7 +117,8 @@ def step(self):
 class Conv2dDeformable(nn.Module):
     def __init__(self, regular_filter, cuda=True):
         super(Conv2dDeformable, self).__init__()
-        assert isinstance(regular_filter, nn.Conv2d)
+        if not isinstance(regular_filter, nn.Conv2d):
+            raise AssertionError
         self.regular_filter = regular_filter
         self.offset_filter = nn.Conv2d(regular_filter.in_channels, 2 * regular_filter.in_channels, kernel_size=3,
                                        padding=1, bias=False)
diff --git a/pywick/models/segmentation/fusionnet.py b/pywick/models/segmentation/fusionnet.py
index b058e29..1282052 100644
--- a/pywick/models/segmentation/fusionnet.py
+++ b/pywick/models/segmentation/fusionnet.py
@@ -15,7 +15,7 @@ def initialize_weights(method='kaiming', *models):
     for model in models:
         for module in model.modules():
 
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.ConvTranspose2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d, nn.Linear)):
                 if method == 'kaiming':
                     init.kaiming_normal_(module.weight.data, np.sqrt(2.0))
                 elif method == 'xavier':
@@ -108,5 +108,6 @@ def forward(self,x):
         output = self.final(dec4)
         return self.activation(output)
 
-    def _do_downsample(self, x, kernel_size=2, stride=2):
+    @staticmethod
+    def _do_downsample(x, kernel_size=2, stride=2):
         return F.max_pool2d(x, kernel_size=kernel_size, stride=stride)
\ No newline at end of file
diff --git a/pywick/models/segmentation/gcnnets/gcn.py b/pywick/models/segmentation/gcnnets/gcn.py
index abafb1c..0f5c415 100644
--- a/pywick/models/segmentation/gcnnets/gcn.py
+++ b/pywick/models/segmentation/gcnnets/gcn.py
@@ -102,7 +102,7 @@ def forward(self, x):
 def initialize_weights(*models):
     for model in models:
         for module in model.modules():
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.Linear)):
                 nn.init.kaiming_normal_(module.weight)
                 if module.bias is not None:
                     module.bias.data.zero_()
diff --git a/pywick/models/segmentation/gcnnets/gcn_densenet.py b/pywick/models/segmentation/gcnnets/gcn_densenet.py
index 589a66f..ba59dd8 100644
--- a/pywick/models/segmentation/gcnnets/gcn_densenet.py
+++ b/pywick/models/segmentation/gcnnets/gcn_densenet.py
@@ -125,7 +125,7 @@ def forward(self, x):
 def initialize_weights(*models):
     for model in models:
         for module in model.modules():
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.Linear)):
                 nn.init.kaiming_normal_(module.weight)
                 if module.bias is not None:
                     module.bias.data.zero_()
diff --git a/pywick/models/segmentation/gcnnets/gcn_nasnet.py b/pywick/models/segmentation/gcnnets/gcn_nasnet.py
index 13f117f..7d6afa8 100644
--- a/pywick/models/segmentation/gcnnets/gcn_nasnet.py
+++ b/pywick/models/segmentation/gcnnets/gcn_nasnet.py
@@ -61,7 +61,8 @@ def forward(self, x):
         out = self.deconv(x)
         return out
 
-    def make_bilinear_weights(self, size, num_channels):
+    @staticmethod
+    def make_bilinear_weights(size, num_channels):
         factor = (size + 1) // 2
         if size % 2 == 1:
             center = factor - 1
@@ -151,7 +152,7 @@ def forward(self, x):
 def initialize_weights(*models):
     for model in models:
         for module in model.modules():
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.Linear)):
                 nn.init.kaiming_normal_(module.weight)
                 if module.bias is not None:
                     module.bias.data.zero_()
@@ -693,8 +694,8 @@ def __init__(self, num_classes=1001, stem_filters=96, penultimate_filters=4032,
         self.dropout = nn.Dropout()
         self.last_linear = nn.Linear(24*filters, self.num_classes)
 
-    def features(self, input):
-        x_conv0 = self.conv0(input)
+    def features(self, input_):
+        x_conv0 = self.conv0(input_)
         x_stem_0 = self.cell_stem_0(x_conv0)
         x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
 
@@ -732,7 +733,7 @@ def logits(self, features):
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
diff --git a/pywick/models/segmentation/gcnnets/gcn_psp.py b/pywick/models/segmentation/gcnnets/gcn_psp.py
index 1e7ac54..e9d61e8 100644
--- a/pywick/models/segmentation/gcnnets/gcn_psp.py
+++ b/pywick/models/segmentation/gcnnets/gcn_psp.py
@@ -142,7 +142,7 @@ def forward(self, x):
 def initialize_weights(*models):
     for model in models:
         for module in model.modules():
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.Linear)):
                 nn.init.kaiming_normal_(module.weight)
                 if module.bias is not None:
                     module.bias.data.zero_()
diff --git a/pywick/models/segmentation/gcnnets/gcn_resnext.py b/pywick/models/segmentation/gcnnets/gcn_resnext.py
index bb05a77..2c9380e 100644
--- a/pywick/models/segmentation/gcnnets/gcn_resnext.py
+++ b/pywick/models/segmentation/gcnnets/gcn_resnext.py
@@ -63,7 +63,8 @@ def forward(self, x):
         out = self.deconv(x)
         return out
 
-    def make_bilinear_weights(self, size, num_channels):
+    @staticmethod
+    def make_bilinear_weights(size, num_channels):
         factor = (size + 1) // 2
         if size % 2 == 1:
             center = factor - 1
@@ -108,15 +109,15 @@ def __init__(self, fn, *args):
 		super(LambdaBase, self).__init__(*args)
 		self.lambda_func = fn
 
-	def forward_prepare(self, input):
+	def forward_prepare(self, input_):
 		output = []
 		for module in self._modules.values():
-			output.append(module(input))
-		return output if output else input
+			output.append(module(input_))
+		return output if output else input_
 
 class Lambda(LambdaBase):
-    def forward(self, input):
-        return self.lambda_func(self.forward_prepare(input))
+    def forward(self, input_):
+        return self.lambda_func(self.forward_prepare(input_))
 
 class ResNeXt(nn.Module):
 
@@ -223,10 +224,11 @@ def forward(self, x):
 
         return out
 
-    def initialize_weights(self, *models):
+    @staticmethod
+    def initialize_weights(*models):
         for model in models:
             for module in model.modules():
-                if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+                if isinstance(module, (nn.Conv2d, nn.Linear)):
                     nn.init.kaiming_normal_(module.weight)
                     if module.bias is not None:
                         module.bias.data.zero_()
diff --git a/pywick/models/segmentation/lex_extractors.py b/pywick/models/segmentation/lex_extractors.py
index e125962..1aaff38 100644
--- a/pywick/models/segmentation/lex_extractors.py
+++ b/pywick/models/segmentation/lex_extractors.py
@@ -192,14 +192,12 @@ def densenet121(pretrained=False, **kwargs):
 class _DenseLayer(nn.Sequential):
     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
         super(_DenseLayer, self).__init__()
-        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu1', nn.ReLU(inplace=True)),
-        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
-                                            growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu2', nn.ReLU(inplace=True)),
-        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                                            kernel_size=3, stride=1, padding=1, bias=False)),
+        self.add_module('norm1', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu1', nn.ReLU(inplace=True))
+        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False))
+        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate))
+        self.add_module('relu2', nn.ReLU(inplace=True))
+        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False))
         self.drop_rate = drop_rate
 
     def forward(self, x):
diff --git a/pywick/models/segmentation/lexpsp.py b/pywick/models/segmentation/lexpsp.py
index 4f6325c..9155f55 100644
--- a/pywick/models/segmentation/lexpsp.py
+++ b/pywick/models/segmentation/lexpsp.py
@@ -25,7 +25,8 @@ def __init__(self, features, out_features=1024, sizes=(1, 2, 3, 6)):
         self.bottleneck = nn.Conv2d(features * (len(sizes) + 1), out_features, kernel_size=1)
         self.relu = nn.ReLU()
 
-    def _make_stage(self, features, size):
+    @staticmethod
+    def _make_stage(features, size):
         prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
         conv = nn.Conv2d(features, features, kernel_size=1, bias=False)
         return nn.Sequential(prior, conv)
diff --git a/pywick/models/segmentation/mnas_linknets/decoder.py b/pywick/models/segmentation/mnas_linknets/decoder.py
index cad2a5f..35078f7 100755
--- a/pywick/models/segmentation/mnas_linknets/decoder.py
+++ b/pywick/models/segmentation/mnas_linknets/decoder.py
@@ -84,7 +84,7 @@ def __init__(self,
         self.relu1 = nonlinearity(inplace=True)
 
         # B, C/4, H, W -> B, C/4, H, W
-        if is_deconv == True:
+        if is_deconv is True:
             self.deconv2 = nn.ConvTranspose2d(in_channels // 4,
                                               in_channels // 4,
                                               3,
@@ -146,7 +146,7 @@ def __init__(self,
         self.relu1 = nonlinearity(inplace=True)
 
         # B, C/4, H, W -> B, C/4, H, W
-        if is_deconv == True:
+        if is_deconv is True:
             self.deconv2 = nn.ConvTranspose2d(in_channels // 4,
                                               in_channels // 4,
                                               kernel_size,
@@ -216,7 +216,7 @@ def __init__(self,
         self.relu1 = nonlinearity(inplace=True)
 
         # B, out_channels, H, W -> B, out_channels, H, W
-        if is_deconv == True:
+        if is_deconv is True:
             self.deconv2 = nn.ConvTranspose2d(out_channels,
                                               out_channels,
                                               kernel_size,
diff --git a/pywick/models/segmentation/mnas_linknets/inception4.py b/pywick/models/segmentation/mnas_linknets/inception4.py
index 9c66e72..bd92fae 100755
--- a/pywick/models/segmentation/mnas_linknets/inception4.py
+++ b/pywick/models/segmentation/mnas_linknets/inception4.py
@@ -306,8 +306,8 @@ def logits(self, features):
         x = self.last_linear(x) 
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
diff --git a/pywick/models/segmentation/mnas_linknets/inception_resnet.py b/pywick/models/segmentation/mnas_linknets/inception_resnet.py
index 8148d20..d157cd2 100755
--- a/pywick/models/segmentation/mnas_linknets/inception_resnet.py
+++ b/pywick/models/segmentation/mnas_linknets/inception_resnet.py
@@ -298,8 +298,8 @@ def __init__(self, num_classes=1001):
         self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)
         self.last_linear = nn.Linear(1536, num_classes)
 
-    def features(self, input):
-        x = self.conv2d_1a(input)
+    def features(self, input_):
+        x = self.conv2d_1a(input_)
         x = self.conv2d_2a(x)
         x = self.conv2d_2b(x)
         x = self.maxpool_3a(x)
@@ -322,8 +322,8 @@ def logits(self, features):
         x = self.last_linear(x) 
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -333,8 +333,8 @@ def inceptionresnetv2(num_classes=1001, pretrained='imagenet'):
     """
     if pretrained:
         settings = pretrained_settings['inceptionresnetv2'][pretrained]
-        assert num_classes == settings['num_classes'], \
-            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+        if num_classes != settings['num_classes']:
+            raise AssertionError("num_classes should be {}, but is {}".format(settings['num_classes'], num_classes))
 
         # both 'imagenet'&'imagenet+background' are loaded from same parameters
         model = InceptionResNetV2(num_classes=1001)
diff --git a/pywick/models/segmentation/mnas_linknets/resnext.py b/pywick/models/segmentation/mnas_linknets/resnext.py
index 6885c97..3968d46 100755
--- a/pywick/models/segmentation/mnas_linknets/resnext.py
+++ b/pywick/models/segmentation/mnas_linknets/resnext.py
@@ -41,14 +41,14 @@ def __init__(self, num_classes=1000):
         self.avg_pool = nn.AvgPool2d((7, 7), (1, 1))
         self.last_linear = nn.Linear(2048, num_classes)
 
-    def logits(self, input):
-        x = self.avg_pool(input)
+    def logits(self, input_):
+        x = self.avg_pool(input_)
         x = x.view(x.size(0), -1)
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.features(input)
+    def forward(self, input_):
+        x = self.features(input_)
         x = self.logits(x)
         return x
 
@@ -69,14 +69,14 @@ def __init__(self, num_classes=1000):
         self.avg_pool = nn.AvgPool2d((7, 7), (1, 1))
         self.last_linear = nn.Linear(2048, num_classes)
 
-    def logits(self, input):
-        x = self.avg_pool(input)
+    def logits(self, input_):
+        x = self.avg_pool(input_)
         x = x.view(x.size(0), -1)
         x = self.last_linear(x)
         return x
 
-    def forward(self, input):
-        x = self.stem(input)
+    def forward(self, input_):
+        x = self.stem(input_)
         x = self.layer1(x)
         x = self.layer2(x)       
         x = self.layer3(x)       
@@ -89,8 +89,8 @@ def resnext101_32x4d(num_classes=1000, pretrained='imagenet'):
     model_blob = ResNeXt101_32x4d_blob(num_classes=num_classes)
     if pretrained is not None:
         settings = pretrained_settings['resnext101_32x4d'][pretrained]
-        assert num_classes == settings['num_classes'], \
-            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+        if num_classes != settings['num_classes']:
+            raise AssertionError("num_classes should be {}, but is {}".format(settings['num_classes'], num_classes))
         model_blob.load_state_dict(model_zoo.load_url(settings['url']))
         
         model.stem = nn.Sequential( 
diff --git a/pywick/models/segmentation/mnas_linknets/resnext101_32x4d_features.py b/pywick/models/segmentation/mnas_linknets/resnext101_32x4d_features.py
index 906b1e6..2fbd2e4 100755
--- a/pywick/models/segmentation/mnas_linknets/resnext101_32x4d_features.py
+++ b/pywick/models/segmentation/mnas_linknets/resnext101_32x4d_features.py
@@ -8,23 +8,23 @@ def __init__(self, fn, *args):
         super(LambdaBase, self).__init__(*args)
         self.lambda_func = fn
 
-    def forward_prepare(self, input):
+    def forward_prepare(self, input_):
         output = []
         for module in self._modules.values():
-            output.append(module(input))
-        return output if output else input
+            output.append(module(input_))
+        return output if output else input_
 
 class Lambda(LambdaBase):
-    def forward(self, input):
-        return self.lambda_func(self.forward_prepare(input))
+    def forward(self, input_):
+        return self.lambda_func(self.forward_prepare(input_))
 
 class LambdaMap(LambdaBase):
-    def forward(self, input):
-        return list(map(self.lambda_func,self.forward_prepare(input)))
+    def forward(self, input_):
+        return list(map(self.lambda_func, self.forward_prepare(input_)))
 
 class LambdaReduce(LambdaBase):
-    def forward(self, input):
-        return reduce(self.lambda_func,self.forward_prepare(input))
+    def forward(self, input_):
+        return reduce(self.lambda_func, self.forward_prepare(input_))
 
 class resnext101_32x4d_features_blob(nn.Module):
 
@@ -685,8 +685,8 @@ def __init__(self):
             )
         )
         
-    def forward(self, input):
-        x = self.resnext101_32x4d_features(input)
+    def forward(self, input_):
+        x = self.resnext101_32x4d_features(input_)
         return x        
     
 class resnext101_32x4d_features(nn.Module):
@@ -1361,8 +1361,8 @@ def __init__(self):
         )        
 
         
-    def forward(self, input):
-        x = self.resnext101_32x4d_stem(input)
+    def forward(self, input_):
+        x = self.resnext101_32x4d_stem(input_)
         x = self.resnext101_32x4d_layer1(x)
         x = self.resnext101_32x4d_layer2(x)
         x = self.resnext101_32x4d_layer3(x)
diff --git a/pywick/models/segmentation/ocnet.py b/pywick/models/segmentation/ocnet.py
index 266ceab..c390a27 100644
--- a/pywick/models/segmentation/ocnet.py
+++ b/pywick/models/segmentation/ocnet.py
@@ -134,7 +134,9 @@ class BaseOCModule(nn.Module):
     """Base-OC"""
 
     def __init__(self, in_channels, out_channels, key_channels, value_channels,
-                 scales=([1]), norm_layer=nn.BatchNorm2d, concat=True, **kwargs):
+                 scales=None, norm_layer=nn.BatchNorm2d, concat=True, **kwargs):
+        if scales is None:
+            scales = ([1])
         super(BaseOCModule, self).__init__()
         self.stages = nn.ModuleList([
             BaseAttentionBlock(in_channels, out_channels, key_channels, value_channels, scale, norm_layer, **kwargs)
@@ -183,8 +185,8 @@ def __init__(self, in_channels, out_channels, key_channels, value_channels,
     def forward(self, x):
         batch_size, c, w, h = x.size()
 
-        local_x = list()
-        local_y = list()
+        local_x = []
+        local_y = []
         step_w, step_h = w // self.scale, h // self.scale
         for i in range(self.scale):
             for j in range(self.scale):
@@ -201,7 +203,7 @@ def forward(self, x):
         query = self.f_query(x)
         key = self.f_key(x)
 
-        local_list = list()
+        local_list = []
         local_block_cnt = (self.scale ** 2) * 2
         for i in range(0, local_block_cnt, 2):
             value_local = value[:, :, local_x[i]:local_x[i + 1], local_y[i]:local_y[i + 1]]
@@ -220,9 +222,9 @@ def forward(self, x):
             context_local = context_local.view(batch_size, self.value_channels, w_local, h_local)
             local_list.append(context_local)
 
-        context_list = list()
+        context_list = []
         for i in range(0, self.scale):
-            row_tmp = list()
+            row_tmp = []
             for j in range(self.scale):
                 row_tmp.append(local_list[j + i * self.scale])
             context_list.append(torch.cat(row_tmp, 3))
@@ -237,7 +239,9 @@ class PyramidOCModule(nn.Module):
     """Pyramid-OC"""
 
     def __init__(self, in_channels, out_channels, key_channels, value_channels,
-                 scales=([1]), norm_layer=nn.BatchNorm2d, **kwargs):
+                 scales=None, norm_layer=nn.BatchNorm2d, **kwargs):
+        if scales is None:
+            scales = ([1])
         super(PyramidOCModule, self).__init__()
         self.stages = nn.ModuleList([
             PyramidAttentionBlock(in_channels, out_channels, key_channels, value_channels, scale, norm_layer, **kwargs)
diff --git a/pywick/models/segmentation/resnet_gcn.py b/pywick/models/segmentation/resnet_gcn.py
index f2ac081..f41e777 100644
--- a/pywick/models/segmentation/resnet_gcn.py
+++ b/pywick/models/segmentation/resnet_gcn.py
@@ -17,7 +17,7 @@ def initialize_weights(method='kaiming', *models):
     for model in models:
         for module in model.modules():
 
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.ConvTranspose2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d, nn.Linear)):
                 if method == 'kaiming':
                     init.kaiming_normal_(module.weight.data, np.sqrt(2.0))
                 elif method == 'xavier':
@@ -127,5 +127,6 @@ def forward(self,x):
 
 
 
-    def _do_upsample(self, num_classes=1, kernel_size=2, stride=2):
+    @staticmethod
+    def _do_upsample(num_classes=1, kernel_size=2, stride=2):
         return nn.ConvTranspose2d(num_classes, num_classes, kernel_size=kernel_size, stride=stride)
diff --git a/pywick/models/segmentation/testnets/Unet_nested_layers.py b/pywick/models/segmentation/testnets/Unet_nested_layers.py
index a344e31..ec63661 100644
--- a/pywick/models/segmentation/testnets/Unet_nested_layers.py
+++ b/pywick/models/segmentation/testnets/Unet_nested_layers.py
@@ -155,12 +155,12 @@ def __init__(self, in_size, out_size, is_deconv, n_concat=2):
             if m.__class__.__name__.find('unetConv2') != -1: continue
             init_weights(m, init_type='kaiming')
 
-    def forward(self, inputs0, *input):
+    def forward(self, inputs0, *input_):
         # print(self.n_concat)
-        # print(input)
+        # print(input_)
         outputs0 = self.up(inputs0)
-        for i in range(len(input)):
-            outputs0 = torch.cat([outputs0, input[i]], 1)
+        for i in range(len(input_)):
+            outputs0 = torch.cat([outputs0, input_[i]], 1)
         return self.conv(outputs0)
 
 
@@ -185,8 +185,8 @@ def __init__(self, in_size, out_size, is_batchnorm, kernel_size=(3, 3, 3), paddi
         for m in self.children():
             init_weights(m, init_type='kaiming')
 
-    def forward(self, inputs):
-        outputs = self.conv1(inputs)
+    def forward(self, input_):
+        outputs = self.conv1(input_)
         outputs = self.conv2(outputs)
         return outputs
 
@@ -324,13 +324,13 @@ def forward(self, inputs):
         output = inputs
         # print(output.shape)
         x_0 = inputs
-        conv = getattr(self, 'conv1')
+        conv = self.conv1
         x_1 = conv(x_0)
-        conv = getattr(self, 'conv2')
+        conv = self.conv2
         x_2 = conv(x_1)
-        conv = getattr(self, 'conv3')
+        conv = self.conv3
         x_3 = conv(x_2)
-        conv = getattr(self, 'conv4')
+        conv = self.conv4
         x_4 = conv(x_3)
 
         return x_0 + x_1 + x_2 + x_3 + x_4
@@ -365,11 +365,11 @@ def forward(self, inputs):
         output = inputs
         # print(output.shape)
         x_0 = inputs
-        conv = getattr(self, 'conv1')
+        conv = self.conv1
         x_1 = conv(x_0)
-        conv = getattr(self, 'conv2')
+        conv = self.conv2
         x_2 = conv(x_1)
-        conv = getattr(self, 'conv3')
+        conv = self.conv3
         x_3 = conv(x_2)
 
         return x_0 + x_1 + x_2 + x_3
diff --git a/pywick/models/segmentation/testnets/autofocusNN.py b/pywick/models/segmentation/testnets/autofocusNN.py
index e624473..f384ff6 100644
--- a/pywick/models/segmentation/testnets/autofocusNN.py
+++ b/pywick/models/segmentation/testnets/autofocusNN.py
@@ -14,7 +14,12 @@
 
 
 class ModelBuilder():
-    def build_net(self, arch='AFN1', num_input=4, num_classes=5, num_branches=4, padding_list=[0, 4, 8, 12], dilation_list=[2, 6, 10, 14], **kwargs):
+    @staticmethod
+    def build_net(arch='AFN1', num_input=4, num_classes=5, num_branches=4, padding_list=None, dilation_list=None, **kwargs):
+        if padding_list is None:
+            padding_list = [0, 4, 8, 12]
+        if dilation_list is None:
+            dilation_list = [2, 6, 10, 14]
         # parameters in the architecture
         channels = [num_input - 1, 30, 30, 40, 40, 40, 40, 50, 50, num_classes]
         kernel_size = 3
diff --git a/pywick/models/segmentation/testnets/axial_deeplab/axial_deeplab.py b/pywick/models/segmentation/testnets/axial_deeplab/axial_deeplab.py
index 61435cb..c6a9873 100644
--- a/pywick/models/segmentation/testnets/axial_deeplab/axial_deeplab.py
+++ b/pywick/models/segmentation/testnets/axial_deeplab/axial_deeplab.py
@@ -21,7 +21,8 @@ def conv1x1(in_planes, out_planes, stride=1):
 class AxialAttention(nn.Module):
     def __init__(self, in_planes, out_planes, groups=8, kernel_size=56,
                  stride=1, bias=False, width=False):
-        assert (in_planes % groups == 0) and (out_planes % groups == 0)
+        if not ((in_planes % groups == 0) and (out_planes % groups == 0)):
+            raise AssertionError
         super(AxialAttention, self).__init__()
         self.in_planes = in_planes
         self.out_planes = out_planes
diff --git a/pywick/models/segmentation/testnets/dabnet.py b/pywick/models/segmentation/testnets/dabnet.py
index ebc6d89..fa0b142 100644
--- a/pywick/models/segmentation/testnets/dabnet.py
+++ b/pywick/models/segmentation/testnets/dabnet.py
@@ -24,8 +24,8 @@ def __init__(self, nIn, nOut, kSize, stride, padding, dilation=(1, 1), groups=1,
         if self.bn_acti:
             self.bn_prelu = BNPReLU(nOut)
 
-    def forward(self, input):
-        output = self.conv(input)
+    def forward(self, input_):
+        output = self.conv(input_)
 
         if self.bn_acti:
             output = self.bn_prelu(output)
@@ -39,8 +39,8 @@ def __init__(self, nIn):
         self.bn = nn.BatchNorm2d(nIn, eps=1e-3)
         self.acti = nn.PReLU(nIn)
 
-    def forward(self, input):
-        output = self.bn(input)
+    def forward(self, input_):
+        output = self.bn(input_)
         output = self.acti(output)
 
         return output
@@ -65,8 +65,8 @@ def __init__(self, nIn, d=1, kSize=3, dkSize=3):
         self.bn_relu_2 = BNPReLU(nIn // 2)
         self.conv1x1 = Conv(nIn // 2, nIn, 1, 1, padding=0, bn_acti=False)
 
-    def forward(self, input):
-        output = self.bn_relu_1(input)
+    def forward(self, input_):
+        output = self.bn_relu_1(input_)
         output = self.conv3x3(output)
 
         br1 = self.dconv3x1(output)
@@ -78,7 +78,7 @@ def forward(self, input):
         output = self.bn_relu_2(output)
         output = self.conv1x1(output)
 
-        return output + input
+        return output + input_
 
 
 class DownSamplingBlock(nn.Module):
@@ -96,11 +96,11 @@ def __init__(self, nIn, nOut):
         self.max_pool = nn.MaxPool2d(2, stride=2)
         self.bn_prelu = BNPReLU(nOut)
 
-    def forward(self, input):
-        output = self.conv3x3(input)
+    def forward(self, input_):
+        output = self.conv3x3(input_)
 
         if self.nIn < self.nOut:
-            max_pool = self.max_pool(input)
+            max_pool = self.max_pool(input_)
             output = torch.cat([output, max_pool], 1)
 
         output = self.bn_prelu(output)
@@ -115,11 +115,11 @@ def __init__(self, ratio):
         for i in range(0, ratio):
             self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
 
-    def forward(self, input):
+    def forward(self, input_):
         for pool in self.pool:
-            input = pool(input)
+            input_ = pool(input_)
 
-        return input
+        return input_
 
 
 class DABNet(nn.Module):
@@ -155,13 +155,13 @@ def __init__(self, num_classes=19, block_1=3, block_2=6, **kwargs):
 
         self.classifier = nn.Sequential(Conv(259, num_classes, 1, 1, padding=0))
 
-    def forward(self, input):
+    def forward(self, input_):
 
-        output0 = self.init_conv(input)
+        output0 = self.init_conv(input_)
 
-        down_1 = self.down_1(input)
-        down_2 = self.down_2(input)
-        down_3 = self.down_3(input)
+        down_1 = self.down_1(input_)
+        down_2 = self.down_2(input_)
+        down_3 = self.down_3(input_)
 
         output0_cat = self.bn_prelu_1(torch.cat([output0, down_1], 1))
 
@@ -176,6 +176,6 @@ def forward(self, input):
         output2_cat = self.bn_prelu_3(torch.cat([output2, output2_0, down_3], 1))
 
         out = self.classifier(output2_cat)
-        out = F.interpolate(out, input.size()[2:], mode='bilinear', align_corners=False)
+        out = F.interpolate(out, input_.size()[2:], mode='bilinear', align_corners=False)
 
         return out
diff --git a/pywick/models/segmentation/testnets/deeplabv3.py b/pywick/models/segmentation/testnets/deeplabv3.py
index 8946e85..e96354f 100644
--- a/pywick/models/segmentation/testnets/deeplabv3.py
+++ b/pywick/models/segmentation/testnets/deeplabv3.py
@@ -39,7 +39,7 @@ def DeepLabV3_ResNet101_MSC(n_classes, output_stride):
     elif output_stride == 8:
         pyramids = [12, 24, 36]
     else:
-        NotImplementedError
+        pass
 
     return MSC(
         scale=DeepLabV3(
diff --git a/pywick/models/segmentation/testnets/deeplabv3_resnet.py b/pywick/models/segmentation/testnets/deeplabv3_resnet.py
index 5ff0a49..0bc0193 100644
--- a/pywick/models/segmentation/testnets/deeplabv3_resnet.py
+++ b/pywick/models/segmentation/testnets/deeplabv3_resnet.py
@@ -93,7 +93,9 @@ def _make_layer(self, block, planes, blocks, stride=1, rate=1):
 
         return nn.Sequential(*layers)
 
-    def _make_MG_unit(self, block, planes, blocks=[1,2,4], stride=1, rate=1):
+    def _make_MG_unit(self, block, planes, blocks=None, stride=1, rate=1):
+        if blocks is None:
+            blocks = [1,2,4]
         downsample = None
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
@@ -110,8 +112,8 @@ def _make_MG_unit(self, block, planes, blocks=[1,2,4], stride=1, rate=1):
 
         return nn.Sequential(*layers)
 
-    def forward(self, input):
-        x = self.conv1(input)
+    def forward(self, input_):
+        x = self.conv1(input_)
         x = self.bn1(x)
         x = self.relu(x)
         x = self.maxpool(x)
@@ -228,8 +230,8 @@ def __init__(self, num_classes=21, pretrained=False, nInputChannels=3, os=16, _p
                                          nn.ReLU(),
                                          nn.Conv2d(256, num_classes, kernel_size=1, stride=1))
 
-    def forward(self, input):
-        x, low_level_features = self.resnet_features(input)
+    def forward(self, input_):
+        x, low_level_features = self.resnet_features(input_)
         x1 = self.aspp1(x)
         x2 = self.aspp2(x)
         x3 = self.aspp3(x)
@@ -242,8 +244,8 @@ def forward(self, input):
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
-        x = F.upsample(x, size=(int(math.ceil(input.size()[-2]/4)),
-                                int(math.ceil(input.size()[-1]/4))), mode='bilinear', align_corners=True)
+        x = F.upsample(x, size=(int(math.ceil(input_.size()[-2] / 4)),
+                                int(math.ceil(input_.size()[-1] / 4))), mode='bilinear', align_corners=True)
 
         low_level_features = self.conv2(low_level_features)
         low_level_features = self.bn2(low_level_features)
@@ -252,7 +254,7 @@ def forward(self, input):
 
         x = torch.cat((x, low_level_features), dim=1)
         x = self.last_linear(x)
-        x = F.upsample(x, size=input.size()[2:], mode='bilinear', align_corners=True)
+        x = F.upsample(x, size=input_.size()[2:], mode='bilinear', align_corners=True)
 
         return x
 
diff --git a/pywick/models/segmentation/testnets/deeplabv3_xception.py b/pywick/models/segmentation/testnets/deeplabv3_xception.py
index 62d5207..0892db6 100644
--- a/pywick/models/segmentation/testnets/deeplabv3_xception.py
+++ b/pywick/models/segmentation/testnets/deeplabv3_xception.py
@@ -345,8 +345,8 @@ def __init__(self, num_classes=21, pretrained=False, nInputChannels=3, os=16, _p
                                          nn.ReLU(),
                                          nn.Conv2d(256, num_classes, kernel_size=1, stride=1))
 
-    def forward(self, input):
-        x, low_level_features = self.xception_features(input)
+    def forward(self, input_):
+        x, low_level_features = self.xception_features(input_)
         x1 = self.aspp1(x)
         x2 = self.aspp2(x)
         x3 = self.aspp3(x)
@@ -359,8 +359,8 @@ def forward(self, input):
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
-        x = F.interpolate(x, size=(int(math.ceil(input.size()[-2]/4)),
-                                int(math.ceil(input.size()[-1]/4))), mode='bilinear', align_corners=True)
+        x = F.interpolate(x, size=(int(math.ceil(input_.size()[-2] / 4)),
+                                int(math.ceil(input_.size()[-1] / 4))), mode='bilinear', align_corners=True)
 
         low_level_features = self.conv2(low_level_features)
         low_level_features = self.bn2(low_level_features)
@@ -369,7 +369,7 @@ def forward(self, input):
 
         x = torch.cat((x, low_level_features), dim=1)
         x = self.last_linear(x)
-        x = F.interpolate(x, size=input.size()[2:], mode='bilinear', align_corners=True)
+        x = F.interpolate(x, size=input_.size()[2:], mode='bilinear', align_corners=True)
 
         return x
 
diff --git a/pywick/models/segmentation/testnets/difnet.py b/pywick/models/segmentation/testnets/difnet.py
index cd30fbe..3d0b494 100644
--- a/pywick/models/segmentation/testnets/difnet.py
+++ b/pywick/models/segmentation/testnets/difnet.py
@@ -205,7 +205,8 @@ def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
 
         return nn.Sequential(*layers)
 
-    def _make_pred_layer(self, block, dilation_series, padding_series, num_classes, inplane):
+    @staticmethod
+    def _make_pred_layer(block, dilation_series, padding_series, num_classes, inplane):
         return block(dilation_series, padding_series, num_classes, inplane)
 
     def forward(self, x):
@@ -261,8 +262,7 @@ def __init__(self, num_classes, layers, **kwargs):
             self.model_sed = ResNet(BasicBlock, [3, 2, 2, 2], num_classes)
             self.model_dif = ResNet(Bottleneck, [3, 4, 6, 3], num_classes, isseed=False)
         else:
-            print('unsupport layer number: {}'.format(layers))
-            exit()
+            raise Exception('unsupport layer number: {}'.format(layers))
         self.mask_layer = Mask(inplanes=num_classes)
 
     def get_alpha(self):
diff --git a/pywick/models/segmentation/testnets/dilated_resnet.py b/pywick/models/segmentation/testnets/dilated_resnet.py
index eccaea9..292bf96 100644
--- a/pywick/models/segmentation/testnets/dilated_resnet.py
+++ b/pywick/models/segmentation/testnets/dilated_resnet.py
@@ -79,8 +79,10 @@ def __init__(self, inplanes, planes, stride=1, dilation=1,
         self.dilation = dilation
         self.stride = stride
 
-    def _sum_each(self, x, y):
-        assert(len(x) == len(y))
+    @staticmethod
+    def _sum_each(x, y):
+        if (len(x) != len(y)):
+            raise AssertionError
         z = []
         for i in range(len(x)):
             z.append(x[i]+y[i])
@@ -175,7 +177,7 @@ def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=No
             )
 
         layers = []
-        if dilation == 1 or dilation == 2:
+        if dilation in (1, 2):
             layers.append(block(self.inplanes, planes, stride, dilation=1,
                                 downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
         elif dilation == 4:
diff --git a/pywick/models/segmentation/testnets/drnet/backbone.py b/pywick/models/segmentation/testnets/drnet/backbone.py
index 4ce295b..b47badd 100644
--- a/pywick/models/segmentation/testnets/drnet/backbone.py
+++ b/pywick/models/segmentation/testnets/drnet/backbone.py
@@ -26,6 +26,7 @@ def conv1x1(in_planes, out_planes, stride=1):
     """1x1 convolution"""
     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 
+
 class Bottleneck(nn.Module):
     expansion = 4
 
diff --git a/pywick/models/segmentation/testnets/encnet.py b/pywick/models/segmentation/testnets/encnet.py
index 4bdcf8e..1f104f0 100644
--- a/pywick/models/segmentation/testnets/encnet.py
+++ b/pywick/models/segmentation/testnets/encnet.py
@@ -131,7 +131,8 @@ def reset_params(self):
 
     def forward(self, X):
         # input X is a 4D tensor
-        assert (X.size(1) == self.D)
+        if (X.size(1) != self.D):
+            raise AssertionError
         B, D = X.size(0), self.D
         if X.dim() == 3:
             # BxDxN -> BxNxD
@@ -177,8 +178,8 @@ def __init__(self, dim, keep_dim=False):
         self.dim = dim
         self.keep_dim = keep_dim
 
-    def forward(self, input):
-        return input.mean(self.dim, self.keep_dim)
+    def forward(self, input_):
+        return input_.mean(self.dim, self.keep_dim)
 
 
 def get_encnet(num_classes=1, backbone='resnet50', pretrained=True, **kwargs):
diff --git a/pywick/models/segmentation/testnets/esp_net.py b/pywick/models/segmentation/testnets/esp_net.py
index a5a9ab1..967f94b 100644
--- a/pywick/models/segmentation/testnets/esp_net.py
+++ b/pywick/models/segmentation/testnets/esp_net.py
@@ -28,12 +28,12 @@ def __init__(self, nIn, nOut, kSize, stride=1):
         self.bn = nn.BatchNorm2d(nOut, eps=1e-03)
         self.act = nn.PReLU(nOut)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: input feature map
+        :param input_: input feature map
         :return: transformed feature map
         '''
-        output = self.conv(input)
+        output = self.conv(input_)
         # output = self.conv1(output)
         output = self.bn(output)
         output = self.act(output)
@@ -53,12 +53,12 @@ def __init__(self, nOut):
         self.bn = nn.BatchNorm2d(nOut, eps=1e-03)
         self.act = nn.PReLU(nOut)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: input feature map
+        :param input_: input feature map
         :return: normalized and thresholded feature map
         '''
-        output = self.bn(input)
+        output = self.bn(input_)
         output = self.act(output)
         return output
 
@@ -80,13 +80,13 @@ def __init__(self, nIn, nOut, kSize, stride=1):
         self.conv = nn.Conv2d(nIn, nOut, (kSize, kSize), stride=stride, padding=(padding, padding), bias=False)
         self.bn = nn.BatchNorm2d(nOut, eps=1e-03)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
 
-        :param input: input feature map
+        :param input_: input feature map
         :return: transformed feature map
         '''
-        output = self.conv(input)
+        output = self.conv(input_)
         output = self.bn(output)
         return output
 
@@ -108,12 +108,12 @@ def __init__(self, nIn, nOut, kSize, stride=1):
         padding = int((kSize - 1) / 2)
         self.conv = nn.Conv2d(nIn, nOut, (kSize, kSize), stride=stride, padding=(padding, padding), bias=False)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: input feature map
+        :param input_: input feature map
         :return: transformed feature map
         '''
-        output = self.conv(input)
+        output = self.conv(input_)
         return output
 
 
@@ -134,12 +134,12 @@ def __init__(self, nIn, nOut, kSize, stride=1, d=1):
         padding = int((kSize - 1) / 2) * d
         self.conv = nn.Conv2d(nIn, nOut, (kSize, kSize), stride=stride, padding=(padding, padding), bias=False, dilation=d)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: input feature map
+        :param input_: input feature map
         :return: transformed feature map
         '''
-        output = self.conv(input)
+        output = self.conv(input_)
         return output
 
 
@@ -157,8 +157,8 @@ def __init__(self, nIn, nOut):
         self.bn = nn.BatchNorm2d(nOut, eps=1e-3)
         self.act = nn.PReLU(nOut)
 
-    def forward(self, input):
-        output1 = self.c1(input)
+    def forward(self, input_):
+        output1 = self.c1(input_)
         d1 = self.d1(output1)
         d2 = self.d2(output1)
         d4 = self.d4(output1)
@@ -203,13 +203,13 @@ def __init__(self, nIn, nOut, add=True):
         self.bn = BR(nOut)
         self.add = add
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: input feature map
+        :param input_: input feature map
         :return: transformed feature map
         '''
         # reduce
-        output1 = self.c1(input)
+        output1 = self.c1(input_)
         # split and transform
         d1 = self.d1(output1)
         d2 = self.d2(output1)
@@ -228,7 +228,7 @@ def forward(self, input):
 
         # if residual version
         if self.add:
-            combine = input + combine
+            combine = input_ + combine
         output = self.bn(combine)
         return output
 
@@ -250,14 +250,14 @@ def __init__(self, samplingTimes):
             # pyramid-based approach for down-sampling
             self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: Input RGB Image
+        :param input_: Input RGB Image
         :return: down-sampled image (pyramid-based approach)
         '''
         for pool in self.pool:
-            input = pool(input)
-        return input
+            input_ = pool(input_)
+        return input_
 
 
 class ESPNet_Encoder(nn.Module):
@@ -292,14 +292,14 @@ def __init__(self, classes=20, p=5, q=3):
 
         self.classifier = C(256, classes, 1, 1)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: Receives the input RGB image
+        :param input_: Receives the input RGB image
         :return: the transformed feature map with spatial dimensions 1/8th of the input image
         '''
-        output0 = self.level1(input)
-        inp1 = self.sample1(input)
-        inp2 = self.sample2(input)
+        output0 = self.level1(input_)
+        inp1 = self.sample1(input_)
+        inp2 = self.sample2(input_)
 
         output0_cat = self.b1(torch.cat([output0, inp1], 1))
         output1_0 = self.level2_0(output0_cat)  # down-sampled
@@ -341,7 +341,7 @@ def __init__(self, num_classes=20, p=2, q=3, encoderFile=None):
         '''
         super().__init__()
         self.encoder = ESPNet_Encoder(num_classes, p, q)
-        if encoderFile != None:
+        if encoderFile is not None:
             self.encoder.load_state_dict(torch.load(encoderFile))
             print('Encoder loaded!')
         # load the encoder modules
@@ -361,14 +361,14 @@ def __init__(self, num_classes=20, p=2, q=3, encoderFile=None):
 
         self.classifier = nn.ConvTranspose2d(num_classes, num_classes, 2, stride=2, padding=0, output_padding=0, bias=False)
 
-    def forward(self, input):
+    def forward(self, input_):
         '''
-        :param input: RGB image
+        :param input_: RGB image
         :return: transformed feature map
         '''
-        output0 = self.modules[0](input)
-        inp1 = self.modules[1](input)
-        inp2 = self.modules[2](input)
+        output0 = self.modules[0](input_)
+        inp1 = self.modules[1](input_)
+        inp2 = self.modules[2](input_)
 
         output0_cat = self.modules[3](torch.cat([output0, inp1], 1))
         output1_0 = self.modules[4](output0_cat)  # down-sampled
diff --git a/pywick/models/segmentation/testnets/fc_densenet.py b/pywick/models/segmentation/testnets/fc_densenet.py
index 182e03b..c73d91f 100755
--- a/pywick/models/segmentation/testnets/fc_densenet.py
+++ b/pywick/models/segmentation/testnets/fc_densenet.py
@@ -43,20 +43,20 @@ def __init__(self,
         self.in_channels = in_channels
         self.out_channels = out_channels
 
-        if type(down_dense_growth_rates) == int:
+        if type(down_dense_growth_rates) is int:
             down_dense_growth_rates = (down_dense_growth_rates,) * 5
-        if down_dense_bottleneck_ratios is None or type(down_dense_bottleneck_ratios) == int:
+        if down_dense_bottleneck_ratios is None or type(down_dense_bottleneck_ratios) is int:
             down_dense_bottleneck_ratios = (down_dense_bottleneck_ratios,) * 5
-        if type(down_dense_num_layers) == int:
+        if type(down_dense_num_layers) is int:
             down_dense_num_layers = (down_dense_num_layers,) * 5
-        if type(down_transition_compression_factors) == float:
+        if type(down_transition_compression_factors) is float:
             down_transition_compression_factors = (down_transition_compression_factors,) * 5
 
-        if type(up_dense_growth_rates) == int:
+        if type(up_dense_growth_rates) is int:
             up_dense_growth_rates = (up_dense_growth_rates,) * 5
-        if up_dense_bottleneck_ratios is None or type(up_dense_bottleneck_ratios) == int:
+        if up_dense_bottleneck_ratios is None or type(up_dense_bottleneck_ratios) is int:
             up_dense_bottleneck_ratios = (up_dense_bottleneck_ratios,) * 5
-        if type(up_dense_num_layers) == int:
+        if type(up_dense_num_layers) is int:
             up_dense_num_layers = (up_dense_num_layers,) * 5
         # endregion
 
diff --git a/pywick/models/segmentation/testnets/flatten.py b/pywick/models/segmentation/testnets/flatten.py
index beea412..25a8ce6 100755
--- a/pywick/models/segmentation/testnets/flatten.py
+++ b/pywick/models/segmentation/testnets/flatten.py
@@ -2,5 +2,6 @@
 
 
 class Flatten(Module):
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return x.view(x.size(0), -1)
diff --git a/pywick/models/segmentation/testnets/gscnn/SEresnext.py b/pywick/models/segmentation/testnets/gscnn/SEresnext.py
index b750987..21f4985 100755
--- a/pywick/models/segmentation/testnets/gscnn/SEresnext.py
+++ b/pywick/models/segmentation/testnets/gscnn/SEresnext.py
@@ -357,9 +357,9 @@ def forward(self, x):
 
 
 def initialize_pretrained_model(model, num_classes, settings):
-    assert num_classes == settings['num_classes'], \
-        'num_classes should be {}, but is {}'.format(
-            settings['num_classes'], num_classes)
+    if num_classes != settings['num_classes']:
+        raise AssertionError('num_classes should be {}, but is {}'.format(
+                settings['num_classes'], num_classes))
     weights = model_zoo.load_url(settings['url'])
     model.load_state_dict(weights)
     model.input_space = settings['input_space']
diff --git a/pywick/models/segmentation/testnets/gscnn/config.py b/pywick/models/segmentation/testnets/gscnn/config.py
index 49b8153..0971cde 100755
--- a/pywick/models/segmentation/testnets/gscnn/config.py
+++ b/pywick/models/segmentation/testnets/gscnn/config.py
@@ -30,26 +30,10 @@
 # --------------------------------------------------------
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import copy
-import six
-import os.path as osp
-
-from ast import literal_eval
-import numpy as np
-import yaml
 import torch
-import torch.nn as nn
-from torch.nn import init
-
 
 from .utils.AttrDict import AttrDict
 
-
 __C = AttrDict()
 # Consumers can get config by:
 # from fast_rcnn_config import cfg
@@ -70,6 +54,7 @@
 __C.MODEL.BNFUNC = torch.nn.BatchNorm2d
 __C.MODEL.BIGMEMORY = False
 
+
 def assert_and_infer_cfg(args, make_immutable=True):
     """Call this function in your script after you have finished setting all cfg
     values that are necessary (e.g., merging a config from a file, merging
diff --git a/pywick/models/segmentation/testnets/gscnn/gscnn.py b/pywick/models/segmentation/testnets/gscnn/gscnn.py
index dda7195..808a0d7 100644
--- a/pywick/models/segmentation/testnets/gscnn/gscnn.py
+++ b/pywick/models/segmentation/testnets/gscnn/gscnn.py
@@ -71,7 +71,8 @@ def __init__(self, axis, offset):
         self.axis = axis
         self.offset = offset
 
-    def forward(self, x, ref):
+    @staticmethod
+    def forward(x, ref):
         """
 
         :param x: input layer
@@ -124,7 +125,9 @@ class _AtrousSpatialPyramidPoolingModule(nn.Module):
       Final 1x1 conv
     '''
 
-    def __init__(self, in_dim, reduction_dim=256, output_stride=16, rates=[6, 12, 18]):
+    def __init__(self, in_dim, reduction_dim=256, output_stride=16, rates=None):
+        if rates is None:
+            rates = [6, 12, 18]
         super(_AtrousSpatialPyramidPoolingModule, self).__init__()
 
         # Check if we are using distributed BN and use the nn from encoding.nn
@@ -160,7 +163,6 @@ def __init__(self, in_dim, reduction_dim=256, output_stride=16, rates=[6, 12, 18
         self.edge_conv = nn.Sequential(
             nn.Conv2d(1, reduction_dim, kernel_size=1, bias=False),
             Norm2d(reduction_dim), nn.ReLU(inplace=True))
-         
 
     def forward(self, x, edge):
         x_size = x.size()
@@ -196,7 +198,7 @@ class GSCNN(nn.Module):
                   (1024, 2048, 4096)]
     '''
 
-    def __init__(self, num_classes, trunk=None, is_cuda=True, aux=False, **_):
+    def __init__(self, num_classes, is_cuda=True, aux=False, **_):
         
         super(GSCNN, self).__init__()
         # self.criterion = criterion
@@ -219,7 +221,6 @@ def __init__(self, num_classes, trunk=None, is_cuda=True, aux=False, **_):
         self.pool2 = wide_resnet.pool2
         self.pool3 = wide_resnet.pool3
         self.interpolate = F.interpolate
-        del wide_resnet
 
         self.dsn1 = nn.Conv2d(64, 1, 1)
         self.dsn3 = nn.Conv2d(256, 1, 1)
@@ -258,7 +259,7 @@ def __init__(self, num_classes, trunk=None, is_cuda=True, aux=False, **_):
         self.sigmoid = nn.Sigmoid()
         initialize_weights(self.final_seg)
 
-    def forward(self, inp, gts=None):
+    def forward(self, inp):
 
         x_size = inp.size() 
 
diff --git a/pywick/models/segmentation/testnets/gscnn/my_functionals/DualTaskLoss.py b/pywick/models/segmentation/testnets/gscnn/my_functionals/DualTaskLoss.py
index 99ec850..b4c532f 100644
--- a/pywick/models/segmentation/testnets/gscnn/my_functionals/DualTaskLoss.py
+++ b/pywick/models/segmentation/testnets/gscnn/my_functionals/DualTaskLoss.py
@@ -35,17 +35,18 @@
 import numpy as np
 from .custom_functional import compute_grad_mag
 
-def perturbate_input_(input, n_elements=200):
-    N, C, H, W = input.shape
-    assert N == 1
+def perturbate_input_(input_, n_elements=200):
+    N, C, H, W = input_.shape
+    if N != 1:
+        raise AssertionError
     c_ = np.random.random_integers(0, C - 1, n_elements)
     h_ = np.random.random_integers(0, H - 1, n_elements)
     w_ = np.random.random_integers(0, W - 1, n_elements)
     for c_idx in c_:
         for h_idx in h_:
             for w_idx in w_:
-                input[0, c_idx, h_idx, w_idx] = 1
-    return input
+                input_[0, c_idx, h_idx, w_idx] = 1
+    return input_
 
 def _sample_gumbel(shape, eps=1e-10):
     """
@@ -67,7 +68,8 @@ def _gumbel_softmax_sample(logits, tau=1, eps=1e-10):
     https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb
     (MIT license)
     """
-    assert logits.dim() == 3
+    if logits.dim() != 3:
+        raise AssertionError
     gumbel_noise = _sample_gumbel(logits.size(), eps=eps)
     y = logits + gumbel_noise
     return F.softmax(y / tau, 1)
@@ -91,7 +93,6 @@ class DualTaskLoss(nn.Module):
     def __init__(self, cuda=False):
         super(DualTaskLoss, self).__init__()
         self._cuda = cuda
-        return
 
     def forward(self, input_logits, gts, ignore_pixel=255):
         """
diff --git a/pywick/models/segmentation/testnets/gscnn/my_functionals/GatedSpatialConv.py b/pywick/models/segmentation/testnets/gscnn/my_functionals/GatedSpatialConv.py
index a894561..6919b0c 100644
--- a/pywick/models/segmentation/testnets/gscnn/my_functionals/GatedSpatialConv.py
+++ b/pywick/models/segmentation/testnets/gscnn/my_functionals/GatedSpatialConv.py
@@ -64,8 +64,8 @@ def reset_parameters(self):
 
 
 class Conv2dPad(nn.Conv2d):
-    def forward(self, input):
-        return conv2d_same_myF(input,self.weight,self.groups)
+    def forward(self, input_):
+        return conv2d_same_myF(input_, self.weight, self.groups)
 
 class HighFrequencyGatedSpatialConv2d(_ConvNd):
     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
diff --git a/pywick/models/segmentation/testnets/gscnn/my_functionals/custom_functional.py b/pywick/models/segmentation/testnets/gscnn/my_functionals/custom_functional.py
index 7b4dad6..de71f54 100644
--- a/pywick/models/segmentation/testnets/gscnn/my_functionals/custom_functional.py
+++ b/pywick/models/segmentation/testnets/gscnn/my_functionals/custom_functional.py
@@ -19,8 +19,8 @@ def calc_pad_same(in_siz, out_siz, stride, ksize):
     return (out_siz - 1) * stride + ksize - in_siz
 
 
-def conv2d_same(input, kernel, groups,bias=None,stride=1,padding=0,dilation=1):
-    n, c, h, w = input.shape
+def conv2d_same(input_, kernel, groups, bias=None, stride=1, padding=0, dilation=1):
+    n, c, h, w = input_.shape
     kout, ki_c_g, kh, kw = kernel.shape
     pw = calc_pad_same(w, w, 1, kw)
     ph = calc_pad_same(h, h, 1, kh)
@@ -29,104 +29,64 @@ def conv2d_same(input, kernel, groups,bias=None,stride=1,padding=0,dilation=1):
     ph_t = ph // 2
     ph_b = ph - ph_t
 
-    input_ = F.pad(input, (pw_l, pw_r, ph_t, ph_b))
+    input_ = F.pad(input_, (pw_l, pw_r, ph_t, ph_b))
     result = F.conv2d(input_, kernel, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
-    assert result.shape == input.shape
+    if result.shape != input_.shape:
+        raise AssertionError
     return result
 
 
-def gradient_central_diff(input, cuda):
-    return input, input
-    kernel = [[1, 0, -1]]
-    kernel_t = 0.5 * torch.Tensor(kernel) * -1.  # pytorch implements correlation instead of conv
-    if type(cuda) is int:
-        if cuda != -1:
-            kernel_t = kernel_t.cuda(device=cuda)
-    else:
-        if cuda is True:
-            kernel_t = kernel_t.cuda()
-    n, c, h, w = input.shape
-
-    x = conv2d_same(input, kernel_t.unsqueeze(0).unsqueeze(0).repeat([c, 1, 1, 1]), c)
-    y = conv2d_same(input, kernel_t.t().unsqueeze(0).unsqueeze(0).repeat([c, 1, 1, 1]), c)
-    return x, y
+def gradient_central_diff(input_):
+    return input_, input_
 
 
-def compute_single_sided_diferences(o_x, o_y, input):
+def compute_single_sided_diferences(o_x, o_y, input_):
     # n,c,h,w
     #input = input.clone()
-    o_y[:, :, 0, :] = input[:, :, 1, :].clone() - input[:, :, 0, :].clone()
-    o_x[:, :, :, 0] = input[:, :, :, 1].clone() - input[:, :, :, 0].clone()
+    o_y[:, :, 0, :] = input_[:, :, 1, :].clone() - input_[:, :, 0, :].clone()
+    o_x[:, :, :, 0] = input_[:, :, :, 1].clone() - input_[:, :, :, 0].clone()
     # --
-    o_y[:, :, -1, :] = input[:, :, -1, :].clone() - input[:, :, -2, :].clone()
-    o_x[:, :, :, -1] = input[:, :, :, -1].clone() - input[:, :, :, -2].clone()
+    o_y[:, :, -1, :] = input_[:, :, -1, :].clone() - input_[:, :, -2, :].clone()
+    o_x[:, :, :, -1] = input_[:, :, :, -1].clone() - input_[:, :, :, -2].clone()
     return o_x, o_y
 
 
-def numerical_gradients_2d(input, cuda=False):
+def numerical_gradients_2d(input_, cuda=False):
     """
     numerical gradients implementation over batches using torch group conv operator.
     the single sided differences are re-computed later.
     it matches np.gradient(image) with the difference than here output=x,y for an image while there output=y,x
-    :param input: N,C,H,W
+    :param input_: N,C,H,W
     :param cuda: whether or not use cuda
     :return: X,Y
     """
-    n, c, h, w = input.shape
-    assert h > 1 and w > 1
-    x, y = gradient_central_diff(input, cuda)
+    n, c, h, w = input_.shape
+    if not (h > 1 and w > 1):
+        raise AssertionError
+    x, y = gradient_central_diff(input_)
     return x, y
 
 
-def convTri(input, r, cuda=False):
+def convTri(input_, r, cuda=False):
     """
     Convolves an image by a 2D triangle filter (the 1D triangle filter f is
     [1:r r+1 r:-1:1]/(r+1)^2, the 2D version is simply conv2(f,f'))
-    :param input:
+    :param input_:
     :param r: integer filter radius
     :param cuda: move the kernel to gpu
     :return:
     """
     if (r <= 1):
         raise ValueError()
-    n, c, h, w = input.shape
-    return input
-    f = list(range(1, r + 1)) + [r + 1] + list(reversed(range(1, r + 1)))
-    kernel = torch.Tensor([f]) / (r + 1) ** 2
-    if type(cuda) is int:
-        if cuda != -1:
-            kernel = kernel.cuda(device=cuda)
-    else:
-        if cuda is True:
-            kernel = kernel.cuda()
-
-    # padding w
-    input_ = F.pad(input, (1, 1, 0, 0), mode='replicate')
-    input_ = F.pad(input_, (r, r, 0, 0), mode='reflect')
-    input_ = [input_[:, :, :, :r], input, input_[:, :, :, -r:]]
-    input_ = torch.cat(input_, 3)
-    t = input_
-
-    # padding h
-    input_ = F.pad(input_, (0, 0, 1, 1), mode='replicate')
-    input_ = F.pad(input_, (0, 0, r, r), mode='reflect')
-    input_ = [input_[:, :, :r, :], t, input_[:, :, -r:, :]]
-    input_ = torch.cat(input_, 2)
-
-    output = F.conv2d(input_,
-                      kernel.unsqueeze(0).unsqueeze(0).repeat([c, 1, 1, 1]),
-                      padding=0, groups=c)
-    output = F.conv2d(output,
-                      kernel.t().unsqueeze(0).unsqueeze(0).repeat([c, 1, 1, 1]),
-                      padding=0, groups=c)
-    return output
+    n, c, h, w = input_.shape
+    return input_
 
 
 def compute_normal(E, cuda=False):
     if torch.sum(torch.isnan(E)) != 0:
         print('nans found here')
-        import ipdb;
-        ipdb.set_trace()
+        # import ipdb;
+        # ipdb.set_trace()
     E_ = convTri(E, 4, cuda)
     Ox, Oy = numerical_gradients_2d(E_, cuda)
     Oxx, _ = numerical_gradients_2d(Ox, cuda)
@@ -138,8 +98,8 @@ def compute_normal(E, cuda=False):
 
     if torch.sum(torch.isnan(O)) != 0:
         print('nans found here')
-        import ipdb;
-        ipdb.set_trace()
+        # import ipdb;
+        # ipdb.set_trace()
 
     return O
 
@@ -147,8 +107,8 @@ def compute_normal(E, cuda=False):
 def compute_normal_2(E, cuda=False):
     if torch.sum(torch.isnan(E)) != 0:
         print('nans found here')
-        import ipdb;
-        ipdb.set_trace()
+        # import ipdb;
+        # ipdb.set_trace()
     E_ = convTri(E, 4, cuda)
     Ox, Oy = numerical_gradients_2d(E_, cuda)
     Oxx, _ = numerical_gradients_2d(Ox, cuda)
@@ -160,8 +120,8 @@ def compute_normal_2(E, cuda=False):
 
     if torch.sum(torch.isnan(O)) != 0:
         print('nans found here')
-        import ipdb;
-        ipdb.set_trace()
+        # import ipdb;
+        # ipdb.set_trace()
 
     return O, (Oyy, Oxx)
 
diff --git a/pywick/models/segmentation/testnets/gscnn/mynn.py b/pywick/models/segmentation/testnets/gscnn/mynn.py
index fa0bce0..80bc1fb 100755
--- a/pywick/models/segmentation/testnets/gscnn/mynn.py
+++ b/pywick/models/segmentation/testnets/gscnn/mynn.py
@@ -11,7 +11,7 @@ def Norm2d(in_channels):
     """
     Custom Norm Function to allow flexible switching
     """
-    layer = getattr(cfg.MODEL, 'BNFUNC')
+    layer = cfg.MODEL.BNFUNC
     normalizationLayer = layer(in_channels)
     return normalizationLayer
 
@@ -19,7 +19,7 @@ def Norm2d(in_channels):
 def initialize_weights(*models):
    for model in models:
         for module in model.modules():
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+            if isinstance(module(nn.Conv2d, nn.Linear)):
                 nn.init.kaiming_normal(module.weight)
                 if module.bias is not None:
                     module.bias.data.zero_()
diff --git a/pywick/models/segmentation/testnets/gscnn/wider_resnet.py b/pywick/models/segmentation/testnets/gscnn/wider_resnet.py
index e7441f2..33a06ef 100755
--- a/pywick/models/segmentation/testnets/gscnn/wider_resnet.py
+++ b/pywick/models/segmentation/testnets/gscnn/wider_resnet.py
@@ -48,11 +48,8 @@ def bnrelu(channels):
 
 class GlobalAvgPool2d(nn.Module):
 
-    def __init__(self):
-        """Global average pooling over the input's spatial dimensions"""
-        super(GlobalAvgPool2d, self).__init__()
-
-    def forward(self, inputs):
+    @staticmethod
+    def forward(inputs):
         in_size = inputs.size()
         return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
 
@@ -104,7 +101,7 @@ def __init__(self,
 
 
         # Check parameters for inconsistencies
-        if len(channels) != 2 and len(channels) != 3:
+        if len(channels) not in (2, 3):
             raise ValueError("channels must contain either two or three values")
         if len(channels) == 2 and groups != 1:
             raise ValueError("groups > 1 are only valid if len(channels) == 3")
@@ -176,8 +173,6 @@ def forward(self, x):
         return out
 
 
-
-
 class WiderResNet(nn.Module):
 
     def __init__(self,
@@ -194,7 +189,7 @@ def __init__(self,
         norm_act : callable
             Function to create normalization / activation Module.
         classes : int
-            If not `0` also include global average pooling and \
+            If not `0` also include global average pooling and
             a fully-connected layer with `classes` outputs at the end
             of the network.
         """
@@ -266,7 +261,7 @@ def __init__(self,
                  **_):
         """Wider ResNet with pre-activation (identity mapping) blocks
 
-        This variant uses down-sampling by max-pooling in the first two blocks and \
+        This variant uses down-sampling by max-pooling in the first two blocks and
          by strided convolution in the others.
 
         Parameters
@@ -277,11 +272,10 @@ def __init__(self,
             Function to create normalization / activation Module.
         classes : int
             If not `0` also include global average pooling and a fully-connected layer
-            \with `classes` outputs at the end
-            of the network.
+            with `classes` outputs at the end of the network.
         dilation : bool
             If `True` apply dilation to the last three modules and change the
-            \down-sampling factor from 32 to 8.
+            down-sampling factor from 32 to 8.
         """
         super(WiderResNetA2, self).__init__()
         self.dist_bn = dist_bn
diff --git a/pywick/models/segmentation/testnets/hrnetv2/hrnet.py b/pywick/models/segmentation/testnets/hrnetv2/hrnet.py
index bde087d..04f76a1 100644
--- a/pywick/models/segmentation/testnets/hrnetv2/hrnet.py
+++ b/pywick/models/segmentation/testnets/hrnetv2/hrnet.py
@@ -153,7 +153,8 @@ def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
         self.fuse_layers = self._make_fuse_layers()
         self.relu = nn.ReLU(inplace=True)
 
-    def _check_branches(self, num_branches, blocks, num_blocks,
+    @staticmethod
+    def _check_branches(num_branches, blocks, num_blocks,
                         num_inchannels, num_channels):
         if num_branches != len(num_blocks):
             error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
diff --git a/pywick/models/segmentation/testnets/hrnetv2/hrnet_config.py b/pywick/models/segmentation/testnets/hrnetv2/hrnet_config.py
index 6edfcc5..0f7d88f 100644
--- a/pywick/models/segmentation/testnets/hrnetv2/hrnet_config.py
+++ b/pywick/models/segmentation/testnets/hrnetv2/hrnet_config.py
@@ -5,10 +5,6 @@
 # Modified by Ke Sun (sunk@mail.ustc.edu.cn), Rainbowsecret (yuyua@microsoft.com)
 # ------------------------------------------------------------------------------
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 from yacs.config import CfgNode as CN
 
 
diff --git a/pywick/models/segmentation/testnets/hrnetv2/seg_hrnet_ocr.py b/pywick/models/segmentation/testnets/hrnetv2/seg_hrnet_ocr.py
index b4bbf41..b2f3ebe 100644
--- a/pywick/models/segmentation/testnets/hrnetv2/seg_hrnet_ocr.py
+++ b/pywick/models/segmentation/testnets/hrnetv2/seg_hrnet_ocr.py
@@ -3,14 +3,8 @@
 # Licensed under the MIT License.
 # Written by Ke Sun (sunk@mail.ustc.edu.cn), Jingyi Xie (hsfzxjy@gmail.com)
 # ------------------------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import os
 import logging
-import functools
 
 import numpy as np
 
@@ -279,7 +273,8 @@ def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
         self.fuse_layers = self._make_fuse_layers()
         self.relu = nn.ReLU(inplace=relu_inplace)
 
-    def _check_branches(self, num_branches, blocks, num_blocks,
+    @staticmethod
+    def _check_branches(num_branches, blocks, num_blocks,
                         num_inchannels, num_channels):
         if num_branches != len(num_blocks):
             error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
@@ -496,8 +491,9 @@ def __init__(self, config, **kwargs):
                       kernel_size=1, stride=1, padding=0, bias=True)
         )
 
+    @staticmethod
     def _make_transition_layer(
-            self, num_channels_pre_layer, num_channels_cur_layer):
+            num_channels_pre_layer, num_channels_cur_layer):
         num_branches_cur = len(num_channels_cur_layer)
         num_branches_pre = len(num_channels_pre_layer)
 
@@ -532,7 +528,8 @@ def _make_transition_layer(
 
         return nn.ModuleList(transition_layers)
 
-    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+    @staticmethod
+    def _make_layer(block, inplanes, planes, blocks, stride=1):
         downsample = None
         if stride != 1 or inplanes != planes * block.expansion:
             downsample = nn.Sequential(
@@ -549,7 +546,8 @@ def _make_layer(self, block, inplanes, planes, blocks, stride=1):
 
         return nn.Sequential(*layers)
 
-    def _make_stage(self, layer_config, num_inchannels,
+    @staticmethod
+    def _make_stage(layer_config, num_inchannels,
                     multi_scale_output=True):
         num_modules = layer_config['NUM_MODULES']
         num_branches = layer_config['NUM_BRANCHES']
diff --git a/pywick/models/segmentation/testnets/lg_kernel_exfuse/deeplab_resnet.py b/pywick/models/segmentation/testnets/lg_kernel_exfuse/deeplab_resnet.py
index 61bceb7..ed68154 100755
--- a/pywick/models/segmentation/testnets/lg_kernel_exfuse/deeplab_resnet.py
+++ b/pywick/models/segmentation/testnets/lg_kernel_exfuse/deeplab_resnet.py
@@ -78,7 +78,8 @@ def __init__(self, orig_resnet, dilate_scale=8, **kwargs):
         self.layer3 = orig_resnet.layer3
         self.layer4 = orig_resnet.layer4
 
-    def _nostride_dilate(self, m, dilate):
+    @staticmethod
+    def _nostride_dilate(m, dilate):
         classname = m.__class__.__name__
         if classname.find('Conv') != -1:
             # the convolution with stride
@@ -386,7 +387,8 @@ def conv3x3_bn_relu(in_planes, out_planes, stride=1):
 # this is used to build the different models, both encoder and decoder
 class ModelBuilder():
     # custom weights initialization
-    def weights_init(self, m):
+    @staticmethod
+    def weights_init(m):
         classname = m.__class__.__name__
         if classname.find('Conv') != -1:
             nn.init.kaiming_normal(m.weight.data)
@@ -396,23 +398,16 @@ def weights_init(self, m):
         elif classname.find('Linear') != -1:
             m.weight.data.normal_(0.0, 0.0001)
 
-    def build_encoder(self, arch='resnet50_dilated8', fc_dim=512, weights='', **kwargs):
-        pretrained = True if len(weights) == 0 else False
+    @staticmethod
+    def build_encoder(arch='resnet50_dilated8', fc_dim=512, weights='', **kwargs):
+        pretrained = len(weights) == 0
         if arch == 'resnet34':
             raise NotImplementedError
-            orig_resnet = resnet.__dict__['resnet34'](pretrained=pretrained)
-            net_encoder = Resnet(orig_resnet)
-        elif arch == 'resnet34_dilated8':
+        if arch == 'resnet34_dilated8':
             raise NotImplementedError
-            orig_resnet = resnet.__dict__['resnet34'](pretrained=pretrained)
-            net_encoder = ResnetDilated(orig_resnet,
-                                        dilate_scale=8)
-        elif arch == 'resnet34_dilated16':
+        if arch == 'resnet34_dilated16':
             raise NotImplementedError
-            orig_resnet = resnet.__dict__['resnet34'](pretrained=pretrained)
-            net_encoder = ResnetDilated(orig_resnet,
-                                        dilate_scale=16)
-        elif arch == 'resnet50':
+        if arch == 'resnet50':
             orig_resnet = resnet.resnet50(**kwargs)
             net_encoder = Resnet(orig_resnet)
         elif arch == 'resnet50_dilated8':
@@ -497,20 +492,18 @@ def __init__(self, inplanes, outplanes, kernel_size, stride, padding, dilation,
 		self.add_module("bn", nn.BatchNorm2d(num_features=outplanes, momentum=0.999, affine=True))
 		if relu:
 			self.add_module("relu", nn.ReLU())
-	def forward(self, x):
-		return super(_ConvBatchNormReluBlock, self).forward(x)
 
 class _ResidualBlockMulGrid(nn.Sequential):
 	"""
 		Residual Block with multi-grid , note: best model-> (1, 2, 1)
 	"""
-	def __init__(self, layers, inplanes, midplanes, outplanes, stride, dilation, mulgrid=[1,2,1]):
-		super(_ResidualBlockMulGrid, self).__init__()
-		self.add_module("block1", _Bottleneck(inplanes, midplanes, outplanes, stride, dilation * mulgrid[0], True))
-		self.add_module("block2", _Bottleneck(outplanes, midplanes, outplanes, stride, dilation * mulgrid[1], False))
-		self.add_module("block3", _Bottleneck(outplanes, midplanes, outplanes, stride, dilation * mulgrid[2], False))
-	def forward(self, x):
-		return super(_ResidualBlockMulGrid, self).forward(x)
+	def __init__(self, layers, inplanes, midplanes, outplanes, stride, dilation, mulgrid=None):
+	    if mulgrid is None:
+	        mulgrid = [1,2,1]
+	    super(_ResidualBlockMulGrid, self).__init__()
+	    self.add_module("block1", _Bottleneck(inplanes, midplanes, outplanes, stride, dilation * mulgrid[0], True))
+	    self.add_module("block2", _Bottleneck(outplanes, midplanes, outplanes, stride, dilation * mulgrid[1], False))
+	    self.add_module("block3", _Bottleneck(outplanes, midplanes, outplanes, stride, dilation * mulgrid[2], False))
 
 class _Bottleneck(nn.Sequential):
 	def __init__(self, inplanes, midplanes, outplanes, stride, dilation, downsample):
diff --git a/pywick/models/segmentation/testnets/lg_kernel_exfuse/seg_resnext.py b/pywick/models/segmentation/testnets/lg_kernel_exfuse/seg_resnext.py
index 68b2708..e917b8d 100755
--- a/pywick/models/segmentation/testnets/lg_kernel_exfuse/seg_resnext.py
+++ b/pywick/models/segmentation/testnets/lg_kernel_exfuse/seg_resnext.py
@@ -11,7 +11,7 @@
     from urllib.request import urlretrieve
 
 
-__all__ = ['ResNeXt', 'resnext101'] # support resnext 101
+__all__ = ['ResNeXt', 'resnext101']  # support resnext 101
 
 # can not used for now
 model_urls = {
diff --git a/pywick/models/segmentation/testnets/mixnet/layers.py b/pywick/models/segmentation/testnets/mixnet/layers.py
index 3df3e2c..b0b291b 100755
--- a/pywick/models/segmentation/testnets/mixnet/layers.py
+++ b/pywick/models/segmentation/testnets/mixnet/layers.py
@@ -3,12 +3,14 @@
 
 
 class Swish(nn.Module):
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return x * torch.sigmoid(x)
 
 
 class Flatten(nn.Module):
-    def forward(self, x):
+    @staticmethod
+    def forward(x):
         return x.view(x.shape[0], -1)
 
 
diff --git a/pywick/models/segmentation/testnets/mixnet/utils.py b/pywick/models/segmentation/testnets/mixnet/utils.py
index 5dc4a44..fb52a3b 100755
--- a/pywick/models/segmentation/testnets/mixnet/utils.py
+++ b/pywick/models/segmentation/testnets/mixnet/utils.py
@@ -43,7 +43,8 @@ def _decode_block_string(block_string, depth_multiplier, depth_divisor, min_dept
         Raises:
         ValueError: if the strides option is not correctly specified.
         """
-        assert isinstance(block_string, str)
+        if not isinstance(block_string, str):
+            raise AssertionError
 
         ops = block_string.split('_')
         options = {}
@@ -113,7 +114,8 @@ def decode(string_list, depth_multiplier, depth_divisor, min_depth):
         Returns:
         A list of namedtuples to represent Mixnet blocks arguments.
         """
-        assert isinstance(string_list, list)
+        if not isinstance(string_list, list):
+            raise AssertionError
         blocks_args = []
         for block_string in string_list:
             blocks_args.append(MixnetDecoder._decode_block_string(block_string, depth_multiplier, depth_divisor, min_depth))
diff --git a/pywick/models/segmentation/testnets/msc.py b/pywick/models/segmentation/testnets/msc.py
index e4c2b4a..d30b038 100644
--- a/pywick/models/segmentation/testnets/msc.py
+++ b/pywick/models/segmentation/testnets/msc.py
@@ -15,7 +15,9 @@
 class MSC(nn.Module):
     """Multi-scale inputs"""
 
-    def __init__(self, scale, pyramids=[0.5, 0.75]):
+    def __init__(self, scale, pyramids=None):
+        if pyramids is None:
+            pyramids = [0.5, 0.75]
         super(MSC, self).__init__()
         self.scale = scale
         self.pyramids = pyramids
diff --git a/pywick/models/segmentation/testnets/psanet.py b/pywick/models/segmentation/testnets/psanet.py
index 020ca35..d8f9a02 100644
--- a/pywick/models/segmentation/testnets/psanet.py
+++ b/pywick/models/segmentation/testnets/psanet.py
@@ -41,7 +41,7 @@ def __init__(self, num_classes, pretrained=True, backbone='resnet101', aux=False
     def forward(self, x):
         size = x.size()[2:]
         _, _, c3, c4 = self.base_forward(x)
-        outputs = list()
+        outputs = []
         x = self.head(c4)
         x = F.interpolate(x, size, mode='bilinear', align_corners=True)
         outputs.append(x)
diff --git a/pywick/models/segmentation/testnets/psp_saeed.py b/pywick/models/segmentation/testnets/psp_saeed.py
index 03699b2..578b4fb 100644
--- a/pywick/models/segmentation/testnets/psp_saeed.py
+++ b/pywick/models/segmentation/testnets/psp_saeed.py
@@ -13,7 +13,7 @@ def initialize_weights(method='kaiming', *models):
     for model in models:
         for module in model.modules():
 
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.ConvTranspose2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d, nn.Linear)):
                 if method == 'kaiming':
                     init.kaiming_normal_(module.weight.data, np.sqrt(2.0))
                 elif method == 'xavier':
diff --git a/pywick/models/segmentation/testnets/resnet.py b/pywick/models/segmentation/testnets/resnet.py
index 81a5ec4..2e14bfe 100644
--- a/pywick/models/segmentation/testnets/resnet.py
+++ b/pywick/models/segmentation/testnets/resnet.py
@@ -47,9 +47,6 @@ def __init__(
         if relu:
             self.add_module("relu", nn.ReLU())
 
-    def forward(self, x):
-        return super(_ConvBatchNormReLU, self).forward(x)
-
 
 class _Bottleneck(nn.Sequential):
     """Bottleneck Unit"""
@@ -100,9 +97,10 @@ def __init__(
         if mg is None:
             mg = [1 for _ in range(n_layers)]
         else:
-            assert n_layers == len(mg), "{} values expected, but got: mg={}".format(
-                n_layers, mg
-            )
+            if n_layers != len(mg):
+                raise AssertionError("{} values expected, but got: mg={}".format(
+                    n_layers, mg
+                ))
 
         self.add_module(
             "block1",
diff --git a/pywick/models/segmentation/testnets/tiramisu_test.py b/pywick/models/segmentation/testnets/tiramisu_test.py
index 6d55a01..6659913 100644
--- a/pywick/models/segmentation/testnets/tiramisu_test.py
+++ b/pywick/models/segmentation/testnets/tiramisu_test.py
@@ -13,9 +13,6 @@ def __init__(self, in_channels, growth_rate):
                                           stride=1, padding=1, bias=True))
         self.add_module('drop', nn.Dropout2d(0.2))
 
-    def forward(self, x):
-        return super().forward(x)
-
 
 class DenseBlock(nn.Module):
     def __init__(self, in_channels, growth_rate, n_layers, upsample=False):
@@ -53,9 +50,6 @@ def __init__(self, in_channels):
         self.add_module('drop', nn.Dropout2d(0.2))
         self.add_module('maxpool', nn.MaxPool2d(2))
 
-    def forward(self, x):
-        return super().forward(x)
-
 
 class TransitionUp(nn.Module):
     def __init__(self, in_channels, out_channels):
@@ -77,9 +71,6 @@ def __init__(self, in_channels, growth_rate, n_layers):
         self.add_module('bottleneck', DenseBlock(
             in_channels, growth_rate, n_layers, upsample=True))
 
-    def forward(self, x):
-        return super().forward(x)
-
 
 def center_crop(layer, max_height, max_width):
     _, _, h, w = layer.size()
diff --git a/pywick/models/segmentation/testnets/tkcnet/base.py b/pywick/models/segmentation/testnets/tkcnet/base.py
index f5fcc7b..c7c1aae 100644
--- a/pywick/models/segmentation/testnets/tkcnet/base.py
+++ b/pywick/models/segmentation/testnets/tkcnet/base.py
@@ -17,9 +17,13 @@
 
 class BaseNet(nn.Module):
     def __init__(self, nclass, backbone, aux, se_loss, dilated=True, norm_layer=None,
-                 base_size=576, crop_size=608, mean=[.485, .456, .406],
-                 std=[.229, .224, .225], root='./pretrain_models',
+                 base_size=576, crop_size=608, mean=None,
+                 std=None, root='./pretrain_models',
                  multi_grid=False, multi_dilation=None, **kwargs):
+        if mean is None:
+            mean = [.485, .456, .406]
+        if std is None:
+            std = [.229, .224, .225]
         super(BaseNet, self).__init__()
         self.nclass = nclass
         self.aux = aux
@@ -66,7 +70,8 @@ def resize_image(img, h, w, **up_kwargs):
 
 def pad_image(img, mean, std, crop_size):
     b,c,h,w = img.size()
-    assert(c==3)
+    if (c != 3):
+        raise AssertionError
     padh = crop_size - h if h < crop_size else 0
     padw = crop_size - w if w < crop_size else 0
     pad_values = -np.array(mean) / np.array(std)
@@ -74,14 +79,16 @@ def pad_image(img, mean, std, crop_size):
     for i in range(c):
         # note that pytorch pad params is in reversed orders
         img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh), value=pad_values[i])
-    assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size)
+    if not (img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size):
+        raise AssertionError
     return img_pad
 
 def crop_image(img, h0, h1, w0, w1):
     return img[:,:,h0:h1,w0:w1]
 
 def flip_image(img):
-    assert(img.dim()==4)
+    if (img.dim() != 4):
+        raise AssertionError
     with torch.cuda.device_of(img):
         idx = torch.arange(img.size(3)-1, -1, -1).type_as(img).long()
     return img.index_select(3, idx)
diff --git a/pywick/models/segmentation/testnets/tkcnet/model_store.py b/pywick/models/segmentation/testnets/tkcnet/model_store.py
index b051c52..4faa9aa 100644
--- a/pywick/models/segmentation/testnets/tkcnet/model_store.py
+++ b/pywick/models/segmentation/testnets/tkcnet/model_store.py
@@ -1,5 +1,4 @@
 """Model store which provides pretrained models."""
-from __future__ import print_function
 __all__ = ['get_model_file', 'purge']
 import os
 import zipfile
diff --git a/pywick/models/segmentation/testnets/tkcnet/resnet.py b/pywick/models/segmentation/testnets/tkcnet/resnet.py
index d5428bc..121ed66 100644
--- a/pywick/models/segmentation/testnets/tkcnet/resnet.py
+++ b/pywick/models/segmentation/testnets/tkcnet/resnet.py
@@ -79,8 +79,10 @@ def __init__(self, inplanes, planes, stride=1, dilation=1,
         self.dilation = dilation
         self.stride = stride
 
-    def _sum_each(self, x, y):
-        assert(len(x) == len(y))
+    @staticmethod
+    def _sum_each(x, y):
+        if (len(x) != len(y)):
+            raise AssertionError
         z = []
         for i in range(len(x)):
             z.append(x[i]+y[i])
@@ -176,7 +178,7 @@ def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=No
             )
 
         layers = []
-        if dilation == 1 or dilation == 2:
+        if dilation in (1, 2):
             layers.append(block(self.inplanes, planes, stride, dilation=1,
                             downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
         elif dilation == 4:
diff --git a/pywick/models/segmentation/testnets/tkcnet/tkcnet.py b/pywick/models/segmentation/testnets/tkcnet/tkcnet.py
index eac2136..5f5e619 100644
--- a/pywick/models/segmentation/testnets/tkcnet/tkcnet.py
+++ b/pywick/models/segmentation/testnets/tkcnet/tkcnet.py
@@ -5,7 +5,6 @@
 # Email: wutianyi@ict.ac.cn
 # Copyright (c) 2018
 ###########################################################################
-from __future__ import division
 import torch
 import torch.nn as nn
 from torch.nn.functional import upsample
@@ -14,6 +13,7 @@
 
 __all__ = ['TKCNet', 'get_tkcnet', 'TKCNet_Resnet101']
 
+
 class TKCNet(BaseNet):
     """Tree-structured Kronecker Convolutional Networks for Semantic Segmentation, 
       Note that:
@@ -69,7 +69,8 @@ def __init__(self, in_channels, out_channels, norm_layer, r1, r2):
                                         nn.ReLU())
         self.conv6 = nn.Sequential(nn.Dropout2d(0.1, False), nn.Conv2d(inter_channels, out_channels, 1))
     
-    def _make_level(self, inChannel, outChannel, r1, r2, norm_layer):
+    @staticmethod
+    def _make_level(inChannel, outChannel, r1, r2, norm_layer):
         avg_agg = nn.AvgPool2d(r2, stride =1, padding= r2 // 2)
         conv = nn.Sequential( nn.Conv2d(inChannel, outChannel, kernel_size= 3, stride= 1, padding = r1, dilation = r1 ),
                               norm_layer(outChannel),
diff --git a/pywick/models/segmentation/tiramisu.py b/pywick/models/segmentation/tiramisu.py
index b7e05dc..be8a403 100644
--- a/pywick/models/segmentation/tiramisu.py
+++ b/pywick/models/segmentation/tiramisu.py
@@ -17,9 +17,6 @@ def __init__(self, in_channels, growth_rate):
         self.add_module('conv', nn.Conv2d(in_channels, growth_rate, kernel_size=3, stride=1, padding=1, bias=True))
         self.add_module('drop', nn.Dropout2d(0.2))
 
-    def forward(self, x):
-        return super().forward(x)
-
 
 class DenseBlock(nn.Module):
     def __init__(self, in_channels, growth_rate, n_layers, upsample=False):
@@ -55,9 +52,6 @@ def __init__(self, in_channels):
         self.add_module('drop', nn.Dropout2d(0.2))
         self.add_module('maxpool', nn.MaxPool2d(2))
 
-    def forward(self, x):
-        return super().forward(x)
-
 
 class TransitionUp(nn.Module):
     def __init__(self, in_channels, out_channels):
@@ -76,9 +70,6 @@ def __init__(self, in_channels, growth_rate, n_layers):
         super().__init__()
         self.add_module('bottleneck', DenseBlock(in_channels, growth_rate, n_layers, upsample=True))
 
-    def forward(self, x):
-        return super().forward(x)
-
 
 def center_crop(layer, max_height, max_width):
     _, _, h, w = layer.size()
diff --git a/pywick/models/segmentation/unet_dilated.py b/pywick/models/segmentation/unet_dilated.py
index 97682c0..3d1cbd2 100644
--- a/pywick/models/segmentation/unet_dilated.py
+++ b/pywick/models/segmentation/unet_dilated.py
@@ -117,7 +117,7 @@ class UNetDilated(nn.Module):
     """
     Unet utilizing dilation
     """
-    def __init__(self, num_classes, **kwargs):
+    def __init__(self, num_classes, **_):
         super(UNetDilated, self).__init__()
         self.Conv0 = self._transition(3, 8)  # 1918
         self.down1 = self._down_block(8, 16, 16)  # 959
@@ -143,7 +143,7 @@ def __init__(self, num_classes, **kwargs):
         self.conv3 = nn.Conv2d(32 * 2, 32, 3, stride=1, padding=1)
         self.bn3 = nn.BatchNorm2d(32)
 
-        self.up4 = self._up_block(32, 16, 16) #, output_padding=(1, 0))  # 959
+        self.up4 = self._up_block(32, 16, 16)       # ,output_padding=(1, 0))  # 959
         self.db4 = self._dense_block(16, 8)
         self.conv4 = nn.Conv2d(16 * 2, 16, 3, stride=1, padding=1)
         self.bn4 = nn.BatchNorm2d(16)
@@ -172,16 +172,12 @@ def forward(self, x):
         # down4.data.shape              =>  torch.Size([2, 96, 64, 43])
 
         up1 = self.act(self.bn1(self.conv1(torch.cat([self.db1(self.up1(down5)), down4], dim=1))))
-        del down5, down4
 
         up2 = self.act(self.bn2(self.conv2(torch.cat([self.db2(self.up2(up1)), down3], dim=1))))
-        del down3
 
         up3 = self.act(self.bn3(self.conv3(torch.cat([self.db3(self.up3(up2)), down2], dim=1))))
-        del down2
 
         up4 = self.act(self.bn4(self.conv4(torch.cat([self.db4(self.up4(up3)), down1], dim=1))))
-        del down1
 
         up5 = self.up5(up4)
         # up5=self.conv5(up5)
@@ -189,23 +185,26 @@ def forward(self, x):
         # return self.clss(self.conv5(up5))
         return self.conv5(up5)
 
-
-    def _transition(self, in_channels, out_channels):
+    @staticmethod
+    def _transition(in_channels, out_channels):
         layers = []
         layers.append(Conv_transition([1, 3, 5], in_channels, out_channels))
         return nn.Sequential(*layers)
 
-    def _down_block(self, in_channels, inner_channels, out_channels):
+    @staticmethod
+    def _down_block(in_channels, inner_channels, out_channels):
         layers = []
         layers.append(Fire_Down(3, in_channels, inner_channels, out_channels))
         return nn.Sequential(*layers)
 
-    def _up_block(self, in_channels, inner_channels, out_channels, output_padding=(1, 1)):
+    @staticmethod
+    def _up_block(in_channels, inner_channels, out_channels, output_padding=(1, 1)):
         layers = []
         layers.append(Fire_Up(3, in_channels, inner_channels, out_channels, output_padding))
         return nn.Sequential(*layers)
 
-    def _dense_block(self, in_channels, growth_rate):
+    @staticmethod
+    def _dense_block(in_channels, growth_rate):
         layers = []
         layers.append(Dense_layer(in_channels, growth_rate))
-        return nn.Sequential(*layers)
\ No newline at end of file
+        return nn.Sequential(*layers)
diff --git a/pywick/models/segmentation/unet_res.py b/pywick/models/segmentation/unet_res.py
index 2a966e2..9db427f 100644
--- a/pywick/models/segmentation/unet_res.py
+++ b/pywick/models/segmentation/unet_res.py
@@ -16,7 +16,7 @@ def initialize_weights(method='kaiming', *models):
     for model in models:
         for module in model.modules():
 
-            if isinstance(module, nn.Conv2d) or isinstance(module, nn.ConvTranspose2d) or isinstance(module, nn.Linear):
+            if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d, nn.Linear)):
                 if method == 'kaiming':
                     init.kaiming_normal_(module.weight.data, np.sqrt(2.0))
                 elif method == 'xavier':
diff --git a/pywick/models/segmentation/unet_stack.py b/pywick/models/segmentation/unet_stack.py
index ae8b37a..71030d4 100644
--- a/pywick/models/segmentation/unet_stack.py
+++ b/pywick/models/segmentation/unet_stack.py
@@ -88,7 +88,9 @@ def forward(self, inputs_, down):
 
 
 class UNet_stack(nn.Module):
-    def get_n_stacks(self, input_size, **kwargs):
+
+    @staticmethod
+    def get_n_stacks(input_size, **_):
         n_stacks = 0
         width, height = input_size, input_size
         while width % 2 == 0 and height % 2 == 0:
@@ -98,7 +100,7 @@ def get_n_stacks(self, input_size, **kwargs):
 
         return n_stacks
 
-    def __init__(self, input_size=512, filters=12, kernel_size=3, max_stacks=6, **kwargs):
+    def __init__(self, input_size=512, filters=12, kernel_size=3, max_stacks=6, **_):
         super(UNet_stack, self).__init__()
         self.n_stacks = min(self.get_n_stacks((input_size, input_size)), max_stacks)
 
@@ -155,7 +157,7 @@ def forward(self, inputs_):
 
 
 class UNet960(nn.Module):
-    def __init__(self, filters=12, kernel_size=3, **kwargs):
+    def __init__(self, filters=12, kernel_size=3, **_):
         super(UNet960, self).__init__()
 
         # 960
diff --git a/pywick/models/segmentation/upernet.py b/pywick/models/segmentation/upernet.py
index b9d6f94..e568ee4 100644
--- a/pywick/models/segmentation/upernet.py
+++ b/pywick/models/segmentation/upernet.py
@@ -36,13 +36,13 @@ def summary(model, input_shape, batch_size=-1, intputshow=True):
     """
 
     def register_hook(module):
-        def hook(module, input, output=None):
+        def hook(module, input_, output=None):
             class_name = str(module.__class__).split(".")[-1].split("'")[0]
             module_idx = len(summary)
 
             m_key = "%s-%i" % (class_name, module_idx + 1)
             summary[m_key] = OrderedDict()
-            summary[m_key]["input_shape"] = list(input[0].size())
+            summary[m_key]["input_shape"] = list(input_[0].size())
             summary[m_key]["input_shape"][0] = batch_size
 
             params = 0
@@ -95,7 +95,7 @@ def hook(module, input, output=None):
         else:
             total_output += np.prod(summary[layer]["output_shape"])
         if "trainable" in summary[layer]:
-            if summary[layer]["trainable"] == True:
+            if summary[layer]["trainable"] is True:
                 trainable_params += summary[layer]["nb_params"]
 
         model_info += line_new + '\n'
@@ -132,7 +132,7 @@ def __init__(self):
         super(BaseModel, self).__init__()
         self.logger = logging.getLogger(self.__class__.__name__)
 
-    def forward(self):
+    def forward(self, x):
         raise NotImplementedError
 
     def summary(self):
@@ -150,7 +150,9 @@ def __str__(self):
 class PSPModule(nn.Module):
     # In the original inmplementation they use precise RoI pooling
     # Instead of using adaptative average pooling
-    def __init__(self, in_channels, bin_sizes=[1, 2, 4, 6]):
+    def __init__(self, in_channels, bin_sizes=None):
+        if bin_sizes is None:
+            bin_sizes = [1, 2, 4, 6]
         super(PSPModule, self).__init__()
         out_channels = in_channels // len(bin_sizes)
         self.stages = nn.ModuleList([self._make_stages(in_channels, out_channels, b_s)
@@ -163,7 +165,8 @@ def __init__(self, in_channels, bin_sizes=[1, 2, 4, 6]):
             nn.Dropout2d(0.1)
         )
 
-    def _make_stages(self, in_channels, out_channels, bin_sz):
+    @staticmethod
+    def _make_stages(in_channels, out_channels, bin_sz):
         prior = nn.AdaptiveAvgPool2d(output_size=bin_sz)
         conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
         bn = nn.BatchNorm2d(out_channels)
@@ -173,11 +176,11 @@ def _make_stages(self, in_channels, out_channels, bin_sz):
     def forward(self, features):
         h, w = features.size()[2], features.size()[3]
         pyramids = [features]
-        pyramids.extend([F.interpolate(stage(features), size=(h, w), mode='bilinear',
-                                        align_corners=True) for stage in self.stages])
+        pyramids.extend([F.interpolate(stage(features), size=(h, w), mode='bilinear', align_corners=True) for stage in self.stages])
         output = self.bottleneck(torch.cat(pyramids, dim=1))
         return output
 
+
 class ResNet(nn.Module):
     def __init__(self, in_channels=3, output_stride=16, backbone='resnet101', pretrained=True):
         super(ResNet, self).__init__()
@@ -203,7 +206,7 @@ def __init__(self, in_channels=3, output_stride=16, backbone='resnet101', pretra
 
         if output_stride == 8:
             for n, m in self.layer3.named_modules():
-                if 'conv1' in n and (backbone == 'resnet34' or backbone == 'resnet18'):
+                if 'conv1' in n and backbone in ('resnet34', 'resnet18'):
                     m.dilation, m.padding, m.stride = (d3,d3), (d3,d3), (s3,s3)
                 elif 'conv2' in n:
                     m.dilation, m.padding, m.stride = (d3,d3), (d3,d3), (s3,s3)
@@ -211,7 +214,7 @@ def __init__(self, in_channels=3, output_stride=16, backbone='resnet101', pretra
                     m.stride = (s3, s3)
 
         for n, m in self.layer4.named_modules():
-            if 'conv1' in n and (backbone == 'resnet34' or backbone == 'resnet18'):
+            if 'conv1' in n and backbone in ('resnet34', 'resnet18'):
                 m.dilation, m.padding, m.stride = (d4,d4), (d4,d4), (s4,s4)
             elif 'conv2' in n:
                 m.dilation, m.padding, m.stride = (d4,d4), (d4,d4), (s4,s4)
@@ -231,9 +234,12 @@ def up_and_add(x, y):
     return F.interpolate(x, size=(y.size(2), y.size(3)), mode='bilinear', align_corners=True) + y
 
 class FPN_fuse(nn.Module):
-    def __init__(self, feature_channels=[256, 512, 1024, 2048], fpn_out=256):
+    def __init__(self, feature_channels=None, fpn_out=256):
+        if feature_channels is None:
+            feature_channels = [256, 512, 1024, 2048]
         super(FPN_fuse, self).__init__()
-        assert feature_channels[0] == fpn_out
+        if feature_channels[0] != fpn_out:
+            raise AssertionError
         self.conv1x1 = nn.ModuleList([nn.Conv2d(ft_size, fpn_out, kernel_size=1)
                                     for ft_size in feature_channels[1:]])
         self.smooth_conv =  nn.ModuleList([nn.Conv2d(fpn_out, fpn_out, kernel_size=3, padding=1)]
@@ -263,7 +269,7 @@ class UperNet(BaseModel):
     def __init__(self, num_classes, in_channels=3, backbone='resnet101', pretrained=True, fpn_out=256, freeze_bn=False, freeze_backbone=False, **_):
         super(UperNet, self).__init__()
 
-        if backbone == 'resnet34' or backbone == 'resnet18':
+        if backbone in ('resnet34', 'resnet18'):
             feature_channels = [64, 128, 256, 512]
         else:
             feature_channels = [256, 512, 1024, 2048]
diff --git a/pywick/modules/_utils.py b/pywick/modules/_utils.py
index 6134f9a..467e329 100644
--- a/pywick/modules/_utils.py
+++ b/pywick/modules/_utils.py
@@ -53,7 +53,7 @@ def _standardize_user_data(inputs, targets=None):
 
 def _validate_metric_input(metric):
     if isinstance(metric, str):
-        if metric.upper() == 'CATEGORICAL_ACCURACY' or metric.upper() == 'ACCURACY':
+        if metric.upper() in ('CATEGORICAL_ACCURACY', 'ACCURACY'):
             return CategoricalAccuracy()
         elif metric.upper() == 'BINARY_ACCURACY':
             return BinaryAccuracy()
diff --git a/pywick/modules/module_trainer.py b/pywick/modules/module_trainer.py
index 6772702..714633e 100644
--- a/pywick/modules/module_trainer.py
+++ b/pywick/modules/module_trainer.py
@@ -27,9 +27,9 @@
 from tqdm import tqdm
 
 
-class ModuleTrainer(object):
+class ModuleTrainer:
 
-    def __init__(self, model, cuda_devices=[]):
+    def __init__(self, model, cuda_devices=None):
         """
         ModelTrainer for high-level training of Pytorch models
 
@@ -47,6 +47,8 @@ def __init__(self, model, cuda_devices=[]):
         - metrics
         - callbacks
         """
+        if cuda_devices is None:
+            cuda_devices = []
         if not isinstance(model, nn.Module):
             raise ValueError('model argument must inherit from torch.nn.Module')
         self.model = model
@@ -56,7 +58,7 @@ def __init__(self, model, cuda_devices=[]):
         self._loss_multipliers = None
 
         # custom fit helpers
-        self._named_helpers = dict()       # custom trainers that can be initialized during compilation time
+        self._named_helpers = {}       # custom trainers that can be initialized during compilation time
 
         # preconditions
         self._preconditions = []
@@ -113,10 +115,7 @@ def set_criterion(self, criterion):
 
     def set_optimizer(self, optimizer, **kwargs):
         if type(optimizer) is type or isinstance(optimizer, str):
-            if 'parameters' in kwargs:
-                parameters = kwargs['parameters']
-            else:
-                parameters = self.model.parameters()
+            parameters = kwargs.get('parameters', self.model.parameters())
 
             optimizer = _validate_optimizer_input(optimizer)
             self._optimizer = optimizer(parameters, **kwargs)
@@ -338,7 +337,8 @@ def fit(self,
                         self._optimizer.zero_grad()
                         output_batch = fit_forward_fn(input_batch)
                         loss = fit_loss_fn(output_batch, target_batch)
-                        assert not math.isnan(loss), 'Assertion failed: Loss is not NaN.'
+                        if math.isnan(loss):
+                            raise AssertionError('Assertion failed: Loss is not NaN.')
                         loss.backward()
                         self._optimizer.step()
                         # ---------------------------------------------
@@ -458,7 +458,8 @@ def fit_loader(self,
                         output_batch = fit_forward_fn(input_batch)
 
                         loss = fit_loss_fn(output_batch, target_batch)
-                        assert not math.isnan(loss), 'Assertion failed: Loss is not NaN.'
+                        if math.isnan(loss):
+                            raise AssertionError('Assertion failed: Loss is not NaN.')
                         loss.backward()
                         self._optimizer.step()
                         # ---------------------------------------------
@@ -614,7 +615,8 @@ def evaluate(self,
                 self._optimizer.zero_grad()
                 output_batch = eval_forward_fn(input_batch)
                 loss = eval_loss_fn(output_batch, target_batch)
-                assert not math.isnan(loss), 'Assertion failed: Loss is not NaN.'
+                if math.isnan(loss):
+                    raise AssertionError('Assertion failed: Loss is not NaN.')
 
                 if conditions_container:
                     cond_logs = conditions_container(CondType.POST, epoch_num=None, batch_num=batch_idx, net=self.model, input_batch=input_batch, output_batch=output_batch, target_batch=target_batch)
@@ -671,7 +673,8 @@ def evaluate_loader(self, loader, eval_helper_name=None, verbose=1):
                 self._optimizer.zero_grad()
                 output_batch = eval_forward_fn(input_batch)
                 loss = eval_loss_fn(output_batch, target_batch)
-                assert not math.isnan(loss), 'Assertion failed: Loss is not NaN.'
+                if math.isnan(loss):
+                    raise AssertionError('Assertion failed: Loss is not NaN.')
 
                 if conditions_container:
                     cond_logs = conditions_container(CondType.POST, epoch_num=None, batch_num=batch_idx, net=self.model, input_batch=input_batch, output_batch=output_batch, target_batch=target_batch)
@@ -780,7 +783,7 @@ def _get_helper(trainer, num_inputs, num_targets, helper_name=None):
 
     return helper
 
-class SingleInput_SingleTarget_Helper(object):
+class SingleInput_SingleTarget_Helper:
 
     def __init__(self, loss_multipliers=None):
         '''
@@ -790,30 +793,36 @@ def __init__(self, loss_multipliers=None):
         '''
         self.loss_multipliers = loss_multipliers
 
-    def move_to_device(self, device, inputs, targets):
+    @staticmethod
+    def move_to_device(device, inputs, targets):
         return inputs.to(device), targets.to(device)
 
-    def shuffle_arrays(self, inputs, targets):
+    @staticmethod
+    def shuffle_arrays(inputs, targets):
         rand_indices = th.randperm(len(inputs))
         inputs = inputs[rand_indices]
         targets = targets[rand_indices]
         return inputs, targets
 
-    def grab_batch(self, batch_idx, batch_size, inputs, targets):
+    @staticmethod
+    def grab_batch(batch_idx, batch_size, inputs, targets):
         input_batch = inputs[batch_idx*batch_size:(batch_idx+1)*batch_size]
         target_batch = targets[batch_idx*batch_size:(batch_idx+1)*batch_size]
         return input_batch, target_batch
 
-    def grab_batch_from_loader(self, loader_iter):
+    @staticmethod
+    def grab_batch_from_loader(loader_iter):
         return next(loader_iter)        # input_batch, target_batch
 
-    def apply_transforms(self, tforms, input_batch, target_batch):
+    @staticmethod
+    def apply_transforms(tforms, input_batch, target_batch):
         input_batch = tforms[0](input_batch)
         target_batch = tforms[1](target_batch)
         input_batch, target_batch = tforms[2](input_batch, target_batch)
         return input_batch, target_batch
 
-    def forward_pass(self, input_batch, model):
+    @staticmethod
+    def forward_pass(input_batch, model):
         return model(input_batch)
 
     def get_partial_forward_fn(self, model):
@@ -823,7 +832,8 @@ def calculate_loss(self, output_batch, target_batch, loss_fn):
         total_loss = 0.
         if is_tuple_or_list(output_batch):     # some networks output multiple results (to compute separate losses)
             if self.loss_multipliers:
-                assert len(output_batch) == len(self.loss_multipliers)
+                if len(output_batch) != len(self.loss_multipliers):
+                    raise AssertionError
 
             for i, output in enumerate(output_batch):
                 if self.loss_multipliers:
@@ -839,37 +849,44 @@ def get_partial_loss_fn(self, loss_fn):
         return functools.partial(self.calculate_loss, loss_fn=loss_fn)
 
 
-class SingleInput_MultiTarget_Helper(object):
+class SingleInput_MultiTarget_Helper:
 
-    def move_to_device(self, device, inputs, targets):
+    @staticmethod
+    def move_to_device(device, inputs, targets):
         return inputs.to(device), [target_.to(device) for target_ in targets]
 
-    def shuffle_arrays(self, inputs, targets):
+    @staticmethod
+    def shuffle_arrays(inputs, targets):
         rand_indices = th.randperm(len(inputs))
         inputs = inputs[rand_indices]
         targets = [target_[rand_indices] for target_ in targets]
         return inputs, targets
 
-    def grab_batch(self, batch_idx, batch_size, inputs, targets):
+    @staticmethod
+    def grab_batch(batch_idx, batch_size, inputs, targets):
         input_batch = inputs[batch_idx*batch_size:(batch_idx+1)*batch_size]
         target_batch = [target_[batch_idx*batch_size:(batch_idx+1)*batch_size] for target_ in targets]
         return input_batch, target_batch
 
-    def grab_batch_from_loader(self, loader_iter):
+    @staticmethod
+    def grab_batch_from_loader(loader_iter):
         return next(loader_iter)        # OLD: # input_batch, [target_ for target_ in target_batch]
 
-    def apply_transforms(self, tforms, input_batch, target_batch):
+    @staticmethod
+    def apply_transforms(tforms, input_batch, target_batch):
         input_batch = tforms[0](input_batch)
         target_batch = [tforms[1](target_) for target_ in target_batch]
         return input_batch, target_batch
 
-    def forward_pass(self, input_batch, model):
+    @staticmethod
+    def forward_pass(input_batch, model):
         return model(input_batch)
 
     def get_partial_forward_fn(self, model):
         return functools.partial(self.forward_pass, model=model)
 
-    def calculate_loss(self, output_batch, target_batch, loss_fn):
+    @staticmethod
+    def calculate_loss(output_batch, target_batch, loss_fn):
         return sum([loss_fn[idx](output_batch[idx], target_batch[idx])
                     for idx in range(len(output_batch))])
 
@@ -877,142 +894,172 @@ def get_partial_loss_fn(self, loss_fn):
         return functools.partial(self.calculate_loss, loss_fn=loss_fn)
 
 
-class MultiInput_SingleTarget_Helper(object):
-    def move_to_device(self, device, inputs, targets):
+class MultiInput_SingleTarget_Helper:
+
+    @staticmethod
+    def move_to_device(device, inputs, targets):
         return [input_.to(device) for input_ in inputs], targets.to(device)
 
-    def shuffle_arrays(self, inputs, targets):
+    @staticmethod
+    def shuffle_arrays(inputs, targets):
         rand_indices = th.randperm(len(inputs))
         inputs = [input_[rand_indices] for input_ in inputs]
         targets = targets[rand_indices]
         return inputs, targets
 
-    def grab_batch(self, batch_idx, batch_size, inputs, targets):
+    @staticmethod
+    def grab_batch(batch_idx, batch_size, inputs, targets):
         input_batch = [input_[batch_idx*batch_size:(batch_idx+1)*batch_size] for input_ in inputs]
         target_batch = targets[batch_idx*batch_size:(batch_idx+1)*batch_size]
         return input_batch, target_batch
 
-    def grab_batch_from_loader(self, loader_iter):
+    @staticmethod
+    def grab_batch_from_loader(loader_iter):
         return next(loader_iter)        # OLD: # [input_ for input_ in input_batch], target_batch
 
-    def apply_transforms(self, tforms, input_batch, target_batch):
+    @staticmethod
+    def apply_transforms(tforms, input_batch, target_batch):
         input_batch = [tforms[0](input_) for input_ in input_batch]
         target_batch = tforms[1](target_batch)
         return input_batch, target_batch
 
-    def forward_pass(self, input_batch, model):
+    @staticmethod
+    def forward_pass(input_batch, model):
         return model(*input_batch)
 
     def get_partial_forward_fn(self, model):
         return functools.partial(self.forward_pass, model=model)
 
-    def calculate_loss(self, output_batch, target_batch, loss_fn):
+    @staticmethod
+    def calculate_loss(output_batch, target_batch, loss_fn):
         return loss_fn(output_batch, target_batch)
 
     def get_partial_loss_fn(self, loss_fn):
         return functools.partial(self.calculate_loss, loss_fn=loss_fn)
 
 
-class MultiInput_MultiTarget_Helper(object):
+class MultiInput_MultiTarget_Helper:
 
-    def move_to_device(self, device, inputs, targets):
+    @staticmethod
+    def move_to_device(device, inputs, targets):
         return [input_.to(device) for input_ in inputs], [target_.to(device) for target_ in targets]
 
-    def shuffle_arrays(self, inputs, targets):
+    @staticmethod
+    def shuffle_arrays(inputs, targets):
         rand_indices = th.randperm(len(inputs))
         inputs = [input_[rand_indices] for input_ in inputs]
         targets = [input_[rand_indices] for input_ in inputs]
         return inputs, targets
 
-    def grab_batch(self, batch_idx, batch_size, inputs, targets):
+    @staticmethod
+    def grab_batch(batch_idx, batch_size, inputs, targets):
         input_batch = [input_[batch_idx*batch_size:(batch_idx+1)*batch_size] for input_ in inputs]
         target_batch = [target_[batch_idx*batch_size:(batch_idx+1)*batch_size] for target_ in targets]
         return input_batch, target_batch
 
-    def grab_batch_from_loader(self, loader_iter):
+    @staticmethod
+    def grab_batch_from_loader(loader_iter):
         return next(loader_iter)        # OLD: # [input_ for input_ in input_batch], [target_ for target_ in target_batch]
 
-    def apply_transforms(self, tforms, input_batch, target_batch):
+    @staticmethod
+    def apply_transforms(tforms, input_batch, target_batch):
         input_batch = [tforms[0](input_) for input_ in input_batch]
         target_batch = [tforms[1](target_) for target_ in target_batch]
         return input_batch, target_batch
 
-    def forward_pass(self, input_batch, model):
+    @staticmethod
+    def forward_pass(input_batch, model):
         return model(*input_batch)
 
     def get_partial_forward_fn(self, model):
         return functools.partial(self.forward_pass, model=model)
 
-    def calculate_loss(self, output_batch, target_batch, loss_fn):
+    @staticmethod
+    def calculate_loss(output_batch, target_batch, loss_fn):
         return sum([loss_fn[idx](output_batch[idx], target_batch[idx]) for idx in range(len(output_batch))])
 
     def get_partial_loss_fn(self, loss_fn):
         return functools.partial(self.calculate_loss, loss_fn=loss_fn)
 
 
-class SingleInput_NoTarget_Helper(object):
-    def move_to_device(self, device, inputs, targets=None):
+class SingleInput_NoTarget_Helper:
+
+    @staticmethod
+    def move_to_device(device, inputs, **_):
         return inputs.to(device), None
 
-    def shuffle_arrays(self, inputs, targets=None):
+    @staticmethod
+    def shuffle_arrays(inputs, **_):
         rand_indices = th.randperm(len(inputs))
         inputs = inputs[rand_indices]
         return inputs, None
 
-    def grab_batch(self, batch_idx, batch_size, inputs, targets=None):
+    @staticmethod
+    def grab_batch(batch_idx, batch_size, inputs, **_):
         input_batch = inputs[batch_idx*batch_size:(batch_idx+1)*batch_size]
         return input_batch, None
 
-    def grab_batch_from_loader(self, loader_iter):
+    @staticmethod
+    def grab_batch_from_loader(loader_iter):
         input_batch = next(loader_iter)
         return input_batch, None
 
-    def apply_transforms(self, tforms, input_batch, target_batch=None):
+    @staticmethod
+    def apply_transforms(tforms, input_batch, **_):
         input_batch = tforms[0](input_batch)
         return input_batch, None
 
-    def forward_pass(self, input_batch, model):
+    @staticmethod
+    def forward_pass(input_batch, model):
         return model(input_batch)
 
     def get_partial_forward_fn(self, model):
         return functools.partial(self.forward_pass, model=model)
 
-    def calculate_loss(self, output_batch, target_batch, loss_fn):
+    @staticmethod
+    def calculate_loss(output_batch, target_batch, loss_fn):
         return loss_fn(output_batch)
 
     def get_partial_loss_fn(self, loss_fn):
         return functools.partial(self.calculate_loss, loss_fn=loss_fn)
 
 
-class MultiInput_NoTarget_Helper(object):
+class MultiInput_NoTarget_Helper:
 
-    def move_to_device(self, device, inputs, targets=None):
+    @staticmethod
+    def move_to_device(device, inputs, **_):
         return [input_.to(device) for input_ in inputs], None
 
-    def shuffle_arrays(self, inputs, targets=None):
+    @staticmethod
+    def shuffle_arrays(inputs, **_):
         rand_indices = th.randperm(len(inputs))
         inputs = [input_[rand_indices] for input_ in inputs]
         return inputs, None
 
-    def grab_batch(self, batch_idx, batch_size, inputs, targets=None):
+    @staticmethod
+    def grab_batch(batch_idx, batch_size, inputs, **_):
         input_batch = [input_[batch_idx*batch_size:(batch_idx+1)*batch_size] for input_ in inputs]
         return input_batch, None
 
-    def grab_batch_from_loader(self, loader_iter):
+    @staticmethod
+    def grab_batch_from_loader(loader_iter):
         input_batch = next(loader_iter)
         return input_batch, None
 
-    def apply_transforms(self, tforms, input_batch, target_batch=None):
+    @staticmethod
+    def apply_transforms(tforms, input_batch, **_):
         input_batch = [tforms[0](input_) for input_ in input_batch]
         return input_batch, None
 
-    def forward_pass(self, input_batch, model):
+    @staticmethod
+    def forward_pass(input_batch, model):
         return model(*input_batch)
 
     def get_partial_forward_fn(self, model):
         return functools.partial(self.forward_pass, model=model)
 
-    def calculate_loss(self, output_batch, target_batch, loss_fn):
+    @staticmethod
+    def calculate_loss(output_batch, target_batch, loss_fn):
         return loss_fn(output_batch)
 
     def get_partial_loss_fn(self, loss_fn):
diff --git a/pywick/optimizers/__init__.py b/pywick/optimizers/__init__.py
index b31e989..fdb40b4 100644
--- a/pywick/optimizers/__init__.py
+++ b/pywick/optimizers/__init__.py
@@ -8,10 +8,12 @@
 from .a2grad import A2GradInc, A2GradExp, A2GradUni
 from .adabelief import AdaBelief
 from .adahessian import Adahessian
+from .adamp import AdamP
 from .adamw import AdamW
 from .addsign import AddSign
 from .apollo import Apollo
 from .eve import Eve
+from .lars import Lars
 from .lookahead import Lookahead
 from .lookaheadsgd import LookaheadSGD
 from .madgrad import MADGRAD
@@ -23,3 +25,4 @@
 from .rangerlars import RangerLars
 from .sgdw import SGDW
 from .swa import SWA
+from torch.optim import *
diff --git a/pywick/optimizers/a2grad.py b/pywick/optimizers/a2grad.py
index 6ef47b8..2410588 100644
--- a/pywick/optimizers/a2grad.py
+++ b/pywick/optimizers/a2grad.py
@@ -2,14 +2,24 @@
 
 import copy
 import math
-from typing import Optional
+from typing import Optional, Tuple, Dict, Any, Callable, Union, Iterable
 
 import torch
 from torch.optim.optimizer import Optimizer
 
-from .madgrad import OptFloat, OptLossClosure, Params
+from torch import Tensor
 
-__all__ = ('A2GradUni', 'A2GradInc', 'A2GradExp')
+Params = Union[Iterable[Tensor], Iterable[Dict[str, Any]]]
+
+LossClosure = Callable[[], float]
+OptLossClosure = Optional[LossClosure]
+Betas2 = Tuple[float, float]
+State = Dict[str, Any]
+OptFloat = Optional[float]
+Nus2 = Tuple[float, float]
+
+
+__all__ = ('A2GradUni', 'A2GradInc', 'A2GradExp', 'Betas2', 'OptFloat', 'OptLossClosure', 'Params', 'State', 'Nus2')
 
 
 class A2GradUni(Optimizer):
diff --git a/pywick/optimizers/adabelief.py b/pywick/optimizers/adabelief.py
index fb91feb..dd301fe 100644
--- a/pywick/optimizers/adabelief.py
+++ b/pywick/optimizers/adabelief.py
@@ -5,9 +5,9 @@
 import torch
 from torch.optim.optimizer import Optimizer
 
-from .madgrad import Betas2, OptFloat, OptLossClosure, Params
+from .a2grad import Betas2, OptFloat, OptLossClosure, Params
 
-__all__ = ('AdaBelief',)
+__all__ = 'AdaBelief'
 
 
 class AdaBelief(Optimizer):
diff --git a/pywick/optimizers/adahessian.py b/pywick/optimizers/adahessian.py
index 940c5d3..cb56fbb 100644
--- a/pywick/optimizers/adahessian.py
+++ b/pywick/optimizers/adahessian.py
@@ -1,200 +1,156 @@
-# Source: https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/adahessian.py (apache 2.0)
-
-import math
-from typing import List, Optional
+""" AdaHessian Optimizer
 
+Lifted from https://github.com/davda54/ada-hessian/blob/master/ada_hessian.py
+Originally licensed MIT, Copyright 2020, David Samuel
+"""
 import torch
-from torch.optim.optimizer import Optimizer
-
-from .madgrad import Betas2, OptFloat, OptLossClosure, Params
-
-Grads = Params
 
-__all__ = ('Adahessian',)
 
-
-class Adahessian(Optimizer):
-    r"""Implements Adahessian Algorithm.
-    It has been proposed in `ADAHESSIAN: An Adaptive Second Order Optimizer
-    for Machine Learning`.
+class Adahessian(torch.optim.Optimizer):
+    """
+    Implements the AdaHessian algorithm from "ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning"
 
     Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 0.15)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-4)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        hessian_power (float, optional): Hessian power (default: 0.5)
-        seed (int, optional): Random number generator seed (default: None)
-
-        __ https://arxiv.org/abs/2006.00719
-
-        Note:
-            Reference code: https://github.com/amirgholami/adahessian
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups
+        lr (float, optional): learning rate (default: 0.1)
+        betas ((float, float), optional): coefficients used for computing running averages of gradient and the
+            squared hessian trace (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0)
+        hessian_power (float, optional): exponent of the hessian trace (default: 1.0)
+        update_each (int, optional): compute the hessian trace approximation only after *this* number of steps
+            (to save time) (default: 1)
+        n_samples (int, optional): how many times to sample `z` for the approximation of the hessian trace (default: 1)
     """
 
-    def __init__(
-        self,
-        params: Params,
-        lr: float = 0.15,
-        betas: Betas2 = (0.9, 0.999),
-        eps: float = 1e-4,
-        weight_decay: float = 0,
-        hessian_power: float = 0.5,
-        seed: Optional[int] = None,
-    ) -> None:
-        if lr <= 0.0:
-            raise ValueError('Invalid learning rate: {}'.format(lr))
-        if eps <= 0.0:
-            raise ValueError('Invalid epsilon value: {}'.format(eps))
+    def __init__(self, params, lr=0.1, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0,
+                 hessian_power=1.0, update_each=1, n_samples=1, avg_conv_kernel=False):
+        if 0.0 > lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if 0.0 > eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                'Invalid beta parameter at index 0: {}'.format(betas[0])
-            )
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                'Invalid beta parameter at index 1: {}'.format(betas[1])
-            )
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
         if not 0.0 <= hessian_power <= 1.0:
-            raise ValueError(
-                'Invalid Hessian power value: {}'.format(hessian_power)
-            )
-        if seed is not None:
-            torch.manual_seed(seed)
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            hessian_power=hessian_power,
-        )
+            raise ValueError(f"Invalid Hessian power value: {hessian_power}")
+
+        self.n_samples = n_samples
+        self.update_each = update_each
+        self.avg_conv_kernel = avg_conv_kernel
+
+        # use a separate generator that deterministically generates the same `z`s across all GPUs in case of distributed training
+        self.seed = 2147483647
+        self.generator = torch.Generator().manual_seed(self.seed)
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, hessian_power=hessian_power)
         super(Adahessian, self).__init__(params, defaults)
 
-    def get_trace(self, params: Params, grads: Grads) -> List[torch.Tensor]:
-        """Get an estimate of Hessian Trace.
-        This is done by computing the Hessian vector product with a random
-        vector v at the current gradient point, to estimate Hessian trace by
-        computing the gradient of <gradsH,v>.
-        :param gradsH: a list of torch variables
-        :return: a list of torch tensors
+        for p in self.get_params():
+            p.hess = 0.0
+            self.state[p]["hessian step"] = 0
+
+    @property
+    def is_second_order(self):
+        return True
+
+    def get_params(self):
+        """
+        Gets all parameters in all param_groups with gradients
+        """
+
+        return (p for group in self.param_groups for p in group['params'] if p.requires_grad)
+
+    def zero_hessian(self):
+        """
+        Zeros out the accumalated hessian traces.
+        """
+
+        for p in self.get_params():
+            if not isinstance(p.hess, float) and self.state[p]["hessian step"] % self.update_each == 0:
+                p.hess.zero_()
+
+    @torch.no_grad()
+    def set_hessian(self):
         """
+        Computes the Hutchinson approximation of the hessian trace and accumulates it for each trainable parameter.
+        """
+
+        params = []
+        for p in filter(lambda p: p.grad is not None, self.get_params()):
+            if self.state[p]["hessian step"] % self.update_each == 0:  # compute the trace only each `update_each` step
+                params.append(p)
+            self.state[p]["hessian step"] += 1
+
+        if len(params) == 0:
+            return
+
+        if self.generator.device != params[0].device:  # hackish way of casting the generator to the right device
+            self.generator = torch.Generator(params[0].device).manual_seed(self.seed)
 
-        # Check backward was called with create_graph set to True
-        for i, grad in enumerate(grads):
-            if grad.grad_fn is None:
-                msg = (
-                    'Gradient tensor {:} does not have grad_fn. When '
-                    'calling loss.backward(), make sure the option '
-                    'create_graph is set to True.'
-                )
-                raise RuntimeError(msg.format(i))
-
-        v = [
-            2
-            * torch.randint_like(
-                p, high=2, memory_format=torch.preserve_format
-            )
-            - 1
-            for p in params
-        ]
-
-        # this is for distributed setting with single node and multi-gpus,
-        # for multi nodes setting, we have not support it yet.
-        hvs = torch.autograd.grad(
-            grads, params, grad_outputs=v, only_inputs=True, retain_graph=True
-        )
-
-        hutchinson_trace = []
-        for hv in hvs:
-            param_size = hv.size()
-            if len(param_size) <= 2:  # for 0/1/2D tensor
-                # Hessian diagonal block size is 1 here.
-                # We use that torch.abs(hv * vi) = hv.abs()
-                tmp_output = hv.abs()
-
-            elif len(param_size) == 4:  # Conv kernel
-                # Hessian diagonal block size is 9 here: torch.sum() reduces
-                # the dim 2/3.
-                # We use that torch.abs(hv * vi) = hv.abs()
-                tmp_output = torch.mean(hv.abs(), dim=[2, 3], keepdim=True)
-            hutchinson_trace.append(tmp_output)
-
-        return hutchinson_trace
-
-    def step(self, closure: OptLossClosure = None) -> OptFloat:
-        """Perform a single optimization step.
+        grads = [p.grad for p in params]
 
+        for i in range(self.n_samples):
+            # Rademacher distribution {-1.0, 1.0}
+            zs = [torch.randint(0, 2, p.size(), generator=self.generator, device=p.device) * 2.0 - 1.0 for p in params]
+            h_zs = torch.autograd.grad(
+                grads, params, grad_outputs=zs, only_inputs=True, retain_graph=i < self.n_samples - 1)
+            for h_z, z, p in zip(h_zs, zs, params):
+                p.hess += h_z * z / self.n_samples  # approximate the expected values of z*(H@z)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """
+        Performs a single optimization step.
         Arguments:
-            closure: A closure that reevaluates the model and returns the loss.
+            closure (callable, optional) -- a closure that reevaluates the model and returns the loss (default: None)
         """
+
         loss = None
         if closure is not None:
             loss = closure()
 
-        params = []
-        groups = []
-        grads = []
+        self.zero_hessian()
+        self.set_hessian()
 
-        # Flatten groups into lists, so that
-        #  hut_traces can be called with lists of parameters
-        #  and grads
         for group in self.param_groups:
             for p in group['params']:
-                if p.grad is not None:
-                    params.append(p)
-                    groups.append(group)
-                    grads.append(p.grad)
-
-        # get the Hessian diagonal
-
-        hut_traces = self.get_trace(params, grads)
-
-        for (p, group, grad, hut_trace) in zip(
-            params, groups, grads, hut_traces
-        ):
-
-            state = self.state[p]
-
-            # State initialization
-            if len(state) == 0:
-                state['step'] = 0
-                # Exponential moving average of gradient values
-                state['exp_avg'] = torch.zeros_like(p.data)
-                # Exponential moving average of Hessian diagonal square values
-                state['exp_hessian_diag_sq'] = torch.zeros_like(p.data)
-
-            exp_avg, exp_hessian_diag_sq = (
-                state['exp_avg'],
-                state['exp_hessian_diag_sq'],
-            )
-
-            beta1, beta2 = group['betas']
-
-            state['step'] += 1
-
-            # Decay the first and second moment running average coefficient
-            exp_avg.mul_(beta1).add_(grad.detach_(), alpha=1 - beta1)
-            exp_hessian_diag_sq.mul_(beta2).addcmul_(
-                hut_trace, hut_trace, value=1 - beta2
-            )
-
-            bias_correction1 = 1 - beta1 ** state['step']
-            bias_correction2 = 1 - beta2 ** state['step']
-
-            # make the square root, and the Hessian power
-            k = group['hessian_power']
-            denom = (
-                (exp_hessian_diag_sq.sqrt() ** k)
-                / math.sqrt(bias_correction2) ** k
-            ).add_(group['eps'])
-
-            # make update
-            p.data = p.data - group['lr'] * (
-                exp_avg / bias_correction1 / denom
-                + group['weight_decay'] * p.data
-            )
+                if p.grad is None or p.hess is None:
+                    continue
+
+                if self.avg_conv_kernel and p.dim() == 4:
+                    p.hess = torch.abs(p.hess).mean(dim=[2, 3], keepdim=True).expand_as(p.hess).clone()
+
+                # Perform correct stepweight decay as in AdamW
+                p.mul_(1 - group['lr'] * group['weight_decay'])
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 1:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of Hessian diagonal square values
+                    state['exp_hessian_diag_sq'] = torch.zeros_like(p)
+
+                exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq']
+                beta1, beta2 = group['betas']
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(p.grad, alpha=1 - beta1)
+                exp_hessian_diag_sq.mul_(beta2).addcmul_(p.hess, p.hess, value=1 - beta2)
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                k = group['hessian_power']
+                denom = (exp_hessian_diag_sq / bias_correction2).pow_(k / 2).add_(group['eps'])
+
+                # make update
+                step_size = group['lr'] / bias_correction1
+                p.addcdiv_(exp_avg, denom, value=-step_size)
 
         return loss
diff --git a/pywick/optimizers/adamp.py b/pywick/optimizers/adamp.py
new file mode 100644
index 0000000..44b0ca9
--- /dev/null
+++ b/pywick/optimizers/adamp.py
@@ -0,0 +1,103 @@
+"""
+AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py
+Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217
+Code: https://github.com/clovaai/AdamP
+Copyright (c) 2020-present NAVER Corp.
+MIT license
+"""
+
+import torch
+import torch.nn.functional as F
+from torch.optim.optimizer import Optimizer
+import math
+
+
+def _channel_view(x) -> torch.Tensor:
+    return x.reshape(x.size(0), -1)
+
+
+def _layer_view(x) -> torch.Tensor:
+    return x.reshape(1, -1)
+
+
+def projection(p, grad, perturb, delta: float, wd_ratio: float, eps: float):
+    wd = 1.
+    expand_size = (-1,) + (1,) * (len(p.shape) - 1)
+    for view_func in [_channel_view, _layer_view]:
+        param_view = view_func(p)
+        grad_view = view_func(grad)
+        cosine_sim = F.cosine_similarity(grad_view, param_view, dim=1, eps=eps).abs_()
+
+        # FIXME this is a problem for PyTorch XLA
+        if cosine_sim.max() < delta / math.sqrt(param_view.size(1)):
+            p_n = p / param_view.norm(p=2, dim=1).add_(eps).reshape(expand_size)
+            perturb -= p_n * view_func(p_n * perturb).sum(dim=1).reshape(expand_size)
+            wd = wd_ratio
+            return perturb, wd
+
+    return perturb, wd
+
+
+class AdamP(Optimizer):
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False):
+        defaults = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+            delta=delta, wd_ratio=wd_ratio, nesterov=nesterov)
+        super(AdamP, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                beta1, beta2 = group['betas']
+                nesterov = group['nesterov']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+
+                # Adam
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+
+                state['step'] += 1
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+                step_size = group['lr'] / bias_correction1
+
+                if nesterov:
+                    perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
+                else:
+                    perturb = exp_avg / denom
+
+                # Projection
+                wd_ratio = 1.
+                if len(p.shape) > 1:
+                    perturb, wd_ratio = projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps'])
+
+                # Weight decay
+                if group['weight_decay'] > 0:
+                    p.mul_(1. - group['lr'] * group['weight_decay'] * wd_ratio)
+
+                # Step
+                p.add_(perturb, alpha=-step_size)
+
+        return loss
\ No newline at end of file
diff --git a/pywick/optimizers/adamw.py b/pywick/optimizers/adamw.py
index 1237395..ee2c817 100644
--- a/pywick/optimizers/adamw.py
+++ b/pywick/optimizers/adamw.py
@@ -34,9 +34,9 @@ class AdamW(Optimizer):
 
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                  weight_decay=1e-2, amsgrad=False):
-        if not 0.0 <= lr:
+        if 0.0 > lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
+        if 0.0 > eps:
             raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
diff --git a/pywick/optimizers/apollo.py b/pywick/optimizers/apollo.py
index 4bdd8dd..e54de2b 100644
--- a/pywick/optimizers/apollo.py
+++ b/pywick/optimizers/apollo.py
@@ -3,7 +3,10 @@
 import torch
 from torch.optim.optimizer import Optimizer
 
-from .madgrad import OptFloat, OptLossClosure, Params
+from .a2grad import OptFloat, OptLossClosure, Params
+
+
+__all__ = 'Apollo'
 
 
 class Apollo(Optimizer):
@@ -46,11 +49,11 @@ def __init__(
             raise ValueError('Invalid epsilon value: {}'.format(eps))
         if not 0.0 <= beta < 1.0:
             raise ValueError('Invalid beta parameter: {}'.format(beta))
-        if not 0.0 <= weight_decay:
+        if 0.0 > weight_decay:
             raise ValueError(
                 'Invalid weight_decay value: {}'.format(weight_decay)
             )
-        if not 0.0 <= warmup:
+        if 0.0 > warmup:
             raise ValueError('Invalid warmup updates: {}'.format(warmup))
         if not 0.0 <= init_lr <= 1.0:
             raise ValueError(
diff --git a/pywick/optimizers/lars.py b/pywick/optimizers/lars.py
new file mode 100644
index 0000000..e9de426
--- /dev/null
+++ b/pywick/optimizers/lars.py
@@ -0,0 +1,129 @@
+""" PyTorch LARS / LARC Optimizer
+An implementation of LARS (SGD) + LARC in PyTorch
+Based on:
+  * PyTorch SGD: https://github.com/pytorch/pytorch/blob/1.7/torch/optim/sgd.py#L100
+  * NVIDIA APEX LARC: https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py
+Additional cleanup and modifications to properly support PyTorch XLA.
+Copyright 2021 Ross Wightman
+"""
+import torch
+from torch.optim.optimizer import Optimizer
+
+
+class Lars(Optimizer):
+    """ LARS for PyTorch
+
+    Paper: `Large batch training of Convolutional Networks` - https://arxiv.org/pdf/1708.03888.pdf
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float, optional): learning rate (default: 1.0).
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+        trust_coeff (float): trust coefficient for computing adaptive lr / trust_ratio (default: 0.001)
+        eps (float): eps for division denominator (default: 1e-8)
+        trust_clip (bool): enable LARC trust ratio clipping (default: False)
+        always_adapt (bool): always apply LARS LR adapt, otherwise only when group weight_decay != 0 (default: False)
+    """
+
+    def __init__(
+            self,
+            params,
+            lr=1.0,
+            momentum=0,
+            dampening=0,
+            weight_decay=0,
+            nesterov=False,
+            trust_coeff=0.001,
+            eps=1e-8,
+            trust_clip=False,
+            always_adapt=False,
+    ):
+        if lr < 0.0:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if momentum < 0.0:
+            raise ValueError(f"Invalid momentum value: {momentum}")
+        if weight_decay < 0.0:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            trust_coeff=trust_coeff,
+            eps=eps,
+            trust_clip=trust_clip,
+            always_adapt=always_adapt,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("nesterov", False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        device = self.param_groups[0]['params'][0].device
+        one_tensor = torch.tensor(1.0, device=device)  # because torch.where doesn't handle scalars correctly
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+            trust_coeff = group['trust_coeff']
+            eps = group['eps']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+
+                # apply LARS LR adaptation, LARC clipping, weight decay
+                # ref: https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py
+                if weight_decay != 0 or group['always_adapt']:
+                    w_norm = p.norm(2.0)
+                    g_norm = grad.norm(2.0)
+                    trust_ratio = trust_coeff * w_norm / (g_norm + w_norm * weight_decay + eps)
+                    # FIXME nested where required since logical and/or not working in PT XLA
+                    trust_ratio = torch.where(
+                        w_norm > 0,
+                        torch.where(g_norm > 0, trust_ratio, one_tensor),
+                        one_tensor,
+                    )
+                    if group['trust_clip']:
+                        trust_ratio = torch.minimum(trust_ratio / group['lr'], one_tensor)
+                    grad.add(p, alpha=weight_decay)
+                    grad.mul_(trust_ratio)
+
+                # apply SGD update https://github.com/pytorch/pytorch/blob/1.7/torch/optim/sgd.py#L100
+                if momentum != 0:
+                    param_state = self.state[p]
+                    if 'momentum_buffer' not in param_state:
+                        buf = param_state['momentum_buffer'] = torch.clone(grad).detach()
+                    else:
+                        buf = param_state['momentum_buffer']
+                        buf.mul_(momentum).add_(grad, alpha=1. - dampening)
+                    if nesterov:
+                        grad = grad.add(buf, alpha=momentum)
+                    else:
+                        grad = buf
+
+                p.add_(grad, alpha=-group['lr'])
+
+        return loss
\ No newline at end of file
diff --git a/pywick/optimizers/madgrad.py b/pywick/optimizers/madgrad.py
index 36dd11d..0358846 100644
--- a/pywick/optimizers/madgrad.py
+++ b/pywick/optimizers/madgrad.py
@@ -1,36 +1,40 @@
-# Source: https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/madgrad.py (apache 2.0)
+""" PyTorch MADGRAD optimizer
+MADGRAD: https://arxiv.org/abs/2101.11075
+Code from: https://github.com/facebookresearch/madgrad
+"""
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
-from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union
-
-from torch import Tensor
-
-Params = Union[Iterable[Tensor], Iterable[Dict[str, Any]]]
-
-LossClosure = Callable[[], float]
-OptLossClosure = Optional[LossClosure]
-Betas2 = Tuple[float, float]
-State = Dict[str, Any]
-OptFloat = Optional[float]
-Nus2 = Tuple[float, float]
+from typing import TYPE_CHECKING, Any, Callable, Optional
 
 import torch
 import torch.optim
 
-
-__all__ = ('MADGRAD', 'Betas2', 'OptFloat', 'OptLossClosure', 'Params', 'State', 'Nus2')
+if TYPE_CHECKING:
+    from torch.optim.optimizer import _params_t
+else:
+    _params_t = Any
 
 
 class MADGRAD(torch.optim.Optimizer):
-    r"""Implements MADGRAD algorithm.
-
-    It has been proposed in `Adaptivity without Compromise: A Momentumized,
-    Adaptive, Dual Averaged Gradient Method for Stochastic Optimization`__
-
+    """
+    MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
+    Optimization.
+    .. _MADGRAD: https://arxiv.org/abs/2101.11075
+    MADGRAD is a general purpose optimizer that can be used in place of SGD or
+    Adam may converge faster and generalize better. Currently GPU-only.
+    Typically, the same learning rate schedule that is used for SGD or Adam may
+    be used. The overall learning rate is not comparable to either method and
+    should be determined by a hyper-parameter sweep.
+    MADGRAD requires less weight decay than other methods, often as little as
+    zero. Momentum values used for SGD or Adam's beta1 should work here also.
+    On sparse problems both weight_decay and momentum should be set to 0.
     Arguments:
         params (iterable):
-            Iterable of parameters to optimize
-            or dicts defining parameter groups.
+            Iterable of parameters to optimize or dicts defining parameter groups.
         lr (float):
             Learning rate (default: 1e-2).
         momentum (float):
@@ -38,94 +42,85 @@ class MADGRAD(torch.optim.Optimizer):
         weight_decay (float):
             Weight decay, i.e. a L2 penalty (default: 0).
         eps (float):
-            Term added to the denominator outside of the root operation
-            to improve numerical stability. (default: 1e-6).
-
-    __ https://arxiv.org/abs/2101.11075
-
-    Note:
-        Reference code: https://github.com/facebookresearch/madgrad
+            Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6).
     """
 
     def __init__(
-        self,
-        params: Params,
-        lr: float = 1e-2,
-        momentum: float = 0.9,
-        weight_decay: float = 0.0,
-        eps: float = 1e-6,
+            self,
+            params: _params_t,
+            lr: float = 1e-2,
+            momentum: float = 0.9,
+            weight_decay: float = 0,
+            eps: float = 1e-6,
+            decoupled_decay: bool = False,
     ):
         if momentum < 0 or momentum >= 1:
-            raise ValueError('Invalid momentum value: {}'.format(momentum))
-        if lr <= 0.0:
-            raise ValueError('Invalid learning rate: {}'.format(lr))
+            raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
+        if lr <= 0:
+            raise ValueError(f"Learning rate {lr} must be positive")
         if weight_decay < 0:
-            raise ValueError(
-                'Invalid weight_decay value: {}'.format(weight_decay)
-            )
-        if eps < 0.0:
-            raise ValueError('Invalid epsilon value: {}'.format(eps))
+            raise ValueError(f"Weight decay {weight_decay} must be non-negative")
+        if eps < 0:
+            raise ValueError(f"Eps must be non-negative")
 
         defaults = dict(
-            lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, k=0
-        )
+            lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, decoupled_decay=decoupled_decay)
         super().__init__(params, defaults)
 
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-
-                state['grad_sum_sq'] = torch.zeros_like(p.data).detach()
-                state['s'] = torch.zeros_like(p.data).detach()
-                if momentum != 0:
-                    state['x0'] = torch.clone(p.data).detach()
+    @property
+    def supports_memory_efficient_fp16(self) -> bool:
+        return False
 
-    def step(
-        self, closure: Optional[Callable[[], float]] = None
-    ) -> Optional[float]:
-        r"""Performs a single optimization step.
+    @property
+    def supports_flat_params(self) -> bool:
+        return True
 
+    @torch.no_grad()
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        """Performs a single optimization step.
         Arguments:
-            closure: A closure that reevaluates the model and returns the loss.
+            closure (callable, optional): A closure that reevaluates the model and returns the loss.
         """
         loss = None
         if closure is not None:
-            loss = closure()
+            with torch.enable_grad():
+                loss = closure()
 
         for group in self.param_groups:
             eps = group['eps']
-            k = group['k']
             lr = group['lr'] + eps
-            decay = group['weight_decay']
+            weight_decay = group['weight_decay']
             momentum = group['momentum']
-
             ck = 1 - momentum
-            lamb = lr * math.pow(k + 1, 0.5)
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
-                grad = p.grad.data
-                state = self.state[p]
-
+                grad = p.grad
                 if momentum != 0.0 and grad.is_sparse:
-                    raise RuntimeError(
-                        'momentum != 0 is not compatible with '
-                        'sparse gradients'
-                    )
+                    raise RuntimeError("momentum != 0 is not compatible with sparse gradients")
 
+                state = self.state[p]
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['grad_sum_sq'] = torch.zeros_like(p)
+                    state['s'] = torch.zeros_like(p)
+                    if momentum != 0:
+                        state['x0'] = torch.clone(p).detach()
+
+                state['step'] += 1
                 grad_sum_sq = state['grad_sum_sq']
                 s = state['s']
+                lamb = lr * math.sqrt(state['step'])
 
                 # Apply weight decay
-                if decay != 0:
-                    if grad.is_sparse:
-                        raise RuntimeError(
-                            'weight_decay option is not '
-                            'compatible with sparse gradients'
-                        )
-
-                    grad.add_(p.data, alpha=decay)
+                if weight_decay != 0:
+                    if group['decoupled_decay']:
+                        p.mul_(1.0 - group['lr'] * weight_decay)
+                    else:
+                        if grad.is_sparse:
+                            raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+                        grad.add_(p, alpha=weight_decay)
 
                 if grad.is_sparse:
                     grad = grad.coalesce()
@@ -136,37 +131,29 @@ def step(
                     s_masked = s.sparse_mask(grad)
 
                     # Compute x_0 from other known quantities
-                    rms_masked_vals = (
-                        grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
-                    )
-                    x0_masked_vals = p_masked._values().addcdiv(
-                        s_masked._values(), rms_masked_vals, value=1
-                    )
+                    rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
+                    x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1)
 
                     # Dense + sparse op
                     grad_sq = grad * grad
                     grad_sum_sq.add_(grad_sq, alpha=lamb)
                     grad_sum_sq_masked.add_(grad_sq, alpha=lamb)
 
-                    rms_masked_vals = (
-                        grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)
-                    )
+                    rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)
 
                     s.add_(grad, alpha=lamb)
                     s_masked._values().add_(grad_val, alpha=lamb)
 
                     # update masked copy of p
-                    p_kp1_masked_vals = x0_masked_vals.addcdiv(
-                        s_masked._values(), rms_masked_vals, value=-1
-                    )
+                    p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1)
                     # Copy updated masked p to dense p using an add operation
                     p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
-                    p.data.add_(p_masked, alpha=-1)
+                    p.add_(p_masked, alpha=-1)
                 else:
                     if momentum == 0:
                         # Compute x_0 from other known quantities
                         rms = grad_sum_sq.pow(1 / 3).add_(eps)
-                        x0 = p.data.addcdiv(s, rms, value=1)
+                        x0 = p.addcdiv(s, rms, value=1)
                     else:
                         x0 = state['x0']
 
@@ -175,16 +162,15 @@ def step(
                     rms = grad_sum_sq.pow(1 / 3).add_(eps)
 
                     # Update s
-                    s.data.add_(grad, alpha=lamb)
+                    s.add_(grad, alpha=lamb)
 
                     # Step
                     if momentum == 0:
-                        p.data.copy_(x0.addcdiv(s, rms, value=-1))
+                        p.copy_(x0.addcdiv(s, rms, value=-1))
                     else:
                         z = x0.addcdiv(s, rms, value=-1)
 
                         # p is a moving average of z
-                        p.data.mul_(1 - ck).add_(z, alpha=ck)
+                        p.mul_(1 - ck).add_(z, alpha=ck)
 
-            group['k'] = group['k'] + 1
-        return loss
+        return loss
\ No newline at end of file
diff --git a/pywick/optimizers/nadam.py b/pywick/optimizers/nadam.py
index 31bf159..58e575b 100644
--- a/pywick/optimizers/nadam.py
+++ b/pywick/optimizers/nadam.py
@@ -1,4 +1,3 @@
-import torch
 from torch.optim.optimizer import Optimizer
 
 
diff --git a/pywick/optimizers/qhadam.py b/pywick/optimizers/qhadam.py
index e703ce4..3d36a97 100644
--- a/pywick/optimizers/qhadam.py
+++ b/pywick/optimizers/qhadam.py
@@ -3,7 +3,7 @@
 import torch
 from torch.optim.optimizer import Optimizer
 
-from .madgrad import Betas2, Nus2, OptFloat, OptLossClosure, Params
+from .a2grad import Betas2, Nus2, OptFloat, OptLossClosure, Params
 
 __all__ = ('QHAdam',)
 
diff --git a/pywick/optimizers/radam.py b/pywick/optimizers/radam.py
index 62f009e..3ba71a8 100644
--- a/pywick/optimizers/radam.py
+++ b/pywick/optimizers/radam.py
@@ -31,9 +31,6 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1
         self.buffer = [[None, None, None] for ind in range(10)]
         super(RAdam, self).__init__(params, defaults)
 
-    def __setstate__(self, state):
-        super(RAdam, self).__setstate__(state)
-
     def step(self, closure=None):
 
         loss = None
diff --git a/pywick/optimizers/ralamb.py b/pywick/optimizers/ralamb.py
index 9bee570..790d008 100644
--- a/pywick/optimizers/ralamb.py
+++ b/pywick/optimizers/ralamb.py
@@ -12,9 +12,6 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1
         self.buffer = [[None, None, None] for ind in range(10)]
         super(Ralamb, self).__init__(params, defaults)
 
-    def __setstate__(self, state):
-        super(Ralamb, self).__setstate__(state)
-
     def step(self, closure=None):
 
         loss = None
@@ -83,7 +80,7 @@ def step(self, closure=None):
 
                 radam_norm = radam_step.pow(2).sum().sqrt()
                 weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
-                if weight_norm == 0 or radam_norm == 0:
+                if 0 in (weight_norm, radam_norm):
                     trust_ratio = 1
                 else:
                     trust_ratio = weight_norm / radam_norm
diff --git a/pywick/optimizers/sgdw.py b/pywick/optimizers/sgdw.py
index 099ca96..bb065e6 100644
--- a/pywick/optimizers/sgdw.py
+++ b/pywick/optimizers/sgdw.py
@@ -23,7 +23,7 @@ class SGDW(Optimizer):
     Example:
         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
         >>> optimizer.zero_grad()
-        >>> loss_fn(model(input), target).backward()
+        >>> loss_fn(model(input_), target).backward()
         >>> optimizer.step()
 
     .. note::
diff --git a/pywick/optimizers/sign_internal_decay.py b/pywick/optimizers/sign_internal_decay.py
index c6331a9..2514eea 100644
--- a/pywick/optimizers/sign_internal_decay.py
+++ b/pywick/optimizers/sign_internal_decay.py
@@ -2,7 +2,7 @@
 
 import math
 
-class _SignInternalDecay(object):
+class _SignInternalDecay:
     """Base class for internal decays for PowerSign and AddSign optimizers.
 
     Arguments:
@@ -28,8 +28,6 @@ class LinearInternalDecay(_SignInternalDecay):
     .. _Neural Optimizer Search with Reinforcement Learning:
         https://arxiv.org/abs/1709.07417
     """
-    def __init__(self, T_max):
-        super(LinearInternalDecay, self).__init__(T_max)
 
     def __call__(self, step):
         """Returns a linear decay at the current training step:
diff --git a/pywick/optimizers/swa.py b/pywick/optimizers/swa.py
index d98c423..8560439 100644
--- a/pywick/optimizers/swa.py
+++ b/pywick/optimizers/swa.py
@@ -48,14 +48,14 @@ class SWA(Optimizer):
         >>> opt = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
         >>> for _ in range(100):
         >>>     opt.zero_grad()
-        >>>     loss_fn(model(input), target).backward()
+        >>>     loss_fn(model(input_), target).backward()
         >>>     opt.step()
         >>> opt.swap_swa_sgd()
         >>> # manual mode
         >>> opt = SWA(base_opt)
         >>> for i in range(100):
         >>>     opt.zero_grad()
-        >>>     loss_fn(model(input), target).backward()
+        >>>     loss_fn(model(input_), target).backward()
         >>>     opt.step()
         >>>     if i > 10 and i % 5 == 0:
         >>>         opt.update_swa()
@@ -118,7 +118,7 @@ def __init__(self, optimizer, swa_start=None, swa_freq=None, swa_lr=None):
             group['step_counter'] = 0
 
     @staticmethod
-    def _check_params(self, swa_start, swa_freq):
+    def _check_params(swa_start, swa_freq):
         params = [swa_start, swa_freq]
         params_none = [param is None for param in params]
         if not all(params_none) and any(params_none):
@@ -150,7 +150,7 @@ def update_swa_group(self, group):
             >>> opt = torchcontrib.optim.SWA(base_opt)
             >>> for i in range(100):
             >>>     opt.zero_grad()
-            >>>     loss_fn(model(input), target).backward()
+            >>>     loss_fn(model(input_), target).backward()
             >>>     opt.step()
             >>>     if i > 10 and i % 5 == 0:
             >>>         # Update SWA for the second parameter group
@@ -283,19 +283,19 @@ def bn_update(loader, model, device=None):
         model.apply(_reset_bn)
         model.apply(lambda module: _get_momenta(module, momenta))
         n = 0
-        for input in loader:
-            if isinstance(input, (list, tuple)):
-                input = input[0]
-            b = input.size(0)
+        for input_ in loader:
+            if isinstance(input_, (list, tuple)):
+                input_ = input_[0]
+            b = input_.size(0)
 
             momentum = b / float(n + b)
-            for module in momenta.keys():
+            for module in momenta:
                 module.momentum = momentum
 
             if device is not None:
-                input = input.to(device)
+                input_ = input_.to(device)
 
-            model(input)
+            model(input_)
             n += b
 
         model.apply(lambda module: _set_momenta(module, momenta))
diff --git a/pywick/regularizers.py b/pywick/regularizers.py
index 352d1bb..2c93d01 100644
--- a/pywick/regularizers.py
+++ b/pywick/regularizers.py
@@ -4,7 +4,7 @@
 
 from .callbacks import Callback
 
-class RegularizerContainer(object):
+class RegularizerContainer:
 
     def __init__(self, regularizers):
         self.regularizers = regularizers
@@ -47,7 +47,7 @@ def on_batch_end(self, batch, logs=None):
         self.container.reset()
 
 
-class Regularizer(object):
+class Regularizer:
 
     def reset(self):
         raise NotImplementedError('subclass must implement this method')
diff --git a/pywick/samplers.py b/pywick/samplers.py
index 204c310..f67a4dd 100644
--- a/pywick/samplers.py
+++ b/pywick/samplers.py
@@ -129,7 +129,8 @@ def __init__(self, dataset, indices=None, num_samples=None):
         weights = [1.0 / label_to_count[self._get_label(dataset, idx)] for idx in self.indices]
         self.weights = torch.DoubleTensor(weights)
 
-    def _get_label(self, dataset, idx):
+    @staticmethod
+    def _get_label(dataset, idx):
         dataset_type = type(dataset)
         if dataset_type is torchvision.datasets.MNIST:
             return dataset.train_labels[idx].item()
diff --git a/pywick/train_classifier.py b/pywick/train_classifier.py
new file mode 100644
index 0000000..17e3e92
--- /dev/null
+++ b/pywick/train_classifier.py
@@ -0,0 +1,204 @@
+"""
+This code trains a neural network with parameters provided by configs/train_classifier.yaml. Feel free to tweak parameters and train on your own data.
+
+To run: >>> python3 train_classifier.py configs/train_classifier.yaml
+"""
+import datetime
+import json
+import sys
+import time
+from datetime import timedelta
+
+import albumentations as Album
+import cv2
+import torch
+import torch.utils.data as data
+import yaml
+from albumentations.pytorch import ToTensorV2
+from pywick import optimizers as optims
+from pywick.datasets.ClonedFolderDataset import random_split_dataset
+from pywick.datasets.FolderDataset import FolderDataset
+from pywick.datasets.MultiFolderDataset import MultiFolderDataset
+from pywick.datasets.data_utils import adjust_dset_length
+from pywick.dictmodels import ExpConfig
+from pywick.initializers import XavierUniform
+from pywick.metrics import CategoricalAccuracySingleInput
+from pywick.models import load_model, ModelType
+from pywick.modules import ModuleTrainer
+from pywick.utils import class_factory
+from pywick.samplers import ImbalancedDatasetSampler
+from pywick.transforms import read_cv2_as_rgb
+from pywick.cust_random import set_seed
+
+
+def load_image(path: str):
+    return read_cv2_as_rgb(path)
+
+
+def main(config: ExpConfig):
+    """
+    Run training based on the loaded parameters
+
+    :param config:     Configuration to execute
+    :return:
+    """
+
+    dsets = {}
+    dset_loaders = {}
+    if not config.val_root:                     # if no validation root provided, we use a part of the full dataset instead
+        total_set = MultiFolderDataset(roots=config.dataroots, class_mode='label', default_loader=load_image)
+        dsets['train'], dsets['val'] = random_split_dataset(orig_dataset=total_set, splitRatio=config.train_val_ratio, random_seed=config.random_seed)
+    else:
+        dsets['train'] = MultiFolderDataset(roots=config.dataroots, class_mode='label', default_loader=load_image)
+        dsets['val'] = FolderDataset(root=config.val_root, class_mode='label', default_loader=load_image)
+
+    # Trim the datasets to fit correctly onto N devices in batches of size B
+    num_devices = 1 if len(config.get('gpu_ids', 0)) == 0 or not config.use_gpu else len(config.gpu_ids)
+    batch_size = config.batch_size
+    adjust_dset_length(dataset=dsets['train'],
+                       num_batches=len(dsets['train']) // (num_devices * batch_size),
+                       num_devices=num_devices,
+                       batch_size=batch_size)
+    adjust_dset_length(dataset=dsets['val'],
+                       num_batches=len(dsets['val']) // (num_devices * batch_size),
+                       num_devices=num_devices,
+                       batch_size=batch_size)
+
+    # we may want to balance the data representation
+    if config.auto_balance_dataset:
+        dset_loaders['train'] = data.DataLoader(dsets['train'],
+                                                sampler=ImbalancedDatasetSampler(dsets['train']),
+                                                batch_size=config.batch_size,
+                                                num_workers=config.workers,
+                                                shuffle=False,
+                                                pin_memory=True)
+    else:
+        dset_loaders['train'] = data.DataLoader(dsets['train'],
+                                                batch_size=config.batch_size,
+                                                num_workers=config.workers,
+                                                shuffle=True,
+                                                pin_memory=True)
+    dset_loaders['val'] = data.DataLoader(dsets['val'],
+                                          batch_size=config.batch_size,
+                                          num_workers=config.workers,
+                                          shuffle=False,
+                                          pin_memory=True)
+
+    dset_sizes = {x: len(dsets[x]) for x in ['train', 'val']}
+
+    device = 'cpu'              # CPU is default but if all checks pass, GPU will be enabled
+    if config.use_gpu and torch.cuda.is_available():
+        device = 'cuda:{}'.format(config.gpu_ids[0])
+
+    # load appropriate model from pywick's model store
+    model = load_model(model_type=ModelType.CLASSIFICATION,
+                       model_name=config.model_spec,
+                       num_classes=len(dsets['train'].class_to_idx),
+                       input_size=None,
+                       pretrained=True,
+                       force_reload=True)
+
+    mean, std = config.mean_std
+    class_to_idx = dsets['train'].class_to_idx
+
+    # Create augmentation and normalization transforms for training + normalization for validation
+    data_transforms = {
+            'train': Album.Compose([
+                # Apply image transforms
+                Album.RandomCrop(height=config.input_size+50, width=config.input_size+50, always_apply=True, p=1),
+                Album.Resize(height=config.input_size, width=config.input_size, interpolation=cv2.INTER_LINEAR, always_apply=True, p=1),
+                Album.RandomBrightnessContrast(brightness_limit=(-0.1, 0.3), contrast_limit=0.4, p=0.6),
+                Album.ShiftScaleRotate(shift_limit=0.2, scale_limit=(-0.4, 0.2), rotate_limit=270, p=0.9, border_mode=cv2.BORDER_REPLICATE),
+                Album.CoarseDropout(max_holes=14, max_height=12, max_width=12, p=0.5),
+                # normalize and convert to tensor
+                Album.Compose([Album.Normalize(mean=mean, std=std, always_apply=True, p=1), ToTensorV2()])
+            ]),
+            'val': Album.Compose([
+                Album.Resize(height=config.input_size, width=config.input_size, interpolation=cv2.INTER_LINEAR, always_apply=True, p=1),
+                Album.ShiftScaleRotate(shift_limit=0.2, scale_limit=(-0.2, 0.2), rotate_limit=270, p=1, border_mode=cv2.BORDER_REPLICATE),
+                Album.Compose([Album.Normalize(mean=mean, std=std, always_apply=True, p=1), ToTensorV2()])
+            ])
+        }
+
+    # Set transforms for each dataset
+    dsets['train'].transform = lambda in_dict: data_transforms['train'](**in_dict)['image']
+    dsets['val'].transform = lambda in_dict: data_transforms['val'](**in_dict)['image']
+    
+    print(f"Configuration Params: \n{config}")
+    print('--------------------------------')
+    print(f"Dataset Stats:")
+    print(f"Num classes: {len(dsets['train'].classes)}")
+    print(f"Train Set Size: {dset_sizes['train']}")
+    print(f"Val Set Size: {dset_sizes['val']}")
+
+    # load desired optimizer (torch.optim and pywick.optimizer types supported)
+    # optimizer = class_factory(classname=config.optimizer['name'], params_dict=config.optimizer.get('params'))
+
+    optimizer = optims.__dict__[config.optimizer['name']](model.parameters(), **(config.optimizer.get('params').to_dict()))
+
+    if device != 'cpu':
+        trainer = ModuleTrainer(model, cuda_devices=config.gpu_ids)
+    else:
+        trainer = ModuleTrainer(model)
+
+    # set up desired callbacks
+    callbacks = []
+    if config.save_callback is not None:
+        if config.save_callback.name == 'ModelCheckpoint':
+            config.save_callback.params['run_id'] = config.exp_id
+            config.save_callback.params['addl_k_v'] = {'num_classes': len(dsets['train'].class_to_idx),
+                                                       'mean_std': config.mean_std,
+                                                       'model_name': config.model_spec,
+                                                       'optimizer': config.optimizer.get('name')}
+            config.save_callback.params['epoch_log_keys'] = ['val_top_1:acc_metric', 'val_top_5:acc_metric']
+
+        checkpt_callback = class_factory(classname=config.save_callback.name, params_dict=config.save_callback.get('params').to_dict())
+        callbacks.append(checkpt_callback)
+
+    # create a scheduler
+    if config.scheduler.name == 'OnceCycleLRScheduler':
+        config.scheduler['params']['steps_per_epoch'] = len(dset_loaders['train'])
+    config.scheduler['params']['epochs'] = config.num_epochs
+    config.scheduler['params']['optimizer'] = optimizer
+    scheduler = class_factory(classname=config.scheduler['name'], params_dict=config.scheduler.get('params'))
+    callbacks.append(scheduler)
+
+    trainer.compile(criterion='cross_entropy',
+                    callbacks=[checkpt_callback] if checkpt_callback is not None else None,
+                    optimizer=optimizer,
+                    # regularizers=regularizers,                # <-- not included in example but can add regularizers
+                    # constraints=constraints,                  #     ... and constraints
+                    initializers=[XavierUniform(bias=False, module_filter='fc*')],
+                    metrics=[CategoricalAccuracySingleInput(top_k=1), CategoricalAccuracySingleInput(top_k=5)])
+
+    start_time = time.time()
+    print(f'''Starting Training: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}''')
+
+    trainer.fit_loader(dset_loaders['train'],
+                       val_loader=dset_loaders['val'],
+                       num_epoch=config.num_epochs,
+                       verbose=1)
+
+    print(f'Training Complete (time: {timedelta(seconds=int(time.time() - start_time))})')
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        raise AssertionError("Only one argument is expected: config_path")
+    config_path = sys.argv[1]
+    # Create a configuration object to run this experiment
+    with open(config_path, 'r') as f:
+        if config_path.endswith('.yml') or config_path.endswith('.yaml'):
+            config = ExpConfig.from_dict(yaml.safe_load(f)['train'])  # loads the 'train' configuration from yaml
+        elif config_path.endswith('.json'):
+            config = ExpConfig.from_dict(json.load(f)['train'])  # loads the 'train' configuration from json
+        else:
+            raise Exception(f'Configuration file extension must be either .yaml/.yml or .json')
+        config.verify_properties()  # make sure all properties have been set
+        set_seed(config.random_seed)
+
+        # if not config.use_gpu or not torch.cuda.is_available():  # this is a known problem / limitation of the multiprocessing module.
+        #     import multiprocessing
+        #     multiprocessing.set_start_method('fork')  # must set multiprocessing to 'fork' from 'spawn' because the dataloader fails to pickle lambda
+
+    main(config)
diff --git a/pywick/transforms/__init__.py b/pywick/transforms/__init__.py
index 9f3d7e6..b03eafd 100644
--- a/pywick/transforms/__init__.py
+++ b/pywick/transforms/__init__.py
@@ -1,3 +1,6 @@
+"""
+Along with custom transforms provided by Pywick, we fully support integration of Albumentations <https://github.com/albumentations-team/albumentations/>`_ which contains a great number of useful transform functions. See train_classifier.py for an example of how to incorporate albumentations into training.
+"""
 
 from .affine_transforms import *
 from .distortion_transforms import *
diff --git a/pywick/transforms/affine_transforms.py b/pywick/transforms/affine_transforms.py
index a3d88f4..de19d37 100644
--- a/pywick/transforms/affine_transforms.py
+++ b/pywick/transforms/affine_transforms.py
@@ -10,7 +10,7 @@
 from ..utils import th_affine2d, th_random_choice
 
 
-class RandomAffine(object):
+class RandomAffine:
 
     def __init__(self, 
                  rotation_range=None, 
@@ -87,13 +87,12 @@ def __call__(self, *inputs):
 
         if self.lazy:
             return tform_matrix
-        else:
-            outputs = Affine(tform_matrix,
-                             interp=self.interp)(*inputs)
-            return outputs
+
+        outputs = Affine(tform_matrix, interp=self.interp)(*inputs)
+        return outputs
 
 
-class Affine(object):
+class Affine:
 
     def __init__(self, 
                  tform_matrix,
@@ -124,15 +123,14 @@ def __call__(self, *inputs):
             interp = self.interp
 
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
-            input_tf = th_affine2d(_input,
-                                   self.tform_matrix,
-                                   mode=interp[idx])
+            input_tf = th_affine2d(_input, self.tform_matrix, mode=interp[idx])
             outputs.append(input_tf)
         return outputs if idx >= 1 else outputs[0]
 
 
-class AffineCompose(object):
+class AffineCompose:
 
     def __init__(self, 
                  transforms,
@@ -175,6 +173,7 @@ def __call__(self, *inputs):
             interp = self.interp
 
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             input_tf = th_affine2d(_input,
                                    tform_matrix,
@@ -183,7 +182,7 @@ def __call__(self, *inputs):
         return outputs if idx >= 1 else outputs[0]
 
 
-class RandomRotate(object):
+class RandomRotate:
 
     def __init__(self, 
                  rotation_range,
@@ -216,13 +215,12 @@ def __call__(self, *inputs):
 
         if self.lazy:
             return Rotate(degree, lazy=True)(inputs[0])
-        else:
-            outputs = Rotate(degree,
-                             interp=self.interp)(*inputs)
-            return outputs
+
+        outputs = Rotate(degree, interp=self.interp)(*inputs)
+        return outputs
 
 
-class RandomChoiceRotate(object):
+class RandomChoiceRotate:
 
     def __init__(self, 
                  values,
@@ -267,13 +265,12 @@ def __call__(self, *inputs):
 
         if self.lazy:
             return Rotate(degree, lazy=True)(inputs[0])
-        else:
-            outputs = Rotate(degree,
-                             interp=self.interp)(*inputs)
-            return outputs
+
+        outputs = Rotate(degree, interp=self.interp)(*inputs)
+        return outputs
 
 
-class Rotate(object):
+class Rotate:
 
     def __init__(self, 
                  value,
@@ -313,18 +310,16 @@ def __call__(self, *inputs):
                                           [0, 0, 1]])
         if self.lazy:
             return rotation_matrix
-        else:
-            outputs = []
-            for idx, _input in enumerate(inputs):
-                input_tf = th_affine2d(_input,
-                                       rotation_matrix,
-                                       mode=interp[idx],
-                                       center=True)
-                outputs.append(input_tf)
-            return outputs if idx >= 1 else outputs[0]
+
+        outputs = []
+        idx = None
+        for idx, _input in enumerate(inputs):
+            input_tf = th_affine2d(_input, rotation_matrix, mode=interp[idx], center=True)
+            outputs.append(input_tf)
+        return outputs if idx >= 1 else outputs[0]
 
 
-class RandomTranslate(object):
+class RandomTranslate:
 
     def __init__(self, 
                  translation_range,
@@ -370,15 +365,13 @@ def __call__(self, *inputs):
         random_width = random.uniform(-self.width_range, self.width_range)
 
         if self.lazy:
-            return Translate([random_height, random_width], 
-                             lazy=True)(inputs[0])
-        else:
-            outputs = Translate([random_height, random_width],
-                                 interp=self.interp)(*inputs)
-            return outputs
+            return Translate([random_height, random_width], lazy=True)(inputs[0])
+
+        outputs = Translate([random_height, random_width], interp=self.interp)(*inputs)
+        return outputs
 
 
-class RandomChoiceTranslate(object):
+class RandomChoiceTranslate:
 
     def __init__(self,
                  values,
@@ -427,13 +420,12 @@ def __call__(self, *inputs):
         if self.lazy:
             return Translate([random_height, random_width],
                              lazy=True)(inputs[0])
-        else:
-            outputs = Translate([random_height, random_width],
-                                interp=self.interp)(*inputs)
-            return outputs
+
+        outputs = Translate([random_height, random_width], interp=self.interp)(*inputs)
+        return outputs
 
 
-class Translate(object):
+class Translate:
 
     def __init__(self, 
                  value, 
@@ -480,18 +472,16 @@ def __call__(self, *inputs):
                                              [0, 0, 1]])
         if self.lazy:
             return translation_matrix
-        else:
-            outputs = []
-            for idx, _input in enumerate(inputs):
-                input_tf = th_affine2d(_input,
-                                       translation_matrix,
-                                       mode=interp[idx],
-                                       center=True)
-                outputs.append(input_tf)
-            return outputs if idx >= 1 else outputs[0]
+
+        outputs = []
+        idx = None
+        for idx, _input in enumerate(inputs):
+            input_tf = th_affine2d(_input, translation_matrix, mode=interp[idx], center=True)
+            outputs.append(input_tf)
+        return outputs if idx >= 1 else outputs[0]
 
 
-class RandomShear(object):
+class RandomShear:
 
     def __init__(self, 
                  shear_range,
@@ -521,15 +511,13 @@ def __init__(self,
     def __call__(self, *inputs):
         shear = random.uniform(-self.shear_range, self.shear_range)
         if self.lazy:
-            return Shear(shear, 
-                         lazy=True)(inputs[0])
-        else:
-            outputs = Shear(shear,
-                            interp=self.interp)(*inputs)
-            return outputs
+            return Shear(shear, lazy=True)(inputs[0])
+
+        outputs = Shear(shear, interp=self.interp)(*inputs)
+        return outputs
 
 
-class RandomChoiceShear(object):
+class RandomChoiceShear:
 
     def __init__(self,
                  values,
@@ -574,13 +562,12 @@ def __call__(self, *inputs):
         if self.lazy:
             return Shear(shear, 
                          lazy=True)(inputs[0])
-        else:
-            outputs = Shear(shear,
-                            interp=self.interp)(*inputs)
-            return outputs 
+
+        outputs = Shear(shear, interp=self.interp)(*inputs)
+        return outputs
 
 
-class Shear(object):
+class Shear:
 
     def __init__(self,
                  value,
@@ -602,18 +589,16 @@ def __call__(self, *inputs):
                                         [0, 0, 1]])
         if self.lazy:
             return shear_matrix
-        else:
-            outputs = []
-            for idx, _input in enumerate(inputs):
-                input_tf = th_affine2d(_input,
-                                       shear_matrix,
-                                       mode=interp[idx],
-                                       center=True)
-                outputs.append(input_tf)
-            return outputs if idx >= 1 else outputs[0]
+
+        outputs = []
+        idx = None
+        for idx, _input in enumerate(inputs):
+            input_tf = th_affine2d(_input, shear_matrix, mode=interp[idx], center=True)
+            outputs.append(input_tf)
+        return outputs if idx >= 1 else outputs[0]
 
 
-class RandomSquareZoom(object):
+class RandomSquareZoom:
 
     def __init__(self,
                  zoom_range,
@@ -649,12 +634,9 @@ def __call__(self, *inputs):
         zy = zx
         if self.lazy:
             return Zoom([zx, zy], lazy=True)(inputs[0])
-        else:
-            outputs = Zoom([zx, zy],
-                           interp=self.interp)(*inputs)
-        return outputs
+        return Zoom([zx, zy], interp=self.interp)(*inputs)
 
-class RandomZoom(object):
+class RandomZoom:
 
     def __init__(self, 
                  zoom_range,
@@ -694,13 +676,11 @@ def __call__(self, *inputs):
 
         if self.lazy:
             return Zoom([zx, zy], lazy=True)(inputs[0])
-        else:
-            outputs = Zoom([zx, zy], 
-                           interp=self.interp)(*inputs)
-            return outputs
+
+        return Zoom([zx, zy],  interp=self.interp)(*inputs)
 
 
-class RandomChoiceZoom(object):
+class RandomChoiceZoom:
 
     def __init__(self, 
                  values,
@@ -746,13 +726,11 @@ def __call__(self, *inputs):
 
         if self.lazy:
             return Zoom([zx, zy], lazy=True)(inputs[0])
-        else:
-            outputs = Zoom([zx, zy], 
-                           interp=self.interp)(*inputs)
-            return outputs
 
+        return Zoom([zx, zy], interp=self.interp)(*inputs)
 
-class Zoom(object):
+
+class Zoom:
 
     def __init__(self,
                  value,
@@ -795,14 +773,12 @@ def __call__(self, *inputs):
 
         if self.lazy:
             return zoom_matrix
-        else:
-            outputs = []
-            for idx, _input in enumerate(inputs):
-                input_tf = th_affine2d(_input,
-                                       zoom_matrix,
-                                       mode=interp[idx],
-                                       center=True)
-                outputs.append(input_tf)
-            return outputs if idx >= 1 else outputs[0]
+
+        outputs = []
+        idx = None
+        for idx, _input in enumerate(inputs):
+            input_tf = th_affine2d(_input, zoom_matrix, mode=interp[idx], center=True)
+            outputs.append(input_tf)
+        return outputs if idx >= 1 else outputs[0]
 
 
diff --git a/pywick/transforms/distortion_transforms.py b/pywick/transforms/distortion_transforms.py
index abbe988..d6bab27 100644
--- a/pywick/transforms/distortion_transforms.py
+++ b/pywick/transforms/distortion_transforms.py
@@ -8,7 +8,7 @@
 import random
 
 
-class Scramble(object):
+class Scramble:
     """
     Create blocks of an image and scramble them
     """
@@ -17,6 +17,7 @@ def __init__(self, blocksize):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             size = _input.size()
             img_height = size[1]
@@ -39,7 +40,7 @@ def __call__(self, *inputs):
         return outputs if idx >= 1 else outputs[0]
  
 
-class RandomChoiceScramble(object):
+class RandomChoiceScramble:
 
     def __init__(self, blocksizes):
         self.blocksizes = blocksizes
@@ -101,7 +102,7 @@ def _butterworth_filter(rows, cols, thresh, order):
     return f
 
 
-class Blur(object):
+class Blur:
     """
     Blur an image with a Butterworth filter with a frequency
     cutoff matching local block size
@@ -122,6 +123,7 @@ def __call__(self, *inputs):
         inputs should have values between 0 and 255
         """
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             rows = _input.size(1)
             cols = _input.size(2)
@@ -137,7 +139,7 @@ def __call__(self, *inputs):
         return outputs if idx >= 1 else outputs[0]
 
 
-class RandomChoiceBlur(object):
+class RandomChoiceBlur:
 
     def __init__(self, thresholds, order=5):
         """
diff --git a/pywick/transforms/image_transforms.py b/pywick/transforms/image_transforms.py
index 3225586..fa81b7b 100644
--- a/pywick/transforms/image_transforms.py
+++ b/pywick/transforms/image_transforms.py
@@ -15,7 +15,7 @@
 from ..utils import th_random_choice
 
 
-class DeNormalize(object):
+class DeNormalize:
     """
     Denormalizes a tensor using provided mean, std
     """
@@ -29,7 +29,7 @@ def __call__(self, tensor):
         return tensor
 
 
-class MaskToSqueezedTensor(object):
+class MaskToSqueezedTensor:
     """
     Removes empty dimensions from the mask and converts to a torch.float32 tensor.
     Typically used with B/W masks to remove the "channel" dimension
@@ -44,7 +44,7 @@ def __call__(self, img):
         return self.to_tensor(img).squeeze()
 
 
-class MaskPixelsToMap(object):
+class MaskPixelsToMap:
     """
     Replaces the pixel values in range [0-255] with class values from supplied value_map.
 
@@ -68,7 +68,7 @@ def __call__(self, mask):
         return mask.astype(np.uint8)    # make sure it's in UINT8 format
 
 
-class MaskToTensor(object):
+class MaskToTensor:
     """
     Converts a PIL, numpy or CV image to a torch.long representation
     """
@@ -76,7 +76,7 @@ def __call__(self, img):
         return th.from_numpy(np.array(img, dtype=np.int32)).long()
 
 
-class MaskToFloatTensor(object):
+class MaskToFloatTensor:
     """
     Converts a PIL, numpy or CV image to a torch.float32 representation
     """
@@ -108,7 +108,7 @@ def _blend(img1, img2, alpha):
     return img1.mul(alpha).add(1 - alpha, img2)
 
 
-class Grayscale(object):
+class Grayscale:
 
     def __init__(self, keep_channels=False):
         """
@@ -128,6 +128,7 @@ def __init__(self, keep_channels=False):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input_dst = _input[0]*0.299 + _input[1]*0.587 + _input[2]*0.114
             _input_gs = _input_dst.repeat(self.channels,1,1)
@@ -135,7 +136,7 @@ def __call__(self, *inputs):
         return outputs if idx >= 1 else outputs[0]
 
 
-class RandomGrayscale(object):
+class RandomGrayscale:
 
     def __init__(self, p=0.5):
         """
@@ -158,7 +159,7 @@ def __call__(self, *inputs):
 # ----------------------------------------------------
 # ----------------------------------------------------
 
-class Gamma(object):
+class Gamma:
 
     def __init__(self, value):
         """
@@ -179,12 +180,13 @@ def __init__(self, value):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = th.pow(_input, self.value)
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
-class RandomGamma(object):
+class RandomGamma:
 
     def __init__(self, min_val, max_val):
         """
@@ -214,7 +216,7 @@ def __call__(self, *inputs):
         outputs = Gamma(value)(*inputs)
         return outputs
 
-class RandomChoiceGamma(object):
+class RandomChoiceGamma:
 
     def __init__(self, values, p=None):
         """
@@ -249,7 +251,7 @@ def __call__(self, *inputs):
 # ----------------------------------------------------
 # ----------------------------------------------------
 
-class Brightness(object):
+class Brightness:
     def __init__(self, value):
         """
         Alter the Brightness of an image
@@ -267,12 +269,13 @@ def __init__(self, value):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = th.clamp(_input.float().add(self.value).type(_input.type()), 0, 1)
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
-class RandomBrightness(object):
+class RandomBrightness:
 
     def __init__(self, min_val, max_val):
         """
@@ -293,7 +296,7 @@ def __call__(self, *inputs):
         outputs = Brightness(value)(*inputs)
         return outputs
 
-class RandomChoiceBrightness(object):
+class RandomChoiceBrightness:
 
     def __init__(self, values, p=None):
         """
@@ -319,7 +322,7 @@ def __call__(self, *inputs):
 # ----------------------------------------------------
 # ----------------------------------------------------
 
-class Saturation(object):
+class Saturation:
 
     def __init__(self, value):
         """
@@ -338,6 +341,7 @@ def __init__(self, value):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _in_gs = Grayscale(keep_channels=True)(_input)
             alpha = 1.0 + self.value
@@ -345,7 +349,7 @@ def __call__(self, *inputs):
             outputs.append(_in)
         return outputs if idx >= 1 else outputs[0]
 
-class RandomSaturation(object):
+class RandomSaturation:
 
     def __init__(self, min_val, max_val):
         """
@@ -366,7 +370,7 @@ def __call__(self, *inputs):
         outputs = Saturation(value)(*inputs)
         return outputs
 
-class RandomChoiceSaturation(object):
+class RandomChoiceSaturation:
 
     def __init__(self, values, p=None):
         """
@@ -393,7 +397,7 @@ def __call__(self, *inputs):
 # ----------------------------------------------------
 # ----------------------------------------------------
 
-class Contrast(object):
+class Contrast:
     """
 
     """
@@ -419,6 +423,7 @@ def __init__(self, value):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             channel_means = _input.mean(1, keepdim=True).mean(2, keepdim=True)
             channel_means = channel_means.expand_as(_input)
@@ -426,7 +431,7 @@ def __call__(self, *inputs):
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
-class RandomContrast(object):
+class RandomContrast:
 
     def __init__(self, min_val, max_val):
         """
@@ -447,7 +452,7 @@ def __call__(self, *inputs):
         outputs = Contrast(value)(*inputs)
         return outputs
 
-class RandomChoiceContrast(object):
+class RandomChoiceContrast:
 
     def __init__(self, values, p=None):
         """
diff --git a/pywick/transforms/tensor_transforms.py b/pywick/transforms/tensor_transforms.py
index a12a3f0..07129be 100644
--- a/pywick/transforms/tensor_transforms.py
+++ b/pywick/transforms/tensor_transforms.py
@@ -7,7 +7,7 @@
 import torch as th
 
 
-class Compose(object):
+class Compose:
     """
     Composes (chains) several transforms together.
 
@@ -24,7 +24,7 @@ def __call__(self, *inputs):
         return inputs
 
 
-class RandomChoiceCompose(object):
+class RandomChoiceCompose:
     """
     Randomly choose to apply one transform from a collection of transforms
 
@@ -44,19 +44,20 @@ def __call__(self, *inputs):
         return outputs
 
 
-class ToTensor(object):
+class ToTensor:
     """
     Converts a numpy array to torch.Tensor
     """
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = th.from_numpy(_input)
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
 
-class ToFile(object):
+class ToFile:
     """
     Saves an image to file. Useful as a pass-through transform
     when wanting to observe how augmentation affects the data
@@ -80,7 +81,7 @@ def __call__(self, *inputs):
         return inputs
 
 
-class ToNumpyType(object):
+class ToNumpyType:
     """
     Converts an object to a specific numpy type (with the idea to be passed to ToTensor() next)
 
@@ -90,14 +91,13 @@ class ToNumpyType(object):
     def __init__(self, type):
         self.type = type
 
-    def __call__(self, input):
-        if isinstance(input, list):     # handle a simple list
-            return np.array(input, dtype=self.type)
-        else:                           # handle ndarray (that is of a different type than desired)
-            return input.astype(self.type)
+    def __call__(self, input_):
+        if isinstance(input_, list):     # handle a simple list
+            return np.array(input_, dtype=self.type)
+        return input_.astype(self.type)
 
 
-class ChannelsLast(object):
+class ChannelsLast:
     """
     Transposes a tensor so that the channel dim is last
     `HWC` and `DHWC` are aliases for this transform.
@@ -115,9 +115,10 @@ def __call__(self, *inputs):
             # check if channels are already last
             if inputs[0].size(-1) < inputs[0].size(0):
                 return inputs
-        plist = list(range(1,ndim))+[0]
+        plist = list(range(1, ndim))+[0]
 
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input.permute(*plist)
             outputs.append(_input)
@@ -127,7 +128,7 @@ def __call__(self, *inputs):
 DHWC = ChannelsLast
 
 
-class ChannelsFirst(object):
+class ChannelsFirst:
     """
     Transposes a tensor so that the channel dim is first.
     `CHW` and `CDHW` are aliases for this transform.
@@ -148,6 +149,7 @@ def __call__(self, *inputs):
         plist = [ndim-1] + list(range(0,ndim-1))
 
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input.permute(*plist)
             outputs.append(_input)
@@ -157,7 +159,7 @@ def __call__(self, *inputs):
 CDHW = ChannelsFirst
 
 
-class TypeCast(object):
+class TypeCast:
     """
     Cast a torch.Tensor to a different type
     param dtype: (string or torch.*Tensor literal or list) of such
@@ -206,13 +208,14 @@ def __call__(self, *inputs):
             dtypes = self.dtype
         
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input.type(dtypes[idx])
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
 
-class AddChannel(object):
+class AddChannel:
     """Adds a dummy channel to an image, also known as expanding an axis or unsqueezing a dim
     This will make an image of size (28, 28) to now be
     of size (1, 28, 28), for example.
@@ -224,6 +227,7 @@ def __init__(self, axis=0):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input.unsqueeze(self.axis)
             outputs.append(_input)
@@ -233,7 +237,7 @@ def __call__(self, *inputs):
 Unsqueeze = AddChannel
 
 
-class Transpose(object):
+class Transpose:
     """
     Swaps two dimensions of a tensor
 
@@ -250,13 +254,14 @@ def __init__(self, dim1, dim2):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = th.transpose(_input, self.dim1, self.dim2)
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
 
-class RangeNormalize(object):
+class RangeNormalize:
     """
     Given min_val: (R, G, B) and max_val: (R,G,B),
     will normalize each channel of the th.*Tensor to
@@ -293,6 +298,7 @@ def __init__(self, min_val, max_val):
 
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _min_val = _input.min()
             _max_val = _input.max()
@@ -303,19 +309,20 @@ def __call__(self, *inputs):
         return outputs if idx >= 1 else outputs[0]
 
 
-class StdNormalize(object):
+class StdNormalize:
     """
     Normalize torch tensor to have zero mean and unit std deviation
     """
     def __call__(self, *inputs):
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input.sub(_input.mean()).div(_input.std())
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
 
-class Slice2D(object):
+class Slice2D:
     """
     Take a random 2D slice from a 3D image along
     a given axis. This image should not have a 4th channel dim.
@@ -334,34 +341,32 @@ def __init__(self, axis=0, reject_zeros=False):
 
     def __call__(self, x, y=None):
         while True:
-            keep_slice  = random.randint(0,x.size(self.axis)-1)
+            keep_slice = random.randint(0, x.size(self.axis) - 1)
             if self.axis == 0:
-                slice_x = x[keep_slice,:,:]
+                slice_x = x[keep_slice, :, :]
                 if y is not None:
-                    slice_y = y[keep_slice,:,:]
+                    slice_y = y[keep_slice, :, :]
             elif self.axis == 1:
-                slice_x = x[:,keep_slice,:]
+                slice_x = x[:, keep_slice, :]
                 if y is not None:
-                    slice_y = y[:,keep_slice,:]
+                    slice_y = y[:, keep_slice, :]
             elif self.axis == 2:
-                slice_x = x[:,:,keep_slice]
+                slice_x = x[:, :, keep_slice]
                 if y is not None:
-                    slice_y = y[:,:,keep_slice]
+                    slice_y = y[:, :, keep_slice]
 
             if not self.reject_zeros:
                 break
-            else:
-                if y is not None and th.sum(slice_y) > 0:
-                        break
-                elif th.sum(slice_x) > 0:
-                        break
+            if y is not None and th.sum(slice_y) > 0:
+                break
+            if th.sum(slice_x) > 0:
+                break
         if y is not None:
             return slice_x, slice_y
-        else:
-            return slice_x
+        return slice_x
 
 
-class RandomCrop(object):
+class RandomCrop:
     """
     Randomly crop a torch tensor
 
@@ -375,13 +380,14 @@ def __call__(self, *inputs):
         h_idx = random.randint(0,inputs[0].size(1)-self.size[0])
         w_idx = random.randint(0,inputs[1].size(2)-self.size[1])
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input[:, h_idx:(h_idx+self.size[0]),w_idx:(w_idx+self.size[1])]
             outputs.append(_input)
         return outputs if idx >= 1 else outputs[0]
 
 
-class SpecialCrop(object):
+class SpecialCrop:
     """
     Perform a special crop - one of the four corners or center crop
 
@@ -435,11 +441,10 @@ def __call__(self, x, y=None):
         if y is not None:
             y = y[:,indices[0][0]:indices[0][1],indices[1][0]:indices[1][1]]
             return x, y
-        else:
-            return x
+        return x
 
 
-class Pad(object):
+class Pad:
 
     """
     Pads an image to the given size
@@ -462,11 +467,10 @@ def __call__(self, x, y=None):
             y = y.numpy()
             y = np.pad(y, pad_sizes, mode='constant')
             return th.from_numpy(x), th.from_numpy(y)
-        else:
-            return th.from_numpy(x)
+        return th.from_numpy(x)
 
 
-class PadNumpy(object):
+class PadNumpy:
 
     """
     Pads a Numpy image to the given size
@@ -487,11 +491,10 @@ def __call__(self, x, y=None):
         if y is not None:
             y = np.pad(y, pad_sizes, mode='constant')
             return x, y
-        else:
-            return x
+        return x
 
 
-class RandomFlip(object):
+class RandomFlip:
 
     """
     Randomly flip an image horizontally and/or vertically with
@@ -536,17 +539,17 @@ def __call__(self, x, y=None):
         if y is None:
             # must copy because torch doesnt current support neg strides
             return th.from_numpy(x.copy())
-        else:
-            return th.from_numpy(x.copy()),th.from_numpy(y.copy())
+        return th.from_numpy(x.copy()),th.from_numpy(y.copy())
 
 
-class RandomOrder(object):
+class RandomOrder:
     """
     Randomly permute the channels of an image
     """
     def __call__(self, *inputs):
         order = th.randperm(inputs[0].dim())
         outputs = []
+        idx = None
         for idx, _input in enumerate(inputs):
             _input = _input.index_select(0, order)
             outputs.append(_input)
diff --git a/pywick/utils.py b/pywick/utils.py
index e7bff08..68332cc 100644
--- a/pywick/utils.py
+++ b/pywick/utils.py
@@ -1,10 +1,11 @@
 """
-Utility functions for th.Tensors
+Utility functions
 """
 
 import pickle
 import random
-import numpy as np
+from .optimizers import *
+from .callbacks import *
 
 import torch as th
 
@@ -133,43 +134,43 @@ def th_affine2d(x, matrix, mode='bilinear', center=True):
     return x_transformed
 
 
-def th_nearest_interp2d(input, coords):
+def th_nearest_interp2d(input_, coords):
     """
     2d nearest neighbor interpolation th.Tensor
     """
     # take clamp of coords so they're in the image bounds
-    x = th.clamp(coords[:,:,0], 0, input.size(1)-1).round()
-    y = th.clamp(coords[:,:,1], 0, input.size(2)-1).round()
+    x = th.clamp(coords[:,:,0], 0, input_.size(1) - 1).round()
+    y = th.clamp(coords[:,:,1], 0, input_.size(2) - 1).round()
 
-    stride = th.LongTensor(input.stride())
+    stride = th.LongTensor(input_.stride())
     x_ix = x.mul(stride[1]).long()
     y_ix = y.mul(stride[2]).long()
 
-    input_flat = input.view(input.size(0),-1)
+    input_flat = input_.view(input_.size(0), -1)
 
     mapped_vals = input_flat.gather(1, x_ix.add(y_ix))
 
-    return mapped_vals.view_as(input)
+    return mapped_vals.view_as(input_)
 
 
-def th_bilinear_interp2d(input, coords):
+def th_bilinear_interp2d(input_, coords):
     """
     bilinear interpolation in 2d
     """
-    x = th.clamp(coords[:,:,0], 0, input.size(1)-2)
+    x = th.clamp(coords[:,:,0], 0, input_.size(1) - 2)
     x0 = x.floor()
     x1 = x0 + 1
-    y = th.clamp(coords[:,:,1], 0, input.size(2)-2)
+    y = th.clamp(coords[:,:,1], 0, input_.size(2) - 2)
     y0 = y.floor()
     y1 = y0 + 1
 
-    stride = th.LongTensor(input.stride())
+    stride = th.LongTensor(input_.stride())
     x0_ix = x0.mul(stride[1]).long()
     x1_ix = x1.mul(stride[1]).long()
     y0_ix = y0.mul(stride[2]).long()
     y1_ix = y1.mul(stride[2]).long()
 
-    input_flat = input.view(input.size(0),-1)
+    input_flat = input_.view(input_.size(0), -1)
 
     vals_00 = input_flat.gather(1, x0_ix.add(y0_ix))
     vals_10 = input_flat.gather(1, x1_ix.add(y0_ix))
@@ -186,7 +187,7 @@ def th_bilinear_interp2d(input, coords):
                 vals_01.mul(xm).mul(yd) +
                 vals_11.mul(xd).mul(yd))
 
-    return x_mapped.view_as(input)
+    return x_mapped.view_as(input_)
 
 
 def th_affine3d(x, matrix, mode='trilinear', center=True):
@@ -227,43 +228,43 @@ def th_affine3d(x, matrix, mode='trilinear', center=True):
     return x_transformed
 
 
-def th_nearest_interp3d(input, coords):
+def th_nearest_interp3d(input_, coords):
     """
     2d nearest neighbor interpolation th.Tensor
     """
     # take clamp of coords so they're in the image bounds
-    coords[:,0] = th.clamp(coords[:,0], 0, input.size(1)-1).round()
-    coords[:,1] = th.clamp(coords[:,1], 0, input.size(2)-1).round()
-    coords[:,2] = th.clamp(coords[:,2], 0, input.size(3)-1).round()
+    coords[:,0] = th.clamp(coords[:,0], 0, input_.size(1) - 1).round()
+    coords[:,1] = th.clamp(coords[:,1], 0, input_.size(2) - 1).round()
+    coords[:,2] = th.clamp(coords[:,2], 0, input_.size(3) - 1).round()
 
-    stride = th.LongTensor(input.stride())[1:].float()
+    stride = th.LongTensor(input_.stride())[1:].float()
     idx = coords.mv(stride).long()
 
-    input_flat = th_flatten(input)
+    input_flat = th_flatten(input_)
 
     mapped_vals = input_flat[idx]
 
-    return mapped_vals.view_as(input)
+    return mapped_vals.view_as(input_)
 
 
-def th_trilinear_interp3d(input, coords):
+def th_trilinear_interp3d(input_, coords):
     """
     trilinear interpolation of 3D th.Tensor image
     """
     # take clamp then floor/ceil of x coords
-    x = th.clamp(coords[:,0], 0, input.size(1)-2)
+    x = th.clamp(coords[:,0], 0, input_.size(1) - 2)
     x0 = x.floor()
     x1 = x0 + 1
     # take clamp then floor/ceil of y coords
-    y = th.clamp(coords[:,1], 0, input.size(2)-2)
+    y = th.clamp(coords[:,1], 0, input_.size(2) - 2)
     y0 = y.floor()
     y1 = y0 + 1
     # take clamp then floor/ceil of z coords
-    z = th.clamp(coords[:,2], 0, input.size(3)-2)
+    z = th.clamp(coords[:,2], 0, input_.size(3) - 2)
     z0 = z.floor()
     z1 = z0 + 1
 
-    stride = th.LongTensor(input.stride())[1:]
+    stride = th.LongTensor(input_.stride())[1:]
     x0_ix = x0.mul(stride[0]).long()
     x1_ix = x1.mul(stride[0]).long()
     y0_ix = y0.mul(stride[1]).long()
@@ -271,7 +272,7 @@ def th_trilinear_interp3d(input, coords):
     z0_ix = z0.mul(stride[2]).long()
     z1_ix = z1.mul(stride[2]).long()
 
-    input_flat = th_flatten(input)
+    input_flat = th_flatten(input_)
 
     vals_000 = input_flat[x0_ix+y0_ix+z0_ix]
     vals_100 = input_flat[x1_ix+y0_ix+z0_ix]
@@ -298,7 +299,7 @@ def th_trilinear_interp3d(input, coords):
                 vals_110.mul(xd).mul(yd).mul(zm1) +
                 vals_111.mul(xd).mul(yd).mul(zd))
 
-    return x_mapped.view_as(input)
+    return x_mapped.view_as(input_)
 
 
 def th_pearsonr(x, y):
@@ -402,24 +403,36 @@ def th_random_choice(a, n_samples=1, replace=True, p=None):
         selection = selection[0]
     return selection
 
+## REMOVED due to security concerns with pickle
+# def save_transform(file, transform):
+#     """
+#     Save a transform object
+#     """
+#     with open(file, 'wb') as output_file:
+#         pickler = pickle.Pickler(output_file, -1)
+#         pickler.dump(transform)
+#
+#
+# def load_transform(file):
+#     """
+#     Load a transform object
+#     """
+#     with open(file, 'rb') as input_file:
+#         transform = pickle.load(input_file)
+#     return transform
+    
 
-def save_transform(file, transform):
-    """
-    Save a transform object
-    """
-    with open(file, 'wb') as output_file:
-        pickler = pickle.Pickler(output_file, -1)
-        pickler.dump(transform)
+from pywick.callbacks import *
 
 
-def load_transform(file):
+def class_factory(classname: str, params_dict: dict = None):
     """
-    Load a transform object
+    Instantiate a class with given parameters
+    :param classname:       Name of class
+    :param params_dict:     Dict of parameters
+    :return:
     """
-    with open(file, 'rb') as input_file:
-        transform = pickle.load(input_file)
-    return transform
-    
-
-
-    
+    if params_dict is None:
+        params_dict = {}
+    cls = globals()[classname]
+    return cls(**params_dict)
diff --git a/requirements.txt b/requirements.txt
index 017ac49..a74f64c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,20 @@
-git+https://github.com/albumentations-team/albumentations.git@1.0.3   # commit: 929cbd84cd06b68ca352d6ab81dae27cfe97d6ad
+albumentations
 dill
-hickle
-inplace_abn
+#hickle
+h5py
+# inplace_abn
 numpy
 opencv-python-headless
 pandas
 pillow
 prodict
+pycm
 pyyaml
 scipy
+requests
 scikit-image
 six
-torch >= 1.6.0
-torchvision
+tabulate
+tini
 tqdm
 yacs
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a843723..7cd8650 100644
--- a/setup.py
+++ b/setup.py
@@ -16,23 +16,27 @@
       long_description_content_type='text/markdown',
       author=author,
       install_requires=[
-            'albumentations >= 1.0.3',
+            'albumentations',
             'dill',
-            'hickle',
-            'inplace_abn',
+            'h5py',
             'numpy',
             'opencv-python-headless',
             'pandas',
             'pillow',
             'prodict',
+            'pycm',
             'pyyaml',
             'scipy',
+            'requests',
             'scikit-image',
+            'setuptools',
             'six',
-            'torch >= 1.7.0',
+            'tabulate',
+            'torch >= 1.6.0',
             'torchvision',
             'tqdm',
-            'yacs'
+            'yacs',
+            'wheel'
             ],
       packages=find_packages(),
       url='https://github.com/achaiah/pywick',
diff --git a/tests/integration/fit_complex/multi_input_multi_target.py b/tests/integration/fit_complex/multi_input_multi_target.py
index 9debb38..ec6c1d8 100644
--- a/tests/integration/fit_complex/multi_input_multi_target.py
+++ b/tests/integration/fit_complex/multi_input_multi_target.py
@@ -59,7 +59,7 @@ def forward(self, x, y, z):
                cons.MaxNorm(5, 0, 'batch', 'conv*')]
 callbacks = [cbks.ReduceLROnPlateau(monitor='loss', verbose=1)]
 
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta',
                 regularizers=regularizers,
                 constraints=constraints,
@@ -82,7 +82,7 @@ def forward(self, x, y, z):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss=['nll_loss', 'nll_loss', 'nll_loss'],
+trainer.compile(criterion=['nll_loss', 'nll_loss', 'nll_loss'],
                 optimizer='adadelta',
                 regularizers=regularizers,
                 constraints=constraints,
@@ -100,7 +100,7 @@ def forward(self, x, y, z):
     model = Network()
     trainer = ModuleTrainer(model)
 
-    trainer.compile(loss=['nll_loss', 'nll_loss'],
+    trainer.compile(criterion=['nll_loss', 'nll_loss'],
                     optimizer='adadelta',
                     regularizers=regularizers,
                     constraints=constraints,
diff --git a/tests/integration/fit_loader_simple/single_input_multi_target.py b/tests/integration/fit_loader_simple/single_input_multi_target.py
index 6a5d680..ba0da3d 100644
--- a/tests/integration/fit_loader_simple/single_input_multi_target.py
+++ b/tests/integration/fit_loader_simple/single_input_multi_target.py
@@ -55,7 +55,7 @@ def forward(self, x):
 # one loss function for multiple targets
 model = Network()
 trainer = ModuleTrainer(model)
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta')
 
 trainer.fit_loader(train_loader,
@@ -69,7 +69,7 @@ def forward(self, x):
 # multiple loss functions
 model = Network()
 trainer = ModuleTrainer(model)
-trainer.compile(loss=['nll_loss', 'nll_loss'],
+trainer.compile(criterion=['nll_loss', 'nll_loss'],
                 optimizer='adadelta')
 trainer.fit_loader(train_loader,
                    num_epoch=3, 
diff --git a/tests/integration/fit_loader_simple/single_input_single_target.py b/tests/integration/fit_loader_simple/single_input_single_target.py
index d58f6e6..2952c10 100644
--- a/tests/integration/fit_loader_simple/single_input_single_target.py
+++ b/tests/integration/fit_loader_simple/single_input_single_target.py
@@ -55,7 +55,7 @@ def forward(self, x):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta')
 
 trainer.fit_loader(train_loader,
diff --git a/tests/integration/fit_simple/simple_multi_input_multi_target.py b/tests/integration/fit_simple/simple_multi_input_multi_target.py
index 4b5f88f..408f4b0 100644
--- a/tests/integration/fit_simple/simple_multi_input_multi_target.py
+++ b/tests/integration/fit_simple/simple_multi_input_multi_target.py
@@ -51,7 +51,7 @@ def forward(self, x, y, z):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta')
 
 trainer.fit([x_train, x_train, x_train], 
@@ -71,7 +71,7 @@ def forward(self, x, y, z):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss=['nll_loss', 'nll_loss', 'nll_loss'],
+trainer.compile(criterion=['nll_loss', 'nll_loss', 'nll_loss'],
                 optimizer='adadelta')
 
 trainer.fit([x_train, x_train, x_train], 
@@ -86,7 +86,7 @@ def forward(self, x, y, z):
     model = Network()
     trainer = ModuleTrainer(model)
 
-    trainer.compile(loss=['nll_loss', 'nll_loss'],
+    trainer.compile(criterion=['nll_loss', 'nll_loss'],
                     optimizer='adadelta')
 
     trainer.fit([x_train, x_train, x_train], 
diff --git a/tests/integration/fit_simple/simple_multi_input_no_target.py b/tests/integration/fit_simple/simple_multi_input_no_target.py
index 578b10d..3f433fb 100644
--- a/tests/integration/fit_simple/simple_multi_input_no_target.py
+++ b/tests/integration/fit_simple/simple_multi_input_no_target.py
@@ -51,7 +51,7 @@ def forward(self, x, y, z):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss='unconstrained_sum',
+trainer.compile(criterion='unconstrained_sum',
                 optimizer='adadelta')
 
 trainer.fit([x_train, x_train, x_train],
diff --git a/tests/integration/fit_simple/simple_multi_input_single_target.py b/tests/integration/fit_simple/simple_multi_input_single_target.py
index 27a9b2b..a8eea4c 100644
--- a/tests/integration/fit_simple/simple_multi_input_single_target.py
+++ b/tests/integration/fit_simple/simple_multi_input_single_target.py
@@ -51,7 +51,7 @@ def forward(self, x, y, z):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta')
 
 trainer.fit([x_train, x_train, x_train], y_train,
diff --git a/tests/integration/fit_simple/single_input_multi_target.py b/tests/integration/fit_simple/single_input_multi_target.py
index 6e488d1..a14f452 100644
--- a/tests/integration/fit_simple/single_input_multi_target.py
+++ b/tests/integration/fit_simple/single_input_multi_target.py
@@ -51,7 +51,7 @@ def forward(self, x):
 # one loss function for multiple targets
 model = Network()
 trainer = ModuleTrainer(model)
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta')
 
 trainer.fit(x_train, 
@@ -67,7 +67,7 @@ def forward(self, x):
 # multiple loss functions
 model = Network()
 trainer = ModuleTrainer(model)
-trainer.compile(loss=['nll_loss', 'nll_loss'],
+trainer.compile(criterion=['nll_loss', 'nll_loss'],
                 optimizer='adadelta')
 trainer.fit(x_train, 
             [y_train, y_train], 
diff --git a/tests/integration/fit_simple/single_input_no_target.py b/tests/integration/fit_simple/single_input_no_target.py
index ee73203..c7a8d26 100644
--- a/tests/integration/fit_simple/single_input_no_target.py
+++ b/tests/integration/fit_simple/single_input_no_target.py
@@ -51,7 +51,7 @@ def forward(self, x):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss='unconstrained_sum',
+trainer.compile(criterion='unconstrained_sum',
                 optimizer='adadelta')
 
 trainer.fit(x_train,
diff --git a/tests/integration/fit_simple/single_input_single_target.py b/tests/integration/fit_simple/single_input_single_target.py
index 7e398db..8be424f 100644
--- a/tests/integration/fit_simple/single_input_single_target.py
+++ b/tests/integration/fit_simple/single_input_single_target.py
@@ -52,7 +52,7 @@ def forward(self, x):
 model = Network()
 trainer = ModuleTrainer(model)
 
-trainer.compile(loss='nll_loss',
+trainer.compile(criterion='nll_loss',
                 optimizer='adadelta',
                 regularizers=[reg.L1Regularizer(1e-4)])
 
diff --git a/tests/unit/transforms/test_tensor_transforms.py b/tests/unit/transforms/test_tensor_transforms.py
index e860775..61fd42f 100644
--- a/tests/unit/transforms/test_tensor_transforms.py
+++ b/tests/unit/transforms/test_tensor_transforms.py
@@ -81,10 +81,10 @@ def ToFile_setup():
     tforms = {}
 
     ROOT = '~/desktop/data/'
-    tforms['tofile_npy'] = ToFile(root=ROOT, fmt='npy')
-    tforms['tofile_pth'] = ToFile(root=ROOT, fmt='pth')
-    tforms['tofile_jpg'] = ToFile(root=ROOT, fmt='jpg')
-    tforms['tofile_png'] = ToFile(root=ROOT, fmt='png')
+    # tforms['tofile_npy'] = ToFile(root=ROOT, fmt='npy')
+    # tforms['tofile_pth'] = ToFile(root=ROOT, fmt='pth')
+    # tforms['tofile_jpg'] = ToFile(root=ROOT, fmt='jpg')
+    # tforms['tofile_png'] = ToFile(root=ROOT, fmt='png')
 
     return tforms