diff --git a/.travis.yml b/.travis.yml
index aa1237649..b4a583533 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,9 @@
 language: python
 
 # https://docs.travis-ci.com/user/caching/#pip-cache
-cache: pip
+cache:
+  directories:
+    - $HOME/.cache/pip/wheels
 
 addons:
   apt:
@@ -59,7 +61,7 @@ matrix:
 install:
   - |
     if [[ -v _DOC_AND_YAPF_TEST ]]; then
-        pip install tensorflow 
+        pip install tensorflow
         pip install yapf
         pip install -e .[doc]
     else
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1d47e777..47a6eff46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -86,6 +86,8 @@ To release a new version, please update the changelog as followed:
 
 ### Changed
 
+- BatchNormLayer: support `data_format`
+
 ### Dependencies Update
 - yapf>=0.22,<0.24 => yapf>=0.22,<0.25 (PR #829)
 - sphinx>=1.7,<1.8 => sphinx>=1.7,<1.9 (PR #842)
@@ -132,7 +134,7 @@ To release a new version, please update the changelog as followed:
 
 ### Contributors
 - @DEKHTIARJonathan: #815 #818 #820 #823
-- @ndiy: #819 
+- @ndiy: #819
 - @zsdonghao: #818
 
 
diff --git a/README.md b/README.md
index 9ba494e8a..053d4a2e5 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,9 @@ both English and Chinese. Please click the following icons to find the documents
 [![Chinese Documentation](https://img.shields.io/badge/documentation-%E4%B8%AD%E6%96%87-blue.svg)](https://tensorlayercn.readthedocs.io/)
 [![Chinese Book](https://img.shields.io/badge/book-%E4%B8%AD%E6%96%87-blue.svg)](http://www.broadview.com.cn/book/5059/)
 
+If you want to try the experimental features on the the master branch, you can find the latest document
+[here](https://tensorlayer.readthedocs.io/en/latest/).
+
 # Install
 
 TensorLayer has pre-requisites including TensorFlow, numpy, and others. For GPU support, CUDA and cuDNN are required.
diff --git a/docs/images/affine_transform_comparison.jpg b/docs/images/affine_transform_comparison.jpg
index bd3b9051f..f7da0b829 100644
Binary files a/docs/images/affine_transform_comparison.jpg and b/docs/images/affine_transform_comparison.jpg differ
diff --git a/docs/images/affine_transform_why.jpg b/docs/images/affine_transform_why.jpg
index f7da0b829..bd3b9051f 100644
Binary files a/docs/images/affine_transform_why.jpg and b/docs/images/affine_transform_why.jpg differ
diff --git a/docs/modules/prepro.rst b/docs/modules/prepro.rst
index 5285aeaa3..6423236dd 100644
--- a/docs/modules/prepro.rst
+++ b/docs/modules/prepro.rst
@@ -215,6 +215,14 @@ preserve the content in an image. The following figure illustrates these two ben
   :width: 100 %
   :align: center
 
+The major reason for combined affine transformation being fast is because it has lower computational complexity.
+Assume we have ``k`` affine transformations ``T1, ..., Tk``, where ``Ti`` can be represented by 3x3 matrixes.
+The sequential transformation can be represented as ``y = Tk (... T1(x))``,
+and the time complexity is ``O(k N)`` where ``N`` is the cost of applying one transformation to image ``x``.
+``N`` is linear to the size of ``x``.
+For the combined transformation ``y = (Tk ... T1) (x)``
+the time complexity is ``O(27(k - 1) + N) = max{O(27k), O(N)} = O(N)`` (assuming 27k << N) where 27 = 3^3 is the cost for combine two transformations.
+
 
 Get rotation matrix
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -261,17 +269,13 @@ Apply keypoint transform
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. autofunction:: affine_transform_keypoints
 
-Projective transform by points
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. autofunction:: projective_transform_by_points
-
 
 Images
 -----------
 
-- These functions only apply on a single image, use ``threading_data`` to apply multiple threading see ``tutorial_image_preprocess.py``.
-- All functions have argument ``is_random``.
-- All functions end with ``*_multi`` process all images together, usually be used for image segmentation i.e. the input and output image should be matched.
+Projective transform by points
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autofunction:: projective_transform_by_points
 
 Rotation
 ^^^^^^^^^
diff --git a/tensorlayer/layers/normalization.py b/tensorlayer/layers/normalization.py
index 23a7b58e2..0e51d51e9 100644
--- a/tensorlayer/layers/normalization.py
+++ b/tensorlayer/layers/normalization.py
@@ -3,6 +3,8 @@
 
 import tensorflow as tf
 from tensorflow.python.training import moving_averages
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
 
 from tensorlayer.layers.core import Layer
 from tensorlayer.layers.core import LayersConfig
@@ -69,6 +71,52 @@ def __init__(
         self._add_layers(self.outputs)
 
 
+def _to_channel_first_bias(b):
+    """Reshape [c] to [c, 1, 1]."""
+    channel_size = int(b.shape[0])
+    new_shape = (channel_size, 1, 1)
+    # new_shape = [-1, 1, 1]  # doesn't work with tensorRT
+    return tf.reshape(b, new_shape)
+
+
+def _bias_scale(x, b, data_format):
+    """The multiplication counter part of tf.nn.bias_add."""
+    if data_format == 'NHWC':
+        return x * b
+    elif data_format == 'NCHW':
+        return x * _to_channel_first_bias(b)
+    else:
+        raise ValueError('invalid data_format: %s' % data_format)
+
+
+def _bias_add(x, b, data_format):
+    """Alternative implementation of tf.nn.bias_add which is compatiable with tensorRT."""
+    if data_format == 'NHWC':
+        return tf.add(x, b)
+    elif data_format == 'NCHW':
+        return tf.add(x, _to_channel_first_bias(b))
+    else:
+        raise ValueError('invalid data_format: %s' % data_format)
+
+
+def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, data_format, name=None):
+    """Data Format aware version of tf.nn.batch_normalization."""
+    with ops.name_scope(name, 'batchnorm', [x, mean, variance, scale, offset]):
+        inv = math_ops.rsqrt(variance + variance_epsilon)
+        if scale is not None:
+            inv *= scale
+
+        a = math_ops.cast(inv, x.dtype)
+        b = math_ops.cast(offset - mean * inv if offset is not None else -mean * inv, x.dtype)
+
+        # Return a * x + b with customized data_format.
+        # Currently TF doesn't have bias_scale, and tensorRT has bug in converting tf.nn.bias_add
+        # So we reimplemted them to allow make the model work with tensorRT.
+        # See https://github.com/tensorlayer/openpose-plus/issues/75 for more details.
+        df = {'channels_first': 'NCHW', 'channels_last': 'NHWC'}
+        return _bias_add(_bias_scale(x, a, df[data_format]), b, df[data_format])
+
+
 class BatchNormLayer(Layer):
     """
     The :class:`BatchNormLayer` is a batch normalization layer for both fully-connected and convolution outputs.
@@ -115,6 +163,7 @@ def __init__(
             beta_init=tf.zeros_initializer,
             gamma_init=tf.random_normal_initializer(mean=1.0, stddev=0.002),
             moving_mean_init=tf.zeros_initializer(),
+            data_format='channels_last',
             name='batchnorm_layer',
     ):
         super(BatchNormLayer, self).__init__(prev_layer=prev_layer, act=act, name=name)
@@ -123,14 +172,21 @@ def __init__(
             "BatchNormLayer %s: decay: %f epsilon: %f act: %s is_train: %s" %
             (self.name, decay, epsilon, self.act.__name__ if self.act is not None else 'No Activation', is_train)
         )
-        if decay > 1:
+        if decay < 0 or 1 < decay:
             raise Exception("decay should be between 0 to 1")
 
         x_shape = self.inputs.get_shape()
-        params_shape = x_shape[-1:]
+        if data_format == 'channels_last':
+            axis = len(x_shape) - 1
+        elif data_format == 'channels_first':
+            axis = 1
+        else:
+            raise ValueError('data_format should be either %s or %s' % ('channels_last', 'channels_first'))
+        params_shape = x_shape[axis]
 
         with tf.variable_scope(name):
-            axis = list(range(len(x_shape) - 1))
+            axes = [i for i in range(len(x_shape)) if i != axis]
+
             # 1. beta, gamma
             variables = []
 
@@ -176,7 +232,7 @@ def __init__(
 
             # 3.
             # These ops will only be preformed when training.
-            mean, variance = tf.nn.moments(self.inputs, axis)
+            mean, variance = tf.nn.moments(self.inputs, axes)
 
             update_moving_mean = moving_averages.assign_moving_average(
                 moving_mean, mean, decay, zero_debias=False
@@ -196,7 +252,7 @@ def mean_var_with_update():
                 mean, var = moving_mean, moving_variance
 
             self.outputs = self._apply_activation(
-                tf.nn.batch_normalization(self.inputs, mean, var, beta, gamma, epsilon)
+                batch_normalization(self.inputs, mean, var, beta, gamma, epsilon, data_format)
             )
 
             variables.extend([moving_mean, moving_variance])
diff --git a/tensorlayer/package_info.py b/tensorlayer/package_info.py
index 50c471c6d..b0974a888 100644
--- a/tensorlayer/package_info.py
+++ b/tensorlayer/package_info.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-"""Deep learning and Reinforcement learning library for Researchers and Engineers"""
+"""Deep learning and Reinforcement learning library for Researchers and Engineers."""
 
 # Use the following formatting: (major, minor, patch, prerelease)
-VERSION = (1, 10, 1, "")
+VERSION = (1, 11, 0, 'rc0')
 __shortversion__ = '.'.join(map(str, VERSION[:3]))
-__version__ = '.'.join(map(str, VERSION[:3])) + "".join(VERSION[3:])
+__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
 
 __package_name__ = 'tensorlayer'
 __contact_names__ = 'TensorLayer Contributors'