diff --git a/.gitignore b/.gitignore
index 01f06be1a909f3..900e5a53cbcf3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
.ipynb_checkpoints
node_modules
/.bazelrc
+/.tf_configure.bazelrc
/bazel-*
/third_party/py/numpy/numpy_include
/tools/bazel.rc
diff --git a/README.md b/README.md
index 84c42aad18731f..d9f05a67e0391f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
+
-----------------
| **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
@@ -33,12 +34,12 @@ and discussion.**
People who are a little more adventurous can also try our nightly binaries:
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.0.1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.0.1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
@@ -59,11 +60,11 @@ Hello, TensorFlow!
>>>
```
-##For more information
+## For more information
* [TensorFlow website](http://tensorflow.org)
* [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
* [TensorFlow Model Zoo](https://github.com/tensorflow/models)
* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
-The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/versions/master/resources#community) for an incomplete list.
+The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/about/#community) for an incomplete list.
diff --git a/RELEASE.md b/RELEASE.md
index 5f261a4543db80..156cc2e3af507f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,110 @@
+# Release 1.1.0
+
+## Major Features and Improvements
+* Added Java API support for Windows.
+* Added `tf.spectral` module. Moved existing FFT ops to `tf.spectral` while
+ keeping an alias in the old location (`tf.*`).
+* Added 1D, 2D and 3D Fourier transform ops for real signals to `tf.spectral`.
+* Added a `tf.bincount` function.
+* Added Keras 2 API to contrib.
+* Added a new lightweight queue-like object - `RecordInput`.
+* Added `tf.contrib.image.compose_transforms` function.
+* Bring `tf.estimator.*` into the API. Non-deprecated functionality from `tf.contrib.learn.Estimator` is moved to `tf.estimator.Estimator` with cosmetic changes.
+* Docker images: TF images on gcr.io and Docker Hub are upgraded to ubuntu:16.04.
+* Added the following features to TensorFlow Debugger (tfdbg):
+ * Ability to inspect Python source file against TF ops and tensors (command `print_source` / `ps`)
+ * New navigation bar in Curses-based UI
+ * NodeStepper (command `invoke_stepper`) now uses intermediate tensor dumps. It also uses `TensorHandles` as direct feeds during successive `cont` calls for improved performance and reduced memory consumption.
+
+## Deprecations
+
+* TensorFlow 1.1.0 will be the last time we release a binary with Mac GPU support. Going forward, we will stop testing on Mac GPU systems. We continue to welcome patches that maintain Mac GPU support, and we will try to keep the Mac GPU build working.
+
+## Changes to contrib APIs
+* The behavior of RNNCells is now stricter due to the transition towards making RNNCells act more like Keras layers.
+ * If an RNNCell is used twice in two different variable scopes, an error is raised describing how to avoid this behavior.
+ * If an RNNCell is used in a variable scope with existing conflicting variables, an error is raised showing that the RNNCell must be constructed with argument `reuse=True`.
+* Deprecated contrib/distributions `pmf`, `pdf`, `log_pmf`, `log_pdf`.
+* Moved `bayesflow.special_math` to distributions.
+* `tf.contrib.tensor_forest.python.tensor_forest.RandomForestDeviceAssigner` removed.
+* Changed some MVN classes and parameters:
+ * `tf.contrib.distributions.MultivariateNormalFull` replaced by `tf.contrib.distributions.MultivariateNormalTriL`.
+ * `tf.contrib.distributions.MultivariateNormalCholesky` replaced by `tf.contrib.distributions.MultivariateNormalTriL`
+ * `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev` replaced
+ by `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusScale`
+ * `tf.contrib.distributions.MultivariateNormalDiag` arguments changed from `mu`, `diag_stddev` to `log`, `scale_diag`.
+ * `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT` removed.
+ * `tf.contrib.distributions.MultivariateNormalDiagPlusLowRank` added.
+
+## Bug Fixes and Other Changes
+* Java: Support for loading models exported using the SavedModel API (courtesy @EronWright).
+* Go: Added support for incremental graph execution.
+* Fix a bug in the WALS solver when single-threaded.
+* Added support for integer sparse feature values in `tf.contrib.layers.sparse_column_with_keys`.
+* Fixed `tf.set_random_seed(0)` to be deterministic for all ops.
+* Stability improvements for the GCS file system support.
+* Improved TensorForest performance.
+* Added support for multiple filename globs in `tf.matching_files`.
+* `LogMessage` now includes a timestamp as beginning of a message.
+* Added MultiBox person detector example standalone binary.
+* Android demo: Makefile build functionality added to build.gradle to fully support building TensorFlow demo in Android on Windows.
+* Android demo: read MultiBox priors from txt file rather than protobuf.
+* Added colocation constraints to `StagingArea`.
+* `sparse_matmul_op` reenabled for Android builds.
+* Restrict weights rank to be the same as the broadcast target, to avoid ambiguity on broadcast rules.
+* Upgraded libxsmm to 1.7.1 and applied other changes for performance and memory usage.
+* Fixed bfloat16 integration of LIBXSMM sparse mat-mul.
+* Improved performance and reduce memory usage by allowing ops to forward input buffers to output buffers and perform computations in-place.
+* Improved the performance of CPU assignment for strings.
+* Speed up matrix * vector multiplication and matrix * matrix with unknown shapes.
+* C API: Graph imports now support input remapping, control dependencies, and returning imported nodes (see `TF_GraphImportGraphDefWithReturnOutputs()`)
+* Multiple C++ API updates.
+* Multiple TensorBoard updates including:
+ * Users can now view image summaries at various sampled steps (instead of just the last step).
+ * Bugs involving switching runs as well as the image dashboard are fixed.
+ * Removed data download links from TensorBoard.
+ * TensorBoard uses a relative data directory, for easier embedding.
+ * TensorBoard automatically ignores outliers for domain calculation, and formats proportional values consistently.
+* Multiple tfdbg bug fixes:
+ * Fixed Windows compatibility issues.
+ * Command history now persists across runs.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton
+Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
+Visser, Brady Zhou, Calpa Liu, Changming Sun, Chi Zeng, Chih Cheng Liang,
+Christopher Berner, Clark Zinzow, @Conchylicultor, Courtial Florian, Dan Ellis,
+Dan J, Dan Jarvis, Daniel Ylitalo, Darren Garvey, David Norman, David Truong,
+@DavidNorman, Dimitar Pavlov, Dmitry Persiyanov, @Eddie, @elirex, Erfan
+Noury, Eron Wright, Evgeny Mazovetskiy, Fabrizio (Misto) Milo, @fanlu, Fisher
+Coder, Franck Dernoncourt, Gagan Goel, Gao, Xiang, @Gautam, Gefu Tang,
+@guilherme, @guschmue, Hannah Provenza, Hans Pabst, @hartb, Hsiao Yi, Huazuo
+Gao, Igor ChorążEwicz, Ivan Smirnov, Jakub Kolodziejczyk, Jason Gavris, Jason
+Morton, Jay Young, Jayaram Bobba, Jeremy Sawruk, Jiaming Liu, Jihun Choi,
+@jiqiu, Joan Thibault, John C F, Jojy G Varghese, Jon Malmaud, Julian Berman,
+Julian Niedermeier, Junpeng Lao, Kai Sasaki, @Kankroc, Karl Lessard, Kyle
+Bostelmann, @Lezcano, Li Yi, Luo Yun, @lurker, Mahmoud-Abuzaina, Mandeep Singh,
+Marek Kolodziej, Mark Szepieniec, Martial Hue, Medhat Omr, Memo Akten, Michael
+Gharbi, MichaëL Defferrard, Milan Straka, @MircoT, @mlucool, Muammar Ibn Faisal,
+Nayana Thorat, @nghiattran, Nicholas Connor, Nikolaas Steenbergen, Niraj Patel,
+Niranjan Hasabnis, @Panmari, Pavel Bulanov, Philip Pries Henningsen, Philipp
+Jund, @polonez, Prayag Verma, Rahul Kavi, Raphael Gontijo Lopes, @rasbt, Raven
+Iqqe, Reid Pryzant, Richard Shin, Rizwan Asif, Russell Kaplan, Ryo Asakura,
+RüDiger Busche, Saisai Shao, Sam Abrahams, @sanosay, Sean Papay, @seaotterman,
+@selay01, Shaurya Sharma, Sriram Narayanamoorthy, Stefano Probst, @taknevski,
+@tbonza, @teldridge11, Yuan (Terry) Tang, Tim Anglade, Tomas Reimers, Tomer Gafner,
+Valentin Iovene, Vamsi Sripathi, Viktor Malyi, Vit Stepanovs, Vivek Rane, Vlad
+Firoiu, @wangg12, @will, Xiaoyu Tao, Yaroslav Bulatov, Yuan (Terry) Tang,
+@Yufeng, Yuming Wang, Yuxin Wu, Zafar Takhirov, Ziming Dong
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+
# Release 1.0.1
## Bug Fixes and Other Changes
@@ -94,7 +201,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
* In the C++ API (in tensorflow/cc), Input, Output, etc. have moved
from the tensorflow::ops namespace to tensorflow.
* Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
-* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn. They will be moved back into core for TF 1.1.
+* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn. They will be moved back into core for TF 1.2.
* `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
* The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.
diff --git a/WORKSPACE b/WORKSPACE
index 6ec1a7df3ec5a5..cab8389a55ccfe 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -20,7 +20,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
#android_sdk_repository(
# name = "androidsdk",
# api_level = 23,
-# build_tools_version = "25.0.1",
+# # Ensure that you have the build_tools_version below installed in the
+# # SDK manager as it updates periodically.
+# build_tools_version = "25.0.2",
# # Replace with path to Android SDK on your system
# path = "",
#)
@@ -29,7 +31,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
#android_ndk_repository(
# name="androidndk",
# path="",
-# api_level=14) # This needs to be 14 or higher to compile TensorFlow.
+# # This needs to be 14 or higher to compile TensorFlow.
+# # Note that the NDK version is not the API level.
+# api_level=14)
# Please add all new TensorFlow dependencies in workspace.bzl.
tf_workspace()
diff --git a/configure b/configure
index e59ee2a925b444..6360641be2ca99 100755
--- a/configure
+++ b/configure
@@ -8,9 +8,6 @@ pushd `dirname $0` > /dev/null
SOURCE_BASE_DIR=`pwd -P`
popd > /dev/null
-# This file contains customized config settings.
-touch .bazelrc
-
PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
function is_linux() {
@@ -38,14 +35,6 @@ function is_windows() {
fi
}
-function bazel_fetch() {
- if [ -z "$TF_BAZEL_TARGETS" ]; then
- bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
- else
- bazel fetch $TF_BAZEL_TARGETS
- fi
-}
-
function sed_hyphen_i() {
if is_macos; then
sed -i '' "$@"
@@ -54,6 +43,21 @@ function sed_hyphen_i() {
fi
}
+function write_to_bazelrc() {
+ echo "$1" >> .tf_configure.bazelrc
+}
+
+function write_action_env_to_bazelrc() {
+ write_to_bazelrc "build --action_env $1=\"$2\""
+}
+
+# This file contains customized config settings.
+rm -f .tf_configure.bazelrc
+touch .tf_configure.bazelrc
+touch .bazelrc
+sed_hyphen_i "/tf_configure/d" .bazelrc
+echo "import .tf_configure.bazelrc" >> .bazelrc
+
# Delete any leftover BUILD files from the Makefile build, which would interfere
# with Bazel parsing.
MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads
@@ -164,6 +168,7 @@ if is_windows; then
TF_NEED_HDFS=0
TF_NEED_JEMALLOC=0
TF_NEED_OPENCL=0
+ TF_CUDA_CLANG=0
fi
if is_linux; then
@@ -181,9 +186,8 @@ else
TF_NEED_JEMALLOC=0
fi
-sed_hyphen_i -e "/with_jemalloc/d" .bazelrc
if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
- echo 'build --define with_jemalloc=true' >>.bazelrc
+ write_to_bazelrc 'build --define with_jemalloc=true'
fi
while [[ "$TF_NEED_GCP" == "" ]]; do
@@ -200,9 +204,8 @@ while [[ "$TF_NEED_GCP" == "" ]]; do
esac
done
-sed_hyphen_i -e "/with_gcp_support/d" .bazelrc
if [[ "$TF_NEED_GCP" == "1" ]]; then
- echo 'build --define with_gcp_support=true' >>.bazelrc
+ write_to_bazelrc 'build --define with_gcp_support=true'
fi
while [[ "$TF_NEED_HDFS" == "" ]]; do
@@ -219,9 +222,8 @@ while [[ "$TF_NEED_HDFS" == "" ]]; do
esac
done
-sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc
if [[ "$TF_NEED_HDFS" == "1" ]]; then
- echo 'build --define with_hdfs_support=true' >>.bazelrc
+ write_to_bazelrc 'build --define with_hdfs_support=true'
fi
## Enable XLA.
@@ -235,9 +237,8 @@ while [[ "$TF_ENABLE_XLA" == "" ]]; do
esac
done
-sed_hyphen_i -e "/with_xla_support/d" .bazelrc
if [[ "$TF_ENABLE_XLA" == "1" ]]; then
- echo 'build --define with_xla_support=true' >>.bazelrc
+ write_to_bazelrc 'build --define with_xla_support=true'
fi
@@ -279,23 +280,11 @@ while [ "$TF_NEED_CUDA" == "" ]; do
esac
done
-sed_hyphen_i -e "/--action_env TF_NEED_CUDA/d" .bazelrc
-sed_hyphen_i -e "/--action_env CUD/d" .bazelrc
-sed_hyphen_i -e "/--action_env GCC_HOST/d" .bazelrc
-sed_hyphen_i -e "/--action_env TF_CUD/d" .bazelrc
-sed_hyphen_i -e "/--action_env CLANG_CUDA/d" .bazelrc
-
export TF_NEED_CUDA
-echo "build --action_env TF_NEED_CUDA=$TF_NEED_CUDA" >>.bazelrc
+write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"
export TF_NEED_OPENCL
-if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
- echo "Configuration finished"
- bazel_fetch
- exit
-fi
-
if [ "$TF_NEED_CUDA" == "1" ]; then
while [[ "$TF_CUDA_CLANG" == "" ]]; do
read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
@@ -308,7 +297,7 @@ while [[ "$TF_CUDA_CLANG" == "" ]]; do
done
export TF_CUDA_CLANG
-echo "build --action_env TF_CUDA_CLANG=$TF_CUDA_CLANG" >>.bazelrc
+write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
# Set up which gcc nvcc should use as the host compiler
# No need to set this on Windows
@@ -324,7 +313,7 @@ while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
fi
if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
export GCC_HOST_COMPILER_PATH
- echo "build --action_env GCC_HOST_COMPILER_PATH=\"$GCC_HOST_COMPILER_PATH\"" >>.bazelrc
+ write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
break
fi
echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
@@ -348,7 +337,7 @@ while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
fi
if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
export CLANG_CUDA_COMPILER_PATH
- echo "build --action_env CLANG_CUDA_COMPILER_PATH=\"$CLANG_CUDA_COMPILER_PATH\"" >>.bazelrc
+ write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH"
break
fi
echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
@@ -399,10 +388,9 @@ while true; do
if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
export CUDA_TOOLKIT_PATH
- echo "build --action_env CUDA_TOOLKIT_PATH=\"$CUDA_TOOLKIT_PATH\"" >>.bazelrc
-
+ write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
export TF_CUDA_VERSION
- echo "build --action_env TF_CUDA_VERSION=$TF_CUDA_VERSION" >>.bazelrc
+ write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
break
fi
echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@@ -417,9 +405,9 @@ done
# Find out where the cuDNN library is installed
while true; do
- # Configure the Cudnn version to use.
+ # Configure the cuDNN version to use.
if [ -z "$TF_CUDNN_VERSION" ]; then
- read -p "Please specify the Cudnn version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
+ read -p "Please specify the cuDNN version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
fi
fromuser=""
@@ -454,10 +442,9 @@ while true; do
if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
export TF_CUDNN_VERSION
- echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
-
+ write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
export CUDNN_INSTALL_PATH
- echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
+ write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
break
fi
@@ -470,10 +457,9 @@ while true; do
CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
export TF_CUDNN_VERSION
- echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
-
+ write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
- echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
+ write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
break
fi
fi
@@ -525,7 +511,7 @@ EOF
fi
else
export TF_CUDA_COMPUTE_CAPABILITIES
- echo "build --action_env TF_CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES" >>.bazelrc
+ write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES"
break
fi
TF_CUDA_COMPUTE_CAPABILITIES=""
@@ -536,9 +522,9 @@ if is_windows; then
export CUDA_PATH="$CUDA_TOOLKIT_PATH"
export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
export NO_WHOLE_ARCHIVE_OPTION=1
-
- # Set GCC_HOST_COMPILER_PATH to keep cuda_configure.bzl happy
- export GCC_HOST_COMPILER_PATH="/usr/bin/dummy_compiler"
+ write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH"
+ write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES"
+ write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1"
fi
# end of if "$TF_NEED_CUDA" == "1"
@@ -629,6 +615,6 @@ done
# end of if "$TF_NEED_OPENCL" == "1"
fi
-bazel_fetch
-
+# TODO(gunan): Remove once bazel correctly handles changes in remote repositories.
+bazel clean
echo "Configuration finished"
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 6981cb67576dff..98c13958d3729b 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -72,7 +72,7 @@ def tfadd_with_ckpt_saver(out_dir):
saver.save(sess, ckpt_file)
# Without the SaverDef, the restore op won't be named correctly.
saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
- with open(saver_file, 'w') as f:
+ with open(saver_file, 'wb') as f:
f.write(saver.as_saver_def().SerializeToString())
@@ -113,7 +113,7 @@ def write_graph(build_graph, out_dir):
with g.as_default():
build_graph(out_dir)
filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
- with open(filename, 'w') as f:
+ with open(filename, 'wb') as f:
f.write(g.as_graph_def().SerializeToString())
diff --git a/tensorflow/compiler/tests/nary_ops_test.py b/tensorflow/compiler/tests/nary_ops_test.py
index a1f1e67a9f2d7d..2660e1d5728caf 100644
--- a/tensorflow/compiler/tests/nary_ops_test.py
+++ b/tensorflow/compiler/tests/nary_ops_test.py
@@ -116,12 +116,14 @@ def testStridedSlice(self):
np.array([1, 1], dtype=np.int32)],
expected=np.array([[], []], dtype=np.float32))
- self._testNAry(lambda x: array_ops.strided_slice(*x),
- [np.array([[], [], []], dtype=np.float32),
- np.array([1, 0], dtype=np.int64),
- np.array([3, 0], dtype=np.int64),
- np.array([1, 1], dtype=np.int64)],
- expected=np.array([[], []], dtype=np.float32))
+ if np.int64 in self.int_types:
+ self._testNAry(
+ lambda x: array_ops.strided_slice(*x), [
+ np.array([[], [], []], dtype=np.float32), np.array(
+ [1, 0], dtype=np.int64), np.array([3, 0], dtype=np.int64),
+ np.array([1, 1], dtype=np.int64)
+ ],
+ expected=np.array([[], []], dtype=np.float32))
self._testNAry(lambda x: array_ops.strided_slice(*x),
[np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
index 4eed903963a34a..eb48fe555a0b18 100644
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -33,7 +33,7 @@
# MaxPoolGrad.
def _AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding):
del outputs # Unused by average-pooling gradients.
- return gen_nn_ops.avg_pool3d_grad(
+ return gen_nn_ops._avg_pool3d_grad(
inputs.get_shape().as_list(),
output_gradients,
ksize=ksize,
@@ -263,7 +263,7 @@ def _VerifyGradient(self, pool_func, pool_grad_func, input_sizes, ksize,
def testMaxPoolGradValidPadding1_1_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[1, 3, 3, 3, 1],
ksize=[1, 1, 1],
strides=[1, 1, 1],
@@ -272,7 +272,7 @@ def testMaxPoolGradValidPadding1_1_3d(self):
def testMaxPoolGradValidPadding2_1_6_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[2, 3, 3, 6, 3],
ksize=[2, 2, 2],
strides=[1, 1, 1],
@@ -281,7 +281,7 @@ def testMaxPoolGradValidPadding2_1_6_3d(self):
def testMaxPoolGradValidPadding2_1_7_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[2, 3, 5, 7, 3],
ksize=[2, 2, 2],
strides=[1, 1, 1],
@@ -290,7 +290,7 @@ def testMaxPoolGradValidPadding2_1_7_3d(self):
def testMaxPoolGradValidPadding2_2_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[2, 2, 2, 2, 3],
ksize=[2, 2, 2],
strides=[2, 2, 2],
@@ -299,7 +299,7 @@ def testMaxPoolGradValidPadding2_2_3d(self):
def testMaxPoolGradSamePadding1_1_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[2, 3, 2, 4, 1],
ksize=[1, 1, 1],
strides=[1, 1, 1],
@@ -308,7 +308,7 @@ def testMaxPoolGradSamePadding1_1_3d(self):
def testMaxPoolGradSamePadding2_1_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[2, 3, 2, 4, 1],
ksize=[2, 2, 2],
strides=[1, 1, 1],
@@ -317,7 +317,7 @@ def testMaxPoolGradSamePadding2_1_3d(self):
def testMaxPoolGradSamePadding2_2_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[2, 5, 2, 4, 3],
ksize=[2, 2, 2],
strides=[2, 2, 2],
@@ -326,7 +326,7 @@ def testMaxPoolGradSamePadding2_2_3d(self):
def testMaxPoolGradSamePadding3_1_3d(self):
self._VerifyGradient(
nn_ops.max_pool3d,
- gen_nn_ops.max_pool3d_grad,
+ gen_nn_ops._max_pool3d_grad,
input_sizes=[1, 3, 3, 7, 1],
ksize=[3, 3, 3],
strides=[1, 1, 1],
diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle
index fb87de621279a0..17a57b99fd6c9e 100644
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@@ -5,7 +5,8 @@ def TF_SRC_DIR = projectDir.toString() + "/../../../.."
android {
compileSdkVersion 24
- buildToolsVersion '25.0.1'
+ // Check local build_tools_version as this is liable to change within Android Studio.
+ buildToolsVersion '25.0.2'
// for debugging native code purpose
publishNonDefault true
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 3c8dc869afa0c7..e27df6898e36b5 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -22,6 +22,7 @@ option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
+option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
option(tensorflow_BUILD_PYTHON_BINDINGS "Build the Python bindings" ON)
option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
@@ -29,6 +30,7 @@ option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib
option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
if (NOT WIN32)
# Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
@@ -81,6 +83,22 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
endif()
endif()
+# MSVC SIMD instructions
+if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+ if (WIN32)
+ CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+ if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
+ else()
+ message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
+ endif()
+ endif()
+endif()
+
+if (tensorflow_ENABLE_JEMALLOC_SUPPORT)
+ add_definitions(-DTENSORFLOW_USE_JEMALLOC -DJEMALLOC_EXPORT=)
+endif()
+
# External dependencies
include(zlib)
include(gif)
@@ -148,6 +166,12 @@ if(tensorflow_ENABLE_GRPC_SUPPORT)
list(APPEND tensorflow_EXTERNAL_DEPENDENCIES grpc)
include_directories(${GRPC_INCLUDE_DIRS})
endif()
+if(tensorflow_ENABLE_JEMALLOC_SUPPORT)
+ include(jemalloc)
+ list(APPEND tensorflow_EXTERNAL_LIBRARIES ${jemalloc_STATIC_LIBRARIES})
+ list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc)
+ include_directories(${jemalloc_INCLUDE_DIRS})
+endif()
if(WIN32)
list(APPEND tensorflow_EXTERNAL_LIBRARIES wsock32 ws2_32 shlwapi)
endif()
@@ -202,7 +226,6 @@ endif()
# Let's get to work!
include(tf_core_framework.cmake)
-include(tf_tools.cmake)
# NOTE: Disabled until issue #3996 is fixed.
# include(tf_stream_executor.cmake)
if (tensorflow_ENABLE_GPU)
@@ -223,6 +246,7 @@ if(tensorflow_BUILD_CC_EXAMPLE)
include(tf_tutorials.cmake)
include(tf_label_image_example.cmake)
endif()
+include(tf_tools.cmake)
if(tensorflow_BUILD_PYTHON_BINDINGS)
include(tensorboard)
include(tf_python.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 2641d5292d201e..af949f79fa1aab 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -45,7 +45,7 @@ bindings.
### Pre-requisites
-* CMake version 3.5 up to 3.6
+* CMake version 3.5 or later.
* [Git](http://git-scm.com)
@@ -181,7 +181,11 @@ Step-by-step Windows build
More? -Dtensorflow_ENABLE_GPU=ON ^
More? -DCUDNN_HOME="D:\...\cudnn"
```
-
+ To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+ ```
+ More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
+ ```
+
Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
configuration that you choose when invoking `msbuild`. The known-good
values are `Release` and `RelWithDebInfo`. The `Debug` build type is
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
new file mode 100644
index 00000000000000..b0b212eeb60987
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -0,0 +1,33 @@
+include (ExternalProject)
+
+set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
+set(jemalloc_URL https://github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
+set(jemalloc_HASH SHA256=f9be9a05fe906deb5c1c8ca818071a7d2e27d66fd87f5ba9a7bf3750bcedeaf0)
+set(jemalloc_BUILD ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc)
+
+if (WIN32)
+ set(jemalloc_INCLUDE_DIRS
+ ${jemalloc_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
+ )
+ set(jemalloc_ADDITIONAL_CMAKE_OPTIONS -A x64)
+ set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+else()
+ set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
+endif()
+
+ExternalProject_Add(jemalloc
+ PREFIX jemalloc
+ URL ${jemalloc_URL}
+ URL_HASH ${jemalloc_HASH}
+ DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+ BUILD_IN_SOURCE 1
+ CONFIGURE_COMMAND ${CMAKE_COMMAND}
+ -DCMAKE_BUILD_TYPE:STRING=Release
+ -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+ -Dwith-jemalloc-prefix:STRING=jemalloc_
+ -Dwithout-export:BOOL=ON
+ ${jemalloc_ADDITIONAL_CMAKE_OPTIONS}
+ BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
+ INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
+)
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 5151fdb444f75c..636caf5f3d9605 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -63,7 +63,6 @@ add_executable(${transform_graph}
target_link_libraries(${transform_graph} PUBLIC
tf_protos_cc
- ${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
@@ -83,7 +82,6 @@ add_executable(${summarize_graph}
target_link_libraries(${summarize_graph} PUBLIC
tf_protos_cc
- ${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
@@ -103,7 +101,6 @@ add_executable(${compare_graphs}
target_link_libraries(${compare_graphs} PUBLIC
tf_protos_cc
- ${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
@@ -118,6 +115,8 @@ add_executable(${benchmark_model}
$
$
$
+ $<$:$>
+ $<$:$>
)
target_link_libraries(${benchmark_model} PUBLIC
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
index 0aefbc6eedb0f1..df744428a8ad96 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
@@ -29,7 +29,6 @@
dispatch_queue_t videoDataOutputQueue;
AVCaptureStillImageOutput *stillImageOutput;
UIView *flashView;
- UIImage *square;
BOOL isUsingFrontFacingCamera;
AVSpeechSynthesizer *synth;
NSMutableDictionary *oldPredictionValues;
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
index e975a25b5e0cf9..20c49d5b6a9d0e 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
@@ -369,13 +369,8 @@ - (IBAction)switchCameras:(id)sender {
isUsingFrontFacingCamera = !isUsingFrontFacingCamera;
}
-- (void)didReceiveMemoryWarning {
- [super didReceiveMemoryWarning];
-}
-
- (void)viewDidLoad {
[super viewDidLoad];
- square = [UIImage imageNamed:@"squarePNG"];
synth = [[AVSpeechSynthesizer alloc] init];
labelLayers = [[NSMutableArray alloc] init];
oldPredictionValues = [[NSMutableDictionary alloc] init];
@@ -399,26 +394,6 @@ - (void)viewDidLoad {
[self setupAVCapture];
}
-- (void)viewDidUnload {
- [super viewDidUnload];
-}
-
-- (void)viewWillAppear:(BOOL)animated {
- [super viewWillAppear:animated];
-}
-
-- (void)viewDidAppear:(BOOL)animated {
- [super viewDidAppear:animated];
-}
-
-- (void)viewWillDisappear:(BOOL)animated {
- [super viewWillDisappear:animated];
-}
-
-- (void)viewDidDisappear:(BOOL)animated {
- [super viewDidDisappear:animated];
-}
-
- (BOOL)shouldAutorotateToInterfaceOrientation:
(UIInterfaceOrientation)interfaceOrientation {
return (interfaceOrientation == UIInterfaceOrientationPortrait);
diff --git a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
index 1134d0e11782bb..e9d783e49da8e1 100644
--- a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
@@ -13,7 +13,6 @@
591D3ECF1CFF7FCE0059011C /* ImageIO.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3ECE1CFF7FCE0059011C /* ImageIO.framework */; };
591D3ED21CFF85C30059011C /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED11CFF85C30059011C /* ios_image_load.mm */; };
591D3ED51CFF85FD0059011C /* tensorflow_utils.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */; };
- 591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED71CFFA83A0059011C /* grace_hopper.jpg */; };
591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */; };
591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */; };
591D3EDF1CFFAD230059011C /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */; };
@@ -38,7 +37,6 @@
591D3ED11CFF85C30059011C /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = SOURCE_ROOT; };
591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tensorflow_utils.mm; sourceTree = SOURCE_ROOT; };
591D3ED41CFF85FD0059011C /* tensorflow_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensorflow_utils.h; sourceTree = SOURCE_ROOT; };
- 591D3ED71CFFA83A0059011C /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = ""; };
591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = ""; };
591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = ""; };
591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = ""; };
@@ -79,7 +77,6 @@
591D3ED61CFFA83A0059011C /* data */ = {
isa = PBXGroup;
children = (
- 591D3ED71CFFA83A0059011C /* grace_hopper.jpg */,
591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */,
591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */,
);
@@ -199,7 +196,6 @@
isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647;
files = (
- 591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */,
591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */,
592FF90D18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard in Resources */,
591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */,
diff --git a/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg b/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg
deleted file mode 100644
index d2a427810f679d..00000000000000
Binary files a/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg and /dev/null differ
diff --git a/tensorflow/contrib/ios_examples/camera/squarePNG.png b/tensorflow/contrib/ios_examples/camera/squarePNG.png
deleted file mode 100644
index e26ff840ed932b..00000000000000
Binary files a/tensorflow/contrib/ios_examples/camera/squarePNG.png and /dev/null differ
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index f0ed31d1d1db6e..b1a7f7ee59a017 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -17,24 +17,31 @@
from __future__ import division
from __future__ import print_function
+from six.moves import xrange # pylint: disable=redefined-builtin
+
from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
+from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor
from tensorflow.python.framework import tensor_shape
from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
from tensorflow.python.ops import sparse_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import tf_logging as logging
__all__ = [
"safe_embedding_lookup_sparse", "scattered_embedding_lookup",
- "scattered_embedding_lookup_sparse", "embedding_lookup_unique"
+ "scattered_embedding_lookup_sparse", "embedding_lookup_unique",
+ "embedding_lookup_sparse_with_distributed_aggregation"
]
@@ -548,3 +555,351 @@ def _sampled_scattered_embedding_lookup_sparse(params,
return math_ops.unsorted_segment_sum(embeddings, segment_ids,
num_segments=num_segments,
name=name_scope)
+
+
+def embedding_lookup_sparse_with_distributed_aggregation(
+ params,
+ sp_ids,
+ sp_weights,
+ partition_strategy="mod",
+ name=None,
+ combiner=None,
+ max_norm=None):
+ """Computes embeddings for the given ids and weights.
+
+ Embeddings belonging to same param are aggregated on that device first. This
+ op is intended to decrease data transmission and improve parallelism. See
+ `tf.nn.embedding_lookup_sparse` for the functionality and example of this op.
+
+ Args:
+ params: A single tensor representing the complete embedding tensor,
+ or a list of P tensors all of same shape except for the first dimension,
+ representing sharded embedding tensors. Alternatively, a
+ `PartitionedVariable`, created by partitioning along dimension 0. Each
+ element must be appropriately sized for the given `partition_strategy`.
+ sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+ where N is typically batch size and M is arbitrary.
+ sp_weights: either a SparseTensor of float / double weights, or None to
+ indicate all weights should be taken to be 1. If specified, sp_weights
+ must have exactly the same shape and indices as sp_ids.
+ partition_strategy: A string specifying the partitioning strategy, relevant
+ if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+ is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+ name: Optional name for the op.
+ combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
+ and "sum" are supported.
+ "sum" computes the weighted sum of the embedding results for each row.
+ "mean" is the weighted sum divided by the total weight.
+ "sqrtn" is the weighted sum divided by the square root of the sum of the
+ squares of the weights.
+ max_norm: If not None, each embedding is normalized to have l2 norm equal
+ to max_norm before combining.
+
+ Returns:
+ A dense tensor representing the combined embeddings for the
+ sparse ids. For each row in the dense tensor represented by sp_ids, the op
+ looks up the embeddings for all ids in that row, multiplies them by the
+ corresponding weight, and combines these embeddings as specified.
+
+ Raises:
+ TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
+ None nor SparseTensor.
+ ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+ """
+ if combiner is None:
+ logging.warn("The default value of combiner will change from \"mean\" "
+ "to \"sqrtn\" after 2016/11/01.")
+ combiner = "mean"
+ if combiner not in ("mean", "sqrtn", "sum"):
+ raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
+ if isinstance(params, variables.PartitionedVariable):
+ params = list(params) # Iterate to get the underlying Variables.
+ if not isinstance(params, list):
+ params = [params]
+ if not isinstance(sp_ids, sparse_tensor.SparseTensor):
+ raise TypeError("sp_ids must be SparseTensor")
+ ignore_weights = sp_weights is None
+ if not ignore_weights:
+ if not isinstance(sp_weights, sparse_tensor.SparseTensor):
+ raise TypeError("sp_weights must be either None or SparseTensor")
+ sp_ids.values.get_shape().assert_is_compatible_with(
+ sp_weights.values.get_shape())
+ sp_ids.indices.get_shape().assert_is_compatible_with(
+ sp_weights.indices.get_shape())
+ sp_ids.dense_shape.get_shape().assert_is_compatible_with(
+ sp_weights.dense_shape.get_shape())
+ # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
+ # sp_weights have equal indices and shapes.
+
+ with ops.name_scope(name, "embedding_lookup_sparse",
+ params + [sp_ids]) as name:
+ segment_ids = sp_ids.indices[:, 0]
+ if segment_ids.dtype != dtypes.int32:
+ segment_ids = math_ops.cast(segment_ids, dtypes.int32)
+
+ ids = sp_ids.values
+ if ignore_weights:
+ ids, idx = array_ops.unique(ids)
+ else:
+ idx = None
+
+ weights = None if ignore_weights else sp_weights.values
+ embeddings = _embedding_lookup_with_distributed_aggregation(
+ params,
+ ids,
+ partition_strategy=partition_strategy,
+ max_norm=max_norm,
+ weights=weights,
+ idx=idx,
+ segment_ids=segment_ids)
+ # Set weights to all one if ignore weights.
+ if ignore_weights:
+ weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1)
+ if weights.dtype != embeddings.dtype:
+ weights = math_ops.cast(weights, embeddings.dtype)
+ # Reshape weights.
+ ones = array_ops.fill(
+ array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
+ bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0)
+ orig_weights_shape = weights.get_shape()
+ weights = array_ops.reshape(weights, bcast_weights_shape)
+ if embeddings.get_shape().ndims is not None:
+ weights.set_shape(
+ orig_weights_shape.concatenate(
+ [1 for _ in range(embeddings.get_shape().ndims - 1)]))
+
+ if combiner == "mean":
+ weight_sum = math_ops.segment_sum(weights, segment_ids)
+ embeddings = math_ops.div(embeddings, weight_sum)
+ elif combiner == "sqrtn":
+ weights_squared = math_ops.pow(weights, 2)
+ weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
+ weight_sum_sqrt = math_ops.sqrt(weight_sum)
+ embeddings = math_ops.div(embeddings, weight_sum_sqrt)
+ elif combiner != "sum":
+ assert False, "Unrecognized combiner"
+ return embeddings
+
+
+def _do_gather(params, ids, validate_indices=True, name=None):
+ """Deals with doing gather differently for resource variables."""
+ if isinstance(params, resource_variable_ops.ResourceVariable):
+ return params.sparse_read(ids, name=name)
+ return array_ops.gather(
+ params, ids, name=name, validate_indices=validate_indices)
+
+
+def _embedding_lookup_with_distributed_aggregation(params,
+ ids,
+ partition_strategy="mod",
+ name=None,
+ validate_indices=True,
+ max_norm=None,
+ weights=None,
+ idx=None,
+ segment_ids=None):
+ """Lookup helper for embedding_lookup_sparse_with_distributed_aggregation."""
+ if params is None or params == []: # pylint: disable=g-explicit-bool-comparison
+ raise ValueError("Need at least one param")
+ if isinstance(params, variables.PartitionedVariable):
+ params = list(params) # Iterate to get the underlying Variables.
+ if not isinstance(params, list):
+ params = [params]
+
+ def maybe_normalize(x):
+ if max_norm is not None:
+ if x.get_shape().ndims is not None:
+ ndims = x.get_shape().ndims
+ else:
+ ndims = array_ops.size(array_ops.shape(x))
+ return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
+ return x
+
+ with ops.name_scope(name, "embedding_lookup_with_distributed_aggregation",
+ params + [ids]) as name:
+ np = len(params) # Number of partitions
+ # Preserve the resource variable status to avoid accidental dense reads.
+ if not any(
+ isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
+ params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
+ if np == 1:
+ with ops.colocate_with(params[0]):
+ ret = maybe_normalize(
+ _do_gather(params[0], ids, validate_indices=validate_indices))
+ ignore_weights = weights is None
+ if not ignore_weights:
+ if weights.dtype != ret.dtype:
+ weights = math_ops.cast(weights, ret.dtype)
+ # Reshape to allow broadcast
+ ones = array_ops.fill(
+ array_ops.expand_dims(array_ops.rank(ret) - 1, 0), 1)
+ bcast_weights_shape = array_ops.concat(
+ [array_ops.shape(weights), ones], 0)
+ orig_weights_shape = weights.get_shape()
+ weights = array_ops.reshape(weights, bcast_weights_shape)
+ # Set weights shape after reshape
+ if ret.get_shape().ndims is not None:
+ weights.set_shape(
+ orig_weights_shape.concatenate(
+ [1 for _ in range(ret.get_shape().ndims - 1)]))
+ ret *= weights
+ return math_ops.segment_sum(ret, segment_ids, name=name)
+ else:
+ return math_ops.sparse_segment_sum(ret, idx, segment_ids, name=name)
+ else:
+ ids = ops.convert_to_tensor(ids, name="ids")
+ flat_ids = array_ops.reshape(ids, [-1])
+ original_indices = math_ops.range(array_ops.size(flat_ids))
+
+ # Create p_assignments and set new_ids depending on the strategy.
+ if partition_strategy == "mod":
+ p_assignments = flat_ids % np
+ new_ids = flat_ids // np
+ elif partition_strategy == "div":
+ # Compute num_total_ids as the sum of dim-0 of params, then assign to
+ # partitions based on a constant number of ids per partition. Optimize
+ # if we already know the full shape statically.
+ dim_0_size = params[0].get_shape()[0]
+ for p in xrange(1, np):
+ dim_0_size += params[p].get_shape()[0]
+ if dim_0_size.value:
+ num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
+ else:
+ dim_0_sizes = []
+ for p in xrange(np):
+ if params[p].get_shape()[0].value is not None:
+ dim_0_sizes.append(params[p].get_shape()[0].value)
+ else:
+ with ops.colocate_with(params[p]):
+ dim_0_sizes.append(array_ops.shape(params[p])[0])
+ num_total_ids = math_ops.reduce_sum(
+ math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
+ ids_per_partition = num_total_ids // np
+ extras = num_total_ids % np
+
+ p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1), (
+ flat_ids - extras) // ids_per_partition)
+
+ # Emulate a conditional using a boolean indicator tensor
+ is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+ flat_ids.dtype)
+ new_ids = (is_in_first_extras_partitions * (flat_ids %
+ (ids_per_partition + 1)) +
+ (1 - is_in_first_extras_partitions) * (
+ (flat_ids - extras) % ids_per_partition))
+ else:
+ raise ValueError("Unrecognized partition strategy: " +
+ partition_strategy)
+
+ # Cast partition assignments to int32 for use in dynamic_partition.
+ # There really should not be more than 2^32 partitions.
+ p_assignments = math_ops.cast(p_assignments, dtypes.int32)
+ # Partition list of ids based on assignments into np separate lists
+ gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
+ # Similarly, partition the original indices.
+ pindices = data_flow_ops.dynamic_partition(original_indices,
+ p_assignments, np)
+ # Do np separate lookups, finding embeddings for plist[p] in params[p]
+ partitioned_result = []
+ for p in xrange(np):
+ with ops.colocate_with(params[p]):
+ partitioned_result.append(
+ _do_gather(
+ params[p], gather_ids[p], validate_indices=validate_indices))
+
+ ignore_weights = weights is None
+ if not ignore_weights:
+ # Partition weights according to pindices.
+ partitioned_weight = []
+ for p in xrange(np):
+ partitioned_weight.append(array_ops.gather(weights, pindices[p]))
+ # Reshape each partition result.
+ element_shape = params[0].get_shape()[1:]
+ for p in params[1:]:
+ element_shape = element_shape.merge_with(p.get_shape()[1:])
+ if element_shape.is_fully_defined():
+ for p in xrange(np):
+ with ops.colocate_with(params[p]):
+ partitioned_result[p] = array_ops.reshape(
+ partitioned_result[p],
+ array_ops.concat([array_ops.shape(pindices[p]), element_shape],
+ 0))
+ else:
+ with ops.colocate_with(params[0]):
+ params_shape = array_ops.shape(params[0])
+ for p in xrange(np):
+ with ops.colocate_with(params[p]):
+ partitioned_result[p] = array_ops.reshape(
+ partitioned_result[p],
+ array_ops.concat([
+ array_ops.shape(pindices[p]), array_ops.slice(
+ params_shape, [1], [-1])
+ ], 0))
+ # Normalize each partition result.
+ for p in xrange(np):
+ with ops.colocate_with(params[p]):
+ partitioned_result[p] = maybe_normalize(partitioned_result[p])
+ if not ignore_weights:
+ # Multiply each partition result with partition weights.
+ for p in xrange(np):
+ with ops.colocate_with(params[p]):
+ if partitioned_weight[p].dtype != partitioned_result[p].dtype:
+ partitioned_weight[p] = math_ops.cast(partitioned_weight[p],
+ partitioned_result[p].dtype)
+ # Reshape partition weights.
+ ones = array_ops.fill(
+ array_ops.expand_dims(
+ array_ops.rank(partitioned_result[p]) - 1, 0), 1)
+ bcast_weights_shape = array_ops.concat(
+ [array_ops.shape(partitioned_weight[p]), ones], 0)
+ orig_weights_shape = partitioned_weight[p].get_shape()
+ partitioned_weight[p] = array_ops.reshape(partitioned_weight[p],
+ bcast_weights_shape)
+ if partitioned_result[p].get_shape().ndims is not None:
+ partitioned_weight[p].set_shape(
+ orig_weights_shape.concatenate([
+ 1
+ for _ in range(partitioned_result[p].get_shape().ndims -
+ 1)
+ ]))
+ partitioned_result[p] *= partitioned_weight[p]
+ partitioned_segment_ids = []
+ for p in xrange(np):
+ if not ignore_weights:
+ # Partition segment_ids according to pindices.
+ p_segment_ids = array_ops.gather(segment_ids, pindices[p])
+ # Number the p_segment_ids to meet segment_sum's requirements. Note
+ # that unique_p_segment_ids contains unique segment ids of this
+ # partiton and these ids' order is unchanged.
+ unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+ p_segment_ids)
+ partitioned_segment_ids.append(unique_p_segment_ids)
+ # segment_sum this partition's result.
+ with ops.colocate_with(params[p]):
+ partitioned_result[p] = math_ops.segment_sum(
+ partitioned_result[p], unique_p_segment_idx)
+ else:
+ # When ignore weights, we need to get indexs of elements in idx and
+ # segment_ids.
+ _, exclude_idx = array_ops.setdiff1d(idx, pindices[p])
+ all_idx = math_ops.range(array_ops.shape(idx)[0])
+ _, include_idx = array_ops.setdiff1d(all_idx, exclude_idx)
+ # Gather segment_ids and idx according to indexs.
+ p_segment_ids = array_ops.gather(segment_ids, include_idx)
+ p_idx = array_ops.gather(idx, include_idx)
+ # Number the p_segment_ids, same as ignore_weights case above.
+ unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+ p_segment_ids)
+ _, unique_p_idx_idx = array_ops.unique(p_idx)
+ partitioned_segment_ids.append(unique_p_segment_ids)
+ with ops.colocate_with(params[p]):
+ partitioned_result[p] = math_ops.sparse_segment_sum(
+ partitioned_result[p], unique_p_idx_idx, unique_p_segment_idx)
+ # Concat each partition's segment_ids and result for final segment_sum.
+ concat_segment_ids = array_ops.concat(partitioned_segment_ids, 0)
+ concat_partitioned_result = array_ops.concat(partitioned_result, 0)
+ return math_ops.unsorted_segment_sum(
+ concat_partitioned_result,
+ concat_segment_ids,
+ math_ops.reduce_max(concat_segment_ids) + 1,
+ name=name)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index dfa8067f27a858..bf2514498202e9 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -31,10 +31,13 @@
from tensorflow.python.framework import errors_impl
from tensorflow.python.framework import random_seed
from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import partitioned_variables
from tensorflow.python.platform import test
+from tensorflow.python.util import compat
class SafeEmbeddingLookupSparseTest(test.TestCase):
@@ -143,8 +146,8 @@ def test_safe_embedding_lookup_sparse_no_weights(self):
self.assertAllClose(
embedding_lookup_result,
[(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
- [0] * 4, embedding_weights[0][2],
- (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
+ [0] * 4, embedding_weights[0][2], (
+ embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
def test_safe_embedding_lookup_sparse_partitioned(self):
with self.test_session():
@@ -169,8 +172,8 @@ def test_safe_embedding_lookup_sparse_partitioned_inconsistent_weights(self):
self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
embedding_weights, sparse_ids)
embedding_weights = [
- constant_op.constant(
- w, dtype=dtypes.float64) for w in embedding_weights
+ constant_op.constant(w, dtype=dtypes.float64)
+ for w in embedding_weights
]
self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
embedding_weights, sparse_ids, sparse_weights)
@@ -183,11 +186,10 @@ def test_safe_embedding_lookup_sparse_3d_return_zero_vector(self):
embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
embedding_weights, sparse_ids, sparse_weights).eval())
- self.assertAllClose(
- embedding_lookup_result,
- [[(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
- 3.0, [0] * 4, [0] * 4],
- [embedding_weights[0][2], [0] * 4, [0] * 4]])
+ self.assertAllClose(embedding_lookup_result, [[
+ (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
+ [0] * 4, [0] * 4
+ ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
with self.test_session():
@@ -213,14 +215,13 @@ def test_safe_embedding_lookup_sparse_3d_no_weights(self):
embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
embedding_weights, sparse_ids, None).eval())
- self.assertAllClose(
- embedding_lookup_result,
- [[(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
- [0] * 4], [
- embedding_weights[0][2],
- (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0,
- [0] * 4
- ]])
+ self.assertAllClose(embedding_lookup_result, [[(
+ embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
+ 0
+ ] * 4], [
+ embedding_weights[0][2],
+ (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
+ ]])
def test_safe_embedding_lookup_sparse_3d_partitioned(self):
with self.test_session():
@@ -231,13 +232,12 @@ def test_safe_embedding_lookup_sparse_3d_partitioned(self):
embedding_weights, sparse_ids, None).eval())
embedding_weights = list(itertools.chain(*embedding_weights))
- self.assertAllClose(embedding_lookup_result,
- [[(embedding_weights[0] + embedding_weights[1]) / 2.0,
- [0] * 4, [0] * 4], [
- embedding_weights[2],
- (embedding_weights[0] + embedding_weights[1]) /
- 2.0, [0] * 4
- ]])
+ self.assertAllClose(embedding_lookup_result, [[
+ (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4
+ ], [
+ embedding_weights[2],
+ (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
+ ]])
def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
self):
@@ -249,8 +249,8 @@ def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
embedding_weights, sparse_ids)
embedding_weights = [
- constant_op.constant(
- w, dtype=dtypes.float64) for w in embedding_weights
+ constant_op.constant(w, dtype=dtypes.float64)
+ for w in embedding_weights
]
self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
embedding_weights, sparse_ids, sparse_weights)
@@ -299,8 +299,8 @@ def test_scattered_embedding_multiple_partition(self):
self.assertAllEqual(embedding_lookup_result[0],
embedding_lookup_result[1])
# Different embedding expected for different value.
- embedding_diff = np.min((embedding_lookup_result[2] -
- embedding_lookup_result[0])**2)
+ embedding_diff = np.min(
+ (embedding_lookup_result[2] - embedding_lookup_result[0])**2)
self.assertGreater(embedding_diff, 0)
def test_scattered_embedding_coverage(self):
@@ -318,8 +318,8 @@ def test_scattered_embedding_coverage(self):
def test_scattered_embedding_multi_dimension(self):
with self.test_session():
embedding_weights = self._random_weights()
- values = constant_op.constant(
- [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+ values = constant_op.constant([["foo", "bar", "bar"],
+ ["bar", "bar", "foo"]])
embedding_lookup_result = embedding_ops.scattered_embedding_lookup(
embedding_weights, values, dimension=10).eval()
@@ -338,8 +338,8 @@ def test_scattered_embedding_lookup_sparse(self):
embedding_lookup_result = (
embedding_ops.scattered_embedding_lookup_sparse(
- embedding_weights, sparse_tensor, dimension=5, combiner="mean")
- .eval())
+ embedding_weights, sparse_tensor, dimension=5,
+ combiner="mean").eval())
self.assertAllEqual(embedding_lookup_result.shape, [5, 5])
# Same non-zero embedding for the empty rows filled with a default value.
@@ -431,8 +431,8 @@ def test_hashed_embedding_consistency(self):
def test_hashed_embedding_multi_dimension(self):
with self.test_session():
embedding_weights = self._random_weights()
- values = constant_op.constant(
- [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+ values = constant_op.constant([["foo", "bar", "bar"],
+ ["bar", "bar", "foo"]])
sampled_candidates = constant_op.constant(
[[[1, 3, 4, 6], [1, 7, 8, 9], [1, 7, 8, 9]],
[[1, 7, 8, 9], [1, 7, 8, 9], [1, 3, 4, 6]]])
@@ -489,8 +489,8 @@ def test_output_values(self):
result = embedding_ops._sampled_scattered_embedding_lookup_sparse(
params, sp_values, dimension=5, hash_key=self._hash_key)
- self.assertAllClose(result.eval(), [[0., 0., 0., 0., 0.],
- [.3, .2, .2, .3, .1],
+ self.assertAllClose(result.eval(), [[0., 0., 0., 0.,
+ 0.], [.3, .2, .2, .3, .1],
[0., 0., 0., 0., 0.]])
def test_output_values_with_sampled_candidates(self):
@@ -563,5 +563,224 @@ def test_distributive_property(self):
self.assertAllClose(result.eval(), result_abc.eval())
+def _PName(param_id):
+ return "p" + str(param_id)
+
+
+def _EmbeddingParams(num_shards,
+ vocab_size,
+ dtype=dtypes.float32,
+ shape=None,
+ use_shapeless_placeholder=False):
+ p = []
+ params = {}
+ feed_dict = {}
+ if not shape:
+ shape = [10]
+ for i in range(num_shards):
+ shard_shape = [vocab_size // num_shards] + shape
+ if i < vocab_size % num_shards: # Excess goes evenly on the first shards
+ shard_shape[0] += 1
+
+ param_name = _PName(i)
+
+ if use_shapeless_placeholder:
+ param = array_ops.placeholder(dtype, shape=None, name=param_name)
+ else:
+ param = constant_op.constant(
+ 1.0, shape=shard_shape, dtype=dtype, name=param_name)
+ p.append(param)
+ np_type = "f" if dtype == dtypes.float32 else "d"
+ val = (np.random.rand(*shard_shape).astype(np_type)) + 1
+ params[param_name + ":0"] = val
+ feed_dict[param.name] = val
+ return p, params, feed_dict
+
+
+def _EmbeddingResult(params,
+ id_vals,
+ num_shards,
+ vocab_size,
+ partition_strategy="mod",
+ weight_vals=None):
+ if weight_vals is None:
+ weight_vals = np.copy(id_vals)
+ weight_vals.fill(1)
+ values = []
+ weights = []
+ weights_squared = []
+ for ids, wts in zip(id_vals, weight_vals):
+ value_aggregation = None
+ weight_aggregation = None
+ squared_weight_aggregation = None
+ if isinstance(ids, compat.integral_types):
+ ids = [ids]
+ wts = [wts]
+ for i, weight_value in zip(ids, wts):
+ if partition_strategy == "mod":
+ val = np.copy(params[_PName(i % num_shards) + ":0"][
+ i // num_shards, :]) * weight_value
+ elif partition_strategy == "div":
+ ids_per_partition, extras = divmod(vocab_size, num_shards)
+ threshold = extras * (ids_per_partition + 1)
+ if i < threshold:
+ partition = i // (ids_per_partition + 1)
+ offset = i % (ids_per_partition + 1)
+ else:
+ partition = extras + (i - threshold) // ids_per_partition
+ offset = (i - threshold) % ids_per_partition
+ val = np.copy(
+ params[_PName(partition) + ":0"][offset, :]) * weight_value
+ else:
+ assert False
+ if value_aggregation is None:
+ assert weight_aggregation is None
+ assert squared_weight_aggregation is None
+ value_aggregation = val
+ weight_aggregation = weight_value
+ squared_weight_aggregation = weight_value * weight_value
+ else:
+ assert weight_aggregation is not None
+ assert squared_weight_aggregation is not None
+ value_aggregation += val
+ weight_aggregation += weight_value
+ squared_weight_aggregation += weight_value * weight_value
+ values.append(value_aggregation)
+ weights.append(weight_aggregation)
+ weights_squared.append(squared_weight_aggregation)
+ values = np.array(values).astype(np.float32)
+ weights = np.array(weights).astype(np.float32)
+ weights_squared = np.array(weights_squared).astype(np.float32)
+ return values, weights, weights_squared
+
+
+class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
+
+ def _RandomIdsAndWeights(self, batch_size, vocab_size):
+ max_val_per_entry = 6
+ vals_per_batch_entry = np.random.randint(
+ 1, max_val_per_entry, size=batch_size)
+ num_vals = np.sum(vals_per_batch_entry)
+
+ ids = np.random.randint(vocab_size, size=num_vals)
+ weights = 1 + np.random.rand(num_vals)
+
+ indices = []
+ for batch_entry, num_val in enumerate(vals_per_batch_entry):
+ for val_index in range(num_val):
+ indices.append([batch_entry, val_index])
+
+ shape = [batch_size, max_val_per_entry]
+
+ sp_ids = sparse_tensor_lib.SparseTensor(
+ constant_op.constant(indices, dtypes.int64),
+ constant_op.constant(ids, dtypes.int32),
+ constant_op.constant(shape, dtypes.int64))
+ sp_weights = sparse_tensor_lib.SparseTensor(
+ constant_op.constant(indices, dtypes.int64),
+ constant_op.constant(weights, dtypes.float32),
+ constant_op.constant(shape, dtypes.int64))
+
+ return sp_ids, sp_weights, ids, weights, vals_per_batch_entry
+
+ def _GroupByBatchEntry(self, vals, vals_per_batch_entry):
+ grouped_vals = []
+ index = 0
+ for num_val in vals_per_batch_entry:
+ grouped_vals.append(list(vals[index:(index + num_val)]))
+ index += num_val
+ return grouped_vals
+
+ def testEmbeddingLookupSparse(self):
+ vocab_size = 13
+ batch_size = 10
+ param_shape = [2, 5]
+ expected_lookup_result_shape = [None] + param_shape
+
+ sp_ids, sp_weights, ids, weights, vals_per_batch_entry = (
+ self._RandomIdsAndWeights(batch_size, vocab_size))
+
+ grouped_ids = self._GroupByBatchEntry(ids, vals_per_batch_entry)
+ grouped_weights = self._GroupByBatchEntry(weights, vals_per_batch_entry)
+ grouped_ignored_weights = self._GroupByBatchEntry(
+ np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
+
+ for num_shards, combiner, dtype, ignore_weights in itertools.product(
+ [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
+ dtypes.float64], [True, False]):
+
+ with self.test_session():
+ p, params, feed_dict = _EmbeddingParams(
+ num_shards, vocab_size, shape=param_shape, dtype=dtype)
+ embedding_sum = \
+ embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+ p,
+ sp_ids,
+ None if ignore_weights else sp_weights,
+ combiner=combiner)
+
+ self.assertEqual(embedding_sum.get_shape().as_list(),
+ expected_lookup_result_shape)
+
+ tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict)
+
+ np_embedding_sum, np_weight_sum, np_weight_sq_sum = _EmbeddingResult(
+ params,
+ grouped_ids,
+ num_shards,
+ vocab_size,
+ weight_vals=grouped_ignored_weights
+ if ignore_weights else grouped_weights)
+ if combiner == "mean":
+ np_embedding_sum /= np.reshape(np_weight_sum, (batch_size, 1, 1))
+ if combiner == "sqrtn":
+ np_embedding_sum /= np.reshape(
+ np.sqrt(np_weight_sq_sum), (batch_size, 1, 1))
+ self.assertAllClose(np_embedding_sum, tf_embedding_sum)
+
+ def testGradientsEmbeddingLookupSparse(self):
+ vocab_size = 12
+ batch_size = 4
+ param_shape = [2, 3]
+ sp_ids, sp_weights, _, _, _ = (self._RandomIdsAndWeights(
+ batch_size, vocab_size))
+
+ for num_shards, combiner, dtype, ignore_weights in itertools.product(
+ [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32,
+ dtypes.float64], [True, False]):
+ with self.test_session():
+ x, params, _ = _EmbeddingParams(
+ num_shards, vocab_size, shape=param_shape, dtype=dtype)
+
+ y = embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+ x,
+ sp_ids,
+ None if ignore_weights else sp_weights,
+ combiner=combiner)
+ x_name = [_PName(i) for i in range(num_shards)]
+ x_init_value = [params[x_n + ":0"] for x_n in x_name]
+ x_shape = [i.shape for i in x_init_value]
+ y_shape = [batch_size] + list(params[_PName(0) + ":0"].shape[1:])
+ err = gradient_checker.compute_gradient_error(
+ x, x_shape, y, y_shape, x_init_value=x_init_value)
+ self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
+
+ def testIncompatibleShapes(self):
+ with self.test_session():
+ x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
+ sp_ids = sparse_tensor_lib.SparseTensor(
+ constant_op.constant([[0, 0], [0, 1], [1, 0]], dtypes.int64),
+ constant_op.constant([0, 1, 2], dtypes.int32),
+ constant_op.constant([2, 2], dtypes.int64))
+ sp_weights = sparse_tensor_lib.SparseTensor(
+ constant_op.constant([[0, 0], [0, 1]], dtypes.int64),
+ constant_op.constant([12.0, 5.0], dtypes.float32),
+ constant_op.constant([1, 2], dtypes.int64))
+
+ with self.assertRaises(ValueError):
+ embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+ x, sp_ids, sp_weights, combiner="mean")
+
+
if __name__ == "__main__":
test.main()
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 282c556424ed5b..32839b251a3838 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -791,9 +791,11 @@ def weighted_sparse_column(sparse_id_column,
weight or value of the corresponding sparse id feature.
dtype: Type of weights, such as `tf.float32`. Only floating and integer
weights are supported.
+
Returns:
A _WeightedSparseColumn composed of two sparse features: one represents id,
the other represents weight (value) of the id feature in that example.
+
Raises:
ValueError: if dtype is not convertible to float.
"""
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
index 0aae178e9ac6d6..6a7b0ea61417bb 100644
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@@ -9,7 +9,7 @@ TF Learn is a simplified interface for TensorFlow, to get people started on pred
### Why *TensorFlow Learn*?
-- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.fit)/[predict](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.predict) and slide into TensorFlow APIs as you are getting comfortable.
+- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#fit)/[predict](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#predict) and slide into TensorFlow APIs as you are getting comfortable.
- To provide a set of reference models that will be easy to integrate with existing code.
## Installation
@@ -43,17 +43,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [
### Existing Estimator Implementations
- [`LinearClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
- ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#LinearClassifier))
+ ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearClassifier))
- [`LinearRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
- ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#LinearRegressor))
+ ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearRegressor))
- [`DNNClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
- ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNClassifier))
+ ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNClassifier))
- [`DNNRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
- ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNRegressor))
+ ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNRegressor))
- [`DNNLinearCombinedClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
- ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNLinearCombinedClassifier))
+ ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedClassifier))
- [`DNNLinearCombinedRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
- ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNLinearCombinedRegressor))
+ ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedRegressor))
- [`SVM`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/svm.py)
([docs](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md))
- [`GMM`](https://www.tensorflow.org/code/tensorflow/contrib/factorization/python/ops/gmm.py)
@@ -67,7 +67,7 @@ Below are a few simple examples of the API. For more examples, please see [examp
General tips:
-- It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](../../../../g3doc/api_docs/python/contrib.learn.md#estimators). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
+- It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
- Categorical variables should be managed before passing input to the estimator.
@@ -219,7 +219,7 @@ INFO:tensorflow:Loss for final step: 0.0162506.
## Summaries
-If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](../../../../g3doc/api_docs/python/train.md#summary-operations) operations.)
+If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](https://www.tensorflow.org/api_guides/python/summary) operations.)
To view the summaries in TensorBoard, run the following command, where `logdir` is the `model_dir` for your `Estimator`:
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
index b2da9a7cc09338..b891bf23016bdb 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
@@ -22,6 +22,7 @@
# pylint: disable=unused-import
from tensorflow.python.estimator.inputs.queues.feeding_functions import _ArrayFeedFn
from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data as enqueue_data
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _GeneratorFeedFn
from tensorflow.python.estimator.inputs.queues.feeding_functions import _OrderedDictNumpyFeedFn
from tensorflow.python.estimator.inputs.queues.feeding_functions import _PandasFeedFn
# pylint: enable=unused-import
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 8f8ab3b335a9aa..bc7465bbc22fab 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -200,6 +200,7 @@ class RunConfig(ClusterConfig):
parameter servers), you probably want to use `learn_runner.EstimatorConfig`
instead.
"""
+ _USE_DEFAULT = 0
def __init__(self,
master=None,
@@ -208,7 +209,7 @@ def __init__(self,
gpu_memory_fraction=1,
tf_random_seed=None,
save_summary_steps=100,
- save_checkpoints_secs=600,
+ save_checkpoints_secs=_USE_DEFAULT,
save_checkpoints_steps=None,
keep_checkpoint_max=5,
keep_checkpoint_every_n_hours=10000,
@@ -260,6 +261,11 @@ def __init__(self,
self._tf_random_seed = tf_random_seed
self._save_summary_steps = save_summary_steps
self._save_checkpoints_secs = save_checkpoints_secs
+ if save_checkpoints_secs == RunConfig._USE_DEFAULT:
+ if save_checkpoints_steps is None:
+ self._save_checkpoints_secs = 600
+ else:
+ self._save_checkpoints_secs = None
self._save_checkpoints_steps = save_checkpoints_steps
# TODO(weiho): Remove these after ModelFn refactoring, when users can
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index 32252cd8e3025a..456792835827f8 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -35,3 +35,4 @@
from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_matrix
from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import HAS_PANDAS
from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import pandas_input_fn
+from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
new file mode 100644
index 00000000000000..5859bb6b47f830
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
@@ -0,0 +1,134 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods to allow generator of dict with numpy arrays."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from types import FunctionType, GeneratorType
+from collections import Container
+
+from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+
+
+def generator_input_fn(x,
+ target_key=None,
+ batch_size=128,
+ num_epochs=1,
+ shuffle=True,
+ queue_capacity=1000,
+ num_threads=1):
+ """Returns input function that would dicts of numpy arrays
+ yielded from a generator.
+
+ It is assumed that every dict yielded from the dictionary represents
+ a single sample. The generator should consume a single epoch of the data.
+
+ This returns a function outputting `features` and `target` based on the dict
+ of numpy arrays. The dict `features` has the same keys as an element yielded
+ from x.
+
+ Example:
+ ```python
+ def generator():
+ for index in range(10):
+ yield {'height': np.random.randint(32,36),
+ 'age': np.random.randint(18, 80),
+ 'label': np.ones(1)}
+
+ with tf.Session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator, target_key="label", batch_size=2, shuffle=False,
+ num_epochs=1)
+ ```
+
+ Args:
+ x: Generator Function, returns a `Generator` that will yield the data
+ in `dict` of numpy arrays
+ target_key: String or Container of Strings, the key or Container of keys of
+ the numpy arrays in x dictionaries to use as target.
+ batch_size: Integer, size of batches to return.
+ num_epochs: Integer, number of epochs to iterate over data. If `None` will
+ run forever.
+ shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
+ time.
+ queue_capacity: Integer, size of queue to accumulate.
+ num_threads: Integer, number of threads used for reading and enqueueing.
+
+ Returns:
+ Function, that returns a feature `dict` with `Tensors` and an optional
+ label `dict` with `Tensors`, or if target_key is `str` label is a `Tensor`
+
+ Raises:
+ TypeError: `x` is not `FunctionType`.
+ TypeError: `x()` is not `GeneratorType`.
+ TypeError: `next(x())` is not `dict`.
+ TypeError: `target_key` is not `str` or `target_key` is not `Container`
+ of `str`.
+ KeyError: `target_key` not a key or `target_key[index]` not in next(`x()`).
+ KeyError: `key` mismatch between dicts emitted from `x()`
+ """
+ if not isinstance(x, FunctionType):
+ raise TypeError(
+ 'x must be generator function; got {}'.format(type(x).__name__))
+ generator = x()
+ if not isinstance(generator, GeneratorType):
+ raise TypeError(
+ 'x() must be generator; got {}'.format(type(generator).__name__))
+ data = next(generator)
+ if not isinstance(data, dict):
+ raise TypeError('x() must yield dict; got {}'.format(type(data).__name__))
+ input_keys = sorted(next(x()).keys())
+ if target_key is not None:
+ if isinstance(target_key, str):
+ target_key = [target_key]
+ elif isinstance(target_key, Container):
+ for item in target_key:
+ if not isinstance(item, str):
+ raise TypeError('target_key must be str or Container of str; got {}'.
+ format(type(item).__name__))
+ if item not in input_keys:
+ raise KeyError(
+ 'target_key not in yielded dict. Expected {} keys; got {}'.format(
+ input_keys, item))
+ else:
+ raise TypeError('target_key must be str or Container of str; got {}'.
+ format(type(target_key).__name__))
+
+ def _generator_input_fn():
+ """generator input function."""
+ queue = feeding_functions.enqueue_data(
+ x,
+ queue_capacity,
+ shuffle=shuffle,
+ num_threads=num_threads,
+ enqueue_size=batch_size,
+ num_epochs=num_epochs)
+
+ features = (queue.dequeue_many(batch_size)
+ if num_epochs is None else queue.dequeue_up_to(batch_size))
+ if not isinstance(features, list):
+ features = [features]
+ features = dict(zip(input_keys, features))
+ if target_key is not None:
+ if len(target_key) > 1:
+ target = {key: features.pop(key) for key in target_key}
+ else:
+ target = features.pop(target_key[0])
+ return features, target
+ return features
+
+ return _generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
new file mode 100644
index 00000000000000..bc767ec18b1fac
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -0,0 +1,348 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+# TODO: #6568 Remove this hack that makes dlopen() not crash.
+if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
+ import ctypes
+
+ sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
+import numpy as np
+from tensorflow.contrib.learn.python.learn.learn_io import generator_io
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+class GeneratorIoTest(test.TestCase):
+
+ def testGeneratorInputFn(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones(1) * index,
+ 'b': np.ones(1) * index + 32,
+ 'label': np.ones(1) * index - 32
+ }
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator,
+ target_key='label',
+ batch_size=2,
+ shuffle=False,
+ num_epochs=1)
+ features, target = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ res = session.run([features, target])
+ self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+ self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+ self.assertAllEqual(res[1], np.asarray([-32, -31]).reshape(-1, 1))
+
+ session.run([features])
+ with self.assertRaises(errors.OutOfRangeError):
+ session.run([features, target])
+
+ coord.request_stop()
+ coord.join(threads)
+
+ def testGeneratorSingleInputFn(self):
+
+ def generator():
+ for index in range(2):
+ yield {'a': np.ones(1) * index}
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+ features = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ res = session.run([features])
+ self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+
+ session.run([features])
+ with self.assertRaises(errors.OutOfRangeError):
+ session.run([features])
+
+ coord.request_stop()
+ coord.join(threads)
+
+ def testGeneratorInputFnLabelDict(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones(1) * index,
+ 'b': np.ones(1) * index + 32,
+ 'label': np.ones(1) * index - 32,
+ 'label2': np.ones(1) * index - 64,
+ }
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator,
+ target_key=['label', 'label2'],
+ batch_size=2,
+ shuffle=False,
+ num_epochs=1)
+ features, target = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ res = session.run([features, target])
+ self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+ self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+ self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(
+ -1, 1))
+ self.assertAllEqual(res[1]['label2'],
+ np.asarray([-64, -63]).reshape(-1, 1))
+
+ session.run([features])
+ with self.assertRaises(errors.OutOfRangeError):
+ session.run([features, target])
+
+ coord.request_stop()
+ coord.join(threads)
+
+ def testGeneratorInputFnWithDifferentDimensionsOfFeatures(self):
+
+ def generator():
+ for index in range(100):
+ yield {
+ 'a': np.ones((10, 10)) * index,
+ 'b': np.ones((5, 5)) * index + 32,
+ 'label': np.ones((3, 3)) * index - 32
+ }
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator,
+ target_key='label',
+ batch_size=2,
+ shuffle=False,
+ num_epochs=1)
+ features, target = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ res = session.run([features, target])
+ self.assertAllEqual(res[0]['a'],
+ np.vstack((np.zeros((10, 10)), np.ones(
+ (10, 10)))).reshape(2, 10, 10))
+ self.assertAllEqual(res[0]['b'],
+ np.vstack((np.zeros((5, 5)), np.ones(
+ (5, 5)))).reshape(2, 5, 5) + 32)
+ self.assertAllEqual(res[1],
+ np.vstack((np.zeros((3, 3)), np.ones(
+ (3, 3)))).reshape(2, 3, 3) - 32)
+
+ coord.request_stop()
+ coord.join(threads)
+
+ def testGeneratorInputFnWithXAsNonGeneratorFunction(self):
+ x = np.arange(32, 36)
+ with self.test_session():
+ with self.assertRaisesRegexp(TypeError, 'x must be generator function'):
+ failing_input_fn = generator_io.generator_input_fn(
+ x, batch_size=2, shuffle=False, num_epochs=1)
+ failing_input_fn()
+
+ def testGeneratorInputFnWithXAsNonGenerator(self):
+
+ def generator():
+ return np.arange(32, 36)
+
+ with self.test_session():
+ with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+ failing_input_fn = generator_io.generator_input_fn(
+ generator, batch_size=2, shuffle=False, num_epochs=1)
+ failing_input_fn()
+
+ def testGeneratorInputFnWithXAsNonGeneratorYieldingDicts(self):
+
+ def generator():
+ yield np.arange(32, 36)
+
+ with self.test_session():
+ with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+ failing_input_fn = generator_io.generator_input_fn(
+ generator, batch_size=2, shuffle=False, num_epochs=1)
+ failing_input_fn()
+
+ def testGeneratorInputFNWithTargetLabelNotString(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones((10, 10)) * index,
+ 'b': np.ones((5, 5)) * index + 32,
+ 'label': np.ones((3, 3)) * index - 32
+ }
+
+ y = np.arange(32, 36)
+ with self.test_session():
+ with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+ ' Container of str'):
+ failing_input_fn = generator_io.generator_input_fn(
+ generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+ failing_input_fn()
+
+ def testGeneratorInputFNWithTargetLabelListNotString(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones((10, 10)) * index,
+ 'b': np.ones((5, 5)) * index + 32,
+ 'label': np.ones((3, 3)) * index - 32
+ }
+
+ y = ['label', np.arange(10)]
+ with self.test_session():
+ with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+ ' Container of str'):
+ failing_input_fn = generator_io.generator_input_fn(
+ generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+ failing_input_fn()
+
+ def testGeneratorInputFNWithTargetLabelNotInDict(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones((10, 10)) * index,
+ 'b': np.ones((5, 5)) * index + 32,
+ 'label': np.ones((3, 3)) * index - 32
+ }
+
+ y = ['label', 'target']
+ with self.test_session():
+ with self.assertRaisesRegexp(KeyError, 'target_key not in yielded dict'):
+ failing_input_fn = generator_io.generator_input_fn(
+ generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+ failing_input_fn()
+
+ def testGeneratorInputFnWithNoTargetKey(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones(1) * index,
+ 'b': np.ones(1) * index + 32,
+ 'label': np.ones(1) * index - 32
+ }
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+ features = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ res = session.run(features)
+ self.assertAllEqual(res['a'], np.asarray([0, 1]).reshape(-1, 1))
+ self.assertAllEqual(res['b'], np.asarray([32, 33]).reshape(-1, 1))
+ self.assertAllEqual(res['label'], np.asarray([-32, -31]).reshape(-1, 1))
+
+ session.run([features])
+ with self.assertRaises(errors.OutOfRangeError):
+ session.run([features])
+
+ coord.request_stop()
+ coord.join(threads)
+
+ def testGeneratorInputFnWithBatchLargerthanData(self):
+
+ def generator():
+ for index in range(2):
+ yield {
+ 'a': np.ones(1) * index,
+ 'b': np.ones(1) * index + 32,
+ 'label': np.ones(1) * index - 32
+ }
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator, target_key=None, batch_size=4, shuffle=False, num_epochs=1)
+ features = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ res = session.run(features)
+ self.assertAllEqual(res['a'], np.asarray([0, 1, 0, 1]).reshape(-1, 1))
+ self.assertAllEqual(res['b'], np.asarray([32, 33, 32, 33]).reshape(-1, 1))
+ self.assertAllEqual(res['label'],
+ np.asarray([-32, -31, -32, -31]).reshape(-1, 1))
+
+ with self.assertRaises(errors.OutOfRangeError):
+ session.run([features])
+
+ coord.request_stop()
+ coord.join(threads)
+
+ def testGeneratorInputFnWithMismatchinGeneratorKeys(self):
+
+ def generator():
+ index = 0
+ yield {
+ 'a': np.ones(1) * index,
+ 'b': np.ones(1) * index + 32,
+ 'label': np.ones(1) * index - 32
+ }
+ index = 1
+ yield {
+ 'a': np.ones(1) * index,
+ 'c': np.ones(1) * index + 32,
+ 'label': np.ones(1) * index - 32
+ }
+
+ with self.test_session() as session:
+ input_fn = generator_io.generator_input_fn(
+ generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+ features = input_fn()
+
+ coord = coordinator.Coordinator()
+ threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+ with self.assertRaises(errors.OutOfRangeError):
+ session.run([features])
+
+ with self.assertRaisesRegex(KeyError, 'key mismatch between dicts emitted'
+ ' by GenFunExpected'):
+ coord.request_stop()
+ coord.join(threads)
+
+
+if __name__ == '__main__':
+ test.main()
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 4db818a3a9c059..2b2e885689d63a 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -370,6 +370,7 @@ ifeq ($(TARGET),IOS)
ifeq ($(IOS_ARCH),I386)
CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
-arch i386 \
+ -mno-sse \
-fembed-bitcode \
-D__thread= \
-DUSE_GEMM_FOR_CONV \
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index ac10dfc722bc9a..f061b58775e87a 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -75,7 +75,7 @@ To run the executable, use:
```bash
tensorflow/contrib/makefile/gen/bin/benchmark \
- --graph=~/graphs/inception/tensorflow_inception_graph.pb
+ --graph=$HOME/graphs/inception/tensorflow_inception_graph.pb
```
## Android
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index ff80167ff476e3..0909760b383d3b 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -99,8 +99,13 @@ def __init__(self, loss, var_list=None, equalities=None, inequalities=None,
slice(start, end) for start, end in zip(accumulated_dims[:-1],
accumulated_dims[1:])]
- def minimize(self, session=None, feed_dict=None, fetches=None,
- step_callback=None, loss_callback=None):
+ def minimize(self,
+ session=None,
+ feed_dict=None,
+ fetches=None,
+ step_callback=None,
+ loss_callback=None,
+ **run_kwargs):
"""Minimize a scalar `Tensor`.
Variables subject to optimization are updated in-place at the end of
@@ -120,6 +125,7 @@ def minimize(self, session=None, feed_dict=None, fetches=None,
flattened into a single vector.
loss_callback: A function to be called every time the loss and gradients
are computed, with evaluated fetches supplied as positional arguments.
+ **run_kwargs: kwargs to pass to `session.run`.
"""
session = session or ops.get_default_session()
feed_dict = feed_dict or {}
@@ -160,8 +166,10 @@ def minimize(self, session=None, feed_dict=None, fetches=None,
for packing_slice in self._packing_slices]
# Set optimization variables to their new values.
- session.run(self._var_updates,
- feed_dict=dict(zip(self._update_placeholders, var_vals)))
+ session.run(
+ self._var_updates,
+ feed_dict=dict(zip(self._update_placeholders, var_vals)),
+ **run_kwargs)
def _minimize(self, initial_val, loss_grad_func, equality_funcs,
equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops.cc b/tensorflow/contrib/rnn/ops/lstm_ops.cc
index 2de40825c906e1..699cc6c88a4634 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@@ -78,7 +78,7 @@ ci = tanh(ci)
cs = ci .* i + cs_prev .* f
cs = clip(cs, cell_clip)
-o = sigmoid(cs * wco + f)
+o = sigmoid(cs * wco + o)
co = tanh(cs)
h = co .* o
```
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index b55e1ff848317d..d01d37511950bf 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -322,9 +322,10 @@ def __call__(self, query):
Args:
query: Tensor of dtype matching `self.values` and shape
`[batch_size, query_depth]`.
+
Returns:
score: Tensor of dtype matching `self.values` and shape
- `[batch_size, self.num_units]`.
+ `[batch_size, max_time]` (`max_time` is memory's `max_time`).
"""
with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
processed_query = self.query_layer(query) if self.query_layer else query
@@ -522,7 +523,8 @@ def __call__(self, inputs, state, scope=None):
- Step 5: Calculate the context vector as the inner product between the
alignments and the attention_mechanism's values (memory).
- Step 6: Calculate the attention output by concatenating the cell output
- and context through the attention layer.
+ and context through the attention layer (a linear layer with
+ `attention_size` outputs).
Args:
inputs: (Possibly nested tuple of) Tensor, the input at this time step.
@@ -531,10 +533,10 @@ def __call__(self, inputs, state, scope=None):
scope: Must be `None`.
Returns:
- A tuple `(attention, next_state)`, where:
+ A tuple `(attention_or_cell_output, next_state)`, where:
- - `attention` is the attention passed to the layer above.
- - `next_state` is an instance of `AttentionWrapperState`
+ - `attention_or_cell_output` depending on `output_attention`.
+ - `next_state` is an instance of `DynamicAttentionWrapperState`
containing the state calculated at this time step.
Raises:
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index cfe6ac51346386..39a6d2f58b1407 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
-
"""Seq2seq loss operations for use in sequence models.
"""
@@ -28,22 +27,33 @@
__all__ = ["sequence_loss"]
-def sequence_loss(logits, targets, weights,
- average_across_timesteps=True, average_across_batch=True,
- softmax_loss_function=None, name=None):
- """Weighted cross-entropy loss for a sequence of logits (per example).
+def sequence_loss(logits,
+ targets,
+ weights,
+ average_across_timesteps=True,
+ average_across_batch=True,
+ softmax_loss_function=None,
+ name=None):
+ """Weighted cross-entropy loss for a sequence of logits.
+
+ Depending on the values of `average_across_timesteps` and
+ `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
+ arguments reduce the cross-entropy at each target, which has shape
+ `[batch_size, sequence_length]`, over their respective dimensions. For
+ example, if `average_across_timesteps` is `True` and `average_across_batch`
+ is `False`, then the return Tensor will have shape `[batch_size]`.
Args:
- logits: A 3D Tensor of shape
- [batch_size x sequence_length x num_decoder_symbols] and dtype float.
+ logits: A Tensor of shape
+ `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
The logits correspond to the prediction across all classes at each
timestep.
- targets: A 2D Tensor of shape [batch_size x sequence_length] and dtype
+ targets: A Tensor of shape `[batch_size, sequence_length]` and dtype
int. The target represents the true class at each timestep.
- weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype
- float. Weights constitutes the weighting of each prediction in the
- sequence. When using weights as masking set all valid timesteps to 1 and
- all padded timesteps to 0.
+ weights: A Tensor of shape `[batch_size, sequence_length]` and dtype
+ float. `weights` constitutes the weighting of each prediction in the
+ sequence. When using `weights` as masking, set all valid timesteps to 1
+ and all padded timesteps to 0, e.g. a mask returned by `tf.sequence_mask`.
average_across_timesteps: If set, sum the cost across the sequence
dimension and divide the cost by the total label weight across timesteps.
average_across_batch: If set, sum the cost across the batch dimension and
@@ -55,7 +65,10 @@ def sequence_loss(logits, targets, weights,
name: Optional name for this operation, defaults to "sequence_loss".
Returns:
- A scalar float Tensor: The average log-perplexity per symbol (weighted).
+ A float Tensor of rank 0, 1, or 2 depending on the
+ `average_across_timesteps` and `average_across_batch` arguments. By default,
+ it has rank 0 (scalar) and is the weighted average cross-entropy
+ (log-perplexity) per symbol.
Raises:
ValueError: logits does not have 3 dimensions or targets does not have 2
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 4aa39e5202f58e..ba761cd7c6fc12 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -721,7 +721,8 @@ cc_library(
"//tensorflow/core/kernels:quantized_ops",
]) + if_mkl([
"//tensorflow/core/kernels:mkl_conv_op",
- "//tensorflow/core/kernels:mkl_matmul_op",
+ "//tensorflow/core/kernels:mkl_pooling_ops",
+ "//tensorflow/core/kernels:mkl_relu_op",
"//tensorflow/core/kernels:mkl_tfconv_op",
]),
)
@@ -2094,7 +2095,8 @@ tf_cc_test_mkl(
"//tensorflow/cc:scope",
"//tensorflow/cc:sendrecv_ops",
"//tensorflow/core/kernels:mkl_conv_op",
- "//tensorflow/core/kernels:mkl_matmul_op",
+ "//tensorflow/core/kernels:mkl_pooling_ops",
+ "//tensorflow/core/kernels:mkl_relu_op",
"//tensorflow/core/kernels:mkl_tfconv_op",
"//tensorflow/core/kernels:ops_util",
"//third_party/eigen3",
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 9a2e4bcfa0c36c..309c4cd774c71a 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -15,13 +15,15 @@ limitations under the License.
#ifdef INTEL_MKL
+#include
#include
#include
+#include
+#include
#include
#include
#include
#include
-
#include "tensorflow/core/common_runtime/function.h"
#include "tensorflow/core/common_runtime/optimization_registry.h"
#include "tensorflow/core/framework/node_def_util.h"
@@ -39,68 +41,91 @@ limitations under the License.
namespace tensorflow {
-// This pass implements rewriting of graph for propagating Mkl
-// layout as an additional output tensor (we will loosely call a
-// tensor that carries Mkl layout as Mkl tensor henceforth.)
-// from every Mkl supported NN layer.
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+// Rewrite happens under following 2 scenarios:
+// 1) Propagating Mkl layout as an additional output tensor
+// (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+// henceforth.) from every Mkl supported NN layer.
+// 2) Context-based rewrite: This is neded in order to optimize
+// gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
+// MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
+// Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
+// This is context-specific optimization, where the context is the
+// forward operator that the BiasAddGrad corresponds to.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+// O = Conv2D(A, B)
+// P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+// P = MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
//
-// As a example, consider Relu layer. Current definition of Relu
-// layer looks like:
+// Meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+// - Merge for Conv2D and BiasAdd happens only when output of Conv2D _only_
+// goes to BiasAdd.
+// - Also, the intersection of attributes of both the nodes must have same
+// values.
+// - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider Relu layer. Current definition of Relu layer looks like:
//
// O = Relu(A)
//
// Relu has 1 input (A), and 1 output (O).
//
-// This rewrite pass will generate a new graph node for Relu
-// (new node is called MklRelu) as:
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
//
// O, O_m = MklRelu(A, A_m)
//
-// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m).
-// Here A input is same as A input of Relu; O output is same
-// as O output of Relu. O_m is the additional output tensor
-// that will be set by MklRelu, and it represents Mkl tensor
-// corresponding to O -- in other words, O_m is some kind of
-// metadata for O. A_m is additional input of Relu, and it
-// represents metadata for A - as O_m is metadata for O, A_m
-// is metadata for A. MklRelu receives this metadata from
-// previous layer (in the graph).
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here A input is
+// same as A input of Relu; O output is same as O output of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous layer (in the graph).
//
-// When previous layer in the graph is Mkl layer, A_m will
-// represent a valid Mkl tensor. But when previous Mkl layer
-// is not an Mkl layer, then A_m represents a dummy Mkl tensor.
+// When previous layer in the graph is Mkl layer, A_m will represent a valid
+// Mkl tensor. But when previous Mkl layer is not an Mkl layer, then A_m
+// represents a dummy Mkl tensor.
//
// Rewriting rules:
-// - Selection of an op for rewriting happens by registering
-// an op with this pass. If an op is not registered, then
-// it is not rewritten.
+// - Selection of an op for rewriting happens by registering an op with this
+// pass. If an op is not registered, then it is not rewritten.
// - Number of inputs after rewriting:
-// Since for every input Tensorflow tensor, the rewritten
-// layer gets Mkl tensor, rewritten op gets 2*N inputs,
-// where N is the number of inputs for original op.
+// Since for every input Tensorflow tensor, the rewritten layer gets Mkl
+// tensor, rewritten op gets 2*N inputs, where N is the number of inputs
+// for original op.
// - Number of outputs after rewriting:
-// Since for every output Tensorflow tensor, the rewritten
-// layer generates Mkl tensor, rewritten op generates 2*N
-// outputs, where N is the number of outputs of original op.
+// Since for every output Tensorflow tensor, the rewritten layer generates
+// Mkl tensor, rewritten op generates 2*N outputs, where N is the number
+// of outputs of original op.
// - Ordering of Tensorflow tensors and Mkl tensors:
-// Since every op generates twice the number of inputs and
-// outputs, one could imagine different ordering among
-// Tensorflow tensors and Mkl tensors. E.g., let's assume
-// an op 'Conv2D' takes (A, B) as input, then new op
-// 'MklConv2D' can take (A, A_m, B, B_m) as input or it
-// can also take (A, B, A_m, B_m) as input. Among N inputs
-// one can get N! permutations.
-//
-// So the question is: which one do we follow? Currently,
-// we follow an intuitive order where Mkl tensor follows a
-// corresponding Tensorflow tensor immediately. In the
-// context of above example, it will be: (A, A_m, B, B_m).
-// We follow same ordering rule for output tensors.
-//
-// NOTE: Current rewriting approach rewrites an op to Mkl op without
-// any conditions. But in the future, it may be possible to
-// consider conditions such as input shapes and sizes to rewrite
-// an op.
+// Since every op generates twice the number of inputs and outputs, one
+// could imagine different ordering among Tensorflow tensors and Mkl
+// tensors. E.g., let's assume an op 'Conv2D' takes (A, B) as input, then
+// new op 'MklConv2D' can take (A, A_m, B, B_m) as input or it can also
+// take (A, B, A_m, B_m) as input. Among N inputs one can get N!
+// permutations.
+//
+// So the question is: which one do we follow? Currently, we follow an
+// intuitive order where Mkl tensor follows a corresponding Tensorflow
+// tensor immediately. In the context of above example, it will be: (A,
+// A_m, B, B_m). We follow same ordering rule for output tensors.
+//
+// NOTE: Current rewriting approach rewrites an op to Mkl op without any
+// conditions. But in the future, it may be possible to consider
+// conditions such as input shapes and sizes to rewrite an op.
//
// Graph rewrite algorithm:
// Algorithm: Graph Rewrite
@@ -147,13 +172,137 @@ namespace tensorflow {
// it is, then we rewrite that node after constructing new inputs to
// the node. If it is not Mkl layer, then we do not rewrite the node.
//
+// Handling workspace propagation for certain ops:
+//
+// Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+// passing of workspace from their corresponding forward ops. But
+// TensorFlow does not have a notion of workspace and as a result
+// does not allow producing additional outputs from these forward ops.
+// For these ops, we need to add an additional edge between forward
+// ops and their corresponding backward ops, and this edge carries
+// workspace tensor value and another edge carries Mkl tensor for
+// workspace tensor.
+//
+// Example:
+//
+// Typical graph for MaxPool and its gradient looks like:
+//
+// A = MaxPool(T)
+// B = MaxPoolGrad(X, A, Y)
+//
+// We will transform this graph to propagate workspace as:
+//
+// A, A_m, W, W_m = MklMaxPool(T, T_m)
+// B, B_m = MklMaxPoolGrad(X, X_m, A, A_m, Y, Y_m, W, W_m)
+//
+// Here W is the workspace tensor. Transformed tensors with name
+// suffix _m are Mkl tensors and this transformation has been done
+// using the algorithm discussed earlier. The transformation for
+// workspace only adds extra outputs (W, W_m) for forward op and
+// connects them to corresponding backward ops.
+//
+// Terms:
+//
+// Forward op name = name of the op in the forward pass
+// where workspace originates (MaxPool in this example)
+// Backward op name = name of the op in the backward pass that receives
+// workspace from forward op (MaxPoolGrad in the example)
+// Slot = Number of the output or input slot that will be
+// used by the workspace (2 for MklMaxPool as W is 3rd
+// output of MaxPool (0 is 1st); 6 for MklMaxPoolGrad)
+//
+// Question:
+//
+// How do we associate backward op to forward op? There can be more
+// than one op with exact same name.
+//
+// In this example we associate MaxPoolGrad with MaxPool. But there
+// could be more than one MaxPool ops. To solve this problem, we look
+// for _direct_ edge between forward op and backward op (tensor A is
+// flowing along this edge in the example.)
+//
+// How do we transform forward and backward op when there is no direct
+// edge between them? In such case, we generate dummy tensors as
+// workspace tensors. For the example, transformation of MaxPool will
+// be exactly same --- it is just that MaxPool won't generate any
+// workspace tensor. For MaxPoolGrad, transformation will also be same,
+// but instead of connecting W and W_m with outputs of MaxPool, we will
+// produce dummy tensors for them, and we will set workspace_enabled
+// attribute to false.
+//
+// Example of B.2 : Context-based node rewrite
+// -------------------------------------------
+// Consider BiasAddGrad op as:
+//
+// O = MklConv2D(A, A_m, B, B_m, C, C_m)
+// P = BiasAddGrad(O)
+//
+// Then we rewrite is as:
+//
+// P = Conv2DWithBiasBackpropBias(O, O_m)
+//
+// 'Distance' between input of BiasAddGrad and MklConv2D in terms of hops is
+// the context matching depth. If MklConv2DWithBias is not within the context
+// matching depth, then we do not rewrite BiasAddGrad.
+
+// How many hops do we search for matching node in the backward dataflow graph?
+// We use maxhop of 10 based on empirical observations. Also, these are
+// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
+// directly goes to backward nodes, we do not expect the hop-distance
+// would be more than few nodes.
+static size_t kNodeMergeContextMaxDepth = 10;
+
class MklLayoutRewritePass : public GraphOptimizationPass {
public:
MklLayoutRewritePass() {
csinfo_.conv2d = "Conv2D";
-
- ninfo_.push_back(
- {csinfo_.conv2d, GetMklOpName(csinfo_.conv2d), 2, CopyAttrsConv2D});
+ csinfo_.mklconv2d = "MklConv2D";
+ csinfo_.mklconv2dwithbias = "MklConv2DWithBias";
+ csinfo_.mklconv2dwithbiasbackpropbias = "MklConv2DWithBiasBackpropBias";
+ csinfo_.biasadd = "BiasAdd";
+ csinfo_.matmul = "MatMul";
+ csinfo_.biasaddgrad = "BiasAddGrad";
+ csinfo_.relu = "Relu";
+ csinfo_.relugrad = "ReluGrad";
+ csinfo_.maxpool = "MaxPool";
+ csinfo_.maxpoolgrad = "MaxPoolGrad";
+ csinfo_.avgpool = "AvgPool";
+ csinfo_.avgpoolgrad = "AvgPoolGrad";
+ csinfo_.conv2dgradinput = "Conv2DBackpropInput";
+ csinfo_.conv2dgradfilter = "Conv2DBackpropFilter";
+
+ rinfo_.push_back(
+ {csinfo_.conv2d, csinfo_.mklconv2d, 2, CopyAttrsConv2D, AlwaysRewrite});
+ rinfo_.push_back({csinfo_.conv2dgradfilter,
+ GetMklOpName(csinfo_.conv2dgradfilter), 3,
+ CopyAttrsConv2D, AlwaysRewrite});
+ rinfo_.push_back({csinfo_.conv2dgradinput,
+ GetMklOpName(csinfo_.conv2dgradinput), 3, CopyAttrsConv2D,
+ AlwaysRewrite});
+ rinfo_.push_back({csinfo_.relu, GetMklOpName(csinfo_.relu), 1,
+ CopyAttrsRelu, AlwaysRewrite});
+ rinfo_.push_back({csinfo_.maxpool, GetMklOpName(csinfo_.maxpool), 1,
+ CopyAttrsPooling, AlwaysRewrite});
+ rinfo_.push_back({csinfo_.maxpoolgrad, GetMklOpName(csinfo_.maxpoolgrad), 3,
+ CopyAttrsPooling, AlwaysRewrite});
+ rinfo_.push_back({csinfo_.avgpool, GetMklOpName(csinfo_.avgpool), 1,
+ CopyAttrsPooling, AlwaysRewrite});
+ rinfo_.push_back({csinfo_.avgpoolgrad, GetMklOpName(csinfo_.avgpoolgrad), 2,
+ CopyAttrsPooling, AlwaysRewrite});
+
+ // Add info about which ops to add workspace edge to and the slots.
+ wsinfo_.push_back({csinfo_.maxpool, csinfo_.maxpoolgrad, 0, 1, 2, 6});
+
+ // Add a rule for merging nodes
+ minfo_.push_back(
+ {csinfo_.mklconv2d, csinfo_.biasadd, 0, csinfo_.mklconv2dwithbias});
+
+ // We use maxhop of 10 based on empirical observations. Also, these are
+ // maxhops in backward data-flow graph. Since input of forward nodes
+ // (Conv2D) directly goes to backward nodes, we do not expect the
+ // hop-distance would be more than few nodes.
+ cinfo_.push_back({csinfo_.biasaddgrad, csinfo_.mklconv2dwithbias,
+ kNodeMergeContextMaxDepth});
}
// Standard interface to run pass
@@ -176,20 +325,79 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
string name; // Original name of the op in the graph
string newname; // New name of op in the graph
int numins; // Number of inputs to the original op
- std::function
- copyattrs; // Function handler
- // to copy attributes from old node to new node.
- } NodesInfo;
+ // Function handler to copy attributes from old node to new node.
+ std::function copyattrs;
+ std::function rewriterule; // Rule under which to
+ // rewrite this node.
+ } RewriteInfo;
+
+ /// Structure to specify forward op, backward op, and the slot numbers
+ /// in forward and backward op where we will add workspace edge.
+ typedef struct {
+ string fwdop; // Name of the forward op in the graph
+ string bwdop; // Name of the backward op in the graph
+ int fwdslot; // Output slot in the forward op node where actual
+ // output tensor resides
+ int bwdslot; // Input slot in the backward op node where actual
+ // input tensor resides
+ int wsfwdslot; // Output slot in the forward op node where workspace
+ // edge is added
+ int wsbwdslot; // Input slot in the backward op node where workspace
+ // edge is added
+ } WorkSpaceInfo;
+
+ /// Structure to specify information used in node merge
+ typedef struct {
+ string pred; // Predecessor node string
+ string succ; // Successor node string
+ int op; // What operand no the predecessor node corresponds
+ // to successor node?
+ string newnode; // Name of the node after merge
+ } MergeInfo;
+
+ /// Structure to specify the context information used in node rewrite rule
+ typedef struct {
+ string node; // Name of the node to be rewritten
+ string fwd; // Node name in forward pass that this node
+ // corresponds to
+ size_t maxhop; // Maximum number of hops the fwd is located
+ // from this node. If fwd is farther than maxhop
+ // then we do not rewrite the node.
+ } ContextInfo;
/// Structure to store all constant strings
struct {
string relu;
string relugrad;
+ // Conv ops
string conv2d;
+ string mklconv2d;
+ string conv2dgradinput;
+ string conv2dgradfilter;
+ string mklconv2dwithbias;
+ string mklconv2dwithbiasbackpropbias;
+ // Pooling ops
+ string maxpool;
+ string maxpoolgrad;
+ string avgpool;
+ string avgpoolgrad;
+ // Others
+ string biasadd;
+ string matmul;
+ string biasaddgrad;
} csinfo_;
/// Maintain info about nodes to rewrite
- std::vector ninfo_;
+ std::vector rinfo_;
+
+ /// Maintain info about nodes to add workspace edge
+ std::vector wsinfo_;
+
+ /// Maintain info to be merged
+ std::vector minfo_;
+
+ /// Maintain info about nodes to rewrite
+ static std::vector cinfo_;
/// Hash table to maintain nodes visited in the graph.
std::unordered_set visited_nodes_;
@@ -209,6 +417,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
// Mark the node as rewritten
inline void MarkRewrittenNode(Node* n) { visited_nodes_.insert(n); }
+ // Clear all visited nodes
+ inline void UnMarkRewrittenNodes() { visited_nodes_.clear(); }
+
// Get the name of Mkl op from original TensorFlow op
// We prefix 'Mkl' to the original op to get Mkl op.
// TODO(nhasabni) We should move this to mkl_util.h.
@@ -218,6 +429,71 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
return string(kMklOpPrefix) + name;
}
+ // Return a node that can be merged with input node 'n'
+ //
+ // @return pointer to the node if we can find such a
+ // node. Otherwise, it returns nullptr.
+ Node* CheckForNodeMerge(const Node* n) const;
+
+ // Merge predecessor node with its successor.
+ // Currently, we merge Conv2D with BiasAdd only.
+ //
+ // Input nodes succ and pred may be deleted if the call to
+ // this function is successful. Attempt to use the pointers
+ // after the call to function may result is undefined behaviors.
+ //
+ // @input g - input graph, succ - successor node, pred - predecessor node
+ // @return Status::OK(), if merging is successful and supported.
+ // Returns appropriate Status error code otherwise.
+ // Graph is updated in case nodes are merged. Otherwise, it is
+ // not updated.
+ Status MergeNode(std::unique_ptr* g, Node* succ, Node* pred);
+
+ // Check if the node 'n' has any applicable rewrite rule
+ // We check for 2 scenarios for rewrite.
+ //
+ // @return RewriteInfo* for the applicable rewrite rule
+ const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+ // Default rewrite rule to be used in scenario 1 for rewrite.
+ // @return - true (since we want to always rewrite)
+ static bool AlwaysRewrite(const Node* n) { return true; }
+ // Rewrite rule that uses context-information for matching
+ // used in scenario 2.
+ //
+ // @input - Node 'n' for which to search for matching context
+ // @return - true if matching context is found; false otherwise.
+ static bool ContextMatchRewrite(const Node* n);
+
+ // Helper function that searches the matching contextinfo for the node.
+ // Implements depth-first search in the data dependence graph for the
+ // gradient op in the backward direction.
+ //
+ // @input n - Node (gradient op) whose contextinfo is to be searched,
+ // fwdn - pointer to node from the forward pass that this node
+ // belongs to. fwdn cannot be NULL.
+ // @return Matching contextinfo in case a match is found; null otherwise.
+ // Also updates *fwdn with pointer to forward node that this context
+ // matches.
+ static const ContextInfo* SearchMatchingContext(const Node* n,
+ const Node** fwdn);
+
+ // Rewrites input node to a new node specified by its matching rewrite info.
+ //
+ // Method first searches matching rewrite info for input node and then
+ // uses that info to rewrite.
+ //
+ // Input node may be deleted in case of rewrite. Attempt to use the node
+ // after the call can result in undefined behaviors.
+ //
+ // @input g - input graph, n - Node to be rewritten,
+ // ri - matching rewriteinfo
+ // @return Status::OK(), if the input node is rewritten;
+ // Returns appropriate Status error code otherwise.
+ // Graph is updated in case the input node is rewritten.
+ // Otherwise, it is not updated.
+ Status RewriteNode(std::unique_ptr* g, Node* n, const RewriteInfo* ri);
+
// Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
// in graph 'g'. Original node is input in 'orign'.
//
@@ -230,28 +506,40 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
const gtl::InlinedVector, 4>& inputs,
NodeBuilder* nb, Node* orign);
- // Rewrite Node 'n' in graph 'g' with rewrite information specified in 'ni'
- // Returns Status::OK() if node rewrite is successful, otherwise returns
- // appropriate error status
- Status RewriteNode(std::unique_ptr* g, Node* n, const NodesInfo& ni);
+ // Add workspace edge on the input or output side of Node 'orign' by using
+ // NodeBuilder 'nb' for the new node provided. If 'orign' does not dictate
+ // adding workspace edge then do not add it.
+ void AddWorkSpaceEdgeIfNeeded(std::unique_ptr* g, Node* orign,
+ NodeBuilder* nb);
// Functions specific to operators to copy attributes
// We need operator-specific function to copy attributes because the framework
// does not provide any generic function for it.
- static void CopyAttrsConv2D(Node* orign, NodeBuilder* nb);
+ static void CopyAttrsConv2D(const Node* orign, NodeBuilder* nb);
+ static void CopyAttrsBiasAddGrad(const Node* orign, NodeBuilder* nb);
+ static void CopyAttrsPooling(const Node* orign, NodeBuilder* nb);
+ static void CopyAttrsRelu(const Node* orign, NodeBuilder* nb);
// Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
// using node for original node 'orign' and return it in '*out'.
// TODO(nhasabni) We should move this to mkl_util.h
void GetDummyMklTensorNode(std::unique_ptr* g, Node** out,
Node* orign);
+ void GetDummyWorkspaceTensorNode(std::unique_ptr* g, Node** out,
+ Node* orign);
};
+std::vector MklLayoutRewritePass::cinfo_;
+
// We register Mkl rewrite pass for phase 1 in pre-placement group.
// Do not change the ordering of the Mkl passes.
REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
MklLayoutRewritePass);
+//////////////////////////////////////////////////////////////////////////
+// Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
static void FillInputs(const Node* n,
gtl::InlinedVector* control_edges,
gtl::InlinedVector, 4>* in) {
@@ -273,47 +561,6 @@ static void FillInputs(const Node* n,
}
}
-//////////////////////////////////////////////////////////////////////////
-
-// Macros to build new node with different number of inputs.
-// We need this way because we need to specify all the inputs when
-// building a node. Comment at core/graph/node_builder.h, line 85-86.
-
-#define SETUP_INPUTS1(nb, op1) \
- do { \
- nb->Input(op1.node, op1.index); \
- } while (0)
-
-#define SETUP_INPUTS2(nb, op1, op2) \
- do { \
- nb->Input(op1.node, op1.index); \
- nb->Input(op2.node, op2.index); \
- } while (0)
-
-#define SETUP_INPUTS3(nb, op1, op2, op3) \
- do { \
- nb->Input(op1.node, op1.index); \
- nb->Input(op2.node, op2.index); \
- nb->Input(op3.node, op3.index); \
- } while (0)
-
-#define SETUP_INPUTS4(nb, op1, op2, op3, op4) \
- do { \
- nb->Input(op1.node, op1.index); \
- nb->Input(op2.node, op2.index); \
- nb->Input(op3.node, op3.index); \
- nb->Input(op4.node, op4.index); \
- } while (0)
-
-#define SETUP_INPUTS5(nb, op1, op2, op3, op4, op5) \
- do { \
- nb->Input(op1.node, op1.index); \
- nb->Input(op2.node, op2.index); \
- nb->Input(op3.node, op3.index); \
- nb->Input(op4.node, op4.index); \
- nb->Input(op5.node, op5.index); \
- } while (0)
-
// TODO(nhasabni) We should move this to mkl_util.h.
void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g,
Node** out, Node* orign) {
@@ -335,6 +582,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g,
// device as device of original
// node.
.Finalize(&**g, out));
+ (*out)->set_assigned_device_name(orign->assigned_device_name());
}
Status MklLayoutRewritePass::SetUpInputs(
@@ -359,7 +607,7 @@ Status MklLayoutRewritePass::SetUpInputs(
TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
// If this op has been rewritten, then its name must have been same as
// Mkl op.
- CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string()), true);
+ CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string(), T), true);
// src slot number for Mkl tensor would be the one next to TF tensor
// slot number.
new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second + 1));
@@ -380,38 +628,140 @@ Status MklLayoutRewritePass::SetUpInputs(
// N for Mkl tensors corresponding to each Tensorflow tensors.
CHECK_EQ(new_inputs.size(), inputs.size() * 2);
- // 2. Let's build the node with new inputs.
- switch (new_inputs.size()) {
- case 0: // We don't need to do anything for no input as we have
- // already built node.
- break;
- case 1:
- SETUP_INPUTS1(nb, new_inputs[0]);
- break;
- case 2:
- SETUP_INPUTS2(nb, new_inputs[0], new_inputs[1]);
- break;
- case 3:
- SETUP_INPUTS3(nb, new_inputs[0], new_inputs[1], new_inputs[2]);
- break;
- case 4:
- SETUP_INPUTS4(nb, new_inputs[0], new_inputs[1], new_inputs[2],
- new_inputs[3]);
- break;
- case 5:
- SETUP_INPUTS5(nb, new_inputs[0], new_inputs[1], new_inputs[2],
- new_inputs[3], new_inputs[4]);
- break;
- default: {
- return Status(error::Code::UNIMPLEMENTED,
- "Could not create node with given number of inputs");
- }
+ // 2. Let's add the new inputs.
+ for (auto ni : new_inputs) {
+ nb->Input(ni.node, ni.index);
}
return Status::OK();
}
-void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
+//////////////////////////////////////////////////////////////////////////
+// Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+ std::unique_ptr* g, Node** out, Node* orign) {
+ // We use a tensor of shape {1} and value 0 to represent
+ // dummy float tensor. We need this as a dummy workspace tensor.
+ // Workspace tensor has type float.
+ const DataType dt = DataTypeToEnum::v();
+ TensorProto proto;
+ proto.set_dtype(dt);
+ float zero[1] = {0};
+ proto.set_tensor_content(const_cast(static_cast(&zero)),
+ 4);
+ TensorShape dummy_shape({1});
+ dummy_shape.AsProto(proto.mutable_tensor_shape());
+ TF_CHECK_OK(
+ NodeBuilder((*g)->NewName("DMT"), "Const")
+ .Attr("value", proto)
+ .Attr("dtype", dt)
+ .Device(orign->def().device()) // We place this node on same
+ // device as device of original
+ // node.
+ .Finalize(&**g, out));
+ (*out)->set_assigned_device_name(orign->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(std::unique_ptr* g,
+ Node* orign,
+ NodeBuilder* nb) {
+ bool workspace_edge_added = false;
+ DataType T;
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+ for (auto ws : wsinfo_) {
+ if (orign->type_string() == ws.fwdop &&
+ mkl_layer_registry::IsMklLayer(GetMklOpName(orign->type_string()), T)) {
+ // If this op is a fwd op, then we need to check if there is an
+ // edge from this node's fwdslot to bwdop's bwdslot. If there is
+ // an edge, then we just add an attribute on this node for setting
+ // workspace_passed to true. We don't add actual workspace edge
+ // in this node. Actual workspace edge gets added in the backward
+ // op for this node.
+ for (const Edge* e : orign->out_edges()) {
+ if (e->src_output() == ws.fwdslot &&
+ e->dst()->type_string() == ws.bwdop &&
+ e->dst_input() == ws.bwdslot) {
+ nb->Attr("workspace_enabled", true);
+ VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+ << orign->type_string();
+ workspace_edge_added = true;
+ // We found the edge that we were looking for, so break.
+ break;
+ }
+ }
+
+ if (!workspace_edge_added) {
+ // If we are here, then we did not find backward operator for this
+ // node.
+ nb->Attr("workspace_enabled", false);
+ }
+ } else if (orign->type_string() == ws.bwdop &&
+ mkl_layer_registry::IsMklLayer(
+ GetMklOpName(orign->type_string()), T)) {
+ // If this op is a bwd op, then we need to add workspace edge and
+ // it's Mkl tensor edge between its corresponding fwd op and this
+ // op. Corresponding fwd op is specified in 'fwdop' field of
+ // workspace info. fwdslot and bwdslot in workspace info specify
+ // an edge between which slots connect forward and backward op.
+ // Once all these criteria match, we add a workspace edge between
+ // wsfwdslot and wsbwdslot. It's corresponding Mkl tensor is added
+ // in wsfwdslot+1 and wsbwdslot+1.
+ for (const Edge* e : orign->in_edges()) {
+ if (e->src_output() == ws.fwdslot &&
+ // We would have rewritten the forward op, so we need to use
+ // GetMklOpName call to get its Mkl name.
+ e->src()->type_string() == GetMklOpName(ws.fwdop) &&
+ e->dst_input() == ws.bwdslot) {
+ nb->Attr("workspace_enabled", true);
+ // Add workspace edge between fwd op and bwd op.
+ nb->Input(e->src(), ws.wsfwdslot);
+ // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+ nb->Input(e->src(), ws.wsfwdslot + 1);
+ // In terms of input ordering, we add these calls to add Input
+ // here because workspace edge (and its Mkl tensor) is the last
+ // edge in the fwdop and bwdop. So all inputs before workspace
+ // tensor have been added by SetUpInputs function.
+ VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+ << orign->type_string();
+ workspace_edge_added = true;
+ // We found the edge that we were looking for, so break.
+ break;
+ }
+ }
+
+ // If we are here means we did not find fwd op that feeds to this
+ // bwd op. So in this case, we need to generate dummy tensors for
+ // workspace input and Mkl tensor for workspace, and set
+ // workspace_enabled to false.
+ if (!workspace_edge_added) {
+ nb->Attr("workspace_enabled", false);
+ Node* dmt_ws = nullptr; // Dummy tensor for workspace
+ Node* dmt_mkl_ws = nullptr; // Dummy Mkl tensor for workspace
+ GetDummyWorkspaceTensorNode(g, &dmt_ws, orign);
+ GetDummyMklTensorNode(g, &dmt_mkl_ws, orign);
+ CHECK_NOTNULL(dmt_ws);
+ CHECK_NOTNULL(dmt_mkl_ws);
+ nb->Input(dmt_ws, 0); // We add dummy tensor as workspace tensor.
+ nb->Input(dmt_mkl_ws, 0); // We add dummy tensor as Mkl
+ // tensor for workspace tensor.
+ VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+ << orign->type_string();
+ }
+ } else {
+ // If this node does not match any workspace info, then we do not
+ // do anything special for workspace propagation for it.
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orign, NodeBuilder* nb) {
DataType T;
string data_format;
string padding;
@@ -433,19 +783,280 @@ void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
}
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orign,
+ NodeBuilder* nb) {
+ DataType T;
+ string data_format;
+ std::vector strides;
+
+ // Get all attributes from old node.
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+
+ // Add attributes to new node.
+ nb->Attr("T", T);
+ nb->Attr("strides", strides);
+ nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orign,
+ NodeBuilder* nb) {
+ DataType T;
+ string data_format;
+ string padding;
+ std::vector ksize, strides;
+
+ // Get all attributes from old node.
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "ksize", &ksize));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+
+ // Add attributes to new node.
+ nb->Attr("T", T);
+ nb->Attr("ksize", ksize);
+ nb->Attr("strides", strides);
+ nb->Attr("padding", padding);
+ nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsRelu(const Node* orign, NodeBuilder* nb) {
+ DataType T;
+
+ // Get all attributes from old node.
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+
+ // Add attributes to new node.
+ nb->Attr("T", T);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+ // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+ // once we support BiasAddGrad as Mkl layer.
+
+ // Search for all matching mergeinfo.
+ // We allow more than one match for extensibility.
+ std::vector matching_mi;
+ for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+ if (a->type_string() == mi->succ) {
+ matching_mi.push_back(&*mi);
+ }
+ }
+
+ for (const MergeInfo* mi : matching_mi) {
+ const int N_in = a->num_inputs();
+ if (mi->op >= N_in) {
+ continue;
+ }
+
+ // Get the control edges and input of node
+ gtl::InlinedVector a_control_edges;
+ gtl::InlinedVector, 4> a_in(N_in);
+ FillInputs(a, &a_control_edges, &a_in);
+
+ // Get operand op of the operator
+ Node* b = nullptr;
+ b = a_in[mi->op].first;
+ if (b == nullptr || (b->type_string() != mi->pred)) {
+ // NOTE: Should the first check be assert?
+ continue;
+ }
+
+ gtl::InlinedVector b_control_edges;
+ gtl::InlinedVector, 4> b_in(N_in);
+ FillInputs(b, &b_control_edges, &b_in);
+
+ // Shouldn't merge if a and b have different control edges.
+ if (a_control_edges != b_control_edges) {
+ continue;
+ } else {
+ // We found a match.
+ return b;
+ }
+ }
+
+ return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr* g, Node* succ,
+ Node* pred) {
+ CHECK_NOTNULL(succ);
+ CHECK_NOTNULL(pred);
+
+ if (succ->type_string() == csinfo_.biasadd &&
+ pred->type_string() == csinfo_.mklconv2d) {
+ // 1. Get all attributes from input nodes.
+ DataType T_pred, T_succ;
+ string padding;
+ std::vector strides;
+ string data_format_pred, data_format_succ;
+ bool use_cudnn_on_gnu;
+ TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+ TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+ TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+ TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+ TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+ TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+ TF_CHECK_OK(
+ GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+ // We check to ensure that data formats of both succ and pred are same.
+ // We expect them to be same, so we can enforce this as assert.
+ // But assert can be too strict, so we enforce this as a check.
+ // If the check fails, then we do not merge two nodes.
+ // We also do same check for devices.
+ if (data_format_pred != data_format_succ || T_pred != T_succ ||
+ pred->assigned_device_name() != succ->assigned_device_name() ||
+ pred->def().device() != succ->def().device()) {
+ return Status(error::Code::INVALID_ARGUMENT,
+ "data_format or T attribute or devices of Conv2D and "
+ "BiasAdd do not match. Will skip node merge optimization");
+ }
+
+ const int succ_num = succ->num_inputs();
+ gtl::InlinedVector succ_control_edges;
+ gtl::InlinedVector, 4> succ_in(succ_num);
+ FillInputs(succ, &succ_control_edges, &succ_in);
+
+ const int pred_num = pred->num_inputs();
+ gtl::InlinedVector pred_control_edges;
+ gtl::InlinedVector, 4> pred_in(pred_num);
+ FillInputs(pred, &pred_control_edges, &pred_in);
+
+ // We need to ensure that there is only 1 edge between Conv2D and AddBias.
+ // Otherwise, merging is semantically incorrect.
+ if (pred->out_edges().size() != 1) {
+ return Status(error::Code::INVALID_ARGUMENT,
+ "Conv2D has multiple outputs."
+ "Will skip node merge optimization");
+ }
+
+ for (const Edge* e : pred->out_edges()) {
+ if (e->dst() != succ) {
+ return Status(error::Code::INVALID_ARGUMENT,
+ "Conv2D does not feed to BiasAdd."
+ "Will skip node merge optimization");
+ }
+ }
+
+ // 2. Get inputs from both the nodes.
+ // Find the 2 inputs from the conv and the bias from the add Bias.
+ // Get operand 0, 1 of conv2D and their Mkl tensors.
+ CHECK_EQ(pred->in_edges().size(), 4); // MklConv2D must have 4 inputs.
+ // Get operand 1 of add_bias
+ // BiasAdd must have 2 inputs: Conv, bias
+ CHECK_EQ(succ->in_edges().size(), 2);
+ Node* oper3_mkl = nullptr; // Mkl tensor corresponding to oper3
+ int oper3_mkl_slot = 0; // For dummy MKL tensor node, output slot is 0.
+ GetDummyMklTensorNode(g, &oper3_mkl, succ); // Get dummy Mkl tensor node
+ // as BiasAdd does not have Mkl tensor as input.
+ CHECK_NOTNULL(oper3_mkl);
+
+ // We will use the node name of BiasAdd as the name of new node
+ // Build new node. We use same name as original node, but change the op
+ // name.
+ NodeBuilder nb(succ->name(), csinfo_.mklconv2dwithbias);
+ nb.Input(pred_in[0].first, pred_in[0].second); // In1 of Conv2D
+ nb.Input(pred_in[1].first, pred_in[1].second); // Mkl for In1
+ nb.Input(pred_in[2].first, pred_in[2].second); // In2 of Conv2D
+ nb.Input(pred_in[3].first, pred_in[3].second); // Mkl for In2
+ nb.Input(succ_in[1].first, succ_in[1].second); // In2 of BiasAdd
+ nb.Input(oper3_mkl, oper3_mkl_slot); // Mkl for In2 of BiasAdd
+
+ // Copy attributes from Conv2D to Conv2DWithBias.
+ CopyAttrsConv2D(const_cast(pred), &nb);
+
+ // Copy the device assigned to old node to new node.
+ nb.Device(succ->def().device());
+
+ // Create node.
+ Node* newn;
+ nb.Finalize(&**g, &newn);
+ CHECK_NOTNULL(newn);
+
+ // Set the Mkl layer label for this op.
+ newn->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
+
+ // Incoming edges are fixed, we will fix the outgoing edges now.
+ for (const Edge* e : succ->out_edges()) {
+ (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+ }
+
+ // Copy device assigned to old node to new node.
+ // It's ok to use pred or succ as we have enforced a check that
+ // both have same device assigned.
+ newn->set_assigned_device_name(pred->assigned_device_name());
+
+ VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+ << ", and node: " << succ->DebugString()
+ << ", into node:" << newn->DebugString();
+
+ (*g)->RemoveNode(succ);
+ (*g)->RemoveNode(pred);
+ MarkRewrittenNode(newn);
+
+ return Status::OK();
+ }
+
+ return Status(error::Code::UNIMPLEMENTED,
+ "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, Node* orign,
- const NodesInfo& ni) {
- VLOG(1) << "MKLLayoutRewritePass: Original node:" << orign->DebugString();
+ const RewriteInfo* ri) {
+ CHECK_NOTNULL(ri);
+ CHECK_NOTNULL(orign);
+
+ VLOG(1) << "MklLayoutRewritePass: Original node:" << orign->DebugString();
+
+ // Check if this is scenario 2 (context-based rewrite).
+ // Get the matching ContextInfo if it is.
+ const Node* fwdn = nullptr;
+ const ContextInfo* ci = nullptr;
+ bool is_context_based_rewrite = false;
+ if ((ci = SearchMatchingContext(orign, &fwdn)) != nullptr) {
+ CHECK_NOTNULL(fwdn);
+ is_context_based_rewrite = true;
+
+ // Sanity checks for context-based rewrite (if any)
+ if (orign->type_string() == csinfo_.biasaddgrad &&
+ ri->newname == csinfo_.mklconv2dwithbiasbackpropbias) {
+ DataType orig_T, ctx_T;
+ string orig_data_format, ctx_data_format;
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &orig_T));
+ TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &orig_data_format));
+ TF_CHECK_OK(GetNodeAttr(fwdn->def(), "T", &ctx_T));
+ TF_CHECK_OK(GetNodeAttr(fwdn->def(), "data_format", &ctx_data_format));
+
+ if (orig_data_format != ctx_data_format || orig_T != ctx_T ||
+ orign->assigned_device_name() != fwdn->assigned_device_name() ||
+ orign->def().device() != fwdn->def().device()) {
+ return Status(
+ error::Code::INVALID_ARGUMENT,
+ "data_format or T attribute or devices of BiasAddGrad and "
+ "Conv2D do not match. Will skip node rewrite optimization");
+ }
+ }
+ }
// Get all inputs.
const int num = orign->num_inputs();
- CHECK_EQ(num, ni.numins);
+ CHECK_EQ(num, ri->numins);
gtl::InlinedVector control_edges;
gtl::InlinedVector, 4> inputs(num);
FillInputs(orign, &control_edges, &inputs);
// Build new node. We use same name as original node, but change the op name.
- NodeBuilder nb(orign->name().c_str(), ni.newname.c_str());
+ NodeBuilder nb(orign->name().c_str(), ri->newname.c_str());
// Copy user-specified device assigned to original node to new node.
nb.Device(orign->def().device());
// Set up new inputs to the rewritten node.
@@ -453,20 +1064,48 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, Node* orign,
if (s != Status::OK()) {
return s;
}
- // Copy attributes from original node to new node.
- ni.copyattrs(orign, &nb);
+
+ // Copy attributes from original node to new node (for scenario 1).
+ // For context-based rewrite, we use context to copy the attributes.
+ if (is_context_based_rewrite) {
+ if (orign->type_string() == csinfo_.biasaddgrad &&
+ ri->newname == csinfo_.mklconv2dwithbiasbackpropbias) {
+ CHECK_NOTNULL(fwdn);
+ ri->copyattrs(fwdn, &nb);
+ } else {
+ return Status(error::Code::UNIMPLEMENTED,
+ "Unimplemented case for node rewrite optimization.");
+ }
+ } else {
+ ri->copyattrs(const_cast(orign), &nb);
+ }
// Set the Mkl layer label for this op.
nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel);
- Node* newn = nullptr;
+
+ // Add workspace edge to this node if needed.
+ // We add workspace edge only for MaxPool, LRN and BatchNorm.
+ AddWorkSpaceEdgeIfNeeded(g, orign, &nb);
// Finalize graph and get new node.
+ Node* newn = nullptr;
TF_CHECK_OK(nb.Finalize(&**g, &newn));
CHECK_NOTNULL(newn);
// Incoming edges from 'orign' node to new 'newn' node are already copied
// in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node.
+ // Since the output also follows same ordering among Tensorflow tensors and
+ // Mkl tensors. We need to connect Tensorflow tensors appropriately.
+ // Specifically, nth output of original node will become 2*nth output of
+ // Mkl node. GetTensorDataIndex provides this mapping function.
for (const Edge* e : orign->out_edges()) {
- (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+ // We need to handle control-edges by using their original slot number.
+ // Generally, -1 is reserved for control slot.
+ if (e->src_output() < 0) {
+ (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+ } else {
+ (*g)->AddEdge(newn, GetTensorDataIndex(e->src_output()), e->dst(),
+ e->dst_input());
+ }
}
// Copy the runtime device assigned from original code to new node.
@@ -476,10 +1115,123 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, Node* orign,
(*g)->RemoveNode(orign);
MarkRewrittenNode(newn);
- VLOG(1) << "MKLLayoutRewritePass: New node:" << newn->DebugString();
+ VLOG(1) << "MklLayoutRewritePass: New node:" << newn->DebugString();
return Status::OK();
}
+const MklLayoutRewritePass::ContextInfo*
+MklLayoutRewritePass::SearchMatchingContext(const Node* n, const Node** fwdn) {
+ CHECK_NOTNULL(n);
+ CHECK_NOTNULL(fwdn);
+ *fwdn = nullptr;
+
+ // Search for matching contextinfo based on node name.
+ // There could be more than one matching contextinfos.
+ bool is_matching_cinfo_found = false;
+ std::vector mci;
+ for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
+ if (n->type_string() == ci->node) {
+ mci.push_back(&*ci);
+ is_matching_cinfo_found = true;
+ }
+ }
+ // If no matching contextinfo is found, return immediately.
+ if (!is_matching_cinfo_found) {
+ return nullptr;
+ }
+
+ VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
+ << " in backwards.";
+
+ // Now we will check for forward op name for context info in data
+ // flow graph. Get the max hops we should search for the fwd node.
+ // We are now going to search (breadth-first) backwards in data
+ // dependence graph (for up to max hops) from n for the node
+ // specified in fwd.
+ // queue to maintain nodes to be visited and depth info for
+ // breadth-first search
+ std::queue> nqueue;
+ const Node* curr_node = n;
+ size_t curr_depth = 0;
+ nqueue.push(std::make_pair(curr_node, curr_depth));
+
+ while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
+ std::pair curr_pair = nqueue.front();
+ nqueue.pop();
+
+ std::set visited_nodes;
+ curr_node = curr_pair.first;
+ curr_depth = curr_pair.second;
+ CHECK_NOTNULL(curr_node);
+
+ VLOG(1) << "MklLayoutRewritePass: Visiting node: "
+ << curr_node->type_string() << " at depth: " << curr_depth
+ << " for node: " << n->type_string();
+
+ // If we find a match, we return immediately.
+ for (const ContextInfo* ci : mci) {
+ if (curr_node->type_string() == ci->fwd) {
+ *fwdn = curr_node;
+ return ci;
+ }
+ }
+
+ // Else we explore backward edges from current node.
+ // Add the source nodes of all incoming edges of the node to the queue.
+ for (const Edge* e : curr_node->in_edges()) {
+ // We do not visit already visited node.
+ if (visited_nodes.find(e->src()) == visited_nodes.end()) {
+ // Depth of these nodes is 1 more than the depth of current node.
+ nqueue.push(std::make_pair(e->src(), curr_depth + 1));
+ visited_nodes.insert(e->src());
+ }
+ }
+ } /* while */
+
+ return nullptr;
+}
+
+bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n) {
+ const Node* fwdn = nullptr;
+ return SearchMatchingContext(n, &fwdn) != nullptr;
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+ CHECK_NOTNULL(n);
+
+ // First check if node along with its type is supported by MKL layer.
+ // We do not want to rewrite an op into Mkl op if types are not supported.
+ // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+ // MklRelu if type is INT32.
+ DataType T;
+ if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+ return nullptr;
+ }
+ if (!mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string()), T)) {
+ return nullptr;
+ }
+
+ // We support 2 types of node rewrites:
+ // 1. Rewriting BiasAddGrad depending on its context.
+ // 2. Rewriting an op to Mkl op always
+ // We return true if any of these 2 conditions is met.
+
+ // Find matching RewriteInfo and then check that rewrite rule applies.
+ for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+ if (n->type_string().compare(ri->name) == 0 && ri->rewriterule(n)) {
+ return &*ri;
+ }
+ }
+
+ // Else return not found.
+ return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
bool MklLayoutRewritePass::RunPass(std::unique_ptr* g) {
bool result = false;
CHECK_NOTNULL(g);
@@ -494,40 +1246,46 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr* g) {
continue;
}
- for (const NodesInfo& ni : ninfo_) {
- DataType dtype = DT_INVALID;
- // An op needs to have data type (T) attribute and its corresponding
- // Mkl op name must be supported.
- if (GetNodeAttr(n->def(), "T", &dtype) == Status::OK() &&
- mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string())) &&
- n->type_string().compare(ni.name) == 0) {
- string node_name = n->name();
- string op_name = n->type_string();
-
- VLOG(1) << "MKLLayoutRewritePass: Scheduled node " << node_name
- << " with op " << op_name << " for rewrite using"
- << " layout optimization.";
-
- if (RewriteNode(g, n, ni) == Status::OK()) {
- VLOG(1) << "MKLLayoutRewritePass: Successfully rewrote node "
- << node_name << " with op " << op_name
- << " for Mkl layout optimization.";
- result = true;
- break; // We found matching nodesinfo so no need to search next.
- }
+ const RewriteInfo* ri = nullptr;
+ Node* predn = nullptr;
+ // We will first search if node is to be rewritten
+ if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+ string node_name = n->name();
+ string op_name = n->type_string();
+
+ VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+ << " with op " << op_name << " for rewrite using"
+ << " layout optimization.";
+
+ if (RewriteNode(g, n, ri) == Status::OK()) {
+ VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+ << " with op " << op_name << " for Mkl layout optimization.";
+ result = true;
+ }
+ } else if ((predn = CheckForNodeMerge(n)) != nullptr) {
+ // Otherwise, we will check if the node is to be merged.
+ string n1_name = n->name();
+ string n2_name = predn->name();
+
+ VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+ << n2_name << " for merging";
+
+ if (MergeNode(g, n, predn) == Status::OK()) {
+ VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+ << n2_name;
+ result = true;
}
}
}
DumpGraph("After running MklLayoutRewritePass", &**g);
+ // Clear marked nodes as the same graph pass may be used multiple times.
+ UnMarkRewrittenNodes();
+
return result;
}
-///////////////////////////////////////////////////////////////////////////////
-// Run function for the pass
-///////////////////////////////////////////////////////////////////////////////
-
bool RunMklLayoutRewritePass(std::unique_ptr* g) {
return MklLayoutRewritePass().RunPass(g);
}
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 10671ee2e9612d..142d60d61128e3 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -18,7 +18,10 @@ limitations under the License.
#include "tensorflow/core/graph/mkl_layout_pass.h"
#include "tensorflow/core/util/mkl_util.h"
+#include
+#include
#include
+
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/graph.h"
@@ -107,10 +110,345 @@ class MklLayoutPassTest : public ::testing::Test {
};
REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("MklInput2").Output("o: uint8").Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+// Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['A', 'B'] }"
+ "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['A', 'B'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(Mul);D(Mul)|"
+ "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M', 'B', 'N']}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'BiasAdd'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['C', 'D'] }"
+ "node { name: 'Y' op: 'Input'}"
+ "node { name: 'Z' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['E', 'Y']}");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+ "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+ "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
+}
+
+// C=MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y)
+// Test for correct output slots selected
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput2'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput2'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M:1', 'B', 'N:1']}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'BiasAdd'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['C', 'D'] }"
+ "node { name: 'Y' op: 'Input'}"
+ "node { name: 'Z' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['E', 'Y']}");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+ "M(MklInput2);N(MklInput2);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+ "DMT/_0->E:5;E->Z;M:1->E:1;N:1->E:3;Y->Z:1");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// This is a case of node rewrite followed by node merge.
+// We will first rewrite Conv2D to MklConv2D, and then merge MklConv2D
+// with BiasAdd to produce MklConv2DWithBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'Conv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'B']}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'BiasAdd'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['C', 'D'] }"
+ "node { name: 'Y' op: 'Input'}"
+ "node { name: 'Z' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['E', 'Y']}");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+ "DMT/_2(Const);E(MklConv2DWithBias);Y(Input);Z(Sub)|"
+ "A->E;B->E:2;D->E:4;DMT/_0->E:1;DMT/_1->E:3;DMT/_2->E:5;"
+ "E->Z;Y->Z:1");
+}
+
+// Graph contains only MklConv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M', 'B', 'N']}");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
+ "A->C;B->C:2;M->C:1;N->C:3");
+}
+
+// MklConv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M', 'B', 'N']}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'Input'}"
+ "node { name: 'F' op: 'BiasAdd'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D', 'E'] }"); // Output of MklConv2D does not go to BiasAdd.
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+ "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
+}
+
+// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M', 'B', 'N']}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'Input'}"
+ "node { name: 'F' op: 'BiasAdd'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D', 'E'] }" // Conv2D has two outputs.
+ // No merge should happen.
+ "node { name: 'G' op: 'Add'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'E'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+ "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
+ "E->F:1;E->G:1;M->C:1;N->C:3");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M', 'B', 'N']}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'BiasAdd'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NHCW' } }"
+ " input: ['C', 'D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
+ "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
+}
+
+// No MklConv2D in context, but Conv2D in context.
+// Only Conv2D would be rewritten to MklConv2D, but no rewrite
+// for BiasAddGrad should happen.
+// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'M' op: 'MklInput'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'N' op: 'MklInput'}"
+ "node { name: 'C' op: 'MklConv2D'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'M', 'B', 'N']}"
+ "node { name: 'D' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'A']}"
+ "node { name: 'E' op: 'BiasAddGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MklConv2D);D(Sub);E(BiasAddGrad);"
+ "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;"
+ "M->C:1;N->C:3");
+}
+
+// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'Add'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['A', 'B']}"
+ "node { name: 'D' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'A']}"
+ "node { name: 'E' op: 'BiasAddGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+ "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No Conv2D in the context for BiasAddGrad, but MatMul in context.
+// Rewrite should happen, but name of BiasAddGrad does not change.
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'MatMul'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'transpose_a' value { b: false } }"
+ " attr { key: 'transpose_b' value { b: false } }"
+ " input: ['A', 'B']}"
+ "node { name: 'D' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'A']}"
+ "node { name: 'E' op: 'BiasAddGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+ "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'MatMul'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'transpose_a' value { b: false } }"
+ " attr { key: 'transpose_b' value { b: false } }"
+ " input: ['A', 'B']}"
+ "node { name: 'D' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'A']}"
+ "node { name: 'E' op: 'BiasAddGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+ "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No MatMul in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'Add'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['A', 'B']}"
+ "node { name: 'D' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'A']}"
+ "node { name: 'E' op: 'BiasAddGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+ "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+/////////////////////////////////////////////////////////////////////
+// Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
// Single Conv2D Op; No Mkl layer on the input and on the output.
// We will generate dummy Mkl tensor as 2nd input of Conv2D.
-TEST_F(MklLayoutPassTest, Conv2D_Basic) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
@@ -130,7 +468,7 @@ TEST_F(MklLayoutPassTest, Conv2D_Basic) {
// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
// have 2 outputs, both of which will be inputs to next Conv2D.
-TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
@@ -156,6 +494,104 @@ TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
"C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
}
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+ InitGraph(
+ "node { name: 'A' op: 'HalfInput'}"
+ "node { name: 'B' op: 'HalfInput'}"
+ "node { name: 'C' op: 'Conv2D'"
+ " attr { key: 'T' value { type: DT_HALF } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
+ " attr { key: 'padding' value { s: 'SAME' } }"
+ " input: ['A', 'B']}"
+ "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_HALF } }"
+ " input: ['B', 'C'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(HalfInput);B(HalfInput);C(Conv2D);D(Mul)|"
+ "A->C;B->C:1;B->D;C->D:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+// Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'MaxPool'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'ksize' value { list: {i: 1, i:1, i:3, i:3} } }"
+ " attr { key: 'padding' value { s: 'VALID' } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
+ " input: ['A'] }"
+ "node { name: 'C' op: 'Input'}"
+ "node { name: 'D' op: 'Input'}"
+ "node { name: 'E' op: 'MaxPoolGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'ksize' value { list: {i: 1, i:1, i:3, i:3} } }"
+ " attr { key: 'padding' value { s: 'VALID' } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
+ " input: ['C', 'B', 'D'] }"
+ "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['C', 'E'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+ "DMT/_1(Const);DMT/_2(Const);E(MklMaxPoolGrad);F(Mul)|"
+ "A->B;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;D->E:4;"
+ "DMT/_0->B:1;DMT/_1->E:1;DMT/_2->E:5;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'MaxPool'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'ksize' value { list: {i: 1, i:1, i:3, i:3} } }"
+ " attr { key: 'padding' value { s: 'VALID' } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
+ " input: ['A'] }"
+ "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['A', 'B'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(MklMaxPool);C(Mul);DMT/_0(Const)|"
+ "A->B;A->C;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPool->MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+ InitGraph(
+ "node { name: 'A' op: 'Input'}"
+ "node { name: 'B' op: 'Input'}"
+ "node { name: 'C' op: 'Input'}"
+ "node { name: 'D' op: 'MaxPoolGrad'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " attr { key: 'ksize' value { list: {i: 1, i:1, i:3, i:3} } }"
+ " attr { key: 'padding' value { s: 'VALID' } }"
+ " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
+ " input: ['A', 'B', 'C'] }"
+ "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ " input: ['A', 'D'] }");
+ EXPECT_EQ(DoMklLayoutOptimizationPass(),
+ "A(Input);B(Input);C(Input);D(MklMaxPoolGrad);DMT/_0(Const);"
+ "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+ "A->D;A->E;B->D:2;C->D:4;D->E:1;DMT/_0->D:1;DMT/_1->D:3;"
+ "DMT/_2->D:5;DMT/_3->D:6;DMT/_4->D:7");
+}
+
+/////////////////////////////////////////////////////////////////////
+
static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
testing::StopTiming();
string s;
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 8c3adad6f0b8c0..7c3836b30892a0 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -81,9 +81,10 @@ class MklToTfConversionPass : public GraphOptimizationPass {
// Is the input Op supported by Mkl-specific layout?
//
// @input op_name string of the op
+ // @input T Datatype to use for checking input op
// @return true if op is Mkl supported; false, otherwise.
- inline bool IsMklSupportedOp(const string& op_name) const {
- return mkl_layer_registry::IsMklLayer(op_name);
+ inline bool IsMklSupportedOp(const string& op_name, DataType T) const {
+ return mkl_layer_registry::IsMklLayer(op_name, T);
}
// Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
@@ -188,6 +189,13 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr* g) {
continue;
}
+ // We skip adding MklToTf on an edge between X->MklToTf or
+ // MklToTf->X, where X is any layer.
+ if (src->type_string().compare("MklToTf") == 0 ||
+ dst->type_string().compare("MklToTf") == 0) {
+ continue;
+ }
+
VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
<< src->type_string() << " and " << dst->type_string();
@@ -202,8 +210,9 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr* g) {
GetNodeAttr(dst->def(), "T", &dst_datatype);
// Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
- if (IsMklSupportedOp(src->type_string()) &&
- !IsMklSupportedOp(dst->type_string())) {
+
+ if (IsMklSupportedOp(src->type_string(), src_datatype) &&
+ !IsMklSupportedOp(dst->type_string(), dst_datatype)) {
VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
<< " and " << dst->name() << " for inserting conversion nodes";
candidate_edges.push_back(const_cast(e));
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index 0a63cf6ddbfa23..7d9237f845432f 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -17,7 +17,10 @@ limitations under the License.
#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include
+#include
#include
+
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/graph.h"
@@ -146,31 +149,34 @@ TEST_F(MklToTfConversionPass, Positive) {
"C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
}
-// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type
-// C=MklConv2D(A,M,B,N); E=Sub(C,D)
-// MklToTf node should be inserted.
-TEST_F(MklToTfConversionPass, Positive_Type) {
+// MklConv2D followed by MklToTf op followed by Non-Mkl layer.
+// C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E)
+// MklToTf node should not be inserted again.
+TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
InitGraph(
- "node { name: 'A' op: 'HalfInput'}"
+ "node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
- "node { name: 'B' op: 'HalfInput'}"
+ "node { name: 'B' op: 'Input'}"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
- " attr { key: 'T' value { type: DT_HALF } }"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'M', 'B', 'N']}"
- "node { name: 'D' op: 'HalfInput'}"
- "node { name: 'E' op: 'Sub'"
- " attr {key: 'T' value { type: DT_HALF } }"
- " input: ['C', 'D']}");
+ "node { name: 'D' op: 'MklToTf'"
+ " attr { key: 'T' value { type: DT_FLOAT } }"
+ " attr { key: 'data_format' value { s: 'NCHW' } }"
+ " input: ['C:0', 'C:1']}"
+ "node { name: 'E' op: 'Input'}"
+ "node { name: 'F' op: 'Sub'"
+ " attr {key: 'T' value { type: DT_FLOAT } }"
+ " input: ['D', 'E']}");
EXPECT_EQ(DoRunMklToTfConversionPass(),
- "A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);"
- "E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|"
- "A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;"
- "M->C:1;Mkl2Tf/_0->E;N->C:3");
+ "A(Input);B(Input);C(MklConv2D);D(MklToTf);E(Input);"
+ "F(Sub);M(MklInput);N(MklInput)|"
+ "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
}
// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9f516efd71beec..9c47d520d96b3b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -27,6 +27,7 @@ load(
"tf_copts",
"tf_opts_nortti_if_android",
"tf_kernel_library",
+ "tf_mkl_kernel_library",
"cc_header_only_library",
)
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@@ -2241,6 +2242,12 @@ tf_kernel_library(
tf_kernel_library(
name = "matmul_op",
+ srcs = [
+ "matmul_op.cc",
+ ] + if_mkl([
+ "mkl_matmul_op.cc",
+ ]),
+ hdrs = ["matmul_op.h"],
defines = select({
":xsmm": [
"TENSORFLOW_USE_LIBXSMM",
@@ -2248,13 +2255,14 @@ tf_kernel_library(
],
"//conditions:default": [],
}),
- prefix = "matmul_op",
deps = MATH_DEPS + select({
":xsmm": [
"@libxsmm_archive//:xsmm_avx",
],
"//conditions:default": [],
- }),
+ }) + if_mkl([
+ "//third_party/mkl:intel_binary_blob",
+ ]),
)
tf_kernel_library(
@@ -2770,6 +2778,7 @@ tf_kernel_library(
"cudnn_pooling_gpu.h",
"fractional_pool_common.h",
"maxpooling_op.h",
+ "pooling_ops_3d.h",
"pooling_ops_common.h",
],
gpu_srcs = [
@@ -2780,6 +2789,8 @@ tf_kernel_library(
"maxpooling_op_gpu.h",
"pooling_ops_common.h",
"pooling_ops_common_gpu.h",
+ "pooling_ops_3d_gpu.h",
+ "pooling_ops_3d_gpu.cu.cc",
],
deps = [
":conv_2d",
@@ -4468,49 +4479,69 @@ tf_cc_test(
],
)
-if_mkl(
- tf_kernel_library(
- name = "mkl_matmul_op",
- prefix = "mkl_matmul",
- deps = [
- ":math",
- "//third_party/mkl:intel_binary_blob",
- ],
- ),
+tf_mkl_kernel_library(
+ name = "mkl_conv_op",
+ prefix = "mkl_conv",
+ deps = [
+ ":bounds_check",
+ ":conv_ops",
+ ":ops_util",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:nn_ops_op_lib",
+ "//third_party/mkl:intel_binary_blob",
+ ],
)
-if_mkl(
- tf_kernel_library(
- name = "mkl_conv_op",
- prefix = "mkl_conv",
- deps = [
- ":bounds_check",
- ":ops_util",
- "//tensorflow/core:core_cpu",
- "//tensorflow/core:framework",
- "//tensorflow/core:lib",
- "//tensorflow/core:lib_internal",
- "//tensorflow/core:nn_ops_op_lib",
- "//third_party/mkl:intel_binary_blob",
- ],
- ),
+tf_mkl_kernel_library(
+ name = "mkl_tfconv_op",
+ prefix = "mkl_tfconv",
+ deps = [
+ ":bounds_check",
+ ":ops_util",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:nn_ops_op_lib",
+ "//third_party/mkl:intel_binary_blob",
+ ],
)
-if_mkl(
- tf_kernel_library(
- name = "mkl_tfconv_op",
- prefix = "mkl_tfconv",
- deps = [
- ":bounds_check",
- ":ops_util",
- "//tensorflow/core:core_cpu",
- "//tensorflow/core:framework",
- "//tensorflow/core:lib",
- "//tensorflow/core:lib_internal",
- "//tensorflow/core:nn_ops_op_lib",
- "//third_party/mkl:intel_binary_blob",
- ],
- ),
+tf_mkl_kernel_library(
+ name = "mkl_pooling_ops",
+ srcs = [
+ "mkl_avgpooling_op.cc",
+ "mkl_maxpooling_op.cc",
+ "mkl_pooling_ops_common.cc",
+ ],
+ hdrs = ["mkl_pooling_ops_common.h"],
+ deps = [
+ ":ops_util",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:nn_ops_op_lib",
+ "//third_party/mkl:intel_binary_blob",
+ ],
+)
+
+tf_mkl_kernel_library(
+ name = "mkl_relu_op",
+ prefix = "mkl_relu",
+ deps = [
+ ":bounds_check",
+ ":ops_util",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:nn_ops_op_lib",
+ "//third_party/mkl:intel_binary_blob",
+ ],
)
# -----------------------------------------------------------------------------
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 2e385f2c55b2c7..f88862bfeb9202 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -30,6 +30,9 @@ limitations under the License.
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
#include "tensorflow/core/kernels/conv_2d.h"
+#ifdef TENSORFLOW_USE_LIBXSMM
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
@@ -88,6 +91,75 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_LIBXSMM
+template
+struct LaunchXsmmBackwardFilter {
+ bool operator()(OpKernelContext* context, const Device& d,
+ typename TTypes::ConstTensor input_backward,
+ typename TTypes::Tensor kernel,
+ typename TTypes::ConstTensor output_backward,
+ int input_rows, int input_cols, int row_stride,
+ int col_stride, int pad_h, int pad_w,
+ TensorFormat data_format) const {
+ return false;
+ }
+};
+
+template <>
+struct LaunchXsmmBackwardFilter {
+ bool operator()(OpKernelContext* context, const CPUDevice& d,
+ typename TTypes::ConstTensor input,
+ typename TTypes::Tensor filter,
+ typename TTypes::ConstTensor output, int input_rows,
+ int input_cols, int row_stride, int col_stride, int pad_h,
+ int pad_w, TensorFormat data_format) const {
+ auto batch = input.dimension(0);
+ auto in_depth = input.dimension(3);
+ auto out_depth = output.dimension(3);
+ auto filter_rows = filter.dimension(0);
+ auto filter_cols = filter.dimension(1);
+
+ auto num_threads =
+ context->device()->tensorflow_cpu_worker_threads()->num_threads;
+ // See libxsmm_dnn.h for this struct definition.
+ libxsmm_dnn_conv_desc desc;
+ desc.N = batch;
+ desc.C = in_depth;
+ desc.H = input_rows;
+ desc.W = input_cols;
+ desc.K = out_depth;
+ desc.R = filter_rows;
+ desc.S = filter_cols;
+ desc.u = row_stride;
+ desc.v = col_stride;
+ desc.pad_h = pad_h;
+ desc.pad_w = pad_w;
+ desc.pad_h_in = 0; // pad_rows; // ignored by libxsmm for now.
+ desc.pad_w_in = 0; // pad_cols; // ignored by libxsmm for now.
+ desc.pad_h_out = 0;
+ desc.pad_w_out = 0;
+ desc.threads = num_threads;
+ desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+ desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+ desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+ desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+ desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+ desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
+
+ if (!CanUseXsmmConv2D(desc, data_format)) {
+ return false;
+ }
+
+ auto input_ptr = input.data();
+ auto filter_ptr = filter.data();
+ auto output_ptr = output.data();
+ bool success = functor::XsmmBkwFilterConv2D()(
+ context, desc, input_ptr, filter_ptr, output_ptr);
+ return success;
+ }
+};
+#endif
+
template
class Conv2DFastBackpropFilterOp : public OpKernel {
public:
@@ -135,6 +207,36 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
OP_REQUIRES_OK(context,
context->allocate_output(0, filter_shape, &filter_backprop));
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+
+ int64 pad_top, pad_bottom;
+ int64 pad_left, pad_right;
+ OP_REQUIRES_OK(
+ context,
+ GetWindowedOutputSizeVerbose(
+ dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+ dims.spatial_dims[0].stride, padding_,
+ &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+ OP_REQUIRES_OK(
+ context,
+ GetWindowedOutputSizeVerbose(
+ dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+ dims.spatial_dims[1].stride, padding_,
+ &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+ if (pad_left == pad_right && pad_top == pad_bottom) {
+ if (LaunchXsmmBackwardFilter()(
+ context, context->eigen_device(), input.tensor(),
+ filter_backprop->tensor(), out_backprop.tensor(),
+ dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+ (int)dims.spatial_dims[0].stride,
+ (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+ data_format_)) {
+ return;
+ }
+ }
+#endif
+
functor::SpatialConvolutionBackwardKernel()(
context->eigen_device(), filter_backprop->tensor(),
input.tensor(), out_backprop.tensor(),
@@ -213,6 +315,19 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
dims.spatial_dims[1].stride, padding_,
&dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+ if (pad_left == pad_right && pad_top == pad_bottom) {
+ if (LaunchXsmmBackwardFilter()(
+ context, context->eigen_device(), input.tensor(),
+ filter_backprop->tensor(), out_backprop.tensor(),
+ dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+ (int)dims.spatial_dims[0].stride,
+ (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+ data_format_)) {
+ return;
+ }
+ }
+#endif
// The total dimension size of each kernel.
const int filter_total_size = dims.spatial_dims[0].filter_size *
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 8bc79bebd9da75..e79c9465cb4081 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -131,7 +131,8 @@ struct LaunchXsmmBackwardInputConvolution {
typename TTypes::ConstTensor kernel,
typename TTypes::ConstTensor output_backward,
int input_rows, int input_cols, int row_stride,
- int col_stride, TensorFormat data_format) const {
+ int col_stride, int pad_h, int pad_w,
+ TensorFormat data_format) const {
return false;
}
};
@@ -143,7 +144,8 @@ struct LaunchXsmmBackwardInputConvolution {
typename TTypes::ConstTensor kernel,
typename TTypes::ConstTensor output_backward,
int input_rows, int input_cols, int row_stride,
- int col_stride, TensorFormat data_format) const {
+ int col_stride, int pad_h, int pad_w,
+ TensorFormat data_format) const {
auto batch = input_backward.dimension(0);
auto in_depth = input_backward.dimension(3);
auto out_depth = output_backward.dimension(3);
@@ -162,10 +164,10 @@ struct LaunchXsmmBackwardInputConvolution {
desc.S = filter_cols;
desc.u = row_stride;
desc.v = col_stride;
- desc.pad_h = 0;
- desc.pad_w = 0;
- desc.pad_h_in = 0; // pad_rows; // ignored by libxsmm for now.
- desc.pad_w_in = 0; // pad_cols; // ignored by libxsmm for now.
+ desc.pad_h = pad_h;
+ desc.pad_w = pad_w;
+ desc.pad_h_in = 0;
+ desc.pad_w_in = 0;
desc.pad_h_out = 0;
desc.pad_w_out = 0;
desc.threads = num_threads;
@@ -174,7 +176,7 @@ struct LaunchXsmmBackwardInputConvolution {
desc.filter_format =
LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; // LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
- desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+ desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
auto input_ptr = input_backward.data();
@@ -236,13 +238,31 @@ class Conv2DFastBackpropInputOp : public OpKernel {
context->allocate_output(0, input_shape, &in_backprop));
#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
- if (LaunchXsmmBackwardInputConvolution()(
- context, context->eigen_device(),
- in_backprop->tensor(), filter.tensor(),
- out_backprop.tensor(), dims.spatial_dims[0].input_size,
- dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
- dims.spatial_dims[1].stride, data_format_)) {
- return;
+ int64 pad_top, pad_bottom;
+ int64 pad_left, pad_right;
+ OP_REQUIRES_OK(
+ context,
+ GetWindowedOutputSizeVerbose(
+ dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+ dims.spatial_dims[0].stride, padding_,
+ &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+ OP_REQUIRES_OK(
+ context,
+ GetWindowedOutputSizeVerbose(
+ dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+ dims.spatial_dims[1].stride, padding_,
+ &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+ if (pad_left == pad_right && pad_top == pad_bottom) {
+ if (LaunchXsmmBackwardInputConvolution()(
+ context, context->eigen_device(),
+ in_backprop->tensor(), filter.tensor(),
+ out_backprop.tensor(), dims.spatial_dims[0].input_size,
+ dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+ (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+ data_format_)) {
+ return;
+ }
}
#endif
@@ -309,21 +329,39 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
OP_REQUIRES_OK(context,
context->allocate_output(0, input_shape, &in_backprop));
+// TODO(andydavis) Consider moving code shared with
+// Conv2DCustomBackpropFilterOp into a shared helper function.
#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
- if (LaunchXsmmBackwardInputConvolution()(
- context, context->eigen_device(),
- in_backprop->tensor(), filter.tensor(),
- out_backprop.tensor(), dims.spatial_dims[0].input_size,
- dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
- dims.spatial_dims[1].stride, data_format_)) {
- return;
- }
-#endif
+ int64 pad_top, pad_bottom;
+ int64 pad_left, pad_right;
+ OP_REQUIRES_OK(
+ context,
+ GetWindowedOutputSizeVerbose(
+ dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+ dims.spatial_dims[0].stride, padding_,
+ &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+ OP_REQUIRES_OK(
+ context,
+ GetWindowedOutputSizeVerbose(
+ dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+ dims.spatial_dims[1].stride, padding_,
+ &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
- // TODO(andydavis) Consider moving code shared with
- // Conv2DCustomBackpropFilterOp into a shared helper function.
+ if (pad_left == pad_right && pad_top == pad_bottom) {
+ if (LaunchXsmmBackwardInputConvolution()(
+ context, context->eigen_device(),
+ in_backprop->tensor(), filter.tensor(),
+ out_backprop.tensor(), dims.spatial_dims[0].input_size,
+ dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+ (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+ data_format_)) {
+ return;
+ }
+ }
+#else
int64 pad_top, pad_bottom;
int64 pad_left, pad_right;
+#endif
OP_REQUIRES_OK(
context,
GetWindowedOutputSizeVerbose(
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index facfe4467d1dde..8076daf387bac7 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -213,8 +213,8 @@ class LaunchXsmmConvOp {
desc.v = stride_cols;
desc.pad_h = pad_rows;
desc.pad_w = pad_cols;
- desc.pad_h_in = pad_rows; // libxsmm supports only physical padding for now
- desc.pad_w_in = pad_cols; // libxsmm supports only physical padding for now
+ desc.pad_h_in = 0;
+ desc.pad_w_in = 0;
desc.pad_h_out = 0;
desc.pad_w_out = 0;
desc.threads = num_threads;
@@ -222,13 +222,17 @@ class LaunchXsmmConvOp {
desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
- desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+ desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
if (!CanUseXsmmConv2D(desc, data_format)) {
return false;
}
+ if (!CanUseXsmmConv2D(desc, data_format)) {
+ return false;
+ }
+
auto input_ptr = input.template flat().data();
auto filter_ptr = filter.template flat().data();
auto output_ptr = output->template flat().data();
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index bb99d627a531e6..2307c2de0e63b0 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -548,9 +548,11 @@ template struct functor::TransformFilter;
template struct functor::ReverseTransformFilter;
template struct functor::ReverseTransformFilter;
+template struct functor::NHWCToNCHW;
template struct functor::NHWCToNCHW;
template struct functor::NHWCToNCHW;
+template struct functor::NCHWToNHWC;
template struct functor::NCHWToNHWC;
template struct functor::NCHWToNHWC;
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index 66f92492342777..5939ecdf62bc32 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -18,6 +18,7 @@ limitations under the License.
#include
+#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/kernels/conv_3d.h"
#include "tensorflow/core/kernels/conv_ops_gpu.h"
@@ -242,8 +243,11 @@ void DnnPooling3dGradOp::Compute(
}
}
-template class DnnPooling3dOp;
-template class DnnPooling3dGradOp;
+#define DEFINE_DNN_OPS(T) \
+ template class DnnPooling3dOp; \
+ template class DnnPooling3dGradOp;
+TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS
#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 41c6251ac7566b..eb590280c9ec44 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
@@ -46,6 +47,7 @@ limitations under the License.
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
const int kInvalidMaxPoolingIndex = -1;
@@ -187,40 +189,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
params.tensor_in_batch, shard_cost, shard);
}
-REGISTER_KERNEL_BUILDER(
- Name("MaxPool").Device(DEVICE_CPU).TypeConstraint("T"),
- MaxPoolingOp);
-REGISTER_KERNEL_BUILDER(
- Name("MaxPool").Device(DEVICE_CPU).TypeConstraint("T"),
- MaxPoolingOp);
-
-#if GOOGLE_CUDA
-// Forward declarations for the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T) \
- template <> \
- void SpatialMaxPooling::operator()( \
- const Eigen::GpuDevice& d, typename TTypes::Tensor output, \
- typename TTypes::ConstTensor input, int window_rows, \
- int window_cols, int row_stride, int col_stride, \
- const Eigen::PaddingType& padding); \
- extern template struct SpatialMaxPooling;
-
-DECLARE_GPU_SPEC(float);
-#undef DECLARE_GPU_SPEC
-} // namespace functor
-
-// Note(jiayq): Currently, the Caffe custom implementation is faster than the
-// default Eigen implementation so we are using the custom kernel as the
-// default. However, you can explicitly invoke the eigen version using
-// kernel_label_map.
-REGISTER_KERNEL_BUILDER(Name("MaxPool")
- .Device(DEVICE_GPU)
- .TypeConstraint("T")
- .Label("eigen_tensor"),
- MaxPoolingOp);
-#endif // GOOGLE_CUDA
-
// The operation to compute MaxPool gradients.
// It takes three inputs:
// - The original input tensor
@@ -237,7 +205,7 @@ class MaxPoolingGradOp : public OpKernel {
errors::InvalidArgument("Invalid data format"));
OP_REQUIRES(
context, data_format_ == FORMAT_NHWC,
- errors::InvalidArgument("Default MaxPoolinGradOp only supports NHWC ",
+ errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
"on device type ",
DeviceTypeString(context->device_type())));
OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
@@ -305,13 +273,6 @@ class MaxPoolingGradOp : public OpKernel {
TensorFormat data_format_;
};
-REGISTER_KERNEL_BUILDER(
- Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint("T"),
- MaxPoolingGradOp);
-REGISTER_KERNEL_BUILDER(
- Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint("T"),
- MaxPoolingGradOp);
-
#ifdef GOOGLE_CUDA
template
@@ -329,13 +290,13 @@ static void MaxPoolingBackwardCustomKernel(
return;
}
- MaxPoolBackwardNoMask(
+ functor::MaxPoolBackwardNoMask()(
tensor_in->flat().data(), params.tensor_in_batch,
params.tensor_in_rows, params.tensor_in_cols, params.depth,
params.out_height, params.out_width, params.window_rows,
params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
- params.pad_cols, out_backprop.flat().data(),
- output->flat().data(), context->eigen_device());
+ params.pad_cols, out_backprop.flat().data(), output->flat().data(),
+ context->eigen_device());
}
template
@@ -403,12 +364,252 @@ class MaxPoolingGradOp : public OpKernel {
bool use_dnn_;
};
-REGISTER_KERNEL_BUILDER(
- Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint("T"),
- MaxPoolingGradOp);
-REGISTER_KERNEL_BUILDER(
- Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint("T"),
- MaxPoolingGradOp);
+#endif // GOOGLE_CUDA
+
+// The operation to compute gradient of MaxPool gradients.
+// It takes three inputs:
+// - The original input tensor
+// - The original output tensor
+// - Backprop tensor for output gradients
+// It produces one output: backprop tensor for output gradient.
+template
+class MaxPoolingGradGradOp : public OpKernel {
+ public:
+ explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ string data_format;
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+ OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+ errors::InvalidArgument("Invalid data format"));
+ OP_REQUIRES(
+ context, data_format_ == FORMAT_NHWC,
+ errors::InvalidArgument(
+ "Default MaxPoolingGradGradOp only supports NHWC ",
+ "on device type ", DeviceTypeString(context->device_type())));
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ OP_REQUIRES(
+ context, ksize_[3] == 1 && stride_[3] == 1,
+ errors::Unimplemented(
+ "MaxPoolingGradGrad is not yet supported on the depth dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ const Tensor& tensor_out = context->input(1);
+ const Tensor& out_grad_backprop = context->input(2);
+
+ // For maxpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional"));
+ OP_REQUIRES(context, tensor_out.dims() == 4,
+ errors::InvalidArgument("tensor_out must be 4-dimensional"));
+ // For maxpooling, out_grad_backprop should have 4 dimensions.
+ OP_REQUIRES(
+ context, out_grad_backprop.dims() == 4,
+ errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+ PoolParameters params{context, ksize_, stride_,
+ padding_, FORMAT_NHWC, tensor_in.shape()};
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+ {2}, 0, tensor_out.shape(), &output));
+
+ SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
+ out_grad_backprop, params, padding_);
+ }
+
+ private:
+ void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
+ const Tensor& tensor_in, const Tensor& tensor_out,
+ const Tensor& top_diff,
+ const PoolParameters& params,
+ const Padding& padding) {
+ typedef Eigen::Map>
+ ConstEigenMatrixMap;
+ typedef Eigen::Map>
+ EigenMatrixMap;
+
+ ConstEigenMatrixMap in_mat(
+ tensor_in.flat().data(), params.depth,
+ params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+ ConstEigenMatrixMap out_mat(
+ tensor_out.flat().data(), params.depth,
+ params.out_width * params.out_height * params.tensor_in_batch);
+ ConstEigenMatrixMap top_diff_mat(
+ top_diff.flat().data(), params.depth,
+ params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+ EigenMatrixMap bottom_diff_mat(
+ bottom_diff->flat().data(), params.depth,
+ params.out_width * params.out_height * params.tensor_in_batch);
+
+ const DeviceBase::CpuWorkerThreads& worker_threads =
+ *(context->device()->tensorflow_cpu_worker_threads());
+
+ // The following code basically does the following:
+ // 1. Flattens the input, output, top_diff and bottom_diff tensors into
+ // two dimensional arrays.
+ // tensor_in_as_matrix:
+ // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+ // tensor_out_as_matrix:
+ // depth by (out_width * out_height * tensor_in_batch)
+ // top_diff_as_matrix:
+ // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+ // bottom_diff_as_matrix:
+ // depth by (out_width * out_height * tensor_in_batch)
+ //
+ // 2. Walks through the set of columns in the flattened
+ // tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
+ // and updates the column(s) corresponding to the maximum values in
+ // tensor_out_as_matrix with the corresponding values in
+ // top_diff_as_matrix.
+ auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+ int64 start, int64 limit) {
+ const int32 depth = params.depth;
+ const int32 in_rows = params.tensor_in_rows;
+ const int32 in_cols = params.tensor_in_cols;
+ const int32 pad_rows = params.pad_rows;
+ const int32 pad_cols = params.pad_cols;
+ const int32 window_rows = params.window_rows;
+ const int32 window_cols = params.window_cols;
+ const int32 row_stride = params.row_stride;
+ const int32 col_stride = params.col_stride;
+ const int32 out_height = params.out_height;
+ const int32 out_width = params.out_width;
+
+ {
+ // Initializes the output grad backprop tensor with 0.
+ const int32 output_image_size = out_height * out_width * params.depth;
+ EigenMatrixMap bottom_diff_shard(
+ bottom_diff_mat.data() + start * output_image_size, 1,
+ (limit - start) * output_image_size);
+ bottom_diff_shard.setZero();
+ }
+
+ for (int b = start; b < limit; ++b) {
+ for (int ph = 0; ph < out_height; ++ph) {
+ for (int pw = 0; pw < out_width; ++pw) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int h_start = ph * row_stride - pad_rows;
+ const int h_end = std::min(h_start + window_rows, in_rows);
+ int w_start = pw * col_stride - pad_cols;
+ const int w_end = std::min(w_start + window_cols, in_cols);
+ h_start = std::max(h_start, 0);
+ w_start = std::max(w_start, 0);
+ const int out_index = (b * out_height + ph) * out_width + pw;
+ // Find value corresponding to the input maximum in top_diff.
+ for (int d = 0; d < depth; ++d) {
+ const T& output_ref = out_mat.coeffRef(d, out_index);
+ bool should_stop = false;
+ for (int h = h_start; h < h_end && !should_stop; ++h) {
+ for (int w = w_start; w < w_end && !should_stop; ++w) {
+ const int in_index = (b * in_rows + h) * in_cols + w;
+ const T& input_ref = in_mat.coeffRef(d, in_index);
+ if (output_ref == input_ref) {
+ T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
+ bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+ should_stop = true;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ };
+
+ const int64 shard_cost = params.out_width * params.out_height *
+ params.depth * params.window_rows *
+ params.window_cols;
+ Shard(worker_threads.num_threads, worker_threads.workers,
+ params.tensor_in_batch, shard_cost, shard);
+ }
+
+ std::vector ksize_;
+ std::vector stride_;
+ Padding padding_;
+ TensorFormat data_format_;
+};
+
+#ifdef GOOGLE_CUDA
+
+template
+class MaxPoolingGradGradOp : public OpKernel {
+ public:
+ typedef Eigen::GpuDevice Device;
+
+ explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ string data_format;
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+ OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+ errors::InvalidArgument("Invalid data format"));
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+ const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+ OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ const Tensor& tensor_out = context->input(1);
+ const Tensor& out_grad_backprop = context->input(2);
+
+ // For maxpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
+ OP_REQUIRES(context, tensor_out.dims() == 4,
+ errors::InvalidArgument("tensor_out must be 4-dimensional"));
+ // For maxpooling, out_grad_backprop should have 4 dimensions.
+ OP_REQUIRES(
+ context, out_grad_backprop.dims() == 4,
+ errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+ {2}, 0, tensor_out.shape(), &output));
+
+ PoolParameters params{context, ksize_, stride_,
+ padding_, data_format_, tensor_in.shape()};
+
+ functor::MaxPoolGradBackwardNoMask()(
+ data_format_, tensor_in.flat().data(), tensor_out.flat().data(),
+ params.tensor_in_batch, params.out_height, params.out_width,
+ params.depth, params.tensor_in_rows, params.tensor_in_cols,
+ params.window_rows, params.window_cols, params.row_stride,
+ params.col_stride, params.pad_rows, params.pad_cols,
+ out_grad_backprop.flat().data(), output->flat().data(),
+ context->eigen_device());
+ }
+
+ private:
+ std::vector ksize_;
+ std::vector stride_;
+ Padding padding_;
+ TensorFormat data_format_;
+ bool use_dnn_;
+};
#endif // GOOGLE_CUDA
@@ -565,6 +766,56 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
Padding padding_;
};
+template
+struct LaunchMaxPoolingGradGradWithArgmax;
+
+template
+class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
+ public:
+ explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window stride field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ const Tensor& grad_in = context->input(1);
+ const Tensor& argmax = context->input(2);
+
+ PoolParameters params{context, ksize_, stride_,
+ padding_, FORMAT_NHWC, tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ TensorShape out_shape({params.tensor_in_batch, params.out_height,
+ params.out_width, params.depth});
+
+ Tensor* grad_out = nullptr;
+ OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+ {1}, 0, out_shape, &grad_out));
+
+ LaunchMaxPoolingGradGradWithArgmax::launch(
+ context, params, grad_in, argmax, grad_out);
+ }
+
+ private:
+ std::vector ksize_;
+ std::vector stride_;
+ Padding padding_;
+};
+
#if GOOGLE_CUDA
template
class MaxPoolingNoMaskOp : public OpKernel {
@@ -631,7 +882,7 @@ template
struct LaunchMaxPoolingNoMask {
static void launch(OpKernelContext* context, const PoolParameters& params,
const Tensor& input, Tensor* output) {
- bool status = MaxPoolForwardWithOptionalArgmax(
+ bool status = functor::MaxPoolForwardWithOptionalArgmax()(
input.flat().data(), params.tensor_in_batch, params.tensor_in_rows,
params.tensor_in_cols, params.depth, params.out_height,
params.out_width, params.window_rows, params.window_cols,
@@ -644,18 +895,11 @@ struct LaunchMaxPoolingNoMask {
}
};
-REGISTER_KERNEL_BUILDER(
- Name("MaxPool").Device(DEVICE_GPU).TypeConstraint("T"),
- MaxPoolingNoMaskOp);
-REGISTER_KERNEL_BUILDER(
- Name("MaxPool").Device(DEVICE_GPU).TypeConstraint("T"),
- MaxPoolingNoMaskOp);
-
template
struct LaunchMaxPoolingWithArgmax {
static void launch(OpKernelContext* context, const PoolParameters& params,
const Tensor& input, Tensor* output, Tensor* argmax) {
- bool status = MaxPoolForwardWithOptionalArgmax(
+ bool status = functor::MaxPoolForwardWithOptionalArgmax()(
input.flat().data(), params.tensor_in_batch, params.tensor_in_rows,
params.tensor_in_cols, params.depth, params.out_height,
params.out_width, params.window_rows, params.window_cols,
@@ -670,17 +914,6 @@ struct LaunchMaxPoolingWithArgmax {
}
};
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
- .Device(DEVICE_GPU)
- .TypeConstraint("Targmax")
- .TypeConstraint("T"),
- MaxPoolingWithArgmaxOp);
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
- .Device(DEVICE_GPU)
- .TypeConstraint("Targmax")
- .TypeConstraint("T"),
- MaxPoolingWithArgmaxOp);
-
template
struct LaunchMaxPoolingGradWithArgmax {
static void launch(OpKernelContext* context, const PoolParameters& params,
@@ -693,30 +926,118 @@ struct LaunchMaxPoolingGradWithArgmax {
const int top_offset = params.out_height * params.out_width * params.depth;
const int bottom_offset =
params.tensor_in_rows * params.tensor_in_cols * params.depth;
- bool status = MaxPoolBackwardWithArgmax(
+ bool status = functor::MaxPoolBackwardWithArgmax()(
output_size, input_size, grad_in.flat().data(),
reinterpret_cast(argmax.flat().data()), top_offset,
bottom_offset, grad_out->flat().data(), context->eigen_gpu_device());
if (!status) {
context->SetStatus(
- errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+ errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
}
}
};
-REGISTER_KERNEL_BUILDER(
- Name("MaxPoolGradWithArgmax")
- .Device(DEVICE_GPU)
- .TypeConstraint("T")
- .TypeConstraint("Targmax"),
- MaxPoolingGradWithArgmaxOp);
-REGISTER_KERNEL_BUILDER(
- Name("MaxPoolGradWithArgmax")
- .Device(DEVICE_GPU)
- .TypeConstraint("T")
- .TypeConstraint("Targmax"),
- MaxPoolingGradWithArgmaxOp);
+template
+struct LaunchMaxPoolingGradGradWithArgmax {
+ static void launch(OpKernelContext* context, const PoolParameters& params,
+ const Tensor& grad_in, const Tensor& argmax,
+ Tensor* grad_out) {
+ const int input_size = params.tensor_in_batch * params.tensor_in_rows *
+ params.tensor_in_cols * params.depth;
+ const int output_size = params.tensor_in_batch * params.out_height *
+ params.out_width * params.depth;
+ const int top_offset =
+ params.tensor_in_rows * params.tensor_in_cols * params.depth;
+ const int bottom_offset =
+ params.out_width * params.out_height * params.depth;
+ bool status = functor::MaxPoolGradBackwardWithArgmax()(
+ output_size, input_size, grad_in.flat().data(),
+ reinterpret_cast(argmax.flat().data()), top_offset,
+ bottom_offset, grad_out->flat().data(), context->eigen_gpu_device());
+ if (!status) {
+ context->SetStatus(
+ errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
+ }
+ }
+};
#endif // GOOGLE_CUDA
+#define REGISTER_MAX_POOL_KERNELS(D, T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint("T"), \
+ MaxPoolingGradOp