From 694e63c9de2cba52b5cdff189b3d713c5490a0e0 Mon Sep 17 00:00:00 2001
From: sindhu-nervana <sindhura.kantamani@intel.com>
Date: Wed, 11 Dec 2019 14:30:34 -0800
Subject: [PATCH 01/22] initial commit

---
 ngraph_bridge/ngraph_utils.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/ngraph_bridge/ngraph_utils.cc b/ngraph_bridge/ngraph_utils.cc
index 686799e94..9c883f515 100644
--- a/ngraph_bridge/ngraph_utils.cc
+++ b/ngraph_bridge/ngraph_utils.cc
@@ -214,6 +214,9 @@ Status TensorToStream(std::ostream& ostream, const Tensor& tensor) {
     case DT_BOOL:
       TensorDataToStream<bool>(ostream, n_elements, data);
       break;
+    case DT_BFLOAT16:
+      TensorDataToStream<bool>(ostream, n_elements, data);
+      break;
     default:
       return errors::Internal("TensorToStream got unsupported data type ",
                               DataType_Name(tensor.dtype()));
@@ -263,6 +266,8 @@ Status TFDataTypeToNGraphElementType(DataType tf_dt,
       break;
     case DataType::DT_QINT32:
       *ng_et = ng::element::i32;
+    case DataType::DT_BFLOAT16:
+      *ng_et = ng::element::bf16;
       break;
     default:
       return errors::Unimplemented("Unsupported TensorFlow data type: ",
@@ -313,15 +318,16 @@ void print_node_histogram(const std::unordered_map<string, int>& histogram,
 
 const gtl::ArraySlice<DataType>& NGraphDTypes() {
   static gtl::ArraySlice<DataType> result{
-      DT_FLOAT,  DT_DOUBLE, DT_INT8,   DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
-      DT_UINT16, DT_UINT32, DT_UINT64, DT_BOOL,  DT_QINT8, DT_QUINT8};
+      DT_FLOAT, DT_DOUBLE, DT_INT8,   DT_INT16,   DT_INT32,
+      DT_INT64, DT_UINT8,  DT_UINT16, DT_UINT32,  DT_UINT64,
+      DT_BOOL,  DT_QINT8,  DT_QUINT8, DT_BFLOAT16};
   return result;
 }
 
 const gtl::ArraySlice<DataType>& NGraphNumericDTypes() {
   static gtl::ArraySlice<DataType> result{
-      DT_FLOAT, DT_DOUBLE, DT_INT8,   DT_INT16,  DT_INT32,
-      DT_INT64, DT_UINT8,  DT_UINT16, DT_UINT32, DT_UINT64};
+      DT_FLOAT, DT_DOUBLE, DT_INT8,   DT_INT16,  DT_INT32,   DT_INT64,
+      DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_BFLOAT16};
   return result;
 }
 
@@ -343,7 +349,7 @@ const gtl::ArraySlice<DataType>& NGraphSupportedQuantizedDTypes() {
 }
 
 const gtl::ArraySlice<DataType>& NGraphRealDTypes() {
-  static gtl::ArraySlice<DataType> result{DT_FLOAT, DT_DOUBLE};
+  static gtl::ArraySlice<DataType> result{DT_FLOAT, DT_DOUBLE, DT_BFLOAT16};
   return result;
 }
 

From 2ab87c77e0c845df460b7819f4a72cc9fb94099e Mon Sep 17 00:00:00 2001
From: sindhu-nervana <sindhura.kantamani@intel.com>
Date: Mon, 16 Dec 2019 15:56:22 -0800
Subject: [PATCH 02/22] add bfloat16 test

---
 test/python/test_bfloat16.py | 46 ++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 test/python/test_bfloat16.py

diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
new file mode 100644
index 000000000..82197a479
--- /dev/null
+++ b/test/python/test_bfloat16.py
@@ -0,0 +1,46 @@
+# ==============================================================================
+#  Copyright 2019 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# ==============================================================================
+"""nGraph TensorFlow bridge bfloat16 matmul operation test
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pytest
+import numpy as np
+
+import tensorflow as tf
+import os
+
+from common import NgraphTest
+
+
+class TestMatmulBfloat16(NgraphTest):
+
+    def test_matmul_bfloat16(self):
+        a = tf.placeholder(tf.bfloat16, [2, 3], name='a')
+        x = tf.placeholder(tf.bfloat16, [3, 4], name='x')
+        a_inp = np.random.rand(2, 3)
+        x_inp = np.random.rand(3, 4)
+        out = tf.matmul(a, x)
+
+        def run_test(sess):
+            return sess.run((out,), feed_dict={a: a_inp, x: x_inp})
+
+        # import pdb
+        # pdb.set_trace()
+        assert self.with_ngraph(run_test) == self.without_ngraph(run_test)

From b267ba1f260ee8bf07ba9273d0a896823d816ced Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 20 Dec 2019 10:29:43 -0800
Subject: [PATCH 03/22] Shrestha/var in compute (#388)

- Enabled --var build to use parallel executor integrating weights-on-device and data pipelining
- moved ngraph_var files outside the var build
---
 bazel/BUILD                                   |   2 +
 ngraph_bridge/CMakeLists.txt                  |   2 +-
 .../enable_variable_ops/ngraph_assign_op.cc   |   4 +-
 .../ngraph_enter_in_catalog.cc                |  15 +-
 .../ngraph_rewrite_pass.cc                    |   8 ++
 .../ngraph_tracked_variable.cc                |   5 +-
 .../ngraph_variable_modifiers.cc              |   2 +-
 .../ngraph_variable_update_ng_tensor_op.cc    |   3 +-
 ngraph_bridge/ngraph_encapsulate_impl.cc      |   2 +-
 ngraph_bridge/ngraph_encapsulate_op.cc        |  79 +++++++----
 ngraph_bridge/ngraph_encapsulate_op_utils.cc  | 130 ++++++++++++++++--
 ngraph_bridge/ngraph_encapsulate_op_utils.h   |  38 ++++-
 .../ngraph_enter_prefetch_in_catalog.h        |   4 +-
 ngraph_bridge/ngraph_executor.cc              |   2 +-
 ngraph_bridge/ngraph_prefetch_dataset_op.cc   |  16 ++-
 ngraph_bridge/ngraph_tensor_manager.cc        |  46 ++++++-
 ngraph_bridge/ngraph_tensor_manager.h         |   2 +
 ngraph_bridge/ngraph_tracked_variable.cc      |   4 +-
 .../{enable_variable_ops => }/ngraph_var.cc   |   2 +-
 .../{enable_variable_ops => }/ngraph_var.h    |   0
 .../test_ng_var_update_ng_tensor.cc           |   2 +-
 test/python/test_flib.py                      |   1 +
 test/test_ng_var_update_ng_tensor_kernel.cc   |   2 +-
 tools/test_utils.py                           |   5 +-
 24 files changed, 304 insertions(+), 72 deletions(-)
 rename ngraph_bridge/{enable_variable_ops => }/ngraph_var.cc (98%)
 rename ngraph_bridge/{enable_variable_ops => }/ngraph_var.h (100%)

diff --git a/bazel/BUILD b/bazel/BUILD
index 7028b6a95..034ff0dec 100644
--- a/bazel/BUILD
+++ b/bazel/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "ngraph_bridge/ngraph_tensor_manager.h",
         "ngraph_bridge/ngraph_timer.h",
         "ngraph_bridge/ngraph_utils.h",
+        "ngraph_bridge/ngraph_var.h",
         "ngraph_bridge/ngraph_version_utils.h",
         "ngraph_bridge/tf_deadness_analysis.h",
         "ngraph_bridge/tf_graphcycles.h",
@@ -92,6 +93,7 @@ cc_library(
         "ngraph_bridge/ngraph_tensor_manager.cc",
         "ngraph_bridge/ngraph_tracked_variable.cc",
         "ngraph_bridge/ngraph_utils.cc",
+        "ngraph_bridge/ngraph_var.cc",
         "ngraph_bridge/tf_deadness_analysis.cc",
         "ngraph_bridge/tf_graphcycles.cc",
         "ngraph_bridge/ops/ngraph_ops.cc",
diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt
index 18d218dad..eb104ae3b 100644
--- a/ngraph_bridge/CMakeLists.txt
+++ b/ngraph_bridge/CMakeLists.txt
@@ -57,6 +57,7 @@ set(SRC
    ngraph_rewrite_pass.cc
    ngraph_tensor_manager.cc
    ngraph_tracked_variable.cc
+   ngraph_var.cc
    ngraph_utils.cc
    tf_graphcycles.cc
    tf_deadness_analysis.cc
@@ -86,7 +87,6 @@ if(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
     list(APPEND SRC enable_variable_ops/ngraph_tracked_variable.cc)
 
     # new files
-    list(APPEND SRC enable_variable_ops/ngraph_var.cc)
     list(APPEND SRC enable_variable_ops/ngraph_assign_op.cc)
     list(APPEND SRC enable_variable_ops/ngraph_enter_in_catalog.cc)
     list(APPEND SRC enable_variable_ops/ngraph_remove_ngraphassigns.cc)
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
index b9f041e8b..35099bbc7 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
@@ -25,11 +25,11 @@
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
@@ -83,7 +83,7 @@ class NGraphAssignOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     std::ostringstream oss;
-    oss << "Execute: Assign_" << my_instance_id << ": " << name();
+    oss << "NGAssign::Compute::" << name();
     ngraph::Event event_compute(oss.str(), name(), "");
 
     NGRAPH_VLOG(4) << "NGraphAssign:: Compute called for: " << def().name()
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
index a456ef6e8..c96a4932e 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
@@ -160,15 +160,12 @@ Status EnterInCatalog(Graph* graph, int graph_id) {
         }
       }
 
-      // are there indexes that need copy
-      if (op_index_to_copy.size() > 0) {
-        try {
-          NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(),
-                                                        op_index_to_copy);
-        } catch (const std::exception& exp) {
-          return errors::Internal(
-              "Caught exception while entering in catalog: ", exp.what(), "\n");
-        }
+      try {
+        NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(),
+                                                      op_index_to_copy);
+      } catch (const std::exception& exp) {
+        return errors::Internal("Caught exception while entering in catalog: ",
+                                exp.what(), "\n");
       }
 
     }  // end of node is type NGraphEncapsulate
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc b/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc
index b764713ab..ea97ff417 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc
@@ -30,6 +30,7 @@
 #include "ngraph_bridge/ngraph_cluster_manager.h"
 #include "ngraph_bridge/ngraph_deassign_clusters.h"
 #include "ngraph_bridge/ngraph_encapsulate_clusters.h"
+#include "ngraph_bridge/ngraph_enter_prefetch_in_catalog.h"
 #include "ngraph_bridge/ngraph_mark_for_clustering.h"
 #include "ngraph_bridge/ngraph_rewrite_for_tracking.h"
 #include "ngraph_bridge/ngraph_utils.h"
@@ -255,6 +256,13 @@ class NGraphEncapsulationPass : public NGraphRewritePass {
                  "Graph with NGraphAssigns Optimized/Removed");
     }
 
+    // 8. Enter Prefetch in catalog then.
+    TF_RETURN_IF_ERROR(EnterPrefetchInCatalog(options.graph->get(), idx));
+    if (DumpCatalogedGraphs()) {
+      DumpGraphs(options, idx, "prefetch-cataloged",
+                 "Graph with Prefetched Inputs Entered in Catalog");
+    }
+
     return Status::OK();
   }
 
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
index c034d13c7..8b5b81f68 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
@@ -23,11 +23,11 @@
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
@@ -119,7 +119,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
                  << " ,backend_name " << ng_backend_name_;
 
   std::ostringstream oss;
-  oss << "NGraphVariable: " << my_instance_id << ": " << name();
+  oss << "NGVariable::Compute::" << name();
   ngraph::Event event_compute(oss.str(), name(), "");
 
   bool log_copies = false;
@@ -250,6 +250,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
     ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes());
   }
   var->Unref();
+  event_compute.Stop();
   ngraph::Event::write_trace(event_compute);
 }
 
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
index 5fc190bea..376a596a9 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
@@ -26,12 +26,12 @@
 
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
index fdb432f79..8755f6f76 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
@@ -24,10 +24,10 @@
 
 #include "ngraph/event_tracing.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
@@ -67,6 +67,7 @@ NGraphVariableUpdateNGTensorOp::~NGraphVariableUpdateNGTensorOp() {
 void NGraphVariableUpdateNGTensorOp::Compute(OpKernelContext* context) {
   std::ostringstream oss;
   // Start event tracing
+  oss << "NGVariableUpdateNGTensor::Compute::" << name();
   ngraph::Event event_compute(oss.str(), name(), "");
   bool log_copies = false;
   OP_REQUIRES_OK(context,
diff --git a/ngraph_bridge/ngraph_encapsulate_impl.cc b/ngraph_bridge/ngraph_encapsulate_impl.cc
index 7823f0a7d..f2ddf1ecd 100644
--- a/ngraph_bridge/ngraph_encapsulate_impl.cc
+++ b/ngraph_bridge/ngraph_encapsulate_impl.cc
@@ -45,8 +45,8 @@
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
+#include "ngraph_bridge/ngraph_var.h"
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #endif
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 9a48d8c92..4605757ae 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -49,9 +49,9 @@
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #endif
 
@@ -88,13 +88,8 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx)
       ctx, backend != nullptr,
       errors::Internal("Cannot get the backend object for BE: ", be_name));
 
-// If we have the VARIABLE capture on then we can't use the
-// parallel executor until that support is added.
-#if !defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
+  // If backend executable can create tensors we use parallel executor
   m_use_parallel_executor = backend->executable_can_create_tensors();
-#else
-  m_use_parallel_executor = false;
-#endif
 
   // Override the switch for debugging/testing
   if (std::getenv("NGRAPH_TF_USE_LEGACY_EXECUTOR") != nullptr) {
@@ -402,7 +397,7 @@ NGraphEncapsulateOp::~NGraphEncapsulateOp() {
 // OpKernel::Compute
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
-  ngraph::Event event_compute("Compute", "", "");
+  ngraph::Event event_compute("NGEncap::Compute::" + name(), name(), "");
 
   if (m_use_parallel_executor) {
     NGRAPH_VLOG(1) << "NGraphEncapsulateOp::Compute: Using Parallel Executor";
@@ -459,6 +454,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
                                m_parallel_executor->GetTensorPipelineDepth()));
 
   // Get Tensor Manager and some error checking
+  ngraph::Event event_prepare_ng_tensors("Prepare NG In/Out Tensors", "", "");
   auto tensor_manager = m_parallel_executor->GetTensorManager();
   int num_of_inputs = tensor_manager->GetNumberOfInputs();
   int num_of_outputs = tensor_manager->GetNumberOfOutputs();
@@ -499,14 +495,18 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
 
-  // All inputs and outputs are pipelined.
-  // Of all these pipelined inputs some are prefetched
-  // TODO: Fit in variables
-  ng_inputs = get<1>(pipelined_io_tensors);
-  ng_outputs = get<2>(pipelined_io_tensors);
+  // Prepare NG Input Output Tensors
+  // Assemble Variable tensors and pipelined tensors to ng_input and ng_outputs
+  OP_REQUIRES_OK(ctx, GetIOTensorsReadyForExecution(
+                          ctx, tensor_manager, get<1>(pipelined_io_tensors),
+                          get<2>(pipelined_io_tensors), ng_inputs, ng_outputs));
+  event_prepare_ng_tensors.Stop();
+  ngraph::Event::write_trace(event_prepare_ng_tensors);
 
   // And execute
-  ngraph::Event event_execute_graph("Execute Graph", "", "");
+  ngraph::Event event_execute_graph(
+      "Execute Graph Pipeline Indx" + to_string(current_iter_pipeline_depth),
+      "", "");
 
   BackendManager::LockBackend(m_parallel_executor->GetOpBackendName());
   NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute call starting for cluster "
@@ -540,12 +540,14 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   ngraph::Event::write_trace(event_execute_graph);
 
   // Now prepare the output
-  ngraph::Event event_copy_output_tensor("Copy Output Tensor", "", "");
+  // Allocate TF Tensors
+  NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Allocating TF Output Tensors "
+                 << m_parallel_executor->GetNgraphClusterId();
 
-  std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
+  ngraph::Event event_prepare_tf_output_tensors("Prepare TF Output Tensor", "",
+                                                "");
+  vector<Tensor*> tf_output_tensors;
   for (auto i = 0; i < ng_exec->get_results().size(); i++) {
-    std::unique_ptr<ngraph::Event> event_copy_prep(
-        new ngraph::Event("Copy Prep", "", ""));
     auto ng_element = ng_exec->get_results()[i];
     auto ng_shape = ng_element->get_shape();
     auto ng_element_type = ng_element->get_element_type();
@@ -558,7 +560,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
     TensorShape tf_shape(dims);
     Tensor* tf_output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(i, tf_shape, &tf_output_tensor));
-
+    tf_output_tensors.push_back(tf_output_tensor);
     // Make sure the nGraph-inferred element type agrees with what TensorFlow
     // expected.
     ng::element::Type expected_elem_type;
@@ -569,28 +571,45 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
         ctx, ng_element_type == expected_elem_type,
         errors::Internal("Element type inferred by nGraph does not match "
                          "the element type expected by TensorFlow"));
-    event_copy_prep->Stop();
-    output_copy_events.push_back(std::move(event_copy_prep));
+  }
 
-    // Now copy the nGraph Tensor to Host Tensor
-    std::unique_ptr<ngraph::Event> event_copy_d2h(
-        new ngraph::Event("Device to Host Copy", "", ""));
-    void* dst_ptr = DMAHelper::base(tf_output_tensor);
+  // Copy Tensors that are required
+  NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Read NG Output Tensors "
+                 << m_parallel_executor->GetNgraphClusterId();
 
-    ng_outputs[i]->read(
-        dst_ptr, ng_outputs[i]->get_element_count() * ng_element_type.size());
+  std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
+
+  auto output_indexes_to_be_copied =
+      tensor_manager->GetOutputIndexesThatNeedCopy();
+  for (auto output_index : output_indexes_to_be_copied) {
+    // Copy the nGraph Tensor to Host Tensor
+    std::unique_ptr<ngraph::Event> event_copy_d2h(new ngraph::Event(
+        "D2H_Output_" + std::to_string(output_index), "", ""));
+    void* dst_ptr = (void*)DMAHelper::base(tf_output_tensors[output_index]);
+    ng_outputs[output_index]->read(
+        dst_ptr, ng_outputs[output_index]->get_element_count() *
+                     ng_outputs[output_index]->get_element_type().size());
     event_copy_d2h->Stop();
     output_copy_events.push_back(std::move(event_copy_d2h));
   }
-
   for (auto& next : output_copy_events) {
     ngraph::Event::write_trace(*next.get());
   }
+  event_prepare_tf_output_tensors.Stop();
+  ngraph::Event::write_trace(event_prepare_tf_output_tensors);
 
-  event_copy_output_tensor.Stop();
-  ngraph::Event::write_trace(event_copy_output_tensor);
+  // Synch Var Output Tensors as required
+  NGRAPH_VLOG(4)
+      << "NGraphEncapsulateOp::Compute Sync NG Output Variable Tensors "
+      << m_parallel_executor->GetNgraphClusterId();
+  ngraph::Event event_update_ngvar_tensors("Update NGVar Tensors", "", "");
+  OP_REQUIRES_OK(ctx, SyncOutputVarTensors(ctx, tensor_manager));
+  event_update_ngvar_tensors.Stop();
+  ngraph::Event::write_trace(event_update_ngvar_tensors);
 
   // Now return them to the cache
+  NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Returning Tensors "
+                 << m_parallel_executor->GetNgraphClusterId();
   ngraph::Event event_return_tensor("Return Tensor", "", "");
   pipelined_tensor_store->return_tensors(current_iter_pipeline_depth);
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 51eca36de..d12494e45 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -18,17 +18,22 @@
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
+#include "ngraph_bridge/ngraph_var.h"
+
 using namespace std;
 
 namespace tensorflow {
 
 namespace ngraph_bridge {
 
+//---------------------------------------------------------------------------
+//  GetPipelinedIOTensorsReadyForExecution
+//---------------------------------------------------------------------------
 Status GetPipelinedIOTensorsReadyForExecution(
-    OpKernelContext* ctx, std::vector<Tensor>& tf_input_tensors,
-    shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
-    shared_ptr<NGraphTensorManager>& tensor_manager,
-    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
+    OpKernelContext* ctx, const vector<Tensor>& tf_input_tensors,
+    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    const shared_ptr<NGraphTensorManager>& tensor_manager,
+    tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors) {
   auto io_tensors = pipelined_tensor_store->get_tensors();
 
@@ -84,7 +89,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
           tensor_manager->GetInputIndexesForPrefetchSharedObject());
 
       // Get the set of IO tensors for the next iteration
-      std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
+      tuple<int, PipelinedTensorVector, PipelinedTensorVector>
           io_tensors_next_iter;
       io_tensors_next_iter = pipelined_tensor_store->get_tensors();
 
@@ -154,18 +159,21 @@ Status GetPipelinedIOTensorsReadyForExecution(
 
   // Allocate the input/
   ngraph::Event event_copy_input_tensor("Copy Pipelined Input Tensors", "", "");
-
+  std::vector<std::unique_ptr<ngraph::Event>> input_write_events;
   if (!skip_tf2ng_copy) {
     // All pipelined inputs are copied
 
     for (auto i = 0; i < pipelined_input_indexes.size(); i++) {
       int tf_index = pipelined_input_indexes[i];
-
       ng::element::Type ng_element_type;
       TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(
           tf_input_tensors[tf_index].dtype(), &ng_element_type));
       void* current_src_ptr =
           (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
+
+      std::unique_ptr<ngraph::Event> event_copy_h2d(
+          new ngraph::Event("H2D_Input_" + std::to_string(tf_index), "", ""));
+
       try {
         ng_pipelined_inputs[i]->write(
             current_src_ptr, ng_pipelined_inputs[i]->get_element_count() *
@@ -176,6 +184,8 @@ Status GetPipelinedIOTensorsReadyForExecution(
       } catch (...) {
         return errors::Internal("Error copying TF tensor to device tensor");
       }
+      event_copy_h2d->Stop();
+      input_write_events.push_back(std::move(event_copy_h2d));
     }
   } else {
     // All pipelined inputs that are not prefetched are copied
@@ -199,19 +209,27 @@ Status GetPipelinedIOTensorsReadyForExecution(
           tf_input_tensors[tf_index].dtype(), &ng_element_type));
       void* current_src_ptr =
           (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
+      unique_ptr<ngraph::Event> event_copy_h2d(
+          new ngraph::Event("H2D_Input_" + to_string(tf_index), "", ""));
       try {
         ng_pipelined_inputs[ng_index]->write(
             current_src_ptr,
             ng_pipelined_inputs[ng_index]->get_element_count() *
                 ng_element_type.size());
-      } catch (const std::exception& exp) {
+      } catch (const exception& exp) {
         return errors::Internal("Error copying TF tensor to device tensor: ",
                                 exp.what());
       } catch (...) {
         return errors::Internal("Error copying TF tensor to device tensor");
       }
+      event_copy_h2d->Stop();
+      input_write_events.push_back(move(event_copy_h2d));
     }
   }
+
+  for (auto& next : input_write_events) {
+    ngraph::Event::write_trace(*next.get());
+  }
   event_copy_input_tensor.Stop();
   ngraph::Event::write_trace(event_copy_input_tensor);
 
@@ -221,5 +239,101 @@ Status GetPipelinedIOTensorsReadyForExecution(
   return Status::OK();
 }
 
+//---------------------------------------------------------------------------
+//  GetTensorFromContext
+//---------------------------------------------------------------------------
+Status GetTensorFromContext(const OpKernelContext* ctx,
+                            const string& shared_name,
+                            shared_ptr<ng::runtime::Tensor>& ng_tensor) {
+  // Get shared name from tensor manager
+  NGraphVar* var;
+  TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup<NGraphVar>(
+      ctx->resource_manager()->default_container(), shared_name, &var));
+  ng_tensor = var->ng_tensor();
+  var->Unref();
+  return Status::OK();
+}
+
+//---------------------------------------------------------------------------
+//  GetIOTensorsReadyForExecution
+//---------------------------------------------------------------------------
+Status GetIOTensorsReadyForExecution(
+    OpKernelContext* ctx, const shared_ptr<NGraphTensorManager>& tensor_manager,
+    const PipelinedTensorVector& pipelined_in_tensors,
+    const PipelinedTensorVector& pipelined_out_tensors,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_inputs,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_outputs) {
+  // Get Variables that are inputs
+  auto var_input_indexes = tensor_manager->GetInputIndexesFedByVariables();
+  for (int input_index : var_input_indexes) {
+    string shared_name;
+    TF_RETURN_IF_ERROR(
+        tensor_manager->GetInputVariableSharedName(input_index, &shared_name));
+    TF_RETURN_IF_ERROR(
+        GetTensorFromContext(ctx, shared_name, ng_inputs[input_index]));
+  }
+
+  // Get Variables that are outputs
+  auto var_output_indexes =
+      tensor_manager->GetOutputIndexesAssigningVariables();
+  for (int output_index : var_output_indexes) {
+    string shared_name;
+    TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName(
+        output_index, &shared_name));
+    TF_RETURN_IF_ERROR(
+        GetTensorFromContext(ctx, shared_name, ng_outputs[output_index]));
+  }
+
+  // Fit Pipelined Input Tensors
+  auto pipelined_input_indexes = tensor_manager->GetPipelinedInputIndexes();
+  for (int i = 0; i < pipelined_input_indexes.size(); i++) {
+    int input_index = pipelined_input_indexes[i];
+    ng_inputs[input_index] = pipelined_in_tensors[i];
+  }
+
+  // Fit Pipelined Output Tensors
+  auto pipelined_output_indexes = tensor_manager->GetPipelinedOutputIndexes();
+  for (int i = 0; i < pipelined_output_indexes.size(); i++) {
+    int output_index = pipelined_output_indexes[i];
+    ng_outputs[output_index] = pipelined_out_tensors[i];
+  }
+
+  return Status::OK();
+}
+
+//---------------------------------------------------------------------------
+//  SyncOutputVarTensors
+//---------------------------------------------------------------------------
+Status SyncOutputVarTensors(
+    const OpKernelContext* ctx,
+    const shared_ptr<NGraphTensorManager>& tensor_manager) {
+  // Get Variables that are outputs
+  auto var_output_indexes =
+      tensor_manager->GetOutputIndexesAssigningVariables();
+  NGRAPH_VLOG(4) << "output indexes size " << var_output_indexes.size();
+
+  for (int output_index : var_output_indexes) {
+    bool copy_to_tf;
+    TF_RETURN_IF_ERROR(
+        tensor_manager->GetOutputVariableCopyToTF(output_index, &copy_to_tf));
+
+    if (copy_to_tf) {
+      NGRAPH_VLOG(4) << "Sync NG Output Variable Tensors " << output_index;
+      // Get shared name from tensor manager
+      string shared_name;
+      TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName(
+          output_index, &shared_name));
+      NGraphVar* var;
+      TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup<NGraphVar>(
+          ctx->resource_manager()->default_container(), shared_name, &var));
+      // update tensor
+      var->copy_ng_to_tf();
+      var->Unref();
+      NGRAPH_VLOG(4) << "Sync Completed " << output_index;
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
index 7f48eb09c..1a6df4ede 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.h
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h
@@ -46,12 +46,44 @@ namespace ngraph_bridge {
 //
 
 Status GetPipelinedIOTensorsReadyForExecution(
-    OpKernelContext* ctx, vector<Tensor>& tf_input_tensors,
-    shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
-    shared_ptr<NGraphTensorManager>& tensor_manager,
+    OpKernelContext* ctx, const vector<Tensor>& tf_input_tensors,
+    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    const shared_ptr<NGraphTensorManager>& tensor_manager,
     tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors);
 
+// Assembles the different types of input and output tensors
+// Variable tensors and pipelined tensors are put together in the right order
+// into ng_inputs and ng_outputs
+// 1. For input indexes that are fed by variables, get the variable tensors from
+// context
+// 2. For output indexes that are updating variables, get the variable tensors
+// from context
+//    This enable update-in-place
+// 3. For input and output indexes that are pipelined, get the respective tensor
+//
+Status GetIOTensorsReadyForExecution(
+    OpKernelContext* ctx, const shared_ptr<NGraphTensorManager>& tensor_manager,
+    const PipelinedTensorVector& pipelined_in_tensors,
+    const PipelinedTensorVector& pipelined_out_tensors,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_inputs,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_outputs);
+
+// Gets the Tensor from OpKernelContext's Container for the given shared_name
+Status GetTensorFromContext(const OpKernelContext* ctx,
+                            const string& shared_name,
+                            shared_ptr<ng::runtime::Tensor>& ng_tensor);
+
+// Encapsulate Op updates the NGVariable's device tensor in-place
+// ie. the NGVariable's backend tensor is updated
+// Some of these Variables may be required by the TF ops and they will use the
+// host tensor
+// These were marked as "copy-to-tf" True in the Rewrite Phase
+// We will update these tensors here
+Status SyncOutputVarTensors(
+    const OpKernelContext* ctx,
+    const shared_ptr<NGraphTensorManager>& tensor_manager);
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
 
diff --git a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h
index d7ab8cc9c..534166aa1 100644
--- a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h
+++ b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *******************************************************************************/
-#ifndef NGRAPH_TF_ENTER_IN_CATALOG_H_
-#define NGRAPH_TF_ENTER_IN_CATALOG_H_
+#ifndef NGRAPH_TF_ENTER_PREFETCH_IN_CATALOG_H_
+#define NGRAPH_TF_ENTER_PREFETCH_IN_CATALOG_H_
 #pragma once
 
 #include "tensorflow/core/graph/graph.h"
diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc
index 37e1b8b40..7d4fe2c2a 100644
--- a/ngraph_bridge/ngraph_executor.cc
+++ b/ngraph_bridge/ngraph_executor.cc
@@ -43,9 +43,9 @@
 #include "ngraph_bridge/ngraph_mark_for_clustering.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #endif
 
diff --git a/ngraph_bridge/ngraph_prefetch_dataset_op.cc b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
index 18b946191..7c131bcce 100644
--- a/ngraph_bridge/ngraph_prefetch_dataset_op.cc
+++ b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
@@ -415,14 +415,15 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
             ngraph_bridge::NGraphPrefetchSharedResouce::RESOURCE_NAME,
             &shared_data);
         if (s.ok()) {
-          ngraph::Event evt_dev_cp("Prf Dev Copy", "Copy", "");
           shared_data->SetBufferDepth(m_buffer_size);
 
           auto ng_input_tensor_bundle =
               shared_data->GetNextIOTensorBundleForDeviceTransfer();
           auto ng_prefetch_input_indexes_map =
               shared_data->GetPrefetchInputIndexesMap();
-
+          ngraph::Event evt_dev_cp(
+              "Prf Dev Copy: Pipe_Ind_" + to_string(ng_input_tensor_bundle.Id),
+              "Copy", "");
           int number_of_buffer_elements = buffer_element.value.size();
           if (number_of_buffer_elements !=
               ng_prefetch_input_indexes_map.size()) {
@@ -433,7 +434,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
                 "encap " +
                 to_string(ng_prefetch_input_indexes_map.size()));
           }
-
+          std::vector<std::unique_ptr<ngraph::Event>>
+              prefetch_input_write_events;
           // Write to these tensors
           for (auto itr : ng_prefetch_input_indexes_map) {
             int ng_index = itr.first;
@@ -445,6 +447,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
 
             void* current_src_ptr =
                 (void*)DMAHelper::base(&buffer_element.value[tf_index]);
+            std::unique_ptr<ngraph::Event> event_copy_h2d(new ngraph::Event(
+                "H2D_PrefetchInput_" + std::to_string(tf_index), "Copy", ""));
             try {
               NGRAPH_VLOG(2)
                   << "[PREFETCH] INPUT tensor being written by Prefetch: "
@@ -459,6 +463,12 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
               throw std::runtime_error(
                   "Error copying TF tensor to device tensor");
             }
+            event_copy_h2d->Stop();
+            prefetch_input_write_events.push_back(std::move(event_copy_h2d));
+          }
+
+          for (auto& next : prefetch_input_write_events) {
+            ngraph::Event::write_trace(*next.get());
           }
 
           // Now add them back to the other queue
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 518ae96fe..116c213ec 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -42,7 +42,6 @@ NGraphTensorManager::NGraphTensorManager(const string ng_encap_node_name,
 
 void NGraphTensorManager::Initialize() {
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-
   // input variables book-keeping
   for (int index = 0; index < m_number_of_inputs; index++) {
     if (NGraphCatalog::ExistsInInputVariableSharedNameMap(
@@ -86,6 +85,17 @@ void NGraphTensorManager::Initialize() {
       m_output_indexes_that_need_copy.push_back(index);
     }
   }
+
+  // For graphs that were run through AOT
+  // Graph rewrite is not done, and there is no entry in catalog
+  // If there is no entry in catalog all outputs need to be copied
+  if (!NGraphCatalog::EncapOutputNeedsCopy(m_ng_encap_graph_id,
+                                           m_ng_encap_node_name)) {
+    m_output_indexes_that_need_copy.resize(m_number_of_outputs);
+    iota(begin(m_output_indexes_that_need_copy),
+         end(m_output_indexes_that_need_copy), 0);
+  }
+
 #else
   m_output_indexes_that_need_copy.resize(m_number_of_outputs);
   iota(begin(m_output_indexes_that_need_copy),
@@ -140,6 +150,40 @@ void NGraphTensorManager::Initialize() {
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
 }
 
+//---------------------------------------------------------------------------
+//  NGraphTensorManager::Print
+//---------------------------------------------------------------------------
+void NGraphTensorManager::Print() {
+  auto PrintVector = [](const vector<int>& input_vector, const string title) {
+    cout << title << endl;
+    cout << ng::join(input_vector) << endl;
+  };
+
+  cout << "** NGEncapsulate TensorManager:" << m_ng_encap_node_name << " **"
+       << endl;
+
+  cout << "** Variables Related **" << endl;
+  PrintVector(m_input_indexes_from_variables, "Input Indexes from Variables");
+  PrintVector(m_output_indexes_assigning_variable,
+              "Output Indexes Referring to Variables");
+  PrintVector(m_output_indexes_that_need_copy, "Output Indexes to be Read");
+
+  cout << "** Pipelined **" << endl;
+  PrintVector(m_pipelined_input_indexes, "Pipelined Input Indexes");
+  PrintVector(m_pipelined_output_indexes, "Pipelined Output Indexes");
+
+  cout << "** Prefetched **" << endl;
+  PrintVector(m_prefetched_input_indexes, "Prefetched Input Indexes");
+  PrintVector(m_pipelined_not_prefetched_input_indexes,
+              "Pipelined But Not Prefetched Input Indexes");
+
+  cout << "** Prefetched wrt pipelined indexes **" << endl;
+  PrintVector(m_pipelined_input_indexes_that_are_prefetched,
+              "Prefetched Input Indexes wrt Pipelined Inputs");
+  PrintVector(m_pipelined_input_indexes_that_are_not_prefetched,
+              "Not Prefetched Input Indexes wrt Pipelined Inputs");
+}
+
 //---------------------------------------------------------------------------
 //  NGraphTensorManager::~NGraphTensorManager
 //---------------------------------------------------------------------------
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 9143241fb..73f2ca9d4 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -109,6 +109,8 @@ class NGraphTensorManager {
   Status GetOutputVariableCopyToTF(const int& output_index,
                                    bool* output_var_copy_to_tf);
 
+  void Print();
+
  private:
   void Initialize();
   string m_ng_encap_node_name;
diff --git a/ngraph_bridge/ngraph_tracked_variable.cc b/ngraph_bridge/ngraph_tracked_variable.cc
index bf277b6c1..22b1e584e 100644
--- a/ngraph_bridge/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/ngraph_tracked_variable.cc
@@ -60,7 +60,6 @@ class NGraphVar : public ResourceBase {
  private:
   mutex mu_;
   Tensor tensor_;
-
   ~NGraphVar() override {}
 };
 
@@ -108,7 +107,7 @@ NGraphVariableOp::~NGraphVariableOp() { tracker_->Unref(); }
 void NGraphVariableOp::Compute(OpKernelContext* ctx) {
   mutex_lock l(init_mu_);
   std::ostringstream oss;
-  oss << "NGraphVariable: " << my_instance_id << ": " << name();
+  oss << "NGVariable::Compute::" << name();
   ngraph::Event event_compute(oss.str(), name(), "");
 
   if (!initialized_) {
@@ -182,6 +181,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
     ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes());
   }
   var->Unref();
+  event_compute.Stop();
   ngraph::Event::write_trace(event_compute);
 }
 
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_var.cc b/ngraph_bridge/ngraph_var.cc
similarity index 98%
rename from ngraph_bridge/enable_variable_ops/ngraph_var.cc
rename to ngraph_bridge/ngraph_var.cc
index efab9e7c0..1fa6001bf 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_var.cc
+++ b/ngraph_bridge/ngraph_var.cc
@@ -24,10 +24,10 @@
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_var.h b/ngraph_bridge/ngraph_var.h
similarity index 100%
rename from ngraph_bridge/enable_variable_ops/ngraph_var.h
rename to ngraph_bridge/ngraph_var.h
diff --git a/test/graph_rewrites/test_ng_var_update_ng_tensor.cc b/test/graph_rewrites/test_ng_var_update_ng_tensor.cc
index 0af2c7a57..924c54266 100644
--- a/test/graph_rewrites/test_ng_var_update_ng_tensor.cc
+++ b/test/graph_rewrites/test_ng_var_update_ng_tensor.cc
@@ -23,10 +23,10 @@
 #include "tensorflow/core/platform/test.h"
 
 #include "logging/tf_graph_writer.h"
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h"
 #include "ngraph_bridge/ngraph_rewrite_for_tracking.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 #include "test/test_utilities.h"
 
 namespace tensorflow {
diff --git a/test/python/test_flib.py b/test/python/test_flib.py
index 079e34449..f0c9b5b59 100644
--- a/test/python/test_flib.py
+++ b/test/python/test_flib.py
@@ -46,6 +46,7 @@ def test_flib_1(self):
 
             res1 = self.with_ngraph(sess_fn)
             res2 = self.without_ngraph(sess_fn)
+
             exp = [np.full((2, 3), 3.0), np.full((2, 3), 0.95257413)]
             # Note both run on Host (because NgraphEncapsulate can only run on host)
             assert np.isclose(res1, res2).all()
diff --git a/test/test_ng_var_update_ng_tensor_kernel.cc b/test/test_ng_var_update_ng_tensor_kernel.cc
index 51742fcc9..4612d156b 100644
--- a/test/test_ng_var_update_ng_tensor_kernel.cc
+++ b/test/test_ng_var_update_ng_tensor_kernel.cc
@@ -30,9 +30,9 @@
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 #include "test/test_utilities.h"
 #include "test/tf_fake_input.h"
 
diff --git a/tools/test_utils.py b/tools/test_utils.py
index 12f2ead29..e8f5752d3 100755
--- a/tools/test_utils.py
+++ b/tools/test_utils.py
@@ -108,7 +108,7 @@ def run_ngtf_pytests(venv_dir, build_dir):
     build_dir = os.path.abspath(build_dir)
     venv_dir = os.path.abspath(venv_dir)
     mnist_dir = os.path.abspath(build_dir + '/examples/mnist/')
-
+    axpy_dir = os.path.abspath(build_dir + '/examples/')
     test_dir = os.path.join(build_dir, "test")
     test_dir = os.path.join(test_dir, "python")
 
@@ -130,7 +130,8 @@ def run_ngtf_pytests(venv_dir, build_dir):
         build_dir) + " --ignore=" + build_dir + "/test/python/bfloat16"
     env = os.environ.copy()
     new_paths = venv_dir + '/bin/python3:' + os.path.abspath(
-        build_dir) + ":" + os.path.abspath(mnist_dir)
+        build_dir) + ":" + os.path.abspath(axpy_dir) + ":" + os.path.abspath(
+            mnist_dir)
     if 'PYTHONPATH' in env:
         env["PYTHONPATH"] = new_paths + ":" + env["PYTHONPATH"]
     else:

From 3ffb02e0197efd33f68db8ff63aa394d61a883d9 Mon Sep 17 00:00:00 2001
From: sindhu-nervana <sindhura.kantamani@intel.com>
Date: Fri, 20 Dec 2019 12:06:18 -0800
Subject: [PATCH 04/22] disable the test

---
 test/python/test_bfloat16.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index 82197a479..13ec3e4fb 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -28,9 +28,13 @@
 
 from common import NgraphTest
 
+#This test is just a sample test to test bf16 dtype
+#This fails, should enable and expand once CPU backend adds bfloat16 support
+
 
 class TestMatmulBfloat16(NgraphTest):
 
+    @pytest.mark.skip(reason="CPU backend does not support dtype bf16")
     def test_matmul_bfloat16(self):
         a = tf.placeholder(tf.bfloat16, [2, 3], name='a')
         x = tf.placeholder(tf.bfloat16, [3, 4], name='x')
@@ -41,6 +45,4 @@ def test_matmul_bfloat16(self):
         def run_test(sess):
             return sess.run((out,), feed_dict={a: a_inp, x: x_inp})
 
-        # import pdb
-        # pdb.set_trace()
         assert self.with_ngraph(run_test) == self.without_ngraph(run_test)

From 367d3db4008d1cfe003c6341e1a2e0f96a3e5218 Mon Sep 17 00:00:00 2001
From: kanvi-nervana <kanvi.khanna@intel.com>
Date: Fri, 20 Dec 2019 13:27:28 -0800
Subject: [PATCH 05/22] Kanvi/Add asserts in some python tests (#398)

---
 test/python/test_sigmoid.py |  3 ++-
 test/python/test_sign.py    | 10 ++++++----
 test/python/test_softmax.py | 15 +++++++++------
 test/python/test_stack.py   |  3 ++-
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/test/python/test_sigmoid.py b/test/python/test_sigmoid.py
index c5f8c1470..91b90f016 100644
--- a/test/python/test_sigmoid.py
+++ b/test/python/test_sigmoid.py
@@ -52,4 +52,5 @@ def test_sigmoid(self):
                                             y: y_np,
                                             z: z_np
                                         })
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
diff --git a/test/python/test_sign.py b/test/python/test_sign.py
index 781674960..c53ba26bf 100644
--- a/test/python/test_sign.py
+++ b/test/python/test_sign.py
@@ -35,8 +35,9 @@ def test_sign_1d(self, test_input, expected):
         val = tf.placeholder(tf.float32, shape=(1,))
         out = tf.sign(val)
         sess_fn = lambda sess: sess.run((out,), feed_dict={val: (test_input,)})
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
-        np.allclose(self.with_ngraph(sess_fn), expected)
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(self.with_ngraph(sess_fn), expected)
 
     def test_sign_2d(self):
         test_input = ((1.5, -2.5, -3.5), (-4.5, 5.5, 0))
@@ -44,5 +45,6 @@ def test_sign_2d(self):
         val = tf.placeholder(tf.float32, shape=(2, 3))
         out = tf.sign(val)
         sess_fn = lambda sess: sess.run((out,), feed_dict={val: test_input})
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
-        np.allclose(self.with_ngraph(sess_fn), expected)
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(self.with_ngraph(sess_fn), expected)
diff --git a/test/python/test_softmax.py b/test/python/test_softmax.py
index 87fcb0d84..2a8cbae94 100644
--- a/test/python/test_softmax.py
+++ b/test/python/test_softmax.py
@@ -43,8 +43,9 @@ def test_softmax_2d(self):
         expected = a_np
         a = tf.nn.softmax(x)
         sess_fn = lambda sess: sess.run((a), feed_dict={x: x_np})
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
-        np.allclose(self.with_ngraph(sess_fn), expected)
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(self.with_ngraph(sess_fn), expected)
 
     def test_softmax_3d(self):
         x = tf.placeholder(tf.float32, shape=(2, 3, 2))
@@ -59,8 +60,9 @@ def test_softmax_3d(self):
         expected = a_np
         a = tf.nn.softmax(x)
         sess_fn = lambda sess: sess.run((a), feed_dict={x: x_np})
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
-        np.allclose(self.with_ngraph(sess_fn), expected)
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(self.with_ngraph(sess_fn), expected)
 
     def test_softmax_4d(self):
         x = tf.placeholder(tf.float32, shape=(2, 3, 2, 4))
@@ -75,5 +77,6 @@ def test_softmax_4d(self):
         expected = a_np
         a = tf.nn.softmax(x)
         sess_fn = lambda sess: sess.run((a), feed_dict={x: x_np})
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
-        np.allclose(self.with_ngraph(sess_fn), expected)
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(self.with_ngraph(sess_fn), expected)
diff --git a/test/python/test_stack.py b/test/python/test_stack.py
index ee5f0c8e5..b44ae4d34 100644
--- a/test/python/test_stack.py
+++ b/test/python/test_stack.py
@@ -50,4 +50,5 @@ def test_stack(self, shapes, axis):
         a = tf.stack(placeholders, axis)
         sess_fn = lambda sess: sess.run(
             [a], feed_dict={p: v for p, v in zip(placeholders, values)})
-        np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))
+        assert np.allclose(
+            self.with_ngraph(sess_fn), self.without_ngraph(sess_fn))

From 4cfb27f680dfb0d64715a66e91f9cb8c06fbe288 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 21 Jan 2020 14:02:08 -0800
Subject: [PATCH 06/22] added test

---
 test/python/test_bfloat16.py | 75 +++++++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 10 deletions(-)

diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index 13ec3e4fb..6c9562f88 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -31,18 +31,73 @@
 #This test is just a sample test to test bf16 dtype
 #This fails, should enable and expand once CPU backend adds bfloat16 support
 
+np.random.seed(5)
 
-class TestMatmulBfloat16(NgraphTest):
+class TestBfloat16(NgraphTest):
 
-    @pytest.mark.skip(reason="CPU backend does not support dtype bf16")
-    def test_matmul_bfloat16(self):
-        a = tf.placeholder(tf.bfloat16, [2, 3], name='a')
-        x = tf.placeholder(tf.bfloat16, [3, 4], name='x')
-        a_inp = np.random.rand(2, 3)
-        x_inp = np.random.rand(3, 4)
-        out = tf.matmul(a, x)
+    def test_conv2d_cast_bfloat16(self):
+        # inputs
+        input_shape_nhwc = (32, 28, 28, 3)
+        filter_shape_hwio = (3, 3, 3, 16)
+        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp")
+        filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out")
+        input_values = np.random.rand(*input_shape_nhwc)
+        filter_values = np.random.rand(*filter_shape_hwio)
+        
+        # cast to bloat
+        input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
+        filter_cast = tf.cast(filter_values, dtype=tf.bfloat16)
+        padding = "VALID"
+        strides = [1, 1, 1, 1]
+        out = tf.nn.conv2d(
+            input_cast,
+            filter_cast,
+            strides,
+            padding,
+            data_format='NHWC',
+            dilations=None,
+            name=None)
 
         def run_test(sess):
-            return sess.run((out,), feed_dict={a: a_inp, x: x_inp})
+            return sess.run((out,),
+                            feed_dict={
+                                input_pl: input_values,
+                                filter_shape_pl: filter_values
+                            })
 
-        assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
+        out_val = self.with_ngraph(run_test)
+        print(out_val)
+
+    def test_conv2d_cast_bfloat16(self):
+        # inputs
+        input_shape_nhwc = (32, 28, 28, 3)
+        filter_shape_hwio = (3, 3, 3, 16)
+        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp")
+        filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out")
+        input_values = np.random.rand(*input_shape_nhwc)
+        filter_values = np.random.rand(*filter_shape_hwio)
+        
+        # cast to bloat
+        input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
+        filter_cast = tf.cast(filter_values, dtype=tf.bfloat16)
+        padding = "VALID"
+        strides = [1, 1, 1, 1]
+        out = tf.nn.conv2d(
+            input_cast,
+            filter_cast,
+            strides,
+            padding,
+            data_format='NHWC',
+            dilations=None,
+            name=None)
+
+        def run_test(sess):
+            return sess.run((out,),
+                            feed_dict={
+                                input_pl: input_values,
+                                filter_shape_pl: filter_values
+                            })
+
+        out_val = self.with_ngraph(run_test)
+        print(out_val)
+        #assert self.with_ngraph(run_test) == self.without_ngraph(run_test)

From 266b24a2bdd4c98a816bd322e91300d81b892f30 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 21 Jan 2020 18:00:50 -0800
Subject: [PATCH 07/22] changes

---
 ngraph_bridge/ngraph_builder.cc              |  1 +
 ngraph_bridge/ngraph_encapsulate_clusters.cc |  2 +-
 test/python/test_bfloat16.py                 | 68 ++++++++------------
 3 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/ngraph_bridge/ngraph_builder.cc b/ngraph_bridge/ngraph_builder.cc
index 0d7ee3eb6..abbbec659 100644
--- a/ngraph_bridge/ngraph_builder.cc
+++ b/ngraph_bridge/ngraph_builder.cc
@@ -1012,6 +1012,7 @@ static Status TranslateCastOp(const Node* op, const std::vector<const Tensor*>&,
   DataType dtype;
   TF_RETURN_IF_ERROR(GetNodeAttr(op->attrs(), "DstT", &dtype));
 
+  cout << "data type " << DataType_Name(dtype) << endl;
   ng::element::Type ng_et;
   TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(dtype, &ng_et));
 
diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc
index d9e506894..6fac8c473 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.cc
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc
@@ -380,7 +380,7 @@ Status Encapsulator::AnalysisPass() {
                << " but another node with assigned device " << it->second
                << " has already been seen in the same cluster";
 
-        return errors::Internal(ss_err.str());
+        // return errors::Internal(ss_err.str());
       }
     } else {
       NGRAPH_VLOG(3) << "setting cluster " << cluster_idx
diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index 6c9562f88..e80ab7548 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -33,56 +33,40 @@
 
 np.random.seed(5)
 
+
 class TestBfloat16(NgraphTest):
 
-    def test_conv2d_cast_bfloat16(self):
-        # inputs
-        input_shape_nhwc = (32, 28, 28, 3)
-        filter_shape_hwio = (3, 3, 3, 16)
-        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp")
-        filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out")
-        input_values = np.random.rand(*input_shape_nhwc)
-        filter_values = np.random.rand(*filter_shape_hwio)
-        
-        # cast to bloat
-        input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
-        filter_cast = tf.cast(filter_values, dtype=tf.bfloat16)
-        padding = "VALID"
-        strides = [1, 1, 1, 1]
-        out = tf.nn.conv2d(
-            input_cast,
-            filter_cast,
-            strides,
-            padding,
-            data_format='NHWC',
-            dilations=None,
-            name=None)
+    @pytest.mark.skip(reason="CPU backend does not support dtype bf16")
+    def test_matmul_bfloat16(self):
+        a = tf.placeholder(tf.bfloat16, [2, 3], name='a')
+        x = tf.placeholder(tf.bfloat16, [3, 4], name='x')
+        a_inp = np.random.rand(2, 3)
+        x_inp = np.random.rand(3, 4)
+        out = tf.matmul(a, x)
 
         def run_test(sess):
-            return sess.run((out,),
-                            feed_dict={
-                                input_pl: input_values,
-                                filter_shape_pl: filter_values
-                            })
+            return sess.run((out,), feed_dict={a: a_inp, x: x_inp})
 
-        out_val = self.with_ngraph(run_test)
-        print(out_val)
+        assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
 
     def test_conv2d_cast_bfloat16(self):
         # inputs
-        input_shape_nhwc = (32, 28, 28, 3)
-        filter_shape_hwio = (3, 3, 3, 16)
-        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp")
-        filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out")
-        input_values = np.random.rand(*input_shape_nhwc)
-        filter_values = np.random.rand(*filter_shape_hwio)
-        
+        input_shape_nhwc = (1, 8, 8, 1)
+        filter_shape_hwio = (3, 3, 1, 2)
+        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl")
+        filter_shape_pl = tf.placeholder(
+            tf.float32, filter_shape_hwio, name="filter_pl")
+        input_values = np.arange(64).reshape(
+            input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
+        filter_values = np.arange(18).reshape(
+            filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
+        print(filter_values)
         # cast to bloat
         input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
-        filter_cast = tf.cast(filter_values, dtype=tf.bfloat16)
+        filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16)
         padding = "VALID"
         strides = [1, 1, 1, 1]
-        out = tf.nn.conv2d(
+        conv_op = tf.nn.conv2d(
             input_cast,
             filter_cast,
             strides,
@@ -90,14 +74,14 @@ def test_conv2d_cast_bfloat16(self):
             data_format='NHWC',
             dilations=None,
             name=None)
+        out = tf.cast(conv_op, dtype=tf.float32)
 
         def run_test(sess):
-            return sess.run((out,),
+            return sess.run((conv_op,),
                             feed_dict={
                                 input_pl: input_values,
                                 filter_shape_pl: filter_values
                             })
 
-        out_val = self.with_ngraph(run_test)
-        print(out_val)
-        #assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
+        assert np.allclose(
+            self.with_ngraph(run_test), self.without_ngraph(run_test))

From 062a3c3adc93c035ba4dee70d9e9eb4997c9975e Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 24 Jan 2020 11:12:09 -0800
Subject: [PATCH 08/22] added another test

---
 test/python/test_bfloat16.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index e80ab7548..060ba365e 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -60,7 +60,6 @@ def test_conv2d_cast_bfloat16(self):
             input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
         filter_values = np.arange(18).reshape(
             filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
-        print(filter_values)
         # cast to bloat
         input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
         filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16)
@@ -85,3 +84,36 @@ def run_test(sess):
 
         assert np.allclose(
             self.with_ngraph(run_test), self.without_ngraph(run_test))
+
+    def test_conv2d_bfloat16(self):
+        # inputs
+        input_shape_nhwc = (1, 8, 8, 1)
+        filter_shape_hwio = (3, 3, 1, 2)
+        input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl")
+        filter_shape_pl = tf.placeholder(
+            tf.bfloat16, filter_shape_hwio, name="filter_pl")
+        input_values = np.arange(64).reshape(
+            input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
+        filter_values = np.arange(18).reshape(
+            filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
+
+        padding = "VALID"
+        strides = [1, 1, 1, 1]
+        conv_op = tf.nn.conv2d(
+            input_pl,
+            filter_shape_pl,
+            strides,
+            padding,
+            data_format='NHWC',
+            dilations=None,
+            name=None)
+
+        def run_test(sess):
+            return sess.run((conv_op,),
+                            feed_dict={
+                                input_pl: input_values,
+                                filter_shape_pl: filter_values
+                            })
+
+        assert np.allclose(
+            self.with_ngraph(run_test), self.without_ngraph(run_test))

From f00e298e8ce39808148b719380eb18a30a159edd Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 24 Jan 2020 11:44:32 -0800
Subject: [PATCH 09/22] added another bfloat test. encapsulate always assigned
 device CPU

---
 ngraph_bridge/ngraph_encapsulate_clusters.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc
index 6fac8c473..d6a89722e 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.cc
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc
@@ -717,7 +717,7 @@ Status Encapsulator::RewritePass(
     }
     Status status = nb.Finalize(graph, &n);
     TF_RETURN_IF_ERROR(status);
-    n->set_assigned_device_name(device_name_map[cluster_idx]);
+    n->set_assigned_device_name("/job:localhost/replica:0/task:0/device:CPU:0");
 
     cluster_node_map[cluster_idx] = n;
   }

From 0a4ffdd796cf178909bd23f3f2e4bbf9ce3fc4ac Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 24 Jan 2020 12:38:33 -0800
Subject: [PATCH 10/22] removed couts, rearranged the tests

---
 ngraph_bridge/ngraph_builder.cc |  1 -
 test/python/test_bfloat16.py    | 38 ++++++++++++++++-----------------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/ngraph_bridge/ngraph_builder.cc b/ngraph_bridge/ngraph_builder.cc
index f6ea2c78e..bfecd7c33 100644
--- a/ngraph_bridge/ngraph_builder.cc
+++ b/ngraph_bridge/ngraph_builder.cc
@@ -1012,7 +1012,6 @@ static Status TranslateCastOp(const Node* op, const std::vector<const Tensor*>&,
   DataType dtype;
   TF_RETURN_IF_ERROR(GetNodeAttr(op->attrs(), "DstT", &dtype));
 
-  cout << "data type " << DataType_Name(dtype) << endl;
   ng::element::Type ng_et;
   TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(dtype, &ng_et));
 
diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index 060ba365e..5cc3b8266 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -28,15 +28,13 @@
 
 from common import NgraphTest
 
-#This test is just a sample test to test bf16 dtype
-#This fails, should enable and expand once CPU backend adds bfloat16 support
-
 np.random.seed(5)
 
 
 class TestBfloat16(NgraphTest):
 
-    @pytest.mark.skip(reason="CPU backend does not support dtype bf16")
+    @pytest.mark.skip(
+        reason="CPU backend does not support dtype bf16 for MatMul/Dot Op")
     def test_matmul_bfloat16(self):
         a = tf.placeholder(tf.bfloat16, [2, 3], name='a')
         x = tf.placeholder(tf.bfloat16, [3, 4], name='x')
@@ -49,31 +47,28 @@ def run_test(sess):
 
         assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
 
-    def test_conv2d_cast_bfloat16(self):
+    def test_conv2d_bfloat16(self):
         # inputs
         input_shape_nhwc = (1, 8, 8, 1)
         filter_shape_hwio = (3, 3, 1, 2)
-        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl")
+        input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl")
         filter_shape_pl = tf.placeholder(
-            tf.float32, filter_shape_hwio, name="filter_pl")
+            tf.bfloat16, filter_shape_hwio, name="filter_pl")
         input_values = np.arange(64).reshape(
             input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
         filter_values = np.arange(18).reshape(
             filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
-        # cast to bloat
-        input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
-        filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16)
+
         padding = "VALID"
         strides = [1, 1, 1, 1]
         conv_op = tf.nn.conv2d(
-            input_cast,
-            filter_cast,
+            input_pl,
+            filter_shape_pl,
             strides,
             padding,
             data_format='NHWC',
             dilations=None,
             name=None)
-        out = tf.cast(conv_op, dtype=tf.float32)
 
         def run_test(sess):
             return sess.run((conv_op,),
@@ -85,31 +80,34 @@ def run_test(sess):
         assert np.allclose(
             self.with_ngraph(run_test), self.without_ngraph(run_test))
 
-    def test_conv2d_bfloat16(self):
+    def test_conv2d_cast_bfloat16(self):
         # inputs
         input_shape_nhwc = (1, 8, 8, 1)
         filter_shape_hwio = (3, 3, 1, 2)
-        input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl")
+        input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl")
         filter_shape_pl = tf.placeholder(
-            tf.bfloat16, filter_shape_hwio, name="filter_pl")
+            tf.float32, filter_shape_hwio, name="filter_pl")
         input_values = np.arange(64).reshape(
             input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
         filter_values = np.arange(18).reshape(
             filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
-
+        # cast to bloat
+        input_cast = tf.cast(input_pl, dtype=tf.bfloat16)
+        filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16)
         padding = "VALID"
         strides = [1, 1, 1, 1]
         conv_op = tf.nn.conv2d(
-            input_pl,
-            filter_shape_pl,
+            input_cast,
+            filter_cast,
             strides,
             padding,
             data_format='NHWC',
             dilations=None,
             name=None)
+        out = tf.cast(conv_op, dtype=tf.float32)
 
         def run_test(sess):
-            return sess.run((conv_op,),
+            return sess.run((out,),
                             feed_dict={
                                 input_pl: input_values,
                                 filter_shape_pl: filter_values

From 80c46f8a365122ba30534e64158beef8da373430 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 24 Jan 2020 17:59:50 -0800
Subject: [PATCH 11/22] device checks

---
 ngraph_bridge/ngraph_encapsulate_clusters.cc | 72 +++++++++++---------
 ngraph_bridge/ngraph_encapsulate_clusters.h  |  2 +
 ngraph_bridge/ngraph_encapsulate_op.cc       |  4 ++
 3 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc
index d6a89722e..276b7ae35 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.cc
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc
@@ -345,22 +345,16 @@ Status PerformAOTOnEncapsulates(Graph* graph, const AOTInfo& aot_info) {
 Encapsulator::Encapsulator(Graph* g)
     : graph(g), analysis_done(false), rewrite_done(false) {}
 
-Status Encapsulator::AnalysisPass() {
-  if (rewrite_done) {
-    return errors::Internal(
-        "In Encapsulator, AnalysisPass called after RewritePass was already "
-        "done");
-  }
+// Finds the Device and Backend that needs to
+// be assigned to each cluster (NGraphEncapsulateOp)
+// And stores it into the device_name_map and backend_name_map
+Status Encapsulator::AssignClusterDeviceAndBackend() {
+  string DEVICE_CPU = "CPU";
+  string DEVICE_XLA_CPU = "XLA_CPU";
+  set<string> allowed_device_types = {DEVICE_CPU, DEVICE_XLA_CPU};
 
-  if (analysis_done) {
-    return errors::Internal(
-        "In Encapsulator, AnalysisPass called more than once");
-  }
-  // Pass 1: Populate the cluster-index-to-device name map for each existing
-  // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE
   for (auto node : graph->op_nodes()) {
     int cluster_idx;
-
     if (GetNodeCluster(node, &cluster_idx) != Status::OK()) {
       continue;
     }
@@ -370,33 +364,32 @@ Status Encapsulator::AnalysisPass() {
       continue;
     }
 
-    auto it = device_name_map.find(cluster_idx);
-
-    if (it != device_name_map.end()) {
-      if (it->second != node->assigned_device_name()) {
-        std::stringstream ss_err;
-        ss_err << "Node " << node->name() << " in cluster " << cluster_idx
-               << " has assigned device " << node->assigned_device_name()
-               << " but another node with assigned device " << it->second
-               << " has already been seen in the same cluster";
-
-        // return errors::Internal(ss_err.str());
-      }
+    DeviceNameUtils::ParsedName parsed;
+    if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(),
+                                        &parsed)) {
+      return errors::Internal("Could not parse the device name ",
+                              node->assigned_device_name(),
+                              " assigned to node ", node->name());
     } else {
-      NGRAPH_VLOG(3) << "setting cluster " << cluster_idx
-                     << " requested device to '" << node->assigned_device_name()
-                     << "'";
-      device_name_map[cluster_idx] = node->assigned_device_name();
+      if (allowed_device_types.find(parsed.type) ==
+          allowed_device_types.end()) {
+        return errors::Internal("Node ", node->name(), " assigned cluster ",
+                                cluster_idx, " has been assigned device ",
+                                node->assigned_device_name(),
+                                " which is not supported.");
+      } else {
+        device_name_map[cluster_idx] = node->assigned_device_name();
+      }
     }
 
+    // backend
     auto itr = backend_name_map.find(cluster_idx);
-
     if (itr != backend_name_map.end()) {
       if (itr->second != node_backend) {
         std::stringstream ss_err;
         ss_err << "Node " << node->name() << " in cluster " << cluster_idx
                << " has assigned backend " << node_backend
-               << " but another node with assigned backend " << it->second
+               << " but another node with assigned backend " << itr->second
                << " has already been seen in the same cluster";
 
         return errors::Internal(ss_err.str());
@@ -407,6 +400,23 @@ Status Encapsulator::AnalysisPass() {
       backend_name_map[cluster_idx] = node_backend;
     }
   }
+  return Status::OK();
+}
+
+Status Encapsulator::AnalysisPass() {
+  if (rewrite_done) {
+    return errors::Internal(
+        "In Encapsulator, AnalysisPass called after RewritePass was already "
+        "done");
+  }
+
+  if (analysis_done) {
+    return errors::Internal(
+        "In Encapsulator, AnalysisPass called more than once");
+  }
+  // Pass 1: Populate the cluster-index-to-device name map for each existing
+  // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE
+  TF_RETURN_IF_ERROR(AssignClusterDeviceAndBackend());
 
   // Pass 2: Find all nodes that are feeding into/out of each cluster, and
   // add inputs for them to the corresponding FunctionDef(s).
diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.h b/ngraph_bridge/ngraph_encapsulate_clusters.h
index a4fe2adec..fb0ca0fbd 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.h
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.h
@@ -116,6 +116,8 @@ class Encapsulator {
   std::set<int> cluster_indices_for_this_graph;
 
   static void AddInput(NodeDef* dst, StringPiece src_name, int src_slot);
+
+  Status AssignClusterDeviceAndBackend();
 };
 
 // Translates TF subgraph to ng function then compiles it
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 4605757ae..fe8f7b784 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -28,6 +28,8 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+// #include "tensorflow/compiler/tf2xla/xla_op_registry.h" //:DEVICE_XLA_CPU
 
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
@@ -1044,5 +1046,7 @@ int NGraphEncapsulateImpl::s_instance_count = 0;
 
 REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device(DEVICE_CPU),
                         ngraph_bridge::NGraphEncapsulateOp);
+REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device("XLA_CPU"),
+                        ngraph_bridge::NGraphEncapsulateOp);
 
 }  // namespace tensorflow

From eb145c77046de732c0318d39bf1e1cc7d69f18dd Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 13:34:10 -0800
Subject: [PATCH 12/22] fix by registering dummy bfloat kernel

---
 .../ngraph_variable_modifiers.cc              | 31 ++++++++
 ngraph_bridge/ngraph_encapsulate_clusters.cc  | 78 ++++++++-----------
 ngraph_bridge/ngraph_encapsulate_clusters.h   |  6 +-
 ngraph_bridge/ngraph_encapsulate_op.cc        |  2 -
 test/python/test_bfloat16.py                  | 23 +++---
 5 files changed, 81 insertions(+), 59 deletions(-)

diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
index 376a596a9..3998c2e5c 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
@@ -161,6 +161,37 @@ class NGraphAssignAddOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("NGraphAssignAdd").Device(DEVICE_CPU),
                         NGraphAssignAddOp);
 
+/* -------------------------------------------------
+//
+// NGraphConv2DOp
+//
+---------------------------------------------------*/
+
+class NGConv2DOp : public OpKernel {
+ public:
+  explicit NGConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES(context, false,
+                errors::Internal("This constructor should not get called",
+                                 name(), "\n"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES(
+        context, false,
+        errors::Internal("This kernel should not get called", name(), "\n"));
+  }
+
+ private:
+  ~NGConv2DOp() override {}
+};
+
+// REGISTER_KERNEL_BUILDER(Name("Conv2D").Device(DEVICE_CPU),
+//                         NGraphAssignAddOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<bfloat16>("T"),
+    NGConv2DOp);
+
 }  // namespace ngraph_bridge
 
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc
index 276b7ae35..06d4a23d3 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.cc
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2017-2019 Intel Corporation
+ * Copyright 2017-2020 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -345,16 +345,22 @@ Status PerformAOTOnEncapsulates(Graph* graph, const AOTInfo& aot_info) {
 Encapsulator::Encapsulator(Graph* g)
     : graph(g), analysis_done(false), rewrite_done(false) {}
 
-// Finds the Device and Backend that needs to
-// be assigned to each cluster (NGraphEncapsulateOp)
-// And stores it into the device_name_map and backend_name_map
-Status Encapsulator::AssignClusterDeviceAndBackend() {
-  string DEVICE_CPU = "CPU";
-  string DEVICE_XLA_CPU = "XLA_CPU";
-  set<string> allowed_device_types = {DEVICE_CPU, DEVICE_XLA_CPU};
+Status Encapsulator::AnalysisPass() {
+  if (rewrite_done) {
+    return errors::Internal(
+        "In Encapsulator, AnalysisPass called after RewritePass was already "
+        "done");
+  }
 
+  if (analysis_done) {
+    return errors::Internal(
+        "In Encapsulator, AnalysisPass called more than once");
+  }
+  // Pass 1: Populate the cluster-index-to-device name map for each existing
+  // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE
   for (auto node : graph->op_nodes()) {
     int cluster_idx;
+
     if (GetNodeCluster(node, &cluster_idx) != Status::OK()) {
       continue;
     }
@@ -364,32 +370,33 @@ Status Encapsulator::AssignClusterDeviceAndBackend() {
       continue;
     }
 
-    DeviceNameUtils::ParsedName parsed;
-    if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(),
-                                        &parsed)) {
-      return errors::Internal("Could not parse the device name ",
-                              node->assigned_device_name(),
-                              " assigned to node ", node->name());
-    } else {
-      if (allowed_device_types.find(parsed.type) ==
-          allowed_device_types.end()) {
-        return errors::Internal("Node ", node->name(), " assigned cluster ",
-                                cluster_idx, " has been assigned device ",
-                                node->assigned_device_name(),
-                                " which is not supported.");
-      } else {
-        device_name_map[cluster_idx] = node->assigned_device_name();
+    auto it = device_name_map.find(cluster_idx);
+
+    if (it != device_name_map.end()) {
+      if (it->second != node->assigned_device_name()) {
+        std::stringstream ss_err;
+        ss_err << "Node " << node->name() << " in cluster " << cluster_idx
+               << " has assigned device " << node->assigned_device_name()
+               << " but another node with assigned device " << it->second
+               << " has already been seen in the same cluster";
+
+        return errors::Internal(ss_err.str());
       }
+    } else {
+      NGRAPH_VLOG(3) << "setting cluster " << cluster_idx
+                     << " requested device to '" << node->assigned_device_name()
+                     << "'";
+      device_name_map[cluster_idx] = node->assigned_device_name();
     }
 
-    // backend
     auto itr = backend_name_map.find(cluster_idx);
+
     if (itr != backend_name_map.end()) {
       if (itr->second != node_backend) {
         std::stringstream ss_err;
         ss_err << "Node " << node->name() << " in cluster " << cluster_idx
                << " has assigned backend " << node_backend
-               << " but another node with assigned backend " << itr->second
+               << " but another node with assigned backend " << it->second
                << " has already been seen in the same cluster";
 
         return errors::Internal(ss_err.str());
@@ -400,23 +407,6 @@ Status Encapsulator::AssignClusterDeviceAndBackend() {
       backend_name_map[cluster_idx] = node_backend;
     }
   }
-  return Status::OK();
-}
-
-Status Encapsulator::AnalysisPass() {
-  if (rewrite_done) {
-    return errors::Internal(
-        "In Encapsulator, AnalysisPass called after RewritePass was already "
-        "done");
-  }
-
-  if (analysis_done) {
-    return errors::Internal(
-        "In Encapsulator, AnalysisPass called more than once");
-  }
-  // Pass 1: Populate the cluster-index-to-device name map for each existing
-  // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE
-  TF_RETURN_IF_ERROR(AssignClusterDeviceAndBackend());
 
   // Pass 2: Find all nodes that are feeding into/out of each cluster, and
   // add inputs for them to the corresponding FunctionDef(s).
@@ -727,7 +717,7 @@ Status Encapsulator::RewritePass(
     }
     Status status = nb.Finalize(graph, &n);
     TF_RETURN_IF_ERROR(status);
-    n->set_assigned_device_name("/job:localhost/replica:0/task:0/device:CPU:0");
+    n->set_assigned_device_name(device_name_map[cluster_idx]);
 
     cluster_node_map[cluster_idx] = n;
   }
@@ -1013,4 +1003,4 @@ Status PerformTranslation(Node* node, const std::map<std::string, vector<int>>&
 
 }  // namespace ngraph_bridge
 
-}  // namespace tensorflow
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.h b/ngraph_bridge/ngraph_encapsulate_clusters.h
index fb0ca0fbd..9628848fc 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.h
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2017-2019 Intel Corporation
+ * Copyright 2017-2020 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,8 +116,6 @@ class Encapsulator {
   std::set<int> cluster_indices_for_this_graph;
 
   static void AddInput(NodeDef* dst, StringPiece src_name, int src_slot);
-
-  Status AssignClusterDeviceAndBackend();
 };
 
 // Translates TF subgraph to ng function then compiles it
@@ -149,4 +147,4 @@ Status PerformTranslation(Node* node,
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
 
-#endif  // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_
+#endif  // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_
\ No newline at end of file
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index fe8f7b784..328e6c03a 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -1046,7 +1046,5 @@ int NGraphEncapsulateImpl::s_instance_count = 0;
 
 REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device(DEVICE_CPU),
                         ngraph_bridge::NGraphEncapsulateOp);
-REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device("XLA_CPU"),
-                        ngraph_bridge::NGraphEncapsulateOp);
 
 }  // namespace tensorflow
diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index 5cc3b8266..078773782 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -25,7 +25,7 @@
 
 import tensorflow as tf
 import os
-
+import sys
 from common import NgraphTest
 
 np.random.seed(5)
@@ -48,13 +48,14 @@ def run_test(sess):
         assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
 
     def test_conv2d_bfloat16(self):
+
         # inputs
-        input_shape_nhwc = (1, 8, 8, 1)
+        input_shape_nhwc = (1, 4, 4, 1)
         filter_shape_hwio = (3, 3, 1, 2)
         input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl")
         filter_shape_pl = tf.placeholder(
             tf.bfloat16, filter_shape_hwio, name="filter_pl")
-        input_values = np.arange(64).reshape(
+        input_values = np.arange(16).reshape(
             input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
         filter_values = np.arange(18).reshape(
             filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
@@ -77,17 +78,19 @@ def run_test(sess):
                                 filter_shape_pl: filter_values
                             })
 
-        assert np.allclose(
-            self.with_ngraph(run_test), self.without_ngraph(run_test))
+        ng_val = self.with_ngraph(run_test)
+        expected_val = np.reshape(
+            np.array([516, 560, 588, 640, 804, 884, 876, 968]), (1, 2, 2, 2))
+        assert np.allclose(ng_val, expected_val)
 
     def test_conv2d_cast_bfloat16(self):
         # inputs
-        input_shape_nhwc = (1, 8, 8, 1)
+        input_shape_nhwc = (1, 4, 4, 1)
         filter_shape_hwio = (3, 3, 1, 2)
         input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl")
         filter_shape_pl = tf.placeholder(
             tf.float32, filter_shape_hwio, name="filter_pl")
-        input_values = np.arange(64).reshape(
+        input_values = np.arange(16).reshape(
             input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
         filter_values = np.arange(18).reshape(
             filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
@@ -113,5 +116,7 @@ def run_test(sess):
                                 filter_shape_pl: filter_values
                             })
 
-        assert np.allclose(
-            self.with_ngraph(run_test), self.without_ngraph(run_test))
+        ng_val = self.with_ngraph(run_test)
+        expected_val = np.reshape(
+            np.array([516, 560, 588, 640, 804, 884, 876, 968]), (1, 2, 2, 2))
+        assert np.allclose(ng_val, expected_val)

From 5f08083e20ce072b3c74558db57119735257bef0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 13:48:24 -0800
Subject: [PATCH 13/22] hanging include

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index aee7df2a6..977574593 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -29,7 +29,6 @@
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-// #include "tensorflow/compiler/tf2xla/xla_op_registry.h" //:DEVICE_XLA_CPU
 
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"

From e50323a2b328127e8a93a33fb5a5339e9f8e158c Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 13:51:37 -0800
Subject: [PATCH 14/22] changes

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 1 -
 test/python/test_bfloat16.py           | 7 +++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 977574593..c9e53fac7 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -28,7 +28,6 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index 078773782..f8ab24975 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -48,8 +48,7 @@ def run_test(sess):
         assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
 
     def test_conv2d_bfloat16(self):
-
-        # inputs
+        # Graph
         input_shape_nhwc = (1, 4, 4, 1)
         filter_shape_hwio = (3, 3, 1, 2)
         input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl")
@@ -59,7 +58,6 @@ def test_conv2d_bfloat16(self):
             input_shape_nhwc)  #np.random.rand(*input_shape_nhwc)
         filter_values = np.arange(18).reshape(
             filter_shape_hwio)  # np.random.rand(*filter_shape_hwio)
-
         padding = "VALID"
         strides = [1, 1, 1, 1]
         conv_op = tf.nn.conv2d(
@@ -84,7 +82,7 @@ def run_test(sess):
         assert np.allclose(ng_val, expected_val)
 
     def test_conv2d_cast_bfloat16(self):
-        # inputs
+        # Graph
         input_shape_nhwc = (1, 4, 4, 1)
         filter_shape_hwio = (3, 3, 1, 2)
         input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl")
@@ -107,6 +105,7 @@ def test_conv2d_cast_bfloat16(self):
             data_format='NHWC',
             dilations=None,
             name=None)
+        # cast to float
         out = tf.cast(conv_op, dtype=tf.float32)
 
         def run_test(sess):

From e35892de7187f708c06983499cc8f3fd4839a58f Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 13:53:38 -0800
Subject: [PATCH 15/22] minor

---
 ngraph_bridge/ngraph_encapsulate_clusters.cc | 2 +-
 ngraph_bridge/ngraph_encapsulate_clusters.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc
index 06d4a23d3..178483883 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.cc
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc
@@ -1003,4 +1003,4 @@ Status PerformTranslation(Node* node, const std::map<std::string, vector<int>>&
 
 }  // namespace ngraph_bridge
 
-}  // namespace tensorflow
\ No newline at end of file
+}  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.h b/ngraph_bridge/ngraph_encapsulate_clusters.h
index 9628848fc..4fb6f00c7 100644
--- a/ngraph_bridge/ngraph_encapsulate_clusters.h
+++ b/ngraph_bridge/ngraph_encapsulate_clusters.h
@@ -147,4 +147,4 @@ Status PerformTranslation(Node* node,
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
 
-#endif  // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_
\ No newline at end of file
+#endif  // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_

From a95c92f04d288dbadbe6708145c58e0cf549a0da Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 17:43:37 -0800
Subject: [PATCH 16/22] Register Stub Kernels

---
 ngraph_bridge/CMakeLists.txt                  |   1 +
 .../ngraph_variable_modifiers.cc              | 165 ++----------------
 ngraph_bridge/ngraph_register_stub_kernels.cc |  71 ++++++++
 ngraph_bridge/ngraph_register_stub_kernels.h  |  56 ++++++
 4 files changed, 142 insertions(+), 151 deletions(-)
 create mode 100644 ngraph_bridge/ngraph_register_stub_kernels.cc
 create mode 100644 ngraph_bridge/ngraph_register_stub_kernels.h

diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt
index 178d09536..ff85b7843 100644
--- a/ngraph_bridge/CMakeLists.txt
+++ b/ngraph_bridge/CMakeLists.txt
@@ -53,6 +53,7 @@ set(SRC
    ngraph_freshness_tracker.cc
    ngraph_mark_for_clustering.cc
    ngraph_partial_shapes.cc
+   ngraph_register_stub_kernels.cc   
    ngraph_rewrite_for_tracking.cc
    ngraph_rewrite_pass.cc
    ngraph_tensor_manager.cc
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
index fd9e5ad2c..066f8b18b 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
@@ -33,164 +33,27 @@
 #include "ngraph_bridge/ngraph_utils.h"
 #include "ngraph_bridge/ngraph_var.h"
 
+#include "ngraph_bridge/ngraph_register_stub_kernels.h"
+
 using namespace std;
 namespace ng = ngraph;
 
 namespace tensorflow {
 
 namespace ngraph_bridge {
-/* -------------------------------------------------
-//
-// NGraphApplyMomentumOp
-//
----------------------------------------------------*/
-
-class NGraphApplyMomentumOp : public OpKernel {
- private:
- public:
-  explicit NGraphApplyMomentumOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, false,
-                errors::Internal("This constructor should not get called",
-                                 name(), "\n"));
-  }
-
-  //---------------------------------------------------------------------------
-  //  ~NGraphApplyMomentumOp()
-  //---------------------------------------------------------------------------
-  ~NGraphApplyMomentumOp() override {}
-
-  // This will never be called
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(
-        context, false,
-        errors::Internal("This kernel should not get called", name(), "\n"));
-  }  // end of compute function
-};   // end of NGraphApplyGradientDescent class definition
-
-REGISTER_KERNEL_BUILDER(Name("NGraphApplyMomentum").Device(DEVICE_CPU),
-                        NGraphApplyMomentumOp);
-/* -------------------------------------------------
-//
-// NGraphApplyGradientDescentOp
-//
----------------------------------------------------*/
-
-class NGraphApplyGradientDescentOp : public OpKernel {
- private:
- public:
-  explicit NGraphApplyGradientDescentOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, false,
-                errors::Internal("This constructor should not get called",
-                                 name(), "\n"));
-  }
-
-  //---------------------------------------------------------------------------
-  //  ~NGraphApplyGradientDescentOp()
-  //---------------------------------------------------------------------------
-  ~NGraphApplyGradientDescentOp() override {}
-
-  // This will never be called
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(
-        context, false,
-        errors::Internal("This kernel should not get called", name(), "\n"));
-  }  // end of compute function
-};   // end of NGraphApplyGradientDescent class definition
-
-REGISTER_KERNEL_BUILDER(Name("NGraphApplyGradientDescent").Device(DEVICE_CPU),
-                        NGraphApplyGradientDescentOp);
-
-/* -------------------------------------------------
-//
-// NGraphAssignSubOp
-//
----------------------------------------------------*/
-
-// Computes *input[0] = *input[0] - input[1]
-class NGraphAssignSubOp : public OpKernel {
- private:
-  // bool use_exclusive_lock_; //TF op has this
-  ~NGraphAssignSubOp() override {}
-
- public:
-  explicit NGraphAssignSubOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, false,
-                errors::Internal("This constructor should not get called",
-                                 name(), "\n"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(
-        context, false,
-        errors::Internal("This kernel should not get called", name(), "\n"));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("NGraphAssignSub").Device(DEVICE_CPU),
-                        NGraphAssignSubOp);
-
-/* -------------------------------------------------
-//
-// NGraphAssignAddOp
-//
----------------------------------------------------*/
-
-// Computes *input[0] = *input[0] + input[1]
-class NGraphAssignAddOp : public OpKernel {
- public:
-  explicit NGraphAssignAddOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, false,
-                errors::Internal("This constructor should not get called",
-                                 name(), "\n"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(
-        context, false,
-        errors::Internal("This kernel should not get called", name(), "\n"));
-  }
-
- private:
-  ~NGraphAssignAddOp() override {}
-};
-
-REGISTER_KERNEL_BUILDER(Name("NGraphAssignAdd").Device(DEVICE_CPU),
-                        NGraphAssignAddOp);
-
-/* -------------------------------------------------
-//
-// NGraphConv2DOp
-//
----------------------------------------------------*/
-
-class NGConv2DOp : public OpKernel {
- public:
-  explicit NGConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES(context, false,
-                errors::Internal("This constructor should not get called",
-                                 name(), "\n"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(
-        context, false,
-        errors::Internal("This kernel should not get called", name(), "\n"));
-  }
-
- private:
-  ~NGConv2DOp() override {}
-};
-
-// REGISTER_KERNEL_BUILDER(Name("Conv2D").Device(DEVICE_CPU),
-//                         NGraphAssignAddOp);
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<bfloat16>("T"),
-    NGConv2DOp);
+// Register NGraphOptimizers here
+// These Optimizer Ops are replaced by a TF computational subgraph
+// in ReplaceModifiers Rewrite Pass. Hence, these Stub Kernels/Op will never get
+// called
+
+// Keep them in alphabetical order
+REGISTER_NGRAPH_STUB_KERNEL("NGraphApplyGradientDescent");
+REGISTER_NGRAPH_STUB_KERNEL("NGraphApplyMomentum");
+REGISTER_NGRAPH_STUB_KERNEL(
+    "NGraphAssignAdd");  //*input[0] = *input[0] + input[1]
+REGISTER_NGRAPH_STUB_KERNEL(
+    "NGraphAssignSub");  //*input[0] = *input[0] - input[1]
 
 }  // namespace ngraph_bridge
 
diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc
new file mode 100644
index 000000000..2fd1f1cd5
--- /dev/null
+++ b/ngraph_bridge/ngraph_register_stub_kernels.cc
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * Copyright 2019-2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "ngraph_bridge/ngraph_register_stub_kernels.h"
+
+using namespace std;
+
+namespace tensorflow {
+
+namespace ngraph_bridge {
+
+/* -------------------------------------------------
+//
+// NGraphStubOp
+//
+---------------------------------------------------*/
+// Constructor
+NGStubOp::NGStubOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES(
+      context, false,
+      errors::Internal("The constructor for OpType ", type_string(),
+                       "should not get called. This Op is expected to have "
+                       "been encapsulated or replaced by other ops. Op Name: ",
+                       name(), "\n"));
+}
+// Compute
+void NGStubOp::Compute(OpKernelContext* context) {
+  OP_REQUIRES(
+      context, false,
+      errors::Internal("This kernel for OpType ", type_string(),
+                       "should not get called. This Op is expected to have "
+                       "been encapsulated or replaced by other ops. Op Name: ",
+                       name(), "\n"));
+}
+// Destructor
+NGStubOp::~NGStubOp() {}
+
+/* ------------------------------------------------- */
+
+// Register Bfloat Stub Kernels
+
+// TF Ops that work on bfloat DataType get assigned Device XLA_CPU
+// Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub float
+// kernels here
+// The expectation is when we register the stub kernels for bfloat
+// TF is going to assign DEVICE_CPU to the respective Ops and we will
+// encapsulate them
+// These Stub Kernels/Op will never get called
+
+// Keep them in alphabetical order
+REGISTER_NGRAPH_STUB_KERNEL("Conv2D")
+
+}  // namespace ngraph_bridge
+
+}  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_register_stub_kernels.h b/ngraph_bridge/ngraph_register_stub_kernels.h
new file mode 100644
index 000000000..12541abb9
--- /dev/null
+++ b/ngraph_bridge/ngraph_register_stub_kernels.h
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright 2019-2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+#ifndef NGRAPH_TF_BRIDGE_REGISTER_STUB_KERNELS_H_
+#define NGRAPH_TF_BRIDGE_REGISTER_STUB_KERNELS_H_
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace std;
+
+namespace tensorflow {
+
+namespace ngraph_bridge {
+
+/* -------------------------------------------------
+//
+// NGStubOp
+//
+---------------------------------------------------*/
+
+class NGStubOp : public OpKernel {
+ public:
+  explicit NGStubOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  ~NGStubOp() override;
+};
+
+#define REGISTER_NGRAPH_STUB_KERNEL(optype)                          \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name(optype).Device(DEVICE_CPU).TypeConstraint<bfloat16>("T"), \
+      NGStubOp);
+
+#define REGISTER_NGRAPH_STUB_BFLOAT_KERNEL(optype) \
+  REGISTER_KERNEL_BUILDER(Name(optype).Device(DEVICE_CPU), NGStubOp);
+
+}  // namespace ngraph_bridge
+
+}  // namespace tensorflow
+
+#endif  // NGRAPH_TF_BRIDGE_REGISTER_STUB_KERNELS_H_

From 5d313e3b7d713086159b26ef159d344ee9ac2d85 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 17:47:38 -0800
Subject: [PATCH 17/22] fix bazel build

---
 bazel/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bazel/BUILD b/bazel/BUILD
index f0674337a..b9e714a52 100644
--- a/bazel/BUILD
+++ b/bazel/BUILD
@@ -44,6 +44,7 @@ cc_library(
         "ngraph_bridge/ngraph_partial_shapes.h",
         "ngraph_bridge/ngraph_prefetch_shared_data.h",
         "ngraph_bridge/ngraph_pipelined_tensors.h",
+        "ngraph_bridge/ngraph_register_stub_kernels.h",
         "ngraph_bridge/ngraph_rewrite_for_tracking.h",
         "ngraph_bridge/ngraph_tensor_manager.h",
         "ngraph_bridge/ngraph_timer.h",
@@ -89,6 +90,7 @@ cc_library(
         "ngraph_bridge/ngraph_mark_for_clustering.cc",
         "ngraph_bridge/ngraph_partial_shapes.cc",
         "ngraph_bridge/ngraph_pipelined_tensors.cc",
+        "ngraph_bridge/ngraph_register_stub_kernels.cc",
         "ngraph_bridge/ngraph_rewrite_for_tracking.cc",
         "ngraph_bridge/ngraph_tensor_manager.cc",
         "ngraph_bridge/ngraph_tracked_variable.cc",

From f6362789a87e3e7357312f7bf8a4bf001096cdd7 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 28 Jan 2020 17:55:39 -0800
Subject: [PATCH 18/22] update comment

---
 ngraph_bridge/ngraph_register_stub_kernels.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc
index 2fd1f1cd5..8f8dde1ca 100644
--- a/ngraph_bridge/ngraph_register_stub_kernels.cc
+++ b/ngraph_bridge/ngraph_register_stub_kernels.cc
@@ -56,9 +56,10 @@ NGStubOp::~NGStubOp() {}
 // Register Bfloat Stub Kernels
 
 // TF Ops that work on bfloat DataType get assigned Device XLA_CPU
-// Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub float
-// kernels here
-// The expectation is when we register the stub kernels for bfloat
+// Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub
+// bfloat16
+// kernels here. The expectation is when we register the stub kernels for
+// bfloat16
 // TF is going to assign DEVICE_CPU to the respective Ops and we will
 // encapsulate them
 // These Stub Kernels/Op will never get called

From d2a161fdc6862809dcfc4bccdb66bd770e9bab8b Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 29 Jan 2020 11:04:57 -0800
Subject: [PATCH 19/22] added comments to the test

---
 test/python/test_bfloat16.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index f8ab24975..de89a27b0 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -47,6 +47,11 @@ def run_test(sess):
 
         assert self.with_ngraph(run_test) == self.without_ngraph(run_test)
 
+    # For testing, we usually run the same graph on TF by disabling NGraph Rewrites.
+    # However, in this case as we register CPU bfloat dummy kernels, TF assigns device CPU
+    # to bfloat ops and hits the asserts in the dummy kernel.
+    # So, we are testing with expected values.
+    # For an ideal run on TF, we need to run on vanilla TF w/o importing ngraph-bridge
     def test_conv2d_bfloat16(self):
         # Graph
         input_shape_nhwc = (1, 4, 4, 1)
@@ -81,6 +86,11 @@ def run_test(sess):
             np.array([516, 560, 588, 640, 804, 884, 876, 968]), (1, 2, 2, 2))
         assert np.allclose(ng_val, expected_val)
 
+    # For testing, we usually run the same graph on TF by disabling NGraph Rewrites.
+    # However, in this case as we register CPU bfloat dummy kernels, TF assigns device CPU
+    # to bfloat ops and hits the asserts in the dummy kernel.
+    # So, we are testing with expected values.
+    # For an ideal run on TF, we need to run on vanilla TF w/o importing ngraph-bridge
     def test_conv2d_cast_bfloat16(self):
         # Graph
         input_shape_nhwc = (1, 4, 4, 1)

From 1e4923c992954d131ce743b53d3348002578cef0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 29 Jan 2020 11:16:30 -0800
Subject: [PATCH 20/22] corrected the macros

---
 ngraph_bridge/ngraph_register_stub_kernels.cc | 2 +-
 ngraph_bridge/ngraph_register_stub_kernels.h  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc
index 8f8dde1ca..77d5818c3 100644
--- a/ngraph_bridge/ngraph_register_stub_kernels.cc
+++ b/ngraph_bridge/ngraph_register_stub_kernels.cc
@@ -65,7 +65,7 @@ NGStubOp::~NGStubOp() {}
 // These Stub Kernels/Op will never get called
 
 // Keep them in alphabetical order
-REGISTER_NGRAPH_STUB_KERNEL("Conv2D")
+REGISTER_NGRAPH_STUB_BFLOAT_KERNEL("Conv2D")
 
 }  // namespace ngraph_bridge
 
diff --git a/ngraph_bridge/ngraph_register_stub_kernels.h b/ngraph_bridge/ngraph_register_stub_kernels.h
index 12541abb9..543e503d1 100644
--- a/ngraph_bridge/ngraph_register_stub_kernels.h
+++ b/ngraph_bridge/ngraph_register_stub_kernels.h
@@ -41,14 +41,14 @@ class NGStubOp : public OpKernel {
   ~NGStubOp() override;
 };
 
-#define REGISTER_NGRAPH_STUB_KERNEL(optype)                          \
+#define REGISTER_NGRAPH_STUB_KERNEL(optype) \
+  REGISTER_KERNEL_BUILDER(Name(optype).Device(DEVICE_CPU), NGStubOp);
+
+#define REGISTER_NGRAPH_STUB_BFLOAT_KERNEL(optype)                   \
   REGISTER_KERNEL_BUILDER(                                           \
       Name(optype).Device(DEVICE_CPU).TypeConstraint<bfloat16>("T"), \
       NGStubOp);
 
-#define REGISTER_NGRAPH_STUB_BFLOAT_KERNEL(optype) \
-  REGISTER_KERNEL_BUILDER(Name(optype).Device(DEVICE_CPU), NGStubOp);
-
 }  // namespace ngraph_bridge
 
 }  // namespace tensorflow

From 0bb58e0f9204b1867d2ad88b39da5b0982b9869a Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 29 Jan 2020 12:56:33 -0800
Subject: [PATCH 21/22] fix template

---
 ngraph_bridge/ngraph_utils.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_utils.cc b/ngraph_bridge/ngraph_utils.cc
index 21edd2e97..f2b177267 100644
--- a/ngraph_bridge/ngraph_utils.cc
+++ b/ngraph_bridge/ngraph_utils.cc
@@ -224,7 +224,9 @@ Status TensorToStream(std::ostream& ostream, const Tensor& tensor) {
       TensorDataToStream<bool>(ostream, n_elements, data);
       break;
     case DT_BFLOAT16:
-      TensorDataToStream<bool>(ostream, n_elements, data);
+      return errors::Internal(
+          "TensorToStream got data type bfloat16. No compatible standard C++ "
+          "data type.");
       break;
     default:
       return errors::Internal("TensorToStream got unsupported data type ",

From 9fce56ca2c5921040027b6aba61f473b6569ebdd Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 29 Jan 2020 14:00:47 -0800
Subject: [PATCH 22/22] incorporate review comments

---
 ngraph_bridge/ngraph_register_stub_kernels.cc | 8 +++-----
 test/python/test_bfloat16.py                  | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc
index 77d5818c3..aa0dca466 100644
--- a/ngraph_bridge/ngraph_register_stub_kernels.cc
+++ b/ngraph_bridge/ngraph_register_stub_kernels.cc
@@ -57,11 +57,9 @@ NGStubOp::~NGStubOp() {}
 
 // TF Ops that work on bfloat DataType get assigned Device XLA_CPU
 // Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub
-// bfloat16
-// kernels here. The expectation is when we register the stub kernels for
-// bfloat16
-// TF is going to assign DEVICE_CPU to the respective Ops and we will
-// encapsulate them
+// bfloat16 kernels here. The expectation is when we register the stub kernels
+// for bfloat16 TF is going to assign DEVICE_CPU to the respective Ops and
+// we will encapsulate them
 // These Stub Kernels/Op will never get called
 
 // Keep them in alphabetical order
diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py
index de89a27b0..61fae7d67 100644
--- a/test/python/test_bfloat16.py
+++ b/test/python/test_bfloat16.py
@@ -1,5 +1,5 @@
 # ==============================================================================
-#  Copyright 2019 Intel Corporation
+#  Copyright 2019-2020 Intel Corporation
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.