From 694e63c9de2cba52b5cdff189b3d713c5490a0e0 Mon Sep 17 00:00:00 2001 From: sindhu-nervana Date: Wed, 11 Dec 2019 14:30:34 -0800 Subject: [PATCH 01/22] initial commit --- ngraph_bridge/ngraph_utils.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ngraph_bridge/ngraph_utils.cc b/ngraph_bridge/ngraph_utils.cc index 686799e94..9c883f515 100644 --- a/ngraph_bridge/ngraph_utils.cc +++ b/ngraph_bridge/ngraph_utils.cc @@ -214,6 +214,9 @@ Status TensorToStream(std::ostream& ostream, const Tensor& tensor) { case DT_BOOL: TensorDataToStream(ostream, n_elements, data); break; + case DT_BFLOAT16: + TensorDataToStream(ostream, n_elements, data); + break; default: return errors::Internal("TensorToStream got unsupported data type ", DataType_Name(tensor.dtype())); @@ -263,6 +266,8 @@ Status TFDataTypeToNGraphElementType(DataType tf_dt, break; case DataType::DT_QINT32: *ng_et = ng::element::i32; + case DataType::DT_BFLOAT16: + *ng_et = ng::element::bf16; break; default: return errors::Unimplemented("Unsupported TensorFlow data type: ", @@ -313,15 +318,16 @@ void print_node_histogram(const std::unordered_map& histogram, const gtl::ArraySlice& NGraphDTypes() { static gtl::ArraySlice result{ - DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, - DT_UINT16, DT_UINT32, DT_UINT64, DT_BOOL, DT_QINT8, DT_QUINT8}; + DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, + DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, + DT_BOOL, DT_QINT8, DT_QUINT8, DT_BFLOAT16}; return result; } const gtl::ArraySlice& NGraphNumericDTypes() { static gtl::ArraySlice result{ - DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, - DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64}; + DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, + DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_BFLOAT16}; return result; } @@ -343,7 +349,7 @@ const gtl::ArraySlice& NGraphSupportedQuantizedDTypes() { } const gtl::ArraySlice& NGraphRealDTypes() { - static gtl::ArraySlice result{DT_FLOAT, DT_DOUBLE}; + static gtl::ArraySlice result{DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}; return result; } From 2ab87c77e0c845df460b7819f4a72cc9fb94099e Mon Sep 17 00:00:00 2001 From: sindhu-nervana Date: Mon, 16 Dec 2019 15:56:22 -0800 Subject: [PATCH 02/22] add bfloat16 test --- test/python/test_bfloat16.py | 46 ++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 test/python/test_bfloat16.py diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py new file mode 100644 index 000000000..82197a479 --- /dev/null +++ b/test/python/test_bfloat16.py @@ -0,0 +1,46 @@ +# ============================================================================== +# Copyright 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""nGraph TensorFlow bridge bfloat16 matmul operation test + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import pytest +import numpy as np + +import tensorflow as tf +import os + +from common import NgraphTest + + +class TestMatmulBfloat16(NgraphTest): + + def test_matmul_bfloat16(self): + a = tf.placeholder(tf.bfloat16, [2, 3], name='a') + x = tf.placeholder(tf.bfloat16, [3, 4], name='x') + a_inp = np.random.rand(2, 3) + x_inp = np.random.rand(3, 4) + out = tf.matmul(a, x) + + def run_test(sess): + return sess.run((out,), feed_dict={a: a_inp, x: x_inp}) + + # import pdb + # pdb.set_trace() + assert self.with_ngraph(run_test) == self.without_ngraph(run_test) From b267ba1f260ee8bf07ba9273d0a896823d816ced Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Fri, 20 Dec 2019 10:29:43 -0800 Subject: [PATCH 03/22] Shrestha/var in compute (#388) - Enabled --var build to use parallel executor integrating weights-on-device and data pipelining - moved ngraph_var files outside the var build --- bazel/BUILD | 2 + ngraph_bridge/CMakeLists.txt | 2 +- .../enable_variable_ops/ngraph_assign_op.cc | 4 +- .../ngraph_enter_in_catalog.cc | 15 +- .../ngraph_rewrite_pass.cc | 8 ++ .../ngraph_tracked_variable.cc | 5 +- .../ngraph_variable_modifiers.cc | 2 +- .../ngraph_variable_update_ng_tensor_op.cc | 3 +- ngraph_bridge/ngraph_encapsulate_impl.cc | 2 +- ngraph_bridge/ngraph_encapsulate_op.cc | 79 +++++++---- ngraph_bridge/ngraph_encapsulate_op_utils.cc | 130 ++++++++++++++++-- ngraph_bridge/ngraph_encapsulate_op_utils.h | 38 ++++- .../ngraph_enter_prefetch_in_catalog.h | 4 +- ngraph_bridge/ngraph_executor.cc | 2 +- ngraph_bridge/ngraph_prefetch_dataset_op.cc | 16 ++- ngraph_bridge/ngraph_tensor_manager.cc | 46 ++++++- ngraph_bridge/ngraph_tensor_manager.h | 2 + ngraph_bridge/ngraph_tracked_variable.cc | 4 +- .../{enable_variable_ops => }/ngraph_var.cc | 2 +- .../{enable_variable_ops => }/ngraph_var.h | 0 .../test_ng_var_update_ng_tensor.cc | 2 +- test/python/test_flib.py | 1 + test/test_ng_var_update_ng_tensor_kernel.cc | 2 +- tools/test_utils.py | 5 +- 24 files changed, 304 insertions(+), 72 deletions(-) rename ngraph_bridge/{enable_variable_ops => }/ngraph_var.cc (98%) rename ngraph_bridge/{enable_variable_ops => }/ngraph_var.h (100%) diff --git a/bazel/BUILD b/bazel/BUILD index 7028b6a95..034ff0dec 100644 --- a/bazel/BUILD +++ b/bazel/BUILD @@ -48,6 +48,7 @@ cc_library( "ngraph_bridge/ngraph_tensor_manager.h", "ngraph_bridge/ngraph_timer.h", "ngraph_bridge/ngraph_utils.h", + "ngraph_bridge/ngraph_var.h", "ngraph_bridge/ngraph_version_utils.h", "ngraph_bridge/tf_deadness_analysis.h", "ngraph_bridge/tf_graphcycles.h", @@ -92,6 +93,7 @@ cc_library( "ngraph_bridge/ngraph_tensor_manager.cc", "ngraph_bridge/ngraph_tracked_variable.cc", "ngraph_bridge/ngraph_utils.cc", + "ngraph_bridge/ngraph_var.cc", "ngraph_bridge/tf_deadness_analysis.cc", "ngraph_bridge/tf_graphcycles.cc", "ngraph_bridge/ops/ngraph_ops.cc", diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt index 18d218dad..eb104ae3b 100644 --- a/ngraph_bridge/CMakeLists.txt +++ b/ngraph_bridge/CMakeLists.txt @@ -57,6 +57,7 @@ set(SRC ngraph_rewrite_pass.cc ngraph_tensor_manager.cc ngraph_tracked_variable.cc + ngraph_var.cc ngraph_utils.cc tf_graphcycles.cc tf_deadness_analysis.cc @@ -86,7 +87,6 @@ if(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS) list(APPEND SRC enable_variable_ops/ngraph_tracked_variable.cc) # new files - list(APPEND SRC enable_variable_ops/ngraph_var.cc) list(APPEND SRC enable_variable_ops/ngraph_assign_op.cc) list(APPEND SRC enable_variable_ops/ngraph_enter_in_catalog.cc) list(APPEND SRC enable_variable_ops/ngraph_remove_ngraphassigns.cc) diff --git a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc index b9f041e8b..35099bbc7 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc @@ -25,11 +25,11 @@ #include "ngraph/event_tracing.hpp" #include "ngraph/runtime/backend.hpp" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_catalog.h" #include "ngraph_bridge/ngraph_freshness_tracker.h" #include "ngraph_bridge/ngraph_timer.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" using namespace std; namespace ng = ngraph; @@ -83,7 +83,7 @@ class NGraphAssignOp : public OpKernel { void Compute(OpKernelContext* context) override { std::ostringstream oss; - oss << "Execute: Assign_" << my_instance_id << ": " << name(); + oss << "NGAssign::Compute::" << name(); ngraph::Event event_compute(oss.str(), name(), ""); NGRAPH_VLOG(4) << "NGraphAssign:: Compute called for: " << def().name() diff --git a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc index a456ef6e8..c96a4932e 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc @@ -160,15 +160,12 @@ Status EnterInCatalog(Graph* graph, int graph_id) { } } - // are there indexes that need copy - if (op_index_to_copy.size() > 0) { - try { - NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(), - op_index_to_copy); - } catch (const std::exception& exp) { - return errors::Internal( - "Caught exception while entering in catalog: ", exp.what(), "\n"); - } + try { + NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(), + op_index_to_copy); + } catch (const std::exception& exp) { + return errors::Internal("Caught exception while entering in catalog: ", + exp.what(), "\n"); } } // end of node is type NGraphEncapsulate diff --git a/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc b/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc index b764713ab..ea97ff417 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc @@ -30,6 +30,7 @@ #include "ngraph_bridge/ngraph_cluster_manager.h" #include "ngraph_bridge/ngraph_deassign_clusters.h" #include "ngraph_bridge/ngraph_encapsulate_clusters.h" +#include "ngraph_bridge/ngraph_enter_prefetch_in_catalog.h" #include "ngraph_bridge/ngraph_mark_for_clustering.h" #include "ngraph_bridge/ngraph_rewrite_for_tracking.h" #include "ngraph_bridge/ngraph_utils.h" @@ -255,6 +256,13 @@ class NGraphEncapsulationPass : public NGraphRewritePass { "Graph with NGraphAssigns Optimized/Removed"); } + // 8. Enter Prefetch in catalog then. + TF_RETURN_IF_ERROR(EnterPrefetchInCatalog(options.graph->get(), idx)); + if (DumpCatalogedGraphs()) { + DumpGraphs(options, idx, "prefetch-cataloged", + "Graph with Prefetched Inputs Entered in Catalog"); + } + return Status::OK(); } diff --git a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc index c034d13c7..8b5b81f68 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc @@ -23,11 +23,11 @@ #include "ngraph/event_tracing.hpp" #include "ngraph/runtime/backend.hpp" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_backend_manager.h" #include "ngraph_bridge/ngraph_catalog.h" #include "ngraph_bridge/ngraph_freshness_tracker.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" using namespace std; namespace ng = ngraph; @@ -119,7 +119,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) { << " ,backend_name " << ng_backend_name_; std::ostringstream oss; - oss << "NGraphVariable: " << my_instance_id << ": " << name(); + oss << "NGVariable::Compute::" << name(); ngraph::Event event_compute(oss.str(), name(), ""); bool log_copies = false; @@ -250,6 +250,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) { ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes()); } var->Unref(); + event_compute.Stop(); ngraph::Event::write_trace(event_compute); } diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc index 5fc190bea..376a596a9 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc @@ -26,12 +26,12 @@ #include "ngraph/runtime/backend.hpp" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_backend_manager.h" #include "ngraph_bridge/ngraph_catalog.h" #include "ngraph_bridge/ngraph_freshness_tracker.h" #include "ngraph_bridge/ngraph_timer.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" using namespace std; namespace ng = ngraph; diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc index fdb432f79..8755f6f76 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc @@ -24,10 +24,10 @@ #include "ngraph/event_tracing.hpp" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h" #include "ngraph_bridge/ngraph_timer.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" using namespace std; namespace ng = ngraph; @@ -67,6 +67,7 @@ NGraphVariableUpdateNGTensorOp::~NGraphVariableUpdateNGTensorOp() { void NGraphVariableUpdateNGTensorOp::Compute(OpKernelContext* context) { std::ostringstream oss; // Start event tracing + oss << "NGVariableUpdateNGTensor::Compute::" << name(); ngraph::Event event_compute(oss.str(), name(), ""); bool log_copies = false; OP_REQUIRES_OK(context, diff --git a/ngraph_bridge/ngraph_encapsulate_impl.cc b/ngraph_bridge/ngraph_encapsulate_impl.cc index 7823f0a7d..f2ddf1ecd 100644 --- a/ngraph_bridge/ngraph_encapsulate_impl.cc +++ b/ngraph_bridge/ngraph_encapsulate_impl.cc @@ -45,8 +45,8 @@ #include "ngraph_bridge/ngraph_timer.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS) -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_catalog.h" #endif diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc index 9a48d8c92..4605757ae 100644 --- a/ngraph_bridge/ngraph_encapsulate_op.cc +++ b/ngraph_bridge/ngraph_encapsulate_op.cc @@ -49,9 +49,9 @@ #include "ngraph_bridge/ngraph_prefetch_shared_data.h" #include "ngraph_bridge/ngraph_timer.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS) -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_catalog.h" #endif @@ -88,13 +88,8 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx) ctx, backend != nullptr, errors::Internal("Cannot get the backend object for BE: ", be_name)); -// If we have the VARIABLE capture on then we can't use the -// parallel executor until that support is added. -#if !defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS) + // If backend executable can create tensors we use parallel executor m_use_parallel_executor = backend->executable_can_create_tensors(); -#else - m_use_parallel_executor = false; -#endif // Override the switch for debugging/testing if (std::getenv("NGRAPH_TF_USE_LEGACY_EXECUTOR") != nullptr) { @@ -402,7 +397,7 @@ NGraphEncapsulateOp::~NGraphEncapsulateOp() { // OpKernel::Compute //--------------------------------------------------------------------------- void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) { - ngraph::Event event_compute("Compute", "", ""); + ngraph::Event event_compute("NGEncap::Compute::" + name(), name(), ""); if (m_use_parallel_executor) { NGRAPH_VLOG(1) << "NGraphEncapsulateOp::Compute: Using Parallel Executor"; @@ -459,6 +454,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) { m_parallel_executor->GetTensorPipelineDepth())); // Get Tensor Manager and some error checking + ngraph::Event event_prepare_ng_tensors("Prepare NG In/Out Tensors", "", ""); auto tensor_manager = m_parallel_executor->GetTensorManager(); int num_of_inputs = tensor_manager->GetNumberOfInputs(); int num_of_outputs = tensor_manager->GetNumberOfOutputs(); @@ -499,14 +495,18 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) { vector> ng_inputs(num_of_inputs); vector> ng_outputs(num_of_outputs); - // All inputs and outputs are pipelined. - // Of all these pipelined inputs some are prefetched - // TODO: Fit in variables - ng_inputs = get<1>(pipelined_io_tensors); - ng_outputs = get<2>(pipelined_io_tensors); + // Prepare NG Input Output Tensors + // Assemble Variable tensors and pipelined tensors to ng_input and ng_outputs + OP_REQUIRES_OK(ctx, GetIOTensorsReadyForExecution( + ctx, tensor_manager, get<1>(pipelined_io_tensors), + get<2>(pipelined_io_tensors), ng_inputs, ng_outputs)); + event_prepare_ng_tensors.Stop(); + ngraph::Event::write_trace(event_prepare_ng_tensors); // And execute - ngraph::Event event_execute_graph("Execute Graph", "", ""); + ngraph::Event event_execute_graph( + "Execute Graph Pipeline Indx" + to_string(current_iter_pipeline_depth), + "", ""); BackendManager::LockBackend(m_parallel_executor->GetOpBackendName()); NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute call starting for cluster " @@ -540,12 +540,14 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) { ngraph::Event::write_trace(event_execute_graph); // Now prepare the output - ngraph::Event event_copy_output_tensor("Copy Output Tensor", "", ""); + // Allocate TF Tensors + NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Allocating TF Output Tensors " + << m_parallel_executor->GetNgraphClusterId(); - std::vector> output_copy_events; + ngraph::Event event_prepare_tf_output_tensors("Prepare TF Output Tensor", "", + ""); + vector tf_output_tensors; for (auto i = 0; i < ng_exec->get_results().size(); i++) { - std::unique_ptr event_copy_prep( - new ngraph::Event("Copy Prep", "", "")); auto ng_element = ng_exec->get_results()[i]; auto ng_shape = ng_element->get_shape(); auto ng_element_type = ng_element->get_element_type(); @@ -558,7 +560,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) { TensorShape tf_shape(dims); Tensor* tf_output_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output(i, tf_shape, &tf_output_tensor)); - + tf_output_tensors.push_back(tf_output_tensor); // Make sure the nGraph-inferred element type agrees with what TensorFlow // expected. ng::element::Type expected_elem_type; @@ -569,28 +571,45 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) { ctx, ng_element_type == expected_elem_type, errors::Internal("Element type inferred by nGraph does not match " "the element type expected by TensorFlow")); - event_copy_prep->Stop(); - output_copy_events.push_back(std::move(event_copy_prep)); + } - // Now copy the nGraph Tensor to Host Tensor - std::unique_ptr event_copy_d2h( - new ngraph::Event("Device to Host Copy", "", "")); - void* dst_ptr = DMAHelper::base(tf_output_tensor); + // Copy Tensors that are required + NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Read NG Output Tensors " + << m_parallel_executor->GetNgraphClusterId(); - ng_outputs[i]->read( - dst_ptr, ng_outputs[i]->get_element_count() * ng_element_type.size()); + std::vector> output_copy_events; + + auto output_indexes_to_be_copied = + tensor_manager->GetOutputIndexesThatNeedCopy(); + for (auto output_index : output_indexes_to_be_copied) { + // Copy the nGraph Tensor to Host Tensor + std::unique_ptr event_copy_d2h(new ngraph::Event( + "D2H_Output_" + std::to_string(output_index), "", "")); + void* dst_ptr = (void*)DMAHelper::base(tf_output_tensors[output_index]); + ng_outputs[output_index]->read( + dst_ptr, ng_outputs[output_index]->get_element_count() * + ng_outputs[output_index]->get_element_type().size()); event_copy_d2h->Stop(); output_copy_events.push_back(std::move(event_copy_d2h)); } - for (auto& next : output_copy_events) { ngraph::Event::write_trace(*next.get()); } + event_prepare_tf_output_tensors.Stop(); + ngraph::Event::write_trace(event_prepare_tf_output_tensors); - event_copy_output_tensor.Stop(); - ngraph::Event::write_trace(event_copy_output_tensor); + // Synch Var Output Tensors as required + NGRAPH_VLOG(4) + << "NGraphEncapsulateOp::Compute Sync NG Output Variable Tensors " + << m_parallel_executor->GetNgraphClusterId(); + ngraph::Event event_update_ngvar_tensors("Update NGVar Tensors", "", ""); + OP_REQUIRES_OK(ctx, SyncOutputVarTensors(ctx, tensor_manager)); + event_update_ngvar_tensors.Stop(); + ngraph::Event::write_trace(event_update_ngvar_tensors); // Now return them to the cache + NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Returning Tensors " + << m_parallel_executor->GetNgraphClusterId(); ngraph::Event event_return_tensor("Return Tensor", "", ""); pipelined_tensor_store->return_tensors(current_iter_pipeline_depth); diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc index 51eca36de..d12494e45 100644 --- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc +++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc @@ -18,17 +18,22 @@ #include "ngraph_bridge/ngraph_prefetch_shared_data.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" + using namespace std; namespace tensorflow { namespace ngraph_bridge { +//--------------------------------------------------------------------------- +// GetPipelinedIOTensorsReadyForExecution +//--------------------------------------------------------------------------- Status GetPipelinedIOTensorsReadyForExecution( - OpKernelContext* ctx, std::vector& tf_input_tensors, - shared_ptr& pipelined_tensor_store, - shared_ptr& tensor_manager, - std::tuple& + OpKernelContext* ctx, const vector& tf_input_tensors, + const shared_ptr& pipelined_tensor_store, + const shared_ptr& tensor_manager, + tuple& pipelined_io_tensors) { auto io_tensors = pipelined_tensor_store->get_tensors(); @@ -84,7 +89,7 @@ Status GetPipelinedIOTensorsReadyForExecution( tensor_manager->GetInputIndexesForPrefetchSharedObject()); // Get the set of IO tensors for the next iteration - std::tuple + tuple io_tensors_next_iter; io_tensors_next_iter = pipelined_tensor_store->get_tensors(); @@ -154,18 +159,21 @@ Status GetPipelinedIOTensorsReadyForExecution( // Allocate the input/ ngraph::Event event_copy_input_tensor("Copy Pipelined Input Tensors", "", ""); - + std::vector> input_write_events; if (!skip_tf2ng_copy) { // All pipelined inputs are copied for (auto i = 0; i < pipelined_input_indexes.size(); i++) { int tf_index = pipelined_input_indexes[i]; - ng::element::Type ng_element_type; TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType( tf_input_tensors[tf_index].dtype(), &ng_element_type)); void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[tf_index]); + + std::unique_ptr event_copy_h2d( + new ngraph::Event("H2D_Input_" + std::to_string(tf_index), "", "")); + try { ng_pipelined_inputs[i]->write( current_src_ptr, ng_pipelined_inputs[i]->get_element_count() * @@ -176,6 +184,8 @@ Status GetPipelinedIOTensorsReadyForExecution( } catch (...) { return errors::Internal("Error copying TF tensor to device tensor"); } + event_copy_h2d->Stop(); + input_write_events.push_back(std::move(event_copy_h2d)); } } else { // All pipelined inputs that are not prefetched are copied @@ -199,19 +209,27 @@ Status GetPipelinedIOTensorsReadyForExecution( tf_input_tensors[tf_index].dtype(), &ng_element_type)); void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[tf_index]); + unique_ptr event_copy_h2d( + new ngraph::Event("H2D_Input_" + to_string(tf_index), "", "")); try { ng_pipelined_inputs[ng_index]->write( current_src_ptr, ng_pipelined_inputs[ng_index]->get_element_count() * ng_element_type.size()); - } catch (const std::exception& exp) { + } catch (const exception& exp) { return errors::Internal("Error copying TF tensor to device tensor: ", exp.what()); } catch (...) { return errors::Internal("Error copying TF tensor to device tensor"); } + event_copy_h2d->Stop(); + input_write_events.push_back(move(event_copy_h2d)); } } + + for (auto& next : input_write_events) { + ngraph::Event::write_trace(*next.get()); + } event_copy_input_tensor.Stop(); ngraph::Event::write_trace(event_copy_input_tensor); @@ -221,5 +239,101 @@ Status GetPipelinedIOTensorsReadyForExecution( return Status::OK(); } +//--------------------------------------------------------------------------- +// GetTensorFromContext +//--------------------------------------------------------------------------- +Status GetTensorFromContext(const OpKernelContext* ctx, + const string& shared_name, + shared_ptr& ng_tensor) { + // Get shared name from tensor manager + NGraphVar* var; + TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup( + ctx->resource_manager()->default_container(), shared_name, &var)); + ng_tensor = var->ng_tensor(); + var->Unref(); + return Status::OK(); +} + +//--------------------------------------------------------------------------- +// GetIOTensorsReadyForExecution +//--------------------------------------------------------------------------- +Status GetIOTensorsReadyForExecution( + OpKernelContext* ctx, const shared_ptr& tensor_manager, + const PipelinedTensorVector& pipelined_in_tensors, + const PipelinedTensorVector& pipelined_out_tensors, + vector>& ng_inputs, + vector>& ng_outputs) { + // Get Variables that are inputs + auto var_input_indexes = tensor_manager->GetInputIndexesFedByVariables(); + for (int input_index : var_input_indexes) { + string shared_name; + TF_RETURN_IF_ERROR( + tensor_manager->GetInputVariableSharedName(input_index, &shared_name)); + TF_RETURN_IF_ERROR( + GetTensorFromContext(ctx, shared_name, ng_inputs[input_index])); + } + + // Get Variables that are outputs + auto var_output_indexes = + tensor_manager->GetOutputIndexesAssigningVariables(); + for (int output_index : var_output_indexes) { + string shared_name; + TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName( + output_index, &shared_name)); + TF_RETURN_IF_ERROR( + GetTensorFromContext(ctx, shared_name, ng_outputs[output_index])); + } + + // Fit Pipelined Input Tensors + auto pipelined_input_indexes = tensor_manager->GetPipelinedInputIndexes(); + for (int i = 0; i < pipelined_input_indexes.size(); i++) { + int input_index = pipelined_input_indexes[i]; + ng_inputs[input_index] = pipelined_in_tensors[i]; + } + + // Fit Pipelined Output Tensors + auto pipelined_output_indexes = tensor_manager->GetPipelinedOutputIndexes(); + for (int i = 0; i < pipelined_output_indexes.size(); i++) { + int output_index = pipelined_output_indexes[i]; + ng_outputs[output_index] = pipelined_out_tensors[i]; + } + + return Status::OK(); +} + +//--------------------------------------------------------------------------- +// SyncOutputVarTensors +//--------------------------------------------------------------------------- +Status SyncOutputVarTensors( + const OpKernelContext* ctx, + const shared_ptr& tensor_manager) { + // Get Variables that are outputs + auto var_output_indexes = + tensor_manager->GetOutputIndexesAssigningVariables(); + NGRAPH_VLOG(4) << "output indexes size " << var_output_indexes.size(); + + for (int output_index : var_output_indexes) { + bool copy_to_tf; + TF_RETURN_IF_ERROR( + tensor_manager->GetOutputVariableCopyToTF(output_index, ©_to_tf)); + + if (copy_to_tf) { + NGRAPH_VLOG(4) << "Sync NG Output Variable Tensors " << output_index; + // Get shared name from tensor manager + string shared_name; + TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName( + output_index, &shared_name)); + NGraphVar* var; + TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup( + ctx->resource_manager()->default_container(), shared_name, &var)); + // update tensor + var->copy_ng_to_tf(); + var->Unref(); + NGRAPH_VLOG(4) << "Sync Completed " << output_index; + } + } + return Status::OK(); +} + } // namespace ngraph_bridge } // namespace tensorflow diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h index 7f48eb09c..1a6df4ede 100644 --- a/ngraph_bridge/ngraph_encapsulate_op_utils.h +++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h @@ -46,12 +46,44 @@ namespace ngraph_bridge { // Status GetPipelinedIOTensorsReadyForExecution( - OpKernelContext* ctx, vector& tf_input_tensors, - shared_ptr& pipelined_tensor_store, - shared_ptr& tensor_manager, + OpKernelContext* ctx, const vector& tf_input_tensors, + const shared_ptr& pipelined_tensor_store, + const shared_ptr& tensor_manager, tuple& pipelined_io_tensors); +// Assembles the different types of input and output tensors +// Variable tensors and pipelined tensors are put together in the right order +// into ng_inputs and ng_outputs +// 1. For input indexes that are fed by variables, get the variable tensors from +// context +// 2. For output indexes that are updating variables, get the variable tensors +// from context +// This enable update-in-place +// 3. For input and output indexes that are pipelined, get the respective tensor +// +Status GetIOTensorsReadyForExecution( + OpKernelContext* ctx, const shared_ptr& tensor_manager, + const PipelinedTensorVector& pipelined_in_tensors, + const PipelinedTensorVector& pipelined_out_tensors, + vector>& ng_inputs, + vector>& ng_outputs); + +// Gets the Tensor from OpKernelContext's Container for the given shared_name +Status GetTensorFromContext(const OpKernelContext* ctx, + const string& shared_name, + shared_ptr& ng_tensor); + +// Encapsulate Op updates the NGVariable's device tensor in-place +// ie. the NGVariable's backend tensor is updated +// Some of these Variables may be required by the TF ops and they will use the +// host tensor +// These were marked as "copy-to-tf" True in the Rewrite Phase +// We will update these tensors here +Status SyncOutputVarTensors( + const OpKernelContext* ctx, + const shared_ptr& tensor_manager); + } // namespace ngraph_bridge } // namespace tensorflow diff --git a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h index d7ab8cc9c..534166aa1 100644 --- a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h +++ b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ -#ifndef NGRAPH_TF_ENTER_IN_CATALOG_H_ -#define NGRAPH_TF_ENTER_IN_CATALOG_H_ +#ifndef NGRAPH_TF_ENTER_PREFETCH_IN_CATALOG_H_ +#define NGRAPH_TF_ENTER_PREFETCH_IN_CATALOG_H_ #pragma once #include "tensorflow/core/graph/graph.h" diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc index 37e1b8b40..7d4fe2c2a 100644 --- a/ngraph_bridge/ngraph_executor.cc +++ b/ngraph_bridge/ngraph_executor.cc @@ -43,9 +43,9 @@ #include "ngraph_bridge/ngraph_mark_for_clustering.h" #include "ngraph_bridge/ngraph_timer.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS) -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_catalog.h" #endif diff --git a/ngraph_bridge/ngraph_prefetch_dataset_op.cc b/ngraph_bridge/ngraph_prefetch_dataset_op.cc index 18b946191..7c131bcce 100644 --- a/ngraph_bridge/ngraph_prefetch_dataset_op.cc +++ b/ngraph_bridge/ngraph_prefetch_dataset_op.cc @@ -415,14 +415,15 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase { ngraph_bridge::NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data); if (s.ok()) { - ngraph::Event evt_dev_cp("Prf Dev Copy", "Copy", ""); shared_data->SetBufferDepth(m_buffer_size); auto ng_input_tensor_bundle = shared_data->GetNextIOTensorBundleForDeviceTransfer(); auto ng_prefetch_input_indexes_map = shared_data->GetPrefetchInputIndexesMap(); - + ngraph::Event evt_dev_cp( + "Prf Dev Copy: Pipe_Ind_" + to_string(ng_input_tensor_bundle.Id), + "Copy", ""); int number_of_buffer_elements = buffer_element.value.size(); if (number_of_buffer_elements != ng_prefetch_input_indexes_map.size()) { @@ -433,7 +434,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase { "encap " + to_string(ng_prefetch_input_indexes_map.size())); } - + std::vector> + prefetch_input_write_events; // Write to these tensors for (auto itr : ng_prefetch_input_indexes_map) { int ng_index = itr.first; @@ -445,6 +447,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase { void* current_src_ptr = (void*)DMAHelper::base(&buffer_element.value[tf_index]); + std::unique_ptr event_copy_h2d(new ngraph::Event( + "H2D_PrefetchInput_" + std::to_string(tf_index), "Copy", "")); try { NGRAPH_VLOG(2) << "[PREFETCH] INPUT tensor being written by Prefetch: " @@ -459,6 +463,12 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase { throw std::runtime_error( "Error copying TF tensor to device tensor"); } + event_copy_h2d->Stop(); + prefetch_input_write_events.push_back(std::move(event_copy_h2d)); + } + + for (auto& next : prefetch_input_write_events) { + ngraph::Event::write_trace(*next.get()); } // Now add them back to the other queue diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc index 518ae96fe..116c213ec 100644 --- a/ngraph_bridge/ngraph_tensor_manager.cc +++ b/ngraph_bridge/ngraph_tensor_manager.cc @@ -42,7 +42,6 @@ NGraphTensorManager::NGraphTensorManager(const string ng_encap_node_name, void NGraphTensorManager::Initialize() { #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS) - // input variables book-keeping for (int index = 0; index < m_number_of_inputs; index++) { if (NGraphCatalog::ExistsInInputVariableSharedNameMap( @@ -86,6 +85,17 @@ void NGraphTensorManager::Initialize() { m_output_indexes_that_need_copy.push_back(index); } } + + // For graphs that were run through AOT + // Graph rewrite is not done, and there is no entry in catalog + // If there is no entry in catalog all outputs need to be copied + if (!NGraphCatalog::EncapOutputNeedsCopy(m_ng_encap_graph_id, + m_ng_encap_node_name)) { + m_output_indexes_that_need_copy.resize(m_number_of_outputs); + iota(begin(m_output_indexes_that_need_copy), + end(m_output_indexes_that_need_copy), 0); + } + #else m_output_indexes_that_need_copy.resize(m_number_of_outputs); iota(begin(m_output_indexes_that_need_copy), @@ -140,6 +150,40 @@ void NGraphTensorManager::Initialize() { FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes); } +//--------------------------------------------------------------------------- +// NGraphTensorManager::Print +//--------------------------------------------------------------------------- +void NGraphTensorManager::Print() { + auto PrintVector = [](const vector& input_vector, const string title) { + cout << title << endl; + cout << ng::join(input_vector) << endl; + }; + + cout << "** NGEncapsulate TensorManager:" << m_ng_encap_node_name << " **" + << endl; + + cout << "** Variables Related **" << endl; + PrintVector(m_input_indexes_from_variables, "Input Indexes from Variables"); + PrintVector(m_output_indexes_assigning_variable, + "Output Indexes Referring to Variables"); + PrintVector(m_output_indexes_that_need_copy, "Output Indexes to be Read"); + + cout << "** Pipelined **" << endl; + PrintVector(m_pipelined_input_indexes, "Pipelined Input Indexes"); + PrintVector(m_pipelined_output_indexes, "Pipelined Output Indexes"); + + cout << "** Prefetched **" << endl; + PrintVector(m_prefetched_input_indexes, "Prefetched Input Indexes"); + PrintVector(m_pipelined_not_prefetched_input_indexes, + "Pipelined But Not Prefetched Input Indexes"); + + cout << "** Prefetched wrt pipelined indexes **" << endl; + PrintVector(m_pipelined_input_indexes_that_are_prefetched, + "Prefetched Input Indexes wrt Pipelined Inputs"); + PrintVector(m_pipelined_input_indexes_that_are_not_prefetched, + "Not Prefetched Input Indexes wrt Pipelined Inputs"); +} + //--------------------------------------------------------------------------- // NGraphTensorManager::~NGraphTensorManager //--------------------------------------------------------------------------- diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h index 9143241fb..73f2ca9d4 100644 --- a/ngraph_bridge/ngraph_tensor_manager.h +++ b/ngraph_bridge/ngraph_tensor_manager.h @@ -109,6 +109,8 @@ class NGraphTensorManager { Status GetOutputVariableCopyToTF(const int& output_index, bool* output_var_copy_to_tf); + void Print(); + private: void Initialize(); string m_ng_encap_node_name; diff --git a/ngraph_bridge/ngraph_tracked_variable.cc b/ngraph_bridge/ngraph_tracked_variable.cc index bf277b6c1..22b1e584e 100644 --- a/ngraph_bridge/ngraph_tracked_variable.cc +++ b/ngraph_bridge/ngraph_tracked_variable.cc @@ -60,7 +60,6 @@ class NGraphVar : public ResourceBase { private: mutex mu_; Tensor tensor_; - ~NGraphVar() override {} }; @@ -108,7 +107,7 @@ NGraphVariableOp::~NGraphVariableOp() { tracker_->Unref(); } void NGraphVariableOp::Compute(OpKernelContext* ctx) { mutex_lock l(init_mu_); std::ostringstream oss; - oss << "NGraphVariable: " << my_instance_id << ": " << name(); + oss << "NGVariable::Compute::" << name(); ngraph::Event event_compute(oss.str(), name(), ""); if (!initialized_) { @@ -182,6 +181,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) { ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes()); } var->Unref(); + event_compute.Stop(); ngraph::Event::write_trace(event_compute); } diff --git a/ngraph_bridge/enable_variable_ops/ngraph_var.cc b/ngraph_bridge/ngraph_var.cc similarity index 98% rename from ngraph_bridge/enable_variable_ops/ngraph_var.cc rename to ngraph_bridge/ngraph_var.cc index efab9e7c0..1fa6001bf 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_var.cc +++ b/ngraph_bridge/ngraph_var.cc @@ -24,10 +24,10 @@ #include "ngraph/event_tracing.hpp" #include "ngraph/runtime/backend.hpp" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/ngraph_backend_manager.h" #include "ngraph_bridge/ngraph_freshness_tracker.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" using namespace std; namespace ng = ngraph; diff --git a/ngraph_bridge/enable_variable_ops/ngraph_var.h b/ngraph_bridge/ngraph_var.h similarity index 100% rename from ngraph_bridge/enable_variable_ops/ngraph_var.h rename to ngraph_bridge/ngraph_var.h diff --git a/test/graph_rewrites/test_ng_var_update_ng_tensor.cc b/test/graph_rewrites/test_ng_var_update_ng_tensor.cc index 0af2c7a57..924c54266 100644 --- a/test/graph_rewrites/test_ng_var_update_ng_tensor.cc +++ b/test/graph_rewrites/test_ng_var_update_ng_tensor.cc @@ -23,10 +23,10 @@ #include "tensorflow/core/platform/test.h" #include "logging/tf_graph_writer.h" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h" #include "ngraph_bridge/ngraph_rewrite_for_tracking.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" #include "test/test_utilities.h" namespace tensorflow { diff --git a/test/python/test_flib.py b/test/python/test_flib.py index 079e34449..f0c9b5b59 100644 --- a/test/python/test_flib.py +++ b/test/python/test_flib.py @@ -46,6 +46,7 @@ def test_flib_1(self): res1 = self.with_ngraph(sess_fn) res2 = self.without_ngraph(sess_fn) + exp = [np.full((2, 3), 3.0), np.full((2, 3), 0.95257413)] # Note both run on Host (because NgraphEncapsulate can only run on host) assert np.isclose(res1, res2).all() diff --git a/test/test_ng_var_update_ng_tensor_kernel.cc b/test/test_ng_var_update_ng_tensor_kernel.cc index 51742fcc9..4612d156b 100644 --- a/test/test_ng_var_update_ng_tensor_kernel.cc +++ b/test/test_ng_var_update_ng_tensor_kernel.cc @@ -30,9 +30,9 @@ #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" -#include "ngraph_bridge/enable_variable_ops/ngraph_var.h" #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h" #include "ngraph_bridge/ngraph_utils.h" +#include "ngraph_bridge/ngraph_var.h" #include "test/test_utilities.h" #include "test/tf_fake_input.h" diff --git a/tools/test_utils.py b/tools/test_utils.py index 12f2ead29..e8f5752d3 100755 --- a/tools/test_utils.py +++ b/tools/test_utils.py @@ -108,7 +108,7 @@ def run_ngtf_pytests(venv_dir, build_dir): build_dir = os.path.abspath(build_dir) venv_dir = os.path.abspath(venv_dir) mnist_dir = os.path.abspath(build_dir + '/examples/mnist/') - + axpy_dir = os.path.abspath(build_dir + '/examples/') test_dir = os.path.join(build_dir, "test") test_dir = os.path.join(test_dir, "python") @@ -130,7 +130,8 @@ def run_ngtf_pytests(venv_dir, build_dir): build_dir) + " --ignore=" + build_dir + "/test/python/bfloat16" env = os.environ.copy() new_paths = venv_dir + '/bin/python3:' + os.path.abspath( - build_dir) + ":" + os.path.abspath(mnist_dir) + build_dir) + ":" + os.path.abspath(axpy_dir) + ":" + os.path.abspath( + mnist_dir) if 'PYTHONPATH' in env: env["PYTHONPATH"] = new_paths + ":" + env["PYTHONPATH"] else: From 3ffb02e0197efd33f68db8ff63aa394d61a883d9 Mon Sep 17 00:00:00 2001 From: sindhu-nervana Date: Fri, 20 Dec 2019 12:06:18 -0800 Subject: [PATCH 04/22] disable the test --- test/python/test_bfloat16.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index 82197a479..13ec3e4fb 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -28,9 +28,13 @@ from common import NgraphTest +#This test is just a sample test to test bf16 dtype +#This fails, should enable and expand once CPU backend adds bfloat16 support + class TestMatmulBfloat16(NgraphTest): + @pytest.mark.skip(reason="CPU backend does not support dtype bf16") def test_matmul_bfloat16(self): a = tf.placeholder(tf.bfloat16, [2, 3], name='a') x = tf.placeholder(tf.bfloat16, [3, 4], name='x') @@ -41,6 +45,4 @@ def test_matmul_bfloat16(self): def run_test(sess): return sess.run((out,), feed_dict={a: a_inp, x: x_inp}) - # import pdb - # pdb.set_trace() assert self.with_ngraph(run_test) == self.without_ngraph(run_test) From 367d3db4008d1cfe003c6341e1a2e0f96a3e5218 Mon Sep 17 00:00:00 2001 From: kanvi-nervana Date: Fri, 20 Dec 2019 13:27:28 -0800 Subject: [PATCH 05/22] Kanvi/Add asserts in some python tests (#398) --- test/python/test_sigmoid.py | 3 ++- test/python/test_sign.py | 10 ++++++---- test/python/test_softmax.py | 15 +++++++++------ test/python/test_stack.py | 3 ++- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/test/python/test_sigmoid.py b/test/python/test_sigmoid.py index c5f8c1470..91b90f016 100644 --- a/test/python/test_sigmoid.py +++ b/test/python/test_sigmoid.py @@ -52,4 +52,5 @@ def test_sigmoid(self): y: y_np, z: z_np }) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) diff --git a/test/python/test_sign.py b/test/python/test_sign.py index 781674960..c53ba26bf 100644 --- a/test/python/test_sign.py +++ b/test/python/test_sign.py @@ -35,8 +35,9 @@ def test_sign_1d(self, test_input, expected): val = tf.placeholder(tf.float32, shape=(1,)) out = tf.sign(val) sess_fn = lambda sess: sess.run((out,), feed_dict={val: (test_input,)}) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) - np.allclose(self.with_ngraph(sess_fn), expected) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose(self.with_ngraph(sess_fn), expected) def test_sign_2d(self): test_input = ((1.5, -2.5, -3.5), (-4.5, 5.5, 0)) @@ -44,5 +45,6 @@ def test_sign_2d(self): val = tf.placeholder(tf.float32, shape=(2, 3)) out = tf.sign(val) sess_fn = lambda sess: sess.run((out,), feed_dict={val: test_input}) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) - np.allclose(self.with_ngraph(sess_fn), expected) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose(self.with_ngraph(sess_fn), expected) diff --git a/test/python/test_softmax.py b/test/python/test_softmax.py index 87fcb0d84..2a8cbae94 100644 --- a/test/python/test_softmax.py +++ b/test/python/test_softmax.py @@ -43,8 +43,9 @@ def test_softmax_2d(self): expected = a_np a = tf.nn.softmax(x) sess_fn = lambda sess: sess.run((a), feed_dict={x: x_np}) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) - np.allclose(self.with_ngraph(sess_fn), expected) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose(self.with_ngraph(sess_fn), expected) def test_softmax_3d(self): x = tf.placeholder(tf.float32, shape=(2, 3, 2)) @@ -59,8 +60,9 @@ def test_softmax_3d(self): expected = a_np a = tf.nn.softmax(x) sess_fn = lambda sess: sess.run((a), feed_dict={x: x_np}) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) - np.allclose(self.with_ngraph(sess_fn), expected) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose(self.with_ngraph(sess_fn), expected) def test_softmax_4d(self): x = tf.placeholder(tf.float32, shape=(2, 3, 2, 4)) @@ -75,5 +77,6 @@ def test_softmax_4d(self): expected = a_np a = tf.nn.softmax(x) sess_fn = lambda sess: sess.run((a), feed_dict={x: x_np}) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) - np.allclose(self.with_ngraph(sess_fn), expected) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose(self.with_ngraph(sess_fn), expected) diff --git a/test/python/test_stack.py b/test/python/test_stack.py index ee5f0c8e5..b44ae4d34 100644 --- a/test/python/test_stack.py +++ b/test/python/test_stack.py @@ -50,4 +50,5 @@ def test_stack(self, shapes, axis): a = tf.stack(placeholders, axis) sess_fn = lambda sess: sess.run( [a], feed_dict={p: v for p, v in zip(placeholders, values)}) - np.allclose(self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) + assert np.allclose( + self.with_ngraph(sess_fn), self.without_ngraph(sess_fn)) From 4cfb27f680dfb0d64715a66e91f9cb8c06fbe288 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 21 Jan 2020 14:02:08 -0800 Subject: [PATCH 06/22] added test --- test/python/test_bfloat16.py | 75 +++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index 13ec3e4fb..6c9562f88 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -31,18 +31,73 @@ #This test is just a sample test to test bf16 dtype #This fails, should enable and expand once CPU backend adds bfloat16 support +np.random.seed(5) -class TestMatmulBfloat16(NgraphTest): +class TestBfloat16(NgraphTest): - @pytest.mark.skip(reason="CPU backend does not support dtype bf16") - def test_matmul_bfloat16(self): - a = tf.placeholder(tf.bfloat16, [2, 3], name='a') - x = tf.placeholder(tf.bfloat16, [3, 4], name='x') - a_inp = np.random.rand(2, 3) - x_inp = np.random.rand(3, 4) - out = tf.matmul(a, x) + def test_conv2d_cast_bfloat16(self): + # inputs + input_shape_nhwc = (32, 28, 28, 3) + filter_shape_hwio = (3, 3, 3, 16) + input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp") + filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out") + input_values = np.random.rand(*input_shape_nhwc) + filter_values = np.random.rand(*filter_shape_hwio) + + # cast to bloat + input_cast = tf.cast(input_pl, dtype=tf.bfloat16) + filter_cast = tf.cast(filter_values, dtype=tf.bfloat16) + padding = "VALID" + strides = [1, 1, 1, 1] + out = tf.nn.conv2d( + input_cast, + filter_cast, + strides, + padding, + data_format='NHWC', + dilations=None, + name=None) def run_test(sess): - return sess.run((out,), feed_dict={a: a_inp, x: x_inp}) + return sess.run((out,), + feed_dict={ + input_pl: input_values, + filter_shape_pl: filter_values + }) - assert self.with_ngraph(run_test) == self.without_ngraph(run_test) + out_val = self.with_ngraph(run_test) + print(out_val) + + def test_conv2d_cast_bfloat16(self): + # inputs + input_shape_nhwc = (32, 28, 28, 3) + filter_shape_hwio = (3, 3, 3, 16) + input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp") + filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out") + input_values = np.random.rand(*input_shape_nhwc) + filter_values = np.random.rand(*filter_shape_hwio) + + # cast to bloat + input_cast = tf.cast(input_pl, dtype=tf.bfloat16) + filter_cast = tf.cast(filter_values, dtype=tf.bfloat16) + padding = "VALID" + strides = [1, 1, 1, 1] + out = tf.nn.conv2d( + input_cast, + filter_cast, + strides, + padding, + data_format='NHWC', + dilations=None, + name=None) + + def run_test(sess): + return sess.run((out,), + feed_dict={ + input_pl: input_values, + filter_shape_pl: filter_values + }) + + out_val = self.with_ngraph(run_test) + print(out_val) + #assert self.with_ngraph(run_test) == self.without_ngraph(run_test) From 266b24a2bdd4c98a816bd322e91300d81b892f30 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 21 Jan 2020 18:00:50 -0800 Subject: [PATCH 07/22] changes --- ngraph_bridge/ngraph_builder.cc | 1 + ngraph_bridge/ngraph_encapsulate_clusters.cc | 2 +- test/python/test_bfloat16.py | 68 ++++++++------------ 3 files changed, 28 insertions(+), 43 deletions(-) diff --git a/ngraph_bridge/ngraph_builder.cc b/ngraph_bridge/ngraph_builder.cc index 0d7ee3eb6..abbbec659 100644 --- a/ngraph_bridge/ngraph_builder.cc +++ b/ngraph_bridge/ngraph_builder.cc @@ -1012,6 +1012,7 @@ static Status TranslateCastOp(const Node* op, const std::vector&, DataType dtype; TF_RETURN_IF_ERROR(GetNodeAttr(op->attrs(), "DstT", &dtype)); + cout << "data type " << DataType_Name(dtype) << endl; ng::element::Type ng_et; TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(dtype, &ng_et)); diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc index d9e506894..6fac8c473 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.cc +++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc @@ -380,7 +380,7 @@ Status Encapsulator::AnalysisPass() { << " but another node with assigned device " << it->second << " has already been seen in the same cluster"; - return errors::Internal(ss_err.str()); + // return errors::Internal(ss_err.str()); } } else { NGRAPH_VLOG(3) << "setting cluster " << cluster_idx diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index 6c9562f88..e80ab7548 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -33,56 +33,40 @@ np.random.seed(5) + class TestBfloat16(NgraphTest): - def test_conv2d_cast_bfloat16(self): - # inputs - input_shape_nhwc = (32, 28, 28, 3) - filter_shape_hwio = (3, 3, 3, 16) - input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp") - filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out") - input_values = np.random.rand(*input_shape_nhwc) - filter_values = np.random.rand(*filter_shape_hwio) - - # cast to bloat - input_cast = tf.cast(input_pl, dtype=tf.bfloat16) - filter_cast = tf.cast(filter_values, dtype=tf.bfloat16) - padding = "VALID" - strides = [1, 1, 1, 1] - out = tf.nn.conv2d( - input_cast, - filter_cast, - strides, - padding, - data_format='NHWC', - dilations=None, - name=None) + @pytest.mark.skip(reason="CPU backend does not support dtype bf16") + def test_matmul_bfloat16(self): + a = tf.placeholder(tf.bfloat16, [2, 3], name='a') + x = tf.placeholder(tf.bfloat16, [3, 4], name='x') + a_inp = np.random.rand(2, 3) + x_inp = np.random.rand(3, 4) + out = tf.matmul(a, x) def run_test(sess): - return sess.run((out,), - feed_dict={ - input_pl: input_values, - filter_shape_pl: filter_values - }) + return sess.run((out,), feed_dict={a: a_inp, x: x_inp}) - out_val = self.with_ngraph(run_test) - print(out_val) + assert self.with_ngraph(run_test) == self.without_ngraph(run_test) def test_conv2d_cast_bfloat16(self): # inputs - input_shape_nhwc = (32, 28, 28, 3) - filter_shape_hwio = (3, 3, 3, 16) - input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp") - filter_shape_pl = tf.placeholder(tf.float32, filter_shape_hwio, name = "out") - input_values = np.random.rand(*input_shape_nhwc) - filter_values = np.random.rand(*filter_shape_hwio) - + input_shape_nhwc = (1, 8, 8, 1) + filter_shape_hwio = (3, 3, 1, 2) + input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl") + filter_shape_pl = tf.placeholder( + tf.float32, filter_shape_hwio, name="filter_pl") + input_values = np.arange(64).reshape( + input_shape_nhwc) #np.random.rand(*input_shape_nhwc) + filter_values = np.arange(18).reshape( + filter_shape_hwio) # np.random.rand(*filter_shape_hwio) + print(filter_values) # cast to bloat input_cast = tf.cast(input_pl, dtype=tf.bfloat16) - filter_cast = tf.cast(filter_values, dtype=tf.bfloat16) + filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16) padding = "VALID" strides = [1, 1, 1, 1] - out = tf.nn.conv2d( + conv_op = tf.nn.conv2d( input_cast, filter_cast, strides, @@ -90,14 +74,14 @@ def test_conv2d_cast_bfloat16(self): data_format='NHWC', dilations=None, name=None) + out = tf.cast(conv_op, dtype=tf.float32) def run_test(sess): - return sess.run((out,), + return sess.run((conv_op,), feed_dict={ input_pl: input_values, filter_shape_pl: filter_values }) - out_val = self.with_ngraph(run_test) - print(out_val) - #assert self.with_ngraph(run_test) == self.without_ngraph(run_test) + assert np.allclose( + self.with_ngraph(run_test), self.without_ngraph(run_test)) From 062a3c3adc93c035ba4dee70d9e9eb4997c9975e Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Fri, 24 Jan 2020 11:12:09 -0800 Subject: [PATCH 08/22] added another test --- test/python/test_bfloat16.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index e80ab7548..060ba365e 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -60,7 +60,6 @@ def test_conv2d_cast_bfloat16(self): input_shape_nhwc) #np.random.rand(*input_shape_nhwc) filter_values = np.arange(18).reshape( filter_shape_hwio) # np.random.rand(*filter_shape_hwio) - print(filter_values) # cast to bloat input_cast = tf.cast(input_pl, dtype=tf.bfloat16) filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16) @@ -85,3 +84,36 @@ def run_test(sess): assert np.allclose( self.with_ngraph(run_test), self.without_ngraph(run_test)) + + def test_conv2d_bfloat16(self): + # inputs + input_shape_nhwc = (1, 8, 8, 1) + filter_shape_hwio = (3, 3, 1, 2) + input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl") + filter_shape_pl = tf.placeholder( + tf.bfloat16, filter_shape_hwio, name="filter_pl") + input_values = np.arange(64).reshape( + input_shape_nhwc) #np.random.rand(*input_shape_nhwc) + filter_values = np.arange(18).reshape( + filter_shape_hwio) # np.random.rand(*filter_shape_hwio) + + padding = "VALID" + strides = [1, 1, 1, 1] + conv_op = tf.nn.conv2d( + input_pl, + filter_shape_pl, + strides, + padding, + data_format='NHWC', + dilations=None, + name=None) + + def run_test(sess): + return sess.run((conv_op,), + feed_dict={ + input_pl: input_values, + filter_shape_pl: filter_values + }) + + assert np.allclose( + self.with_ngraph(run_test), self.without_ngraph(run_test)) From f00e298e8ce39808148b719380eb18a30a159edd Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Fri, 24 Jan 2020 11:44:32 -0800 Subject: [PATCH 09/22] added another bfloat test. encapsulate always assigned device CPU --- ngraph_bridge/ngraph_encapsulate_clusters.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc index 6fac8c473..d6a89722e 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.cc +++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc @@ -717,7 +717,7 @@ Status Encapsulator::RewritePass( } Status status = nb.Finalize(graph, &n); TF_RETURN_IF_ERROR(status); - n->set_assigned_device_name(device_name_map[cluster_idx]); + n->set_assigned_device_name("/job:localhost/replica:0/task:0/device:CPU:0"); cluster_node_map[cluster_idx] = n; } From 0a4ffdd796cf178909bd23f3f2e4bbf9ce3fc4ac Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Fri, 24 Jan 2020 12:38:33 -0800 Subject: [PATCH 10/22] removed couts, rearranged the tests --- ngraph_bridge/ngraph_builder.cc | 1 - test/python/test_bfloat16.py | 38 ++++++++++++++++----------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/ngraph_bridge/ngraph_builder.cc b/ngraph_bridge/ngraph_builder.cc index f6ea2c78e..bfecd7c33 100644 --- a/ngraph_bridge/ngraph_builder.cc +++ b/ngraph_bridge/ngraph_builder.cc @@ -1012,7 +1012,6 @@ static Status TranslateCastOp(const Node* op, const std::vector&, DataType dtype; TF_RETURN_IF_ERROR(GetNodeAttr(op->attrs(), "DstT", &dtype)); - cout << "data type " << DataType_Name(dtype) << endl; ng::element::Type ng_et; TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(dtype, &ng_et)); diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index 060ba365e..5cc3b8266 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -28,15 +28,13 @@ from common import NgraphTest -#This test is just a sample test to test bf16 dtype -#This fails, should enable and expand once CPU backend adds bfloat16 support - np.random.seed(5) class TestBfloat16(NgraphTest): - @pytest.mark.skip(reason="CPU backend does not support dtype bf16") + @pytest.mark.skip( + reason="CPU backend does not support dtype bf16 for MatMul/Dot Op") def test_matmul_bfloat16(self): a = tf.placeholder(tf.bfloat16, [2, 3], name='a') x = tf.placeholder(tf.bfloat16, [3, 4], name='x') @@ -49,31 +47,28 @@ def run_test(sess): assert self.with_ngraph(run_test) == self.without_ngraph(run_test) - def test_conv2d_cast_bfloat16(self): + def test_conv2d_bfloat16(self): # inputs input_shape_nhwc = (1, 8, 8, 1) filter_shape_hwio = (3, 3, 1, 2) - input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl") + input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl") filter_shape_pl = tf.placeholder( - tf.float32, filter_shape_hwio, name="filter_pl") + tf.bfloat16, filter_shape_hwio, name="filter_pl") input_values = np.arange(64).reshape( input_shape_nhwc) #np.random.rand(*input_shape_nhwc) filter_values = np.arange(18).reshape( filter_shape_hwio) # np.random.rand(*filter_shape_hwio) - # cast to bloat - input_cast = tf.cast(input_pl, dtype=tf.bfloat16) - filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16) + padding = "VALID" strides = [1, 1, 1, 1] conv_op = tf.nn.conv2d( - input_cast, - filter_cast, + input_pl, + filter_shape_pl, strides, padding, data_format='NHWC', dilations=None, name=None) - out = tf.cast(conv_op, dtype=tf.float32) def run_test(sess): return sess.run((conv_op,), @@ -85,31 +80,34 @@ def run_test(sess): assert np.allclose( self.with_ngraph(run_test), self.without_ngraph(run_test)) - def test_conv2d_bfloat16(self): + def test_conv2d_cast_bfloat16(self): # inputs input_shape_nhwc = (1, 8, 8, 1) filter_shape_hwio = (3, 3, 1, 2) - input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl") + input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl") filter_shape_pl = tf.placeholder( - tf.bfloat16, filter_shape_hwio, name="filter_pl") + tf.float32, filter_shape_hwio, name="filter_pl") input_values = np.arange(64).reshape( input_shape_nhwc) #np.random.rand(*input_shape_nhwc) filter_values = np.arange(18).reshape( filter_shape_hwio) # np.random.rand(*filter_shape_hwio) - + # cast to bloat + input_cast = tf.cast(input_pl, dtype=tf.bfloat16) + filter_cast = tf.cast(filter_shape_pl, dtype=tf.bfloat16) padding = "VALID" strides = [1, 1, 1, 1] conv_op = tf.nn.conv2d( - input_pl, - filter_shape_pl, + input_cast, + filter_cast, strides, padding, data_format='NHWC', dilations=None, name=None) + out = tf.cast(conv_op, dtype=tf.float32) def run_test(sess): - return sess.run((conv_op,), + return sess.run((out,), feed_dict={ input_pl: input_values, filter_shape_pl: filter_values From 80c46f8a365122ba30534e64158beef8da373430 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Fri, 24 Jan 2020 17:59:50 -0800 Subject: [PATCH 11/22] device checks --- ngraph_bridge/ngraph_encapsulate_clusters.cc | 72 +++++++++++--------- ngraph_bridge/ngraph_encapsulate_clusters.h | 2 + ngraph_bridge/ngraph_encapsulate_op.cc | 4 ++ 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc index d6a89722e..276b7ae35 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.cc +++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc @@ -345,22 +345,16 @@ Status PerformAOTOnEncapsulates(Graph* graph, const AOTInfo& aot_info) { Encapsulator::Encapsulator(Graph* g) : graph(g), analysis_done(false), rewrite_done(false) {} -Status Encapsulator::AnalysisPass() { - if (rewrite_done) { - return errors::Internal( - "In Encapsulator, AnalysisPass called after RewritePass was already " - "done"); - } +// Finds the Device and Backend that needs to +// be assigned to each cluster (NGraphEncapsulateOp) +// And stores it into the device_name_map and backend_name_map +Status Encapsulator::AssignClusterDeviceAndBackend() { + string DEVICE_CPU = "CPU"; + string DEVICE_XLA_CPU = "XLA_CPU"; + set allowed_device_types = {DEVICE_CPU, DEVICE_XLA_CPU}; - if (analysis_done) { - return errors::Internal( - "In Encapsulator, AnalysisPass called more than once"); - } - // Pass 1: Populate the cluster-index-to-device name map for each existing - // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE for (auto node : graph->op_nodes()) { int cluster_idx; - if (GetNodeCluster(node, &cluster_idx) != Status::OK()) { continue; } @@ -370,33 +364,32 @@ Status Encapsulator::AnalysisPass() { continue; } - auto it = device_name_map.find(cluster_idx); - - if (it != device_name_map.end()) { - if (it->second != node->assigned_device_name()) { - std::stringstream ss_err; - ss_err << "Node " << node->name() << " in cluster " << cluster_idx - << " has assigned device " << node->assigned_device_name() - << " but another node with assigned device " << it->second - << " has already been seen in the same cluster"; - - // return errors::Internal(ss_err.str()); - } + DeviceNameUtils::ParsedName parsed; + if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(), + &parsed)) { + return errors::Internal("Could not parse the device name ", + node->assigned_device_name(), + " assigned to node ", node->name()); } else { - NGRAPH_VLOG(3) << "setting cluster " << cluster_idx - << " requested device to '" << node->assigned_device_name() - << "'"; - device_name_map[cluster_idx] = node->assigned_device_name(); + if (allowed_device_types.find(parsed.type) == + allowed_device_types.end()) { + return errors::Internal("Node ", node->name(), " assigned cluster ", + cluster_idx, " has been assigned device ", + node->assigned_device_name(), + " which is not supported."); + } else { + device_name_map[cluster_idx] = node->assigned_device_name(); + } } + // backend auto itr = backend_name_map.find(cluster_idx); - if (itr != backend_name_map.end()) { if (itr->second != node_backend) { std::stringstream ss_err; ss_err << "Node " << node->name() << " in cluster " << cluster_idx << " has assigned backend " << node_backend - << " but another node with assigned backend " << it->second + << " but another node with assigned backend " << itr->second << " has already been seen in the same cluster"; return errors::Internal(ss_err.str()); @@ -407,6 +400,23 @@ Status Encapsulator::AnalysisPass() { backend_name_map[cluster_idx] = node_backend; } } + return Status::OK(); +} + +Status Encapsulator::AnalysisPass() { + if (rewrite_done) { + return errors::Internal( + "In Encapsulator, AnalysisPass called after RewritePass was already " + "done"); + } + + if (analysis_done) { + return errors::Internal( + "In Encapsulator, AnalysisPass called more than once"); + } + // Pass 1: Populate the cluster-index-to-device name map for each existing + // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE + TF_RETURN_IF_ERROR(AssignClusterDeviceAndBackend()); // Pass 2: Find all nodes that are feeding into/out of each cluster, and // add inputs for them to the corresponding FunctionDef(s). diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.h b/ngraph_bridge/ngraph_encapsulate_clusters.h index a4fe2adec..fb0ca0fbd 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.h +++ b/ngraph_bridge/ngraph_encapsulate_clusters.h @@ -116,6 +116,8 @@ class Encapsulator { std::set cluster_indices_for_this_graph; static void AddInput(NodeDef* dst, StringPiece src_name, int src_slot); + + Status AssignClusterDeviceAndBackend(); }; // Translates TF subgraph to ng function then compiles it diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc index 4605757ae..fe8f7b784 100644 --- a/ngraph_bridge/ngraph_encapsulate_op.cc +++ b/ngraph_bridge/ngraph_encapsulate_op.cc @@ -28,6 +28,8 @@ #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_constructor.h" +// #include "tensorflow/compiler/tf2xla/xla_op_registry.h" //:DEVICE_XLA_CPU #include "ngraph/event_tracing.hpp" #include "ngraph/runtime/backend.hpp" @@ -1044,5 +1046,7 @@ int NGraphEncapsulateImpl::s_instance_count = 0; REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device(DEVICE_CPU), ngraph_bridge::NGraphEncapsulateOp); +REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device("XLA_CPU"), + ngraph_bridge::NGraphEncapsulateOp); } // namespace tensorflow From eb145c77046de732c0318d39bf1e1cc7d69f18dd Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 13:34:10 -0800 Subject: [PATCH 12/22] fix by registering dummy bfloat kernel --- .../ngraph_variable_modifiers.cc | 31 ++++++++ ngraph_bridge/ngraph_encapsulate_clusters.cc | 78 ++++++++----------- ngraph_bridge/ngraph_encapsulate_clusters.h | 6 +- ngraph_bridge/ngraph_encapsulate_op.cc | 2 - test/python/test_bfloat16.py | 23 +++--- 5 files changed, 81 insertions(+), 59 deletions(-) diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc index 376a596a9..3998c2e5c 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc @@ -161,6 +161,37 @@ class NGraphAssignAddOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("NGraphAssignAdd").Device(DEVICE_CPU), NGraphAssignAddOp); +/* ------------------------------------------------- +// +// NGraphConv2DOp +// +---------------------------------------------------*/ + +class NGConv2DOp : public OpKernel { + public: + explicit NGConv2DOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES(context, false, + errors::Internal("This constructor should not get called", + name(), "\n")); + } + + void Compute(OpKernelContext* context) override { + OP_REQUIRES( + context, false, + errors::Internal("This kernel should not get called", name(), "\n")); + } + + private: + ~NGConv2DOp() override {} +}; + +// REGISTER_KERNEL_BUILDER(Name("Conv2D").Device(DEVICE_CPU), +// NGraphAssignAddOp); + +REGISTER_KERNEL_BUILDER( + Name("Conv2D").Device(DEVICE_CPU).TypeConstraint("T"), + NGConv2DOp); + } // namespace ngraph_bridge } // namespace tensorflow diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc index 276b7ae35..06d4a23d3 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.cc +++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2017-2019 Intel Corporation + * Copyright 2017-2020 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -345,16 +345,22 @@ Status PerformAOTOnEncapsulates(Graph* graph, const AOTInfo& aot_info) { Encapsulator::Encapsulator(Graph* g) : graph(g), analysis_done(false), rewrite_done(false) {} -// Finds the Device and Backend that needs to -// be assigned to each cluster (NGraphEncapsulateOp) -// And stores it into the device_name_map and backend_name_map -Status Encapsulator::AssignClusterDeviceAndBackend() { - string DEVICE_CPU = "CPU"; - string DEVICE_XLA_CPU = "XLA_CPU"; - set allowed_device_types = {DEVICE_CPU, DEVICE_XLA_CPU}; +Status Encapsulator::AnalysisPass() { + if (rewrite_done) { + return errors::Internal( + "In Encapsulator, AnalysisPass called after RewritePass was already " + "done"); + } + if (analysis_done) { + return errors::Internal( + "In Encapsulator, AnalysisPass called more than once"); + } + // Pass 1: Populate the cluster-index-to-device name map for each existing + // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE for (auto node : graph->op_nodes()) { int cluster_idx; + if (GetNodeCluster(node, &cluster_idx) != Status::OK()) { continue; } @@ -364,32 +370,33 @@ Status Encapsulator::AssignClusterDeviceAndBackend() { continue; } - DeviceNameUtils::ParsedName parsed; - if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(), - &parsed)) { - return errors::Internal("Could not parse the device name ", - node->assigned_device_name(), - " assigned to node ", node->name()); - } else { - if (allowed_device_types.find(parsed.type) == - allowed_device_types.end()) { - return errors::Internal("Node ", node->name(), " assigned cluster ", - cluster_idx, " has been assigned device ", - node->assigned_device_name(), - " which is not supported."); - } else { - device_name_map[cluster_idx] = node->assigned_device_name(); + auto it = device_name_map.find(cluster_idx); + + if (it != device_name_map.end()) { + if (it->second != node->assigned_device_name()) { + std::stringstream ss_err; + ss_err << "Node " << node->name() << " in cluster " << cluster_idx + << " has assigned device " << node->assigned_device_name() + << " but another node with assigned device " << it->second + << " has already been seen in the same cluster"; + + return errors::Internal(ss_err.str()); } + } else { + NGRAPH_VLOG(3) << "setting cluster " << cluster_idx + << " requested device to '" << node->assigned_device_name() + << "'"; + device_name_map[cluster_idx] = node->assigned_device_name(); } - // backend auto itr = backend_name_map.find(cluster_idx); + if (itr != backend_name_map.end()) { if (itr->second != node_backend) { std::stringstream ss_err; ss_err << "Node " << node->name() << " in cluster " << cluster_idx << " has assigned backend " << node_backend - << " but another node with assigned backend " << itr->second + << " but another node with assigned backend " << it->second << " has already been seen in the same cluster"; return errors::Internal(ss_err.str()); @@ -400,23 +407,6 @@ Status Encapsulator::AssignClusterDeviceAndBackend() { backend_name_map[cluster_idx] = node_backend; } } - return Status::OK(); -} - -Status Encapsulator::AnalysisPass() { - if (rewrite_done) { - return errors::Internal( - "In Encapsulator, AnalysisPass called after RewritePass was already " - "done"); - } - - if (analysis_done) { - return errors::Internal( - "In Encapsulator, AnalysisPass called more than once"); - } - // Pass 1: Populate the cluster-index-to-device name map for each existing - // cluster. PIGGYBACKING BACKEND TEST HERE, THEY WILL GET COMBINED INTO ONE - TF_RETURN_IF_ERROR(AssignClusterDeviceAndBackend()); // Pass 2: Find all nodes that are feeding into/out of each cluster, and // add inputs for them to the corresponding FunctionDef(s). @@ -727,7 +717,7 @@ Status Encapsulator::RewritePass( } Status status = nb.Finalize(graph, &n); TF_RETURN_IF_ERROR(status); - n->set_assigned_device_name("/job:localhost/replica:0/task:0/device:CPU:0"); + n->set_assigned_device_name(device_name_map[cluster_idx]); cluster_node_map[cluster_idx] = n; } @@ -1013,4 +1003,4 @@ Status PerformTranslation(Node* node, const std::map>& } // namespace ngraph_bridge -} // namespace tensorflow +} // namespace tensorflow \ No newline at end of file diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.h b/ngraph_bridge/ngraph_encapsulate_clusters.h index fb0ca0fbd..9628848fc 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.h +++ b/ngraph_bridge/ngraph_encapsulate_clusters.h @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2017-2019 Intel Corporation + * Copyright 2017-2020 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,8 +116,6 @@ class Encapsulator { std::set cluster_indices_for_this_graph; static void AddInput(NodeDef* dst, StringPiece src_name, int src_slot); - - Status AssignClusterDeviceAndBackend(); }; // Translates TF subgraph to ng function then compiles it @@ -149,4 +147,4 @@ Status PerformTranslation(Node* node, } // namespace ngraph_bridge } // namespace tensorflow -#endif // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_ +#endif // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_ \ No newline at end of file diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc index fe8f7b784..328e6c03a 100644 --- a/ngraph_bridge/ngraph_encapsulate_op.cc +++ b/ngraph_bridge/ngraph_encapsulate_op.cc @@ -1046,7 +1046,5 @@ int NGraphEncapsulateImpl::s_instance_count = 0; REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device(DEVICE_CPU), ngraph_bridge::NGraphEncapsulateOp); -REGISTER_KERNEL_BUILDER(Name("NGraphEncapsulate").Device("XLA_CPU"), - ngraph_bridge::NGraphEncapsulateOp); } // namespace tensorflow diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index 5cc3b8266..078773782 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -25,7 +25,7 @@ import tensorflow as tf import os - +import sys from common import NgraphTest np.random.seed(5) @@ -48,13 +48,14 @@ def run_test(sess): assert self.with_ngraph(run_test) == self.without_ngraph(run_test) def test_conv2d_bfloat16(self): + # inputs - input_shape_nhwc = (1, 8, 8, 1) + input_shape_nhwc = (1, 4, 4, 1) filter_shape_hwio = (3, 3, 1, 2) input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl") filter_shape_pl = tf.placeholder( tf.bfloat16, filter_shape_hwio, name="filter_pl") - input_values = np.arange(64).reshape( + input_values = np.arange(16).reshape( input_shape_nhwc) #np.random.rand(*input_shape_nhwc) filter_values = np.arange(18).reshape( filter_shape_hwio) # np.random.rand(*filter_shape_hwio) @@ -77,17 +78,19 @@ def run_test(sess): filter_shape_pl: filter_values }) - assert np.allclose( - self.with_ngraph(run_test), self.without_ngraph(run_test)) + ng_val = self.with_ngraph(run_test) + expected_val = np.reshape( + np.array([516, 560, 588, 640, 804, 884, 876, 968]), (1, 2, 2, 2)) + assert np.allclose(ng_val, expected_val) def test_conv2d_cast_bfloat16(self): # inputs - input_shape_nhwc = (1, 8, 8, 1) + input_shape_nhwc = (1, 4, 4, 1) filter_shape_hwio = (3, 3, 1, 2) input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl") filter_shape_pl = tf.placeholder( tf.float32, filter_shape_hwio, name="filter_pl") - input_values = np.arange(64).reshape( + input_values = np.arange(16).reshape( input_shape_nhwc) #np.random.rand(*input_shape_nhwc) filter_values = np.arange(18).reshape( filter_shape_hwio) # np.random.rand(*filter_shape_hwio) @@ -113,5 +116,7 @@ def run_test(sess): filter_shape_pl: filter_values }) - assert np.allclose( - self.with_ngraph(run_test), self.without_ngraph(run_test)) + ng_val = self.with_ngraph(run_test) + expected_val = np.reshape( + np.array([516, 560, 588, 640, 804, 884, 876, 968]), (1, 2, 2, 2)) + assert np.allclose(ng_val, expected_val) From 5f08083e20ce072b3c74558db57119735257bef0 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 13:48:24 -0800 Subject: [PATCH 13/22] hanging include --- ngraph_bridge/ngraph_encapsulate_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc index aee7df2a6..977574593 100644 --- a/ngraph_bridge/ngraph_encapsulate_op.cc +++ b/ngraph_bridge/ngraph_encapsulate_op.cc @@ -29,7 +29,6 @@ #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_constructor.h" -// #include "tensorflow/compiler/tf2xla/xla_op_registry.h" //:DEVICE_XLA_CPU #include "ngraph/event_tracing.hpp" #include "ngraph/runtime/backend.hpp" From e50323a2b328127e8a93a33fb5a5339e9f8e158c Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 13:51:37 -0800 Subject: [PATCH 14/22] changes --- ngraph_bridge/ngraph_encapsulate_op.cc | 1 - test/python/test_bfloat16.py | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc index 977574593..c9e53fac7 100644 --- a/ngraph_bridge/ngraph_encapsulate_op.cc +++ b/ngraph_bridge/ngraph_encapsulate_op.cc @@ -28,7 +28,6 @@ #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" -#include "tensorflow/core/graph/graph_constructor.h" #include "ngraph/event_tracing.hpp" #include "ngraph/runtime/backend.hpp" diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index 078773782..f8ab24975 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -48,8 +48,7 @@ def run_test(sess): assert self.with_ngraph(run_test) == self.without_ngraph(run_test) def test_conv2d_bfloat16(self): - - # inputs + # Graph input_shape_nhwc = (1, 4, 4, 1) filter_shape_hwio = (3, 3, 1, 2) input_pl = tf.placeholder(tf.bfloat16, input_shape_nhwc, name="inp_pl") @@ -59,7 +58,6 @@ def test_conv2d_bfloat16(self): input_shape_nhwc) #np.random.rand(*input_shape_nhwc) filter_values = np.arange(18).reshape( filter_shape_hwio) # np.random.rand(*filter_shape_hwio) - padding = "VALID" strides = [1, 1, 1, 1] conv_op = tf.nn.conv2d( @@ -84,7 +82,7 @@ def run_test(sess): assert np.allclose(ng_val, expected_val) def test_conv2d_cast_bfloat16(self): - # inputs + # Graph input_shape_nhwc = (1, 4, 4, 1) filter_shape_hwio = (3, 3, 1, 2) input_pl = tf.placeholder(tf.float32, input_shape_nhwc, name="inp_pl") @@ -107,6 +105,7 @@ def test_conv2d_cast_bfloat16(self): data_format='NHWC', dilations=None, name=None) + # cast to float out = tf.cast(conv_op, dtype=tf.float32) def run_test(sess): From e35892de7187f708c06983499cc8f3fd4839a58f Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 13:53:38 -0800 Subject: [PATCH 15/22] minor --- ngraph_bridge/ngraph_encapsulate_clusters.cc | 2 +- ngraph_bridge/ngraph_encapsulate_clusters.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.cc b/ngraph_bridge/ngraph_encapsulate_clusters.cc index 06d4a23d3..178483883 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.cc +++ b/ngraph_bridge/ngraph_encapsulate_clusters.cc @@ -1003,4 +1003,4 @@ Status PerformTranslation(Node* node, const std::map>& } // namespace ngraph_bridge -} // namespace tensorflow \ No newline at end of file +} // namespace tensorflow diff --git a/ngraph_bridge/ngraph_encapsulate_clusters.h b/ngraph_bridge/ngraph_encapsulate_clusters.h index 9628848fc..4fb6f00c7 100644 --- a/ngraph_bridge/ngraph_encapsulate_clusters.h +++ b/ngraph_bridge/ngraph_encapsulate_clusters.h @@ -147,4 +147,4 @@ Status PerformTranslation(Node* node, } // namespace ngraph_bridge } // namespace tensorflow -#endif // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_ \ No newline at end of file +#endif // NGRAPH_TF_BRIDGE_ENCAPSULATE_CLUSTERS_H_ From a95c92f04d288dbadbe6708145c58e0cf549a0da Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 17:43:37 -0800 Subject: [PATCH 16/22] Register Stub Kernels --- ngraph_bridge/CMakeLists.txt | 1 + .../ngraph_variable_modifiers.cc | 165 ++---------------- ngraph_bridge/ngraph_register_stub_kernels.cc | 71 ++++++++ ngraph_bridge/ngraph_register_stub_kernels.h | 56 ++++++ 4 files changed, 142 insertions(+), 151 deletions(-) create mode 100644 ngraph_bridge/ngraph_register_stub_kernels.cc create mode 100644 ngraph_bridge/ngraph_register_stub_kernels.h diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt index 178d09536..ff85b7843 100644 --- a/ngraph_bridge/CMakeLists.txt +++ b/ngraph_bridge/CMakeLists.txt @@ -53,6 +53,7 @@ set(SRC ngraph_freshness_tracker.cc ngraph_mark_for_clustering.cc ngraph_partial_shapes.cc + ngraph_register_stub_kernels.cc ngraph_rewrite_for_tracking.cc ngraph_rewrite_pass.cc ngraph_tensor_manager.cc diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc index fd9e5ad2c..066f8b18b 100644 --- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc +++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc @@ -33,164 +33,27 @@ #include "ngraph_bridge/ngraph_utils.h" #include "ngraph_bridge/ngraph_var.h" +#include "ngraph_bridge/ngraph_register_stub_kernels.h" + using namespace std; namespace ng = ngraph; namespace tensorflow { namespace ngraph_bridge { -/* ------------------------------------------------- -// -// NGraphApplyMomentumOp -// ----------------------------------------------------*/ - -class NGraphApplyMomentumOp : public OpKernel { - private: - public: - explicit NGraphApplyMomentumOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES(context, false, - errors::Internal("This constructor should not get called", - name(), "\n")); - } - - //--------------------------------------------------------------------------- - // ~NGraphApplyMomentumOp() - //--------------------------------------------------------------------------- - ~NGraphApplyMomentumOp() override {} - - // This will never be called - void Compute(OpKernelContext* context) override { - OP_REQUIRES( - context, false, - errors::Internal("This kernel should not get called", name(), "\n")); - } // end of compute function -}; // end of NGraphApplyGradientDescent class definition - -REGISTER_KERNEL_BUILDER(Name("NGraphApplyMomentum").Device(DEVICE_CPU), - NGraphApplyMomentumOp); -/* ------------------------------------------------- -// -// NGraphApplyGradientDescentOp -// ----------------------------------------------------*/ - -class NGraphApplyGradientDescentOp : public OpKernel { - private: - public: - explicit NGraphApplyGradientDescentOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES(context, false, - errors::Internal("This constructor should not get called", - name(), "\n")); - } - - //--------------------------------------------------------------------------- - // ~NGraphApplyGradientDescentOp() - //--------------------------------------------------------------------------- - ~NGraphApplyGradientDescentOp() override {} - - // This will never be called - void Compute(OpKernelContext* context) override { - OP_REQUIRES( - context, false, - errors::Internal("This kernel should not get called", name(), "\n")); - } // end of compute function -}; // end of NGraphApplyGradientDescent class definition - -REGISTER_KERNEL_BUILDER(Name("NGraphApplyGradientDescent").Device(DEVICE_CPU), - NGraphApplyGradientDescentOp); - -/* ------------------------------------------------- -// -// NGraphAssignSubOp -// ----------------------------------------------------*/ - -// Computes *input[0] = *input[0] - input[1] -class NGraphAssignSubOp : public OpKernel { - private: - // bool use_exclusive_lock_; //TF op has this - ~NGraphAssignSubOp() override {} - - public: - explicit NGraphAssignSubOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES(context, false, - errors::Internal("This constructor should not get called", - name(), "\n")); - } - - void Compute(OpKernelContext* context) override { - OP_REQUIRES( - context, false, - errors::Internal("This kernel should not get called", name(), "\n")); - } -}; - -REGISTER_KERNEL_BUILDER(Name("NGraphAssignSub").Device(DEVICE_CPU), - NGraphAssignSubOp); - -/* ------------------------------------------------- -// -// NGraphAssignAddOp -// ----------------------------------------------------*/ - -// Computes *input[0] = *input[0] + input[1] -class NGraphAssignAddOp : public OpKernel { - public: - explicit NGraphAssignAddOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES(context, false, - errors::Internal("This constructor should not get called", - name(), "\n")); - } - - void Compute(OpKernelContext* context) override { - OP_REQUIRES( - context, false, - errors::Internal("This kernel should not get called", name(), "\n")); - } - - private: - ~NGraphAssignAddOp() override {} -}; - -REGISTER_KERNEL_BUILDER(Name("NGraphAssignAdd").Device(DEVICE_CPU), - NGraphAssignAddOp); - -/* ------------------------------------------------- -// -// NGraphConv2DOp -// ----------------------------------------------------*/ - -class NGConv2DOp : public OpKernel { - public: - explicit NGConv2DOp(OpKernelConstruction* context) : OpKernel(context) { - OP_REQUIRES(context, false, - errors::Internal("This constructor should not get called", - name(), "\n")); - } - - void Compute(OpKernelContext* context) override { - OP_REQUIRES( - context, false, - errors::Internal("This kernel should not get called", name(), "\n")); - } - - private: - ~NGConv2DOp() override {} -}; - -// REGISTER_KERNEL_BUILDER(Name("Conv2D").Device(DEVICE_CPU), -// NGraphAssignAddOp); -REGISTER_KERNEL_BUILDER( - Name("Conv2D").Device(DEVICE_CPU).TypeConstraint("T"), - NGConv2DOp); +// Register NGraphOptimizers here +// These Optimizer Ops are replaced by a TF computational subgraph +// in ReplaceModifiers Rewrite Pass. Hence, these Stub Kernels/Op will never get +// called + +// Keep them in alphabetical order +REGISTER_NGRAPH_STUB_KERNEL("NGraphApplyGradientDescent"); +REGISTER_NGRAPH_STUB_KERNEL("NGraphApplyMomentum"); +REGISTER_NGRAPH_STUB_KERNEL( + "NGraphAssignAdd"); //*input[0] = *input[0] + input[1] +REGISTER_NGRAPH_STUB_KERNEL( + "NGraphAssignSub"); //*input[0] = *input[0] - input[1] } // namespace ngraph_bridge diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc new file mode 100644 index 000000000..2fd1f1cd5 --- /dev/null +++ b/ngraph_bridge/ngraph_register_stub_kernels.cc @@ -0,0 +1,71 @@ +/******************************************************************************* + * Copyright 2019-2020 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" + +#include "ngraph_bridge/ngraph_register_stub_kernels.h" + +using namespace std; + +namespace tensorflow { + +namespace ngraph_bridge { + +/* ------------------------------------------------- +// +// NGraphStubOp +// +---------------------------------------------------*/ +// Constructor +NGStubOp::NGStubOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES( + context, false, + errors::Internal("The constructor for OpType ", type_string(), + "should not get called. This Op is expected to have " + "been encapsulated or replaced by other ops. Op Name: ", + name(), "\n")); +} +// Compute +void NGStubOp::Compute(OpKernelContext* context) { + OP_REQUIRES( + context, false, + errors::Internal("This kernel for OpType ", type_string(), + "should not get called. This Op is expected to have " + "been encapsulated or replaced by other ops. Op Name: ", + name(), "\n")); +} +// Destructor +NGStubOp::~NGStubOp() {} + +/* ------------------------------------------------- */ + +// Register Bfloat Stub Kernels + +// TF Ops that work on bfloat DataType get assigned Device XLA_CPU +// Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub float +// kernels here +// The expectation is when we register the stub kernels for bfloat +// TF is going to assign DEVICE_CPU to the respective Ops and we will +// encapsulate them +// These Stub Kernels/Op will never get called + +// Keep them in alphabetical order +REGISTER_NGRAPH_STUB_KERNEL("Conv2D") + +} // namespace ngraph_bridge + +} // namespace tensorflow diff --git a/ngraph_bridge/ngraph_register_stub_kernels.h b/ngraph_bridge/ngraph_register_stub_kernels.h new file mode 100644 index 000000000..12541abb9 --- /dev/null +++ b/ngraph_bridge/ngraph_register_stub_kernels.h @@ -0,0 +1,56 @@ +/******************************************************************************* + * Copyright 2019-2020 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ +#ifndef NGRAPH_TF_BRIDGE_REGISTER_STUB_KERNELS_H_ +#define NGRAPH_TF_BRIDGE_REGISTER_STUB_KERNELS_H_ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" + +using namespace std; + +namespace tensorflow { + +namespace ngraph_bridge { + +/* ------------------------------------------------- +// +// NGStubOp +// +---------------------------------------------------*/ + +class NGStubOp : public OpKernel { + public: + explicit NGStubOp(OpKernelConstruction* context); + + void Compute(OpKernelContext* context) override; + + private: + ~NGStubOp() override; +}; + +#define REGISTER_NGRAPH_STUB_KERNEL(optype) \ + REGISTER_KERNEL_BUILDER( \ + Name(optype).Device(DEVICE_CPU).TypeConstraint("T"), \ + NGStubOp); + +#define REGISTER_NGRAPH_STUB_BFLOAT_KERNEL(optype) \ + REGISTER_KERNEL_BUILDER(Name(optype).Device(DEVICE_CPU), NGStubOp); + +} // namespace ngraph_bridge + +} // namespace tensorflow + +#endif // NGRAPH_TF_BRIDGE_REGISTER_STUB_KERNELS_H_ From 5d313e3b7d713086159b26ef159d344ee9ac2d85 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 17:47:38 -0800 Subject: [PATCH 17/22] fix bazel build --- bazel/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bazel/BUILD b/bazel/BUILD index f0674337a..b9e714a52 100644 --- a/bazel/BUILD +++ b/bazel/BUILD @@ -44,6 +44,7 @@ cc_library( "ngraph_bridge/ngraph_partial_shapes.h", "ngraph_bridge/ngraph_prefetch_shared_data.h", "ngraph_bridge/ngraph_pipelined_tensors.h", + "ngraph_bridge/ngraph_register_stub_kernels.h", "ngraph_bridge/ngraph_rewrite_for_tracking.h", "ngraph_bridge/ngraph_tensor_manager.h", "ngraph_bridge/ngraph_timer.h", @@ -89,6 +90,7 @@ cc_library( "ngraph_bridge/ngraph_mark_for_clustering.cc", "ngraph_bridge/ngraph_partial_shapes.cc", "ngraph_bridge/ngraph_pipelined_tensors.cc", + "ngraph_bridge/ngraph_register_stub_kernels.cc", "ngraph_bridge/ngraph_rewrite_for_tracking.cc", "ngraph_bridge/ngraph_tensor_manager.cc", "ngraph_bridge/ngraph_tracked_variable.cc", From f6362789a87e3e7357312f7bf8a4bf001096cdd7 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Tue, 28 Jan 2020 17:55:39 -0800 Subject: [PATCH 18/22] update comment --- ngraph_bridge/ngraph_register_stub_kernels.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc index 2fd1f1cd5..8f8dde1ca 100644 --- a/ngraph_bridge/ngraph_register_stub_kernels.cc +++ b/ngraph_bridge/ngraph_register_stub_kernels.cc @@ -56,9 +56,10 @@ NGStubOp::~NGStubOp() {} // Register Bfloat Stub Kernels // TF Ops that work on bfloat DataType get assigned Device XLA_CPU -// Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub float -// kernels here -// The expectation is when we register the stub kernels for bfloat +// Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub +// bfloat16 +// kernels here. The expectation is when we register the stub kernels for +// bfloat16 // TF is going to assign DEVICE_CPU to the respective Ops and we will // encapsulate them // These Stub Kernels/Op will never get called From d2a161fdc6862809dcfc4bccdb66bd770e9bab8b Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Wed, 29 Jan 2020 11:04:57 -0800 Subject: [PATCH 19/22] added comments to the test --- test/python/test_bfloat16.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index f8ab24975..de89a27b0 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -47,6 +47,11 @@ def run_test(sess): assert self.with_ngraph(run_test) == self.without_ngraph(run_test) + # For testing, we usually run the same graph on TF by disabling NGraph Rewrites. + # However, in this case as we register CPU bfloat dummy kernels, TF assigns device CPU + # to bfloat ops and hits the asserts in the dummy kernel. + # So, we are testing with expected values. + # For an ideal run on TF, we need to run on vanilla TF w/o importing ngraph-bridge def test_conv2d_bfloat16(self): # Graph input_shape_nhwc = (1, 4, 4, 1) @@ -81,6 +86,11 @@ def run_test(sess): np.array([516, 560, 588, 640, 804, 884, 876, 968]), (1, 2, 2, 2)) assert np.allclose(ng_val, expected_val) + # For testing, we usually run the same graph on TF by disabling NGraph Rewrites. + # However, in this case as we register CPU bfloat dummy kernels, TF assigns device CPU + # to bfloat ops and hits the asserts in the dummy kernel. + # So, we are testing with expected values. + # For an ideal run on TF, we need to run on vanilla TF w/o importing ngraph-bridge def test_conv2d_cast_bfloat16(self): # Graph input_shape_nhwc = (1, 4, 4, 1) From 1e4923c992954d131ce743b53d3348002578cef0 Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Wed, 29 Jan 2020 11:16:30 -0800 Subject: [PATCH 20/22] corrected the macros --- ngraph_bridge/ngraph_register_stub_kernels.cc | 2 +- ngraph_bridge/ngraph_register_stub_kernels.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc index 8f8dde1ca..77d5818c3 100644 --- a/ngraph_bridge/ngraph_register_stub_kernels.cc +++ b/ngraph_bridge/ngraph_register_stub_kernels.cc @@ -65,7 +65,7 @@ NGStubOp::~NGStubOp() {} // These Stub Kernels/Op will never get called // Keep them in alphabetical order -REGISTER_NGRAPH_STUB_KERNEL("Conv2D") +REGISTER_NGRAPH_STUB_BFLOAT_KERNEL("Conv2D") } // namespace ngraph_bridge diff --git a/ngraph_bridge/ngraph_register_stub_kernels.h b/ngraph_bridge/ngraph_register_stub_kernels.h index 12541abb9..543e503d1 100644 --- a/ngraph_bridge/ngraph_register_stub_kernels.h +++ b/ngraph_bridge/ngraph_register_stub_kernels.h @@ -41,14 +41,14 @@ class NGStubOp : public OpKernel { ~NGStubOp() override; }; -#define REGISTER_NGRAPH_STUB_KERNEL(optype) \ +#define REGISTER_NGRAPH_STUB_KERNEL(optype) \ + REGISTER_KERNEL_BUILDER(Name(optype).Device(DEVICE_CPU), NGStubOp); + +#define REGISTER_NGRAPH_STUB_BFLOAT_KERNEL(optype) \ REGISTER_KERNEL_BUILDER( \ Name(optype).Device(DEVICE_CPU).TypeConstraint("T"), \ NGStubOp); -#define REGISTER_NGRAPH_STUB_BFLOAT_KERNEL(optype) \ - REGISTER_KERNEL_BUILDER(Name(optype).Device(DEVICE_CPU), NGStubOp); - } // namespace ngraph_bridge } // namespace tensorflow From 0bb58e0f9204b1867d2ad88b39da5b0982b9869a Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Wed, 29 Jan 2020 12:56:33 -0800 Subject: [PATCH 21/22] fix template --- ngraph_bridge/ngraph_utils.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ngraph_bridge/ngraph_utils.cc b/ngraph_bridge/ngraph_utils.cc index 21edd2e97..f2b177267 100644 --- a/ngraph_bridge/ngraph_utils.cc +++ b/ngraph_bridge/ngraph_utils.cc @@ -224,7 +224,9 @@ Status TensorToStream(std::ostream& ostream, const Tensor& tensor) { TensorDataToStream(ostream, n_elements, data); break; case DT_BFLOAT16: - TensorDataToStream(ostream, n_elements, data); + return errors::Internal( + "TensorToStream got data type bfloat16. No compatible standard C++ " + "data type."); break; default: return errors::Internal("TensorToStream got unsupported data type ", From 9fce56ca2c5921040027b6aba61f473b6569ebdd Mon Sep 17 00:00:00 2001 From: Shrestha Malik Date: Wed, 29 Jan 2020 14:00:47 -0800 Subject: [PATCH 22/22] incorporate review comments --- ngraph_bridge/ngraph_register_stub_kernels.cc | 8 +++----- test/python/test_bfloat16.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ngraph_bridge/ngraph_register_stub_kernels.cc b/ngraph_bridge/ngraph_register_stub_kernels.cc index 77d5818c3..aa0dca466 100644 --- a/ngraph_bridge/ngraph_register_stub_kernels.cc +++ b/ngraph_bridge/ngraph_register_stub_kernels.cc @@ -57,11 +57,9 @@ NGStubOp::~NGStubOp() {} // TF Ops that work on bfloat DataType get assigned Device XLA_CPU // Since nGraph-bridge OPs work on TF DEVICE_CPU we are registering stub -// bfloat16 -// kernels here. The expectation is when we register the stub kernels for -// bfloat16 -// TF is going to assign DEVICE_CPU to the respective Ops and we will -// encapsulate them +// bfloat16 kernels here. The expectation is when we register the stub kernels +// for bfloat16 TF is going to assign DEVICE_CPU to the respective Ops and +// we will encapsulate them // These Stub Kernels/Op will never get called // Keep them in alphabetical order diff --git a/test/python/test_bfloat16.py b/test/python/test_bfloat16.py index de89a27b0..61fae7d67 100644 --- a/test/python/test_bfloat16.py +++ b/test/python/test_bfloat16.py @@ -1,5 +1,5 @@ # ============================================================================== -# Copyright 2019 Intel Corporation +# Copyright 2019-2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.