backend: compiler: retain allocator to prevent early release & enable…

… optional_reshape for int8 pattern
oneapi-src · Dec 28, 2021 · 381a0ac · 381a0ac
1 parent ef99a0c
commit 381a0ac
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 2 deletions.
diff --git a/src/backend/graph_compiler/compiler_partition_impl.cpp b/src/backend/graph_compiler/compiler_partition_impl.cpp
@@ -319,6 +319,7 @@ compiler_compiled_partition_impl_t::compiler_compiled_partition_impl_t(
  , graph_engine_(graph_engine) {
  std::lock_guard<std::mutex> lock(mtx_);
  partition_count_map[graph_engine_]++;
+ graph_engine_->allocator_->retain();
 }
 
 compiler_compiled_partition_impl_t::~compiler_compiled_partition_impl_t() {
@@ -330,6 +331,8 @@ compiler_compiled_partition_impl_t::~compiler_compiled_partition_impl_t() {
  sc::release_runtime_memory(graph_engine_.get());
  }
  }
+ jit_func_ = nullptr;
+ graph_engine_->allocator_->release();
 }
 
 impl::status_t compiler_compiled_partition_impl_t::execute(

diff --git a/src/backend/graph_compiler/patterns/mha_pattern.hpp b/src/backend/graph_compiler/patterns/mha_pattern.hpp
@@ -256,7 +256,7 @@ COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(int8_mha_pattern)
  |
  Transpose
  |
- Reshape
+ Reshape (optional)
  |
  Quantize
  |
@@ -320,8 +320,21 @@ COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_mha_pattern)
  auto transpose_output = pgraph->append_op(
  impl::op_kind::StaticTranspose,
  {in_edge(0, matmul_v, 0)}, "transpose_output");
+
+ auto optional_reshape_subgraph
+ = std::make_shared<pb_graph_t>(
+ "optional_reshape_subgraph");
+ auto optional_reshape
+ = optional_reshape_subgraph->append_op(
+ impl::op_kind::StaticReshape,
+ "optional_reshape");
+ optional_reshape_subgraph->create_input_port(
+ 0, optional_reshape, 0);
+ optional_reshape_subgraph->create_output_port(
+ 0, optional_reshape, 0);
+
  auto reshape_output
- = pgraph->append_op(impl::op_kind::StaticReshape,
+ = pgraph->append_optional(optional_reshape_subgraph,
  {in_edge(0, transpose_output, 0)},
  "reshape_output");
  pgraph->append_op(impl::op_kind::Quantize,

diff --git a/tests/cpp/unit/backend/graph_compiler/test_compile_execute.cpp b/tests/cpp/unit/backend/graph_compiler/test_compile_execute.cpp
@@ -354,3 +354,68 @@ TEST(GCGraphTest, Fp32MHACompileExecutionMultiThreading) {
  workers[t_num].join();
  }
 }
+
+// test allocator release before compiled partition destruction
+TEST(GCGraphTest, AllocatorEarlyRelease) {
+ REQUIRE_AVX512();
+ impl::graph_t agraph;
+ add_MHA_subgraph(&agraph, false);
+ agraph.build_graph();
+
+ auto &compiler_backend_ptr
+ = impl::compiler_impl::compiler_backend_t::get_singleton();
+ compiler_backend_ptr.get_partitions(agraph, impl::partition_policy::fusion);
+ auto partitions = agraph.get_partitions();
+ ASSERT_EQ(partitions.size(), 1);
+
+ impl::partition_t p;
+ p.init(partitions[0]);
+ auto partition_inputs = p.get_inputs();
+ auto partition_outputs = p.get_outputs();
+
+ std::vector<const impl::logical_tensor_t *> inputs;
+ std::vector<const impl::logical_tensor_t *> outputs;
+ for (auto &lt : partition_inputs) {
+ inputs.push_back(&lt);
+ }
+ for (auto &lt : partition_outputs) {
+ outputs.push_back(&lt);
+ }
+ impl::compiled_partition_t cp(p);
+ impl::allocator_t *allocator = impl::allocator_t::create();
+ impl::engine_t eng(impl::engine_kind::cpu,
+ 0); // create a new engine rather than use test engine here to
+ // avoid release the default allocator of the test engine
+ eng.set_allocator(allocator);
+ ASSERT_EQ(p.compile(&cp, inputs, outputs, &eng), impl::status::success);
+
+ allocator->release(); // release the allocator
+
+ std::vector<impl::tensor_t> execution_inputs;
+ std::vector<impl::tensor_t> execution_outputs;
+ size_t size = 0;
+ for (auto &lt : partition_inputs) {
+ size += compiler_backend_ptr.get_mem_size(lt);
+ }
+ for (auto &lt : partition_outputs) {
+ size += compiler_backend_ptr.get_mem_size(lt);
+ }
+ test::vector<char> data(size);
+
+ size = 0;
+ for (auto &lt : partition_inputs) {
+ impl::tensor_t placeholder(lt, &eng, data.data() + size);
+ execution_inputs.push_back(placeholder);
+ size += compiler_backend_ptr.get_mem_size(lt);
+ }
+ for (auto &lt : partition_outputs) {
+ impl::tensor_t placeholder(lt, &eng, data.data() + size);
+ execution_outputs.push_back(placeholder);
+ size += compiler_backend_ptr.get_mem_size(lt);
+ }
+
+ impl::stream_t &strm = get_stream();
+ ASSERT_EQ(cp.execute(&strm, execution_inputs, execution_outputs),
+ impl::status::success);
+ strm.wait();
+}