Skip to content

Commit

Permalink
Merge branch 'leca/memcpyToHost' of https://github.com/microsoft/onnx…
Browse files Browse the repository at this point in the history
…runtime into leca/memcpyToHost
  • Loading branch information
Lei Cao committed Feb 11, 2023
2 parents 6ceb297 + 7177c26 commit 049ae80
Showing 1 changed file with 4 additions and 8 deletions.
12 changes: 4 additions & 8 deletions onnxruntime/core/framework/allocation_planner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -572,10 +572,6 @@ class PlannerImpl {
}

Status ComputeReuseCount() {
// Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model
using GraphInputsSet = InlinedHashSet<std::string_view>;
const auto& graph_inputs_nodes = graph_viewer_.GetInputsIncludingInitializers();

for (auto graph_input : graph_viewer_.GetInputs()) {
OrtValueIndex index = Index(graph_input->Name());
UseCount(index)++; // Models caller's usage post-inference; ensures it will not be reused.
Expand Down Expand Up @@ -1803,7 +1799,7 @@ class PlannerImpl {
for (size_t i = 0; i < num_logic_streams_; ++i) {
for (auto node_index : stream_nodes_[i]) {
auto* node = graph_viewer_.GetNode(node_index);
// Neither trigger ActivateNotification/WaitOnEPStep for Shape op (whose output is ready for all the EPs), nor
// Neither trigger ActivateNotification/WaitOnEPStep for Shape op (whose output is ready for all the EPs), nor
// upstream is on CPU device (As currently we never invoke RegisterWaitFn(CPU, ...) for all kinds of EP, thus no wait_handle can be retrieved for this case)
if (node->OpType() != "Shape" && execution_plan[i]->device_.Type() != OrtDevice::CPU) {
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
Expand All @@ -1817,7 +1813,7 @@ class PlannerImpl {
// 2. the consumer is in the same stream(non-cpu device), but it consumes a CPU tensor from an non-shape op.
// for example, a resize cuda kernel consumer a tensor from MemCpyToHost cuda kernel on the same stream.
// in this case, the FIFO can't guarantee the cpu tensor is ready when resize kernel is launching
// TODO(leca): After we separate MemcpyToHost to an extra stream, by default there shouldn't be the case that
// TODO(leca): After we separate MemcpyToHost to an extra stream, by default there shouldn't be the case that
// producer and consumer are both in the same CUDA stream and producer has a CPU output consumed by consumer.
// The only possible way is user explicitly creates this case in the customized partition JSON file (see PlannerTest.MultiStreamCudaEPNodeCPUOutput)
OrtDevice::DeviceType output_arg_device = plan_.allocation_plan[output_arg_idx].location.device.Type();
Expand Down Expand Up @@ -1851,7 +1847,7 @@ class PlannerImpl {
ORT_ENFORCE(execution_plan[node_stream_map_[node_index]]->device_.Type() == node_device_mem_location.device.Type());
}
}

// 4. add commands to logic queue
for (size_t i = 0; i < num_logic_streams_; ++i) {
for (size_t j = 0; j < stream_nodes_[i].size(); ++j) {
Expand Down Expand Up @@ -2387,4 +2383,4 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con

#endif

} // namespace onnxruntime
} // namespace onnxruntime

0 comments on commit 049ae80

Please sign in to comment.