diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc index 1f127602e1e..c99a7851aa8 100644 --- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc +++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc @@ -54,6 +54,7 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) { "scale", "__xpu__resnet50", "softmax", + "select_input", }; auto insert_invalid_op_nodes_for_specific_target = diff --git a/lite/kernels/host/cast_compute.cc b/lite/kernels/host/cast_compute.cc index 871a665c614..e21ffa72f61 100644 --- a/lite/kernels/host/cast_compute.cc +++ b/lite/kernels/host/cast_compute.cc @@ -136,6 +136,11 @@ void CastCompute::Run() { const int32_t* x_data_end = x_data_begin + param.X->numel(); bool* out_data = param.Out->mutable_data(); std::transform(x_data_begin, x_data_end, out_data, TransOp); + } else if (param.in_dtype == 3 && param.out_dtype == 0) { // INT64 -> bool + const int64_t* x_data_begin = param.X->data(); + const int64_t* x_data_end = x_data_begin + param.X->numel(); + bool* out_data = param.Out->mutable_data(); + std::transform(x_data_begin, x_data_end, out_data, TransOp); } else if (param.in_dtype == 2 && param.out_dtype == 2) { // INT32 -> INT32 const int32_t* x_data_begin = param.X->data(); const int32_t* x_data_end = x_data_begin + param.X->numel(); diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index dfdc738ac87..29dc37e90fc 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -64,6 +64,7 @@ add_kernel(where_compute_xpu XPU extra SRCS where_compute.cc) add_kernel(gather_nd_compute_xpu XPU extra SRCS gather_nd_compute.cc) add_kernel(meshgrid_compute_xpu XPU basic SRCS meshgrid_compute.cc) add_kernel(fetch_compute_xpu XPU basic SRCS fetch_compute.cc) +add_kernel(unbind_compute_xpu XPU basic SRCS unbind_compute.cc) # extra add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc) diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc index d3ef3715e7f..cc684111edc 100644 --- a/lite/kernels/xpu/activation_compute.cc +++ b/lite/kernels/xpu/activation_compute.cc @@ -75,10 +75,10 @@ void SigmoidCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - int r = xdnn::fast_sigmoid(ctx.GetRawContext(), - param.X->template data(), - param.Out->template mutable_data(TARGET(kXPU)), - param.X->numel()); + int r = xdnn::sigmoid(ctx.GetRawContext(), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), + param.X->numel()); CHECK_EQ(r, 0); } diff --git a/lite/kernels/xpu/compare_compute.cc b/lite/kernels/xpu/compare_compute.cc index d6744c82654..51450596a78 100644 --- a/lite/kernels/xpu/compare_compute.cc +++ b/lite/kernels/xpu/compare_compute.cc @@ -82,6 +82,14 @@ void CompareCompute::Run() { const auto* y = param.Y->template data(); auto& ctx = this->ctx_->template As(); + + if (x_dims.size() == 0) { + x_dims.ConstructFrom({1}); + } + if (y_dims.size() == 0) { + y_dims.ConstructFrom({1}); + } + Functor comp_func; std::vector xshape; std::vector yshape; diff --git a/lite/kernels/xpu/linspace_compute.cc b/lite/kernels/xpu/linspace_compute.cc index 829b1cf50d9..e0aa23f2133 100644 --- a/lite/kernels/xpu/linspace_compute.cc +++ b/lite/kernels/xpu/linspace_compute.cc @@ -21,6 +21,22 @@ namespace lite { namespace kernels { namespace xpu { +template +T GetValueOfExpectedType(const lite::Tensor* x) { + switch (x->precision()) { + case PRECISION(kFloat): + return static_cast(x->template data()[0]); + case PRECISION(kInt32): + return static_cast(x->template data()[0]); + case PRECISION(kInt64): + return static_cast(x->template data()[0]); + default: + LOG(FATAL) << "data type is not supported: " + << lite_api::PrecisionToStr(x->precision()); + return static_cast(0); + } +} + template void LinspaceCompute::Run() { auto& param = this->template Param(); @@ -31,20 +47,31 @@ void LinspaceCompute::Run() { auto* out_tensor = param.Out; int64_t num = static_cast(num_tensor->template data()[0]); int r = -1; + + T start_val = GetValueOfExpectedType(start_tensor); + T stop_val = GetValueOfExpectedType(stop_tensor); switch (param.Out->precision()) { case PRECISION(kFloat): r = xdnn::linspace(ctx.GetRawContext(), out_tensor->template mutable_data(TARGET(kXPU)), - static_cast(start_tensor->template data()[0]), - static_cast(stop_tensor->template data()[0]), + start_val, + stop_val, num); CHECK_EQ(r, 0); break; case PRECISION(kInt32): r = xdnn::linspace(ctx.GetRawContext(), out_tensor->template mutable_data(TARGET(kXPU)), - static_cast(start_tensor->template data()[0]), - static_cast(stop_tensor->template data()[0]), + start_val, + stop_val, + num); + CHECK_EQ(r, 0); + break; + case PRECISION(kInt64): + r = xdnn::linspace(ctx.GetRawContext(), + out_tensor->template mutable_data(TARGET(kXPU)), + start_val, + stop_val, num); CHECK_EQ(r, 0); break; diff --git a/lite/kernels/xpu/scale_compute.cc b/lite/kernels/xpu/scale_compute.cc index 32b7f298a7e..e81718ef3f9 100644 --- a/lite/kernels/xpu/scale_compute.cc +++ b/lite/kernels/xpu/scale_compute.cc @@ -21,8 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -template -void ScaleCompute::Run() { +template +void ScaleCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -52,32 +52,30 @@ void ScaleCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(scale, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ScaleCompute, - def) +using XPUScale_FP32 = + paddle::lite::kernels::xpu::ScaleCompute; +REGISTER_LITE_KERNEL(scale, kXPU, kFloat, kNCHW, XPUScale_FP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL(scale, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ScaleCompute, - int32) +using XPUScale_FP16 = + paddle::lite::kernels::xpu::ScaleCompute; +REGISTER_LITE_KERNEL(scale, kXPU, kFP16, kNCHW, XPUScale_FP16, fp16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +using XPUScale_Int32 = + paddle::lite::kernels::xpu::ScaleCompute; +REGISTER_LITE_KERNEL(scale, kXPU, kFloat, kNCHW, XPUScale_Int32, int32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); -REGISTER_LITE_KERNEL(scale, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ScaleCompute, - int64) +using XPUScale_Int64 = + paddle::lite::kernels::xpu::ScaleCompute; +REGISTER_LITE_KERNEL(scale, kXPU, kFloat, kNCHW, XPUScale_Int64, int64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h index bf92b943033..01489a55ba3 100644 --- a/lite/kernels/xpu/scale_compute.h +++ b/lite/kernels/xpu/scale_compute.h @@ -21,8 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -template -class ScaleCompute : public KernelLite { +template +class ScaleCompute : public KernelLite { public: using param_t = operators::ScaleParam; diff --git a/lite/kernels/xpu/set_value_compute.cc b/lite/kernels/xpu/set_value_compute.cc index c3b68ac1283..60c29524304 100644 --- a/lite/kernels/xpu/set_value_compute.cc +++ b/lite/kernels/xpu/set_value_compute.cc @@ -82,8 +82,8 @@ void SetValueCompute::SetValue(const std::vector& starts, __ends__, \ __steps__, \ param.axes, \ - {}, \ - {}); \ + param.decrease_axes, \ + param.none_axes); \ CHECK_EQ(r, 0); \ return; \ } diff --git a/lite/kernels/xpu/unbind_compute.cc b/lite/kernels/xpu/unbind_compute.cc new file mode 100644 index 00000000000..c36e899e4b9 --- /dev/null +++ b/lite/kernels/xpu/unbind_compute.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/unbind_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +void UnbindCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto x = param.x; + auto& axis = param.axis; + + auto output = param.output; + + std::vector y_ptrs; + for (size_t j = 0; j < output.size(); ++j) { + y_ptrs.emplace_back(output[j]->template mutable_data(TARGET(kXPU))); + } + auto x_shape = x->dims().Vectorize(); + int r = xdnn::unbind( + ctx.GetRawContext(), x->template data(), y_ptrs, x_shape, axis); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +using unbind_fp32 = + paddle::lite::kernels::xpu::UnbindCompute; +REGISTER_LITE_KERNEL(unbind, kXPU, kFloat, kNCHW, unbind_fp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + +using unbind_int64 = + paddle::lite::kernels::xpu::UnbindCompute; +REGISTER_LITE_KERNEL(unbind, kXPU, kFloat, kNCHW, unbind_int64, int64) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/xpu/unbind_compute.h b/lite/kernels/xpu/unbind_compute.h new file mode 100644 index 00000000000..766f83d386e --- /dev/null +++ b/lite/kernels/xpu/unbind_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class UnbindCompute : public KernelLite { + public: + using param_t = operators::UnbindParam; + + virtual void Run(); + + virtual ~UnbindCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc index ac4ae07dfa2..0e33466a580 100644 --- a/lite/operators/compare_op.cc +++ b/lite/operators/compare_op.cc @@ -35,7 +35,7 @@ static void GetBroadcastDimsArrays(const DDim &x_dims, }; CHECK_GE(axis, 0); - CHECK_LT(axis, max_dim); + CHECK_LE(axis, max_dim); if (x_dims.size() > y_dims.size()) { std::fill(y_dims_array, y_dims_array + axis, 1); if (axis + y_dims.size() < max_dim) {