diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
index 1f127602e1e..c99a7851aa8 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
@@ -54,6 +54,7 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) {
       "scale",
       "__xpu__resnet50",
       "softmax",
+      "select_input",
   };
 
   auto insert_invalid_op_nodes_for_specific_target =
diff --git a/lite/kernels/host/cast_compute.cc b/lite/kernels/host/cast_compute.cc
index 871a665c614..e21ffa72f61 100644
--- a/lite/kernels/host/cast_compute.cc
+++ b/lite/kernels/host/cast_compute.cc
@@ -136,6 +136,11 @@ void CastCompute::Run() {
     const int32_t* x_data_end = x_data_begin + param.X->numel();
     bool* out_data = param.Out->mutable_data<bool>();
     std::transform(x_data_begin, x_data_end, out_data, TransOp<int32_t, bool>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 0) {  // INT64 -> bool
+    const int64_t* x_data_begin = param.X->data<int64_t>();
+    const int64_t* x_data_end = x_data_begin + param.X->numel();
+    bool* out_data = param.Out->mutable_data<bool>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<int64_t, bool>);
   } else if (param.in_dtype == 2 && param.out_dtype == 2) {  // INT32 -> INT32
     const int32_t* x_data_begin = param.X->data<int32_t>();
     const int32_t* x_data_end = x_data_begin + param.X->numel();
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index dfdc738ac87..29dc37e90fc 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -64,6 +64,7 @@ add_kernel(where_compute_xpu XPU extra SRCS where_compute.cc)
 add_kernel(gather_nd_compute_xpu XPU extra SRCS gather_nd_compute.cc)
 add_kernel(meshgrid_compute_xpu XPU basic SRCS meshgrid_compute.cc)
 add_kernel(fetch_compute_xpu XPU basic SRCS fetch_compute.cc)
+add_kernel(unbind_compute_xpu XPU basic SRCS unbind_compute.cc)
 
 # extra
 add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc)
diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc
index d3ef3715e7f..cc684111edc 100644
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -75,10 +75,10 @@ void SigmoidCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
-  int r = xdnn::fast_sigmoid(ctx.GetRawContext(),
-                             param.X->template data<T>(),
-                             param.Out->template mutable_data<T>(TARGET(kXPU)),
-                             param.X->numel());
+  int r = xdnn::sigmoid(ctx.GetRawContext(),
+                        param.X->template data<T>(),
+                        param.Out->template mutable_data<T>(TARGET(kXPU)),
+                        param.X->numel());
   CHECK_EQ(r, 0);
 }
 
diff --git a/lite/kernels/xpu/compare_compute.cc b/lite/kernels/xpu/compare_compute.cc
index d6744c82654..51450596a78 100644
--- a/lite/kernels/xpu/compare_compute.cc
+++ b/lite/kernels/xpu/compare_compute.cc
@@ -82,6 +82,14 @@ void CompareCompute<PType, T, Functor>::Run() {
   const auto* y = param.Y->template data<T>();
 
   auto& ctx = this->ctx_->template As<XPUContext>();
+
+  if (x_dims.size() == 0) {
+    x_dims.ConstructFrom({1});
+  }
+  if (y_dims.size() == 0) {
+    y_dims.ConstructFrom({1});
+  }
+
   Functor comp_func;
   std::vector<int> xshape;
   std::vector<int> yshape;
diff --git a/lite/kernels/xpu/linspace_compute.cc b/lite/kernels/xpu/linspace_compute.cc
index 829b1cf50d9..e0aa23f2133 100644
--- a/lite/kernels/xpu/linspace_compute.cc
+++ b/lite/kernels/xpu/linspace_compute.cc
@@ -21,6 +21,22 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
+template <typename T>
+T GetValueOfExpectedType(const lite::Tensor* x) {
+  switch (x->precision()) {
+    case PRECISION(kFloat):
+      return static_cast<T>(x->template data<float>()[0]);
+    case PRECISION(kInt32):
+      return static_cast<T>(x->template data<int32_t>()[0]);
+    case PRECISION(kInt64):
+      return static_cast<T>(x->template data<int64_t>()[0]);
+    default:
+      LOG(FATAL) << "data type is not supported: "
+                 << lite_api::PrecisionToStr(x->precision());
+      return static_cast<T>(0);
+  }
+}
+
 template <typename T, PrecisionType PType>
 void LinspaceCompute<T, PType>::Run() {
   auto& param = this->template Param<operators::LinspaceParam>();
@@ -31,20 +47,31 @@ void LinspaceCompute<T, PType>::Run() {
   auto* out_tensor = param.Out;
   int64_t num = static_cast<int64_t>(num_tensor->template data<int>()[0]);
   int r = -1;
+
+  T start_val = GetValueOfExpectedType<T>(start_tensor);
+  T stop_val = GetValueOfExpectedType<T>(stop_tensor);
   switch (param.Out->precision()) {
     case PRECISION(kFloat):
       r = xdnn::linspace<T>(ctx.GetRawContext(),
                             out_tensor->template mutable_data<T>(TARGET(kXPU)),
-                            static_cast<T>(start_tensor->template data<T>()[0]),
-                            static_cast<T>(stop_tensor->template data<T>()[0]),
+                            start_val,
+                            stop_val,
                             num);
       CHECK_EQ(r, 0);
       break;
     case PRECISION(kInt32):
       r = xdnn::linspace<T>(ctx.GetRawContext(),
                             out_tensor->template mutable_data<T>(TARGET(kXPU)),
-                            static_cast<T>(start_tensor->template data<T>()[0]),
-                            static_cast<T>(stop_tensor->template data<T>()[0]),
+                            start_val,
+                            stop_val,
+                            num);
+      CHECK_EQ(r, 0);
+      break;
+    case PRECISION(kInt64):
+      r = xdnn::linspace<T>(ctx.GetRawContext(),
+                            out_tensor->template mutable_data<T>(TARGET(kXPU)),
+                            start_val,
+                            stop_val,
                             num);
       CHECK_EQ(r, 0);
       break;
diff --git a/lite/kernels/xpu/scale_compute.cc b/lite/kernels/xpu/scale_compute.cc
index 32b7f298a7e..e81718ef3f9 100644
--- a/lite/kernels/xpu/scale_compute.cc
+++ b/lite/kernels/xpu/scale_compute.cc
@@ -21,8 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename T>
-void ScaleCompute<T>::Run() {
+template <typename T, PrecisionType PType>
+void ScaleCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -52,32 +52,30 @@ void ScaleCompute<T>::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(scale,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ScaleCompute<float>,
-                     def)
+using XPUScale_FP32 =
+    paddle::lite::kernels::xpu::ScaleCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(scale, kXPU, kFloat, kNCHW, XPUScale_FP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(scale,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ScaleCompute<int>,
-                     int32)
+using XPUScale_FP16 =
+    paddle::lite::kernels::xpu::ScaleCompute<float16, PRECISION(kFP16)>;
+REGISTER_LITE_KERNEL(scale, kXPU, kFP16, kNCHW, XPUScale_FP16, fp16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+using XPUScale_Int32 =
+    paddle::lite::kernels::xpu::ScaleCompute<int, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(scale, kXPU, kFloat, kNCHW, XPUScale_Int32, int32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(scale,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ScaleCompute<int64_t>,
-                     int64)
+using XPUScale_Int64 =
+    paddle::lite::kernels::xpu::ScaleCompute<int64_t, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(scale, kXPU, kFloat, kNCHW, XPUScale_Int64, int64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h
index bf92b943033..01489a55ba3 100644
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
@@ -21,8 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename T>
-class ScaleCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ScaleCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ScaleParam;
 
diff --git a/lite/kernels/xpu/set_value_compute.cc b/lite/kernels/xpu/set_value_compute.cc
index c3b68ac1283..60c29524304 100644
--- a/lite/kernels/xpu/set_value_compute.cc
+++ b/lite/kernels/xpu/set_value_compute.cc
@@ -82,8 +82,8 @@ void SetValueCompute::SetValue(const std::vector<int64_t>& starts,
         __ends__,                                                      \
         __steps__,                                                     \
         param.axes,                                                    \
-        {},                                                            \
-        {});                                                           \
+        param.decrease_axes,                                           \
+        param.none_axes);                                              \
     CHECK_EQ(r, 0);                                                    \
     return;                                                            \
   }
diff --git a/lite/kernels/xpu/unbind_compute.cc b/lite/kernels/xpu/unbind_compute.cc
new file mode 100644
index 00000000000..c36e899e4b9
--- /dev/null
+++ b/lite/kernels/xpu/unbind_compute.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/unbind_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+void UnbindCompute<T, PType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto x = param.x;
+  auto& axis = param.axis;
+
+  auto output = param.output;
+
+  std::vector<T*> y_ptrs;
+  for (size_t j = 0; j < output.size(); ++j) {
+    y_ptrs.emplace_back(output[j]->template mutable_data<T>(TARGET(kXPU)));
+  }
+  auto x_shape = x->dims().Vectorize();
+  int r = xdnn::unbind(
+      ctx.GetRawContext(), x->template data<T>(), y_ptrs, x_shape, axis);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using unbind_fp32 =
+    paddle::lite::kernels::xpu::UnbindCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(unbind, kXPU, kFloat, kNCHW, unbind_fp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
+using unbind_int64 =
+    paddle::lite::kernels::xpu::UnbindCompute<int64_t, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(unbind, kXPU, kFloat, kNCHW, unbind_int64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/xpu/unbind_compute.h b/lite/kernels/xpu/unbind_compute.h
new file mode 100644
index 00000000000..766f83d386e
--- /dev/null
+++ b/lite/kernels/xpu/unbind_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+class UnbindCompute : public KernelLite<TARGET(kXPU), PType> {
+ public:
+  using param_t = operators::UnbindParam;
+
+  virtual void Run();
+
+  virtual ~UnbindCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc
index ac4ae07dfa2..0e33466a580 100644
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
@@ -35,7 +35,7 @@ static void GetBroadcastDimsArrays(const DDim &x_dims,
   };
 
   CHECK_GE(axis, 0);
-  CHECK_LT(axis, max_dim);
+  CHECK_LE(axis, max_dim);
   if (x_dims.size() > y_dims.size()) {
     std::fill(y_dims_array, y_dims_array + axis, 1);
     if (axis + y_dims.size() < max_dim) {