From 0bbd3c44620579c156a61a6b23219e9812e8e254 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 14 Feb 2024 13:09:55 -0800 Subject: [PATCH] Fix Dtype --- .../cuda/tensor/resize_antialias_impl.cu | 123 ++++++++++-------- 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu index 53b9c27f3fc2a..f1f1649edb8af 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu @@ -142,7 +142,7 @@ __global__ void ComputeInterpolationAtLevel1( const fast_divmod div_output_width, int32_t window_size, bool use_extrapolation, float extrapolation_value, - const uint8_t* clip8_lookups, + const uint8_t* clip8_table, const int64_t* bound_data, std::tuple outof_bounds_buffers, const AccumType* weight_coefficients, @@ -162,30 +162,30 @@ __global__ void ComputeInterpolationAtLevel1( int output_y, output_x; div_output_width.divmod(id, output_y, output_x); -// if (use_extrapolation) { -// const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers); -// // Extrapolate along the w dimension -// if (w_outof_bounds[static_cast(output_x)] != -1) { -// T* Ydata_offset = Ydata + output_y * output_width; -//#ifdef CPU_TESTING -// assert(w_outof_bounds[static_cast(output_x)] == output_x); -//#endif -// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); -// return; -// } -// -// // Extrapolate along the y dimension -// const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers); -// if (y_outof_bounds[static_cast(output_y)] != -1) { -//#ifdef CPU_TESTING -// assert(y_outof_bounds[static_cast(output_y)] == output_y); -//#endif -// T* Ydata_offset = Ydata + output_y * output_width; -// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); -// return; -// } -// /// XXX Add z dimension test -// } + // if (use_extrapolation) { + // const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers); + // // Extrapolate along the w dimension + // if (w_outof_bounds[static_cast(output_x)] != -1) { + // T* Ydata_offset = Ydata + output_y * output_width; + // #ifdef CPU_TESTING + // assert(w_outof_bounds[static_cast(output_x)] == output_x); + // #endif + // Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); + // return; + // } + // + // // Extrapolate along the y dimension + // const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers); + // if (y_outof_bounds[static_cast(output_y)] != -1) { + // #ifdef CPU_TESTING + // assert(y_outof_bounds[static_cast(output_y)] == output_y); + // #endif + // T* Ydata_offset = Ydata + output_y * output_width; + // Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); + // return; + // } + // /// XXX Add z dimension test + // } auto* Ydata_offset = Ydata + output_width * output_y + output_x; const auto* bound = bound_data; @@ -200,13 +200,18 @@ __global__ void ComputeInterpolationAtLevel1( const auto* Xdata_offset = Xdata + input_width * output_y + xmin; for (; xmin < xmax; ++xmin) { - // This cast is needed when we deal with half - output += static_cast((*Xdata_offset++)) * (*weight_coeff++); + if constexpr (std::is_same::value) { + // This cast is needed when we deal with half + output += static_cast((*Xdata_offset++)) * (*weight_coeff++); + } else { + output += (*Xdata_offset++) * (*weight_coeff++); + } } - std::cout << " " << output; + std::cout << output << ' '; if constexpr (onnxruntime::is_8bit_v) { + const uint8_t* clip8_lookups = &clip8_table[640]; *Ydata_offset = static_cast(clip8_lookups[output >> 22]); } else if constexpr (std::is_same::value) { *Ydata_offset = static_cast(std::round(output)); @@ -225,7 +230,7 @@ __global__ void ComputeInterpolationAtLevel2( const fast_divmod div_output_width, int32_t window_size, bool use_extrapolation, float extrapolation_value, - const uint8_t* clip8_lookups, + const uint8_t* clip8_table, const int64_t* bound_data, std::tuple outof_bounds_buffers, const AccumType* weight_coefficients, @@ -243,29 +248,29 @@ __global__ void ComputeInterpolationAtLevel2( int output_y, output_x; div_output_width.divmod(id, output_y, output_x); -// if (use_extrapolation) { -// const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers); -// // Extrapolate along the w dimension -// if (w_outof_bounds[static_cast(output_x)] != -1) { -// T* Ydata_offset = Ydata + output_y * output_width; -//#ifdef CPU_TESTING -// assert(w_outof_bounds[static_cast(output_x)] == output_x); -//#endif -// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); -// return; -// } -// -// // Extrapolate along the y dimension -// const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers); -// if (y_outof_bounds[static_cast(output_y)] != -1) { -//#ifdef CPU_TESTING -// assert(y_outof_bounds[static_cast(output_y)] == output_y); -//#endif -// T* Ydata_offset = Ydata + output_y * output_width; -// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); -// return; -// } -// } + // if (use_extrapolation) { + // const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers); + // // Extrapolate along the w dimension + // if (w_outof_bounds[static_cast(output_x)] != -1) { + // T* Ydata_offset = Ydata + output_y * output_width; + // #ifdef CPU_TESTING + // assert(w_outof_bounds[static_cast(output_x)] == output_x); + // #endif + // Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); + // return; + // } + // + // // Extrapolate along the y dimension + // const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers); + // if (y_outof_bounds[static_cast(output_y)] != -1) { + // #ifdef CPU_TESTING + // assert(y_outof_bounds[static_cast(output_y)] == output_y); + // #endif + // T* Ydata_offset = Ydata + output_y * output_width; + // Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); + // return; + // } + // } auto* Ydata_offset = Ydata + output_width * output_y + output_x; const auto* bound = bound_data; @@ -279,14 +284,19 @@ __global__ void ComputeInterpolationAtLevel2( const auto* Xdata_offset = Xdata + ymin * output_width + output_x; for (; ymin < ymax; ++ymin) { - output += static_cast((*Xdata_offset)) * (*weight_coeff); + if constexpr (std::is_same::value) { + // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA + output += static_cast((*Xdata_offset)) * (*weight_coeff++); + } else { + output += (*Xdata_offset) * (*weight_coeff++); + } Xdata_offset += input_width; - weight_coeff++; } - std::cout << ", " << output; + std::cout << output << ' '; if constexpr (onnxruntime::is_8bit_v) { + const uint8_t* clip8_lookups = &clip8_table[640]; *Ydata_offset = static_cast(clip8_lookups[output >> 22]); } else if constexpr (std::is_same::value) { *Ydata_offset = static_cast(std::round(output)); @@ -594,7 +604,7 @@ __global__ void _SetupTrilinerarUpsampleFilterAntiAlias( const auto scale = 1.f / inv_scale; const float inv_scale_1 = (scale >= 1.0f) ? 1.0f / scale : 1.0f; - std::cout << "Rscale: " << inv_scale_1 << " Scale: " << scale << " Scaled Support: " << scaled_support << " Window Size: " << window_size + std::cout << "\nRscale: " << inv_scale_1 << " Scale: " << scale << " Scaled Support: " << scaled_support << " Window Size: " << window_size << " Input Size: " << input_size << " Output Size: " << output_size << " Inv Scale: " << inv_scale << " roi_start: " << roi_start << " roi_end " << roi_end; } @@ -855,7 +865,6 @@ void ResizeAntiAliasImpl( std::cout << std::endl; - CUDA_CALL_THROW(cudaMemcpyAsync(output_data, host_output_buffer.get(), N * sizeof(T), cudaMemcpyHostToDevice, stream)); #else