diff --git a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h index 50d45efb1a28f..ec813e001413a 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h +++ b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h @@ -257,6 +257,8 @@ void ComputeInterpolationAtLevel1(int64_t num_channels, int64_t input_height, in concurrency::ThreadPool* tp) { const uint8_t* clip8_lookups = &p.GetClip8LookupTable()[640]; + std::cout << "L1: "; + concurrency::ThreadPool::TrySimpleParallelFor( tp, narrow(num_channels), [&](std::ptrdiff_t c) { @@ -286,6 +288,8 @@ void ComputeInterpolationAtLevel1(int64_t num_channels, int64_t input_height, in output += (*Xdata_offset++) * (*weight_coeff++); } + std::cout << " " << output; + if constexpr (is_8bit_v) { *Ydata_offset++ = static_cast(clip8_lookups[output >> 22]); } else if constexpr (std::is_same::value) { @@ -296,6 +300,8 @@ void ComputeInterpolationAtLevel1(int64_t num_channels, int64_t input_height, in } } }); + + std::cout << std::endl; } /** @@ -322,6 +328,8 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in const FilterParamsAntiAlias& p, const FilterParamsBaseAntiAlias& p_dim, concurrency::ThreadPool* tp) { + std::cout << "L2: "; + const uint8_t* clip8_lookups = &p.GetClip8LookupTable()[640]; // This condition is set for higher performance. // Observed that TrySimpleParallelFor in dim num_channels is always have higher efficiency, so I would rather @@ -357,6 +365,9 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in output += *Xdata_offset * (*weight_coeff_start++); Xdata_offset += output_width; } + + std::cout << " " << output; + if constexpr (is_8bit_v) { *Ydata_offset++ = static_cast(clip8_lookups[output >> 22]); } else if constexpr (std::is_same::value) { @@ -403,6 +414,9 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in output += *Xdata_offset * (*weight_coeff_start++); Xdata_offset += output_width; } + + std::cout << " " << output; + if constexpr (is_8bit_v) { *Ydata_offset++ = static_cast(clip8_lookups[output >> 22]); } else if constexpr (std::is_same::value) { @@ -414,6 +428,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in } }); } + std::cout << std::endl; } template diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu index 32370930a5d90..53b9c27f3fc2a 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu @@ -132,267 +132,169 @@ struct AccumTypeCaster { } }; -template -FUNC_DEF void ComputeInterpolationAtLevel1(IdType id, - int64_t input_height, int64_t input_width, - int64_t output_height, int64_t output_width, - const fast_divmod& div_output_width, const fast_divmod& div_output_image, - int64_t window_size, - const uint8_t* clip8_lookups, - const int64_t* bound_data, - const AccumulateType* weight_coefficients, - const InputType* Xdata, InputType* Ydata) { +template +__global__ void ComputeInterpolationAtLevel1( +#ifdef CPU_TESTING + IdType id, +#endif + int64_t input_height, int64_t input_width, + int64_t output_height, int64_t output_width, + const fast_divmod div_output_width, + int32_t window_size, + bool use_extrapolation, float extrapolation_value, + const uint8_t* clip8_lookups, + const int64_t* bound_data, + std::tuple outof_bounds_buffers, + const AccumType* weight_coefficients, + const T* Xdata, T* Ydata, + const int N) { + +#ifndef CPU_TESTING + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); +#endif + // No need to do scale if (output_width == input_width) { Ydata[id] = Xdata[id]; return; } - int bxc, output_image_index; - div_output_image.divmod(id, bxc, output_image_index); - int64_t input_index = bxc * input_height * input_width; - int output_y, output_x; - div_output_width.divmod(output_image_index, output_y, output_x); + div_output_width.divmod(id, output_y, output_x); + +// if (use_extrapolation) { +// const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers); +// // Extrapolate along the w dimension +// if (w_outof_bounds[static_cast(output_x)] != -1) { +// T* Ydata_offset = Ydata + output_y * output_width; +//#ifdef CPU_TESTING +// assert(w_outof_bounds[static_cast(output_x)] == output_x); +//#endif +// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); +// return; +// } +// +// // Extrapolate along the y dimension +// const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers); +// if (y_outof_bounds[static_cast(output_y)] != -1) { +//#ifdef CPU_TESTING +// assert(y_outof_bounds[static_cast(output_y)] == output_y); +//#endif +// T* Ydata_offset = Ydata + output_y * output_width; +// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); +// return; +// } +// /// XXX Add z dimension test +// } - // Output ptr auto* Ydata_offset = Ydata + output_width * output_y + output_x; const auto* bound = bound_data; - AccumulateType output = onnxruntime::is_8bit_v ? ConstValue::mag_factor : 0; + AccumType output = onnxruntime::is_8bit_v ? ConstValue::mag_factor : 0; const auto* weight_coeff = weight_coefficients + window_size * output_x; - int64_t xmin = *bound++; - int64_t xmax = *bound; + int64_t xmin = bound[static_cast(output_x) * 2]; + int64_t xmax = bound[static_cast(output_x) * 2 + 1]; // Input window - const auto* Xdata_offset = Xdata + input_index + xmin; + const auto* Xdata_offset = Xdata + input_width * output_y + xmin; for (; xmin < xmax; ++xmin) { // This cast is needed when we deal with half - output += static_cast((*Xdata_offset++)) * (*weight_coeff++); + output += static_cast((*Xdata_offset++)) * (*weight_coeff++); } - if constexpr (onnxruntime::is_8bit_v) { - *Ydata_offset = static_cast(clip8_lookups[output >> 22]); - } else if constexpr (std::is_same::value) { + std::cout << " " << output; + + if constexpr (onnxruntime::is_8bit_v) { + *Ydata_offset = static_cast(clip8_lookups[output >> 22]); + } else if constexpr (std::is_same::value) { *Ydata_offset = static_cast(std::round(output)); } else { - *Ydata_offset = static_cast(output); + *Ydata_offset = static_cast(output); } } -template -FUNC_DEF void ComputeInterpolationAtLevel2(IdType id, - int64_t input_height, int64_t input_width, - int64_t output_height, int64_t output_width, - const fast_divmod& div_output_width, const fast_divmod& div_output_image, - int64_t window_size, - const uint8_t* clip8_lookups, - const int64_t* bound_data, - const AccumulateType* weight_coefficients, - const InputType* Xdata, InputType* Ydata) { +template +__global__ void ComputeInterpolationAtLevel2( +#ifdef CPU_TESTING + IdType id, +#endif + int64_t input_height, int64_t input_width, + int64_t output_height, int64_t output_width, + const fast_divmod div_output_width, + int32_t window_size, + bool use_extrapolation, float extrapolation_value, + const uint8_t* clip8_lookups, + const int64_t* bound_data, + std::tuple outof_bounds_buffers, + const AccumType* weight_coefficients, + const T* Xdata, T* Ydata, int N) { +#ifndef CPU_TESTING + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); +#endif + // No need to do scale if (output_height == input_height) { Ydata[id] = Xdata[id]; return; } - // input image index and output image are the same. - int bxc, output_image_index; - div_output_image.divmod(id, bxc, output_image_index); - int64_t input_index = bxc * input_height * input_width; - int output_y, output_x; - div_output_width.divmod(output_image_index, output_y, output_x); + div_output_width.divmod(id, output_y, output_x); + +// if (use_extrapolation) { +// const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers); +// // Extrapolate along the w dimension +// if (w_outof_bounds[static_cast(output_x)] != -1) { +// T* Ydata_offset = Ydata + output_y * output_width; +//#ifdef CPU_TESTING +// assert(w_outof_bounds[static_cast(output_x)] == output_x); +//#endif +// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); +// return; +// } +// +// // Extrapolate along the y dimension +// const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers); +// if (y_outof_bounds[static_cast(output_y)] != -1) { +//#ifdef CPU_TESTING +// assert(y_outof_bounds[static_cast(output_y)] == output_y); +//#endif +// T* Ydata_offset = Ydata + output_y * output_width; +// Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); +// return; +// } +// } auto* Ydata_offset = Ydata + output_width * output_y + output_x; const auto* bound = bound_data; - AccumulateType output = onnxruntime::is_8bit_v ? ConstValue::mag_factor : 0; + AccumType output = onnxruntime::is_8bit_v ? ConstValue::mag_factor : 0; const auto* weight_coeff = weight_coefficients + window_size * output_y; - int64_t ymin = *bound++; - int64_t ymax = *bound; + int64_t ymin = bound[static_cast(output_y) * 2]; + int64_t ymax = bound[static_cast(output_y) * 2 + 1]; - const auto* Xdata_offset = Xdata + input_index + ymin * output_width + output_x; + const auto* Xdata_offset = Xdata + ymin * output_width + output_x; for (; ymin < ymax; ++ymin) { - output += static_cast((*Xdata_offset)) * (*weight_coeff); + output += static_cast((*Xdata_offset)) * (*weight_coeff); Xdata_offset += input_width; weight_coeff++; } - if constexpr (onnxruntime::is_8bit_v) { - *Ydata_offset = static_cast(clip8_lookups[output >> 22]); - } else if constexpr (std::is_same::value) { + std::cout << ", " << output; + + if constexpr (onnxruntime::is_8bit_v) { + *Ydata_offset = static_cast(clip8_lookups[output >> 22]); + } else if constexpr (std::is_same::value) { *Ydata_offset = static_cast(std::round(output)); } else { *Ydata_offset = output; } } -template -FUNC_DEF void HandleExtrapolation(IdType id, int64_t input_height, int64_t input_width, - int64_t output_depth, int64_t output_height, int64_t output_width, - const float extrapolation_value, InputType* Ydata, - const fast_divmod& div_output_height, const fast_divmod& div_output_width, - const fast_divmod& div_output_image, - const int64_t* z_outof_bounds, const int64_t* y_outof_bounds, - const int64_t* w_outof_bounds) { - int bxc, output_image_index; - div_output_image.divmod(id, bxc, output_image_index); - // CUDA_LONG input_index = bxc * input_height * input_width; - - InputType* Ydata_base = Ydata + output_image_index * (output_depth * output_height * output_width); - - int output_z, output_y, output_x, temp; - div_output_height.divmod(output_image_index, output_z, temp); - div_output_width.divmod(temp, output_y, output_x); - - // Extrapolate along the w dimension - if (w_outof_bounds[static_cast(output_x)] != -1) { - InputType* Ydata_offset = Ydata_base + (output_z * output_height + output_y) * output_width; -#ifdef CPU_TESTING - assert(w_outof_bounds[static_cast(output_x)] == output_x); -#endif - Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); - } - - // Extrapolate along the y dimension - if (y_outof_bounds[static_cast(output_y)] != -1) { -#ifdef CPU_TESTING - assert(y_outof_bounds[static_cast(output_y)] == output_y); -#endif - InputType* Ydata_offset = Ydata_base + (output_z * output_height + output_y) * output_width; - Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); - } - - // Extrapolate along the y dimension - if (z_outof_bounds != nullptr && z_outof_bounds[static_cast(output_z)] != -1) { -#ifdef CPU_TESTING - assert(z_outof_bounds[static_cast(output_z)] == output_z); -#endif - InputType* Ydata_offset = Ydata_base + (output_z * output_height + output_y) * output_width; - Ydata_offset[static_cast(output_x)] = static_cast(extrapolation_value); - } -} - -template -__global__ void _UpsampleBilinearAntiAlias( -#ifdef CPU_TESTING - IdType id, -#endif - const int64_t input_height, - const int64_t input_width, - const int64_t output_depth, - const int64_t output_height, - const int64_t output_width, - fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image, - std::tuple window_sizes, // h, w - const bool use_extrapolation, - const float extrapolation_value, - std::tuple bounds_buffers, // h, w - std::tuple outof_bounds_buffers, // z, h, w - std::tuple weight_buffers, // h, w - const uint8_t* clip8_lookups, - T* image_temp_buffer, // We expect this to be input_height * output_width * num_channels - const T* Xdata, - T* Ydata, - const size_t N) { - -#ifndef CPU_TESTING - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); -#endif - - // horizon interpolate - // This computes only the width direction.Thus height keeps unchanged. - ComputeInterpolationAtLevel1(id, input_height, input_width, input_height, output_width, - div_output_width, div_output_image, std::get<1>(window_sizes), - clip8_lookups, std::get<1>(bounds_buffers), - std::get<1>(weight_buffers), - Xdata, image_temp_buffer); - - // vertical interpolate - ComputeInterpolationAtLevel2(id, input_height, output_width, output_height, output_width, - div_output_width, div_output_image, - std::get<0>(window_sizes), - clip8_lookups, std::get<0>(bounds_buffers), - std::get<0>(weight_buffers), - image_temp_buffer, Ydata); - - if (use_extrapolation) { - HandleExtrapolation(id, input_height, input_width, output_depth, output_height, output_width, - extrapolation_value, Ydata, - div_output_height, div_output_width, div_output_image, - std::get<0>(outof_bounds_buffers), - std::get<1>(outof_bounds_buffers), - std::get<2>(outof_bounds_buffers)); - } -} - -template -__global__ void _UpsampleTrilinearAntiAlias( -#ifdef CPU_TESTING - IdType id, -#endif - const int64_t num_channels, - const int64_t input_depth, - const int64_t input_height, - const int64_t input_width, - const int64_t output_depth, - const int64_t output_height, - const int64_t output_width, - fast_divmod div_output_depth, fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image, - int64_t window_size, - const bool use_extrapolation, - const float extrapolation_value, - std::tuple bounds_buffers, // z, h, w - std::tuple outof_bounds_buffers, // z, h, w - std::tuple weight_buffers, // z, h, w - const uint8_t* clip8_lookups, - T* image_temp_buffer, - const T* Xdata, - T* Ydata, - const int N) { - -#ifndef CPU_TESTING - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); -#endif - - // horizon interpolate - // This computes only the width direction.Thus height keeps unchanged. - ComputeInterpolationAtLevel1(id, input_height, input_width, input_height, output_width, - div_output_width, div_output_image, window_size, - clip8_lookups, std::get<2>(bounds_buffers), - std::get<2>(weight_buffers), - Xdata, Ydata); - - // vertical interpolate - ComputeInterpolationAtLevel2(id, input_height, output_width, output_height, output_width, - div_output_width, div_output_image, - window_size, clip8_lookups, std::get<1>(bounds_buffers), - std::get<1>(weight_buffers), - Xdata, Ydata); - - // depth interpolate - ComputeInterpolationAtLevel2(id, input_depth, output_height * output_width, - output_depth, output_height * output_width, - div_output_width, div_output_image, - window_size, clip8_lookups, std::get<0>(bounds_buffers), - std::get<0>(weight_buffers), - Xdata, Ydata); - - if (use_extrapolation) { - HandleExtrapolation(id, input_height, input_width, output_depth, output_height, output_width, - extrapolation_value, Ydata, - div_output_height, div_output_width, div_output_image, - std::get<0>(outof_bounds_buffers), - std::get<1>(outof_bounds_buffers), - std::get<2>(outof_bounds_buffers)); - } -} - /// /// This function expects the following buffers to be pre-allocated on device /// 1. bounds: int64_t[output_size * 2] @@ -867,17 +769,18 @@ void ResizeAntiAliasImpl( auto bounds_buffer_ptr = AllocateTyped(allocate_temp_space, bounds_buffer_size); auto out_of_bounds_buffer_ptr = AllocateTyped(allocate_temp_space, out_of_bounds_buffer_size); - const int64_t weighted_buffer_size = SafeInt(weighted_y_size) + - weighted_w_size; - auto weighted_buffer = AllocateTyped(allocate_temp_space, weighted_buffer_size); - int64_t* y_bounds_buffer = GetTyped(weighted_buffer); + int64_t* y_bounds_buffer = GetTyped(bounds_buffer_ptr); int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2; int64_t* y_outof_bounds_buffer = GetTyped(out_of_bounds_buffer_ptr); int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height; - AccumType* y_weighted_buffer = GetTyped(weighted_buffer); + const int64_t weighted_buffer_size = SafeInt(weighted_y_size) + + weighted_w_size; + auto weighted_buffer_ptr = AllocateTyped(allocate_temp_space, weighted_buffer_size); + + AccumType* y_weighted_buffer = GetTyped(weighted_buffer_ptr); AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size; #ifdef CPU_TESTING @@ -904,27 +807,57 @@ void ResizeAntiAliasImpl( auto bounds_buffer_span = gsl::make_span(GetTyped(bounds_buffer_ptr), bounds_buffer_size); auto out_of_bounds_buffer_span = gsl::make_span(GetTyped(out_of_bounds_buffer_ptr), out_of_bounds_buffer_size); - auto weighted_buffer_span = gsl::make_span(GetTyped(weighted_buffer), weighted_buffer_size); + auto weighted_buffer_span = gsl::make_span(GetTyped(weighted_buffer_ptr), weighted_buffer_size); PrintAntiAliasBuffers(std::cout, bounds_buffer_span, out_of_bounds_buffer_span, weighted_buffer_span); + const auto temp_buf_size = input_height * output_width; auto image_temp_buffer = AllocateTyped(allocate_temp_space, - narrow(input_height * output_width * num_channels)); + narrow(temp_buf_size)); + + const int input_size = narrow(input_height * input_width); + auto host_input_buffer = AllocateTyped(allocate_temp_space, input_size); + + CUDA_CALL_THROW(cudaMemcpyAsync(host_input_buffer.get(), input_data, + input_size * sizeof(T), cudaMemcpyDeviceToHost, stream)); + + std::cout << "CL1: "; + + // We feed all the inputs, but the width is reduced at this step. + for (IdType id = 0, lim = narrow(temp_buf_size); id < lim; ++id) { + ComputeInterpolationAtLevel1(id, input_height, input_width, input_height, output_width, + output_div_pitches[rank - 2], + w_window_size, + use_extrapolation, extrapolation_value, clip8_lookups, + w_bounds_buffer, + std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer), + w_weighted_buffer, GetTyped(host_input_buffer), GetTyped(image_temp_buffer), + narrow(temp_buf_size)); + } - for (IdType id = 0, lim = narrow(batch_size * input_height * num_channels * input_width); id < lim; ++id) { - _UpsampleBilinearAntiAlias( - id, - input_height, input_width, output_depth, output_height, output_width, - output_div_pitches[rank - 2], output_div_pitches[rank - 1], div_output_image, - std::make_tuple(h_window_size, w_window_size), - use_extrapolation, extrapolation_value, - std::make_tuple(y_bounds_buffer, w_bounds_buffer), - std::make_tuple(static_cast(nullptr) , y_outof_bounds_buffer, w_outof_bounds_buffer), - std::make_tuple(y_weighted_buffer, w_weighted_buffer), - clip8_lookups, - GetTyped(image_temp_buffer), - input_data, output_data, N); + std::cout << std::endl; + + auto host_output_buffer = AllocateTyped(allocate_temp_space, N); + + std::cout << "CL2: "; + + // Here we feed temp buffer input_height x output_width and the output is N. + for (IdType id = 0, lim = narrow(N); id < lim; ++id) { + ComputeInterpolationAtLevel2(id, input_height, output_width, output_height, output_width, + output_div_pitches[rank - 2], + h_window_size, + use_extrapolation, extrapolation_value, + clip8_lookups, + y_bounds_buffer, + std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer), + y_weighted_buffer, GetTyped(image_temp_buffer), GetTyped(host_output_buffer), + narrow(N)); } + std::cout << std::endl; + + + CUDA_CALL_THROW(cudaMemcpyAsync(output_data, host_output_buffer.get(), + N * sizeof(T), cudaMemcpyHostToDevice, stream)); #else DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() { // Data is d, h, w in tuples