diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index 895fd86bb01e5f..a2d3f878e1cc69 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -53,15 +53,31 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive get_internal_buffer_layouts_impl() const override { - // TODO: current implementation is supposed to have the same kernel version for both indirect/default paths, - // considering this, we may assume that both indirect/default kernels have absolutely the same intermediate - // buffers number and its' sizes (since update_dispatch_data is called for both kernels too), and - // do not double memory allocations during reallocate_if_needed() function call + // Look for the first sdpa_opt kernel entry. Currently, it can be used as default sdpa, indirect sdpa, or for both default + // and indirect cases. All of sdpa_opt kernels use the same internal buffers, so we can find the first sdpa_opt and + // use its` internal buffers configuration. The following scenarios are possible: + // 1) _kernels_data[0] - micro_sdpa (default) + // => internal buffers are not needed + // 2) _kernels_data[0] - sdpa_opt (default) + // => use internal buffers from [0] kernel + // 2) _kernels_data[0] - sdpa_opt (default) + // _kernels_data[1] - sdpa_opt (indirect) + // => use internal buffers from [0] kernel + // 3) _kernels_data[0] - micro_sdpa (default) + // _kernels_data[1] - sdpa_opt (indirect) + // => use internal buffers from [1] kernel + size_t kernel_idx = _kernels_data.size(); + if (_kernels_data.size() >= 1 && !_kernels_data[0].internalBufferSizes.empty()) { + kernel_idx = 0; + } else if (_kernels_data.size() >= 2 && !_kernels_data[1].internalBufferSizes.empty()) { + kernel_idx = 1; + } + std::vector layouts; - if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) { - auto dtype = from_data_type(_kernels_data[0].internalBufferDataType); + if (kernel_idx < _kernels_data.size()) { + auto dtype = from_data_type(_kernels_data[kernel_idx].internalBufferDataType); const auto bpp = data_type_traits::size_of(dtype); - for (auto size : _kernels_data[0].internalBufferSizes) { + for (auto size : _kernels_data[kernel_idx].internalBufferSizes) { layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel) {1, 1, 1, (tensor::value_type)(size / bpp)}}; layouts.push_back(inbuf_layout); @@ -300,7 +316,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive{static_cast(b), static_cast(f), static_cast(y), static_cast(x)}; + size_t b_kv = bt_in_ptr[b * kv_shape[concat_axis] + out_idx[concat_axis]]; // bt_idx = b * total_seq_len + seq_len_idx auto in_idx = std::vector{static_cast(b_kv), static_cast(f), static_cast(y), static_cast(x)}; - auto out_idx = std::vector{static_cast(b), static_cast(f), static_cast(y), static_cast(x)}; cldnn::tensor in(cldnn::format::bfyx, in_idx, 0); cldnn::tensor out(cldnn::format::bfyx, out_idx, 0); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp index dba2bf4132ace6..3c25f7a8ab9c65 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp @@ -124,7 +124,7 @@ void set_arguments_impl(ocl_kernel_type& kernel, switch (scalar.t) { case scalar_t::UINT8: status = kernel.setArg(i, scalar.v.u8); - GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set scalar " << i << " (u8): " << scalar.v.u8 << "\n"; + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set scalar " << i << " (u8): " << static_cast(scalar.v.u8) << "\n"; break; case scalar_t::UINT16: status = kernel.setArg(i, scalar.v.u16); @@ -140,7 +140,7 @@ void set_arguments_impl(ocl_kernel_type& kernel, break; case scalar_t::INT8: status = kernel.setArg(i, scalar.v.s8); - GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set scalar " << i << " (s8): " << scalar.v.s8 << "\n"; + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set scalar " << i << " (s8): " << static_cast(scalar.v.s8) << "\n"; break; case scalar_t::INT16: status = kernel.setArg(i, scalar.v.s16); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp index 2563fe535a93d9..fe923135550e5b 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp @@ -343,6 +343,10 @@ std::vector get_test_params() { p.push_back({with_rearrange, with_mask, !with_scale, !causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); p.push_back({!with_rearrange, with_mask, !with_scale, !causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + // Beam search + p.push_back({with_rearrange, !with_mask, !with_scale, !causal, !compressed, 2, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, !with_mask, !with_scale, !causal, !compressed, 4, ov::element::Type_t::f16, 5, 16, 1, {0, 2, 1, 3}}); + // Compressed p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}});