From aac546d820f42cb2332100b5adff88d5c9fe24fd Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 11 Nov 2024 13:54:49 -0800 Subject: [PATCH] [ET-VK] Reduced int precision for texture coordinates in conv2d_pw op, to reduce shader register pressure. This diff reduces the precision of texture coordinates in the conv2d_pw op in Executorch's Vulkan backend to reduce shader register pressure. The changes made in the code include reducing the precision of the z coordinate in the loop and using uint16_t instead of int for the loop counter. Differential Revision: [D64767415](https://our.internmc.facebook.com/intern/diff/D64767415/) [ghstack-poisoned] --- backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index fedbdb0b5b..b1950f970e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -77,15 +77,16 @@ void main() { sum[i] = sum[0]; } + int z4 = 0; // Since the kernel is 1x1, we only have to loop over the depth dimension. - for (int z = 0, z4 = 0; z < in_group_size; z += 4, ++z4) { + for (uint16_t z = uint16_t(0); z < uint16_t(in_group_size); z += uint16_t(4), ++z4) { // During prepacking, the weight tensor has been permuted so that the // channel (IC) dim is along the x-axis, and the batch (OC) dim is along // the z-axis. - const vec4 ktex_0 = texelFetch(t_kernel, u16vec2(z + 0, gpos.z), 0); - const vec4 ktex_1 = texelFetch(t_kernel, u16vec2(z + 1, gpos.z), 0); - const vec4 ktex_2 = texelFetch(t_kernel, u16vec2(z + 2, gpos.z), 0); - const vec4 ktex_3 = texelFetch(t_kernel, u16vec2(z + 3, gpos.z), 0); + const vec4 ktex_0 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(0, 0)); + const vec4 ktex_1 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(1, 0)); + const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(2, 0)); + const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(3, 0)); #pragma unroll