shadps4-emu · psucien · Sep 21, 2024 · Sep 11, 2024 · Sep 12, 2024 · Sep 13, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -579,6 +579,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
                src/video_core/renderer_vulkan/vk_master_semaphore.h
                src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
                src/video_core/renderer_vulkan/vk_pipeline_cache.h
+               src/video_core/renderer_vulkan/vk_pipeline_common.cpp
+               src/video_core/renderer_vulkan/vk_pipeline_common.h
                src/video_core/renderer_vulkan/vk_platform.cpp
                src/video_core/renderer_vulkan/vk_platform.h
                src/video_core/renderer_vulkan/vk_rasterizer.cpp

diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp
@@ -157,8 +157,11 @@ Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, const
     ImageOperands operands;
     operands.AddOffset(ctx, offset);
     operands.Add(spv::ImageOperandsMask::Lod, lod);
-    return ctx.OpBitcast(
-        ctx.F32[4], ctx.OpImageFetch(result_type, image, coords, operands.mask, operands.operands));
+    const Id texel =
+        texture.is_storage
+            ? ctx.OpImageRead(result_type, image, coords, operands.mask, operands.operands)
+            : ctx.OpImageFetch(result_type, image, coords, operands.mask, operands.operands);
+    return ctx.OpBitcast(ctx.F32[4], texel);
 }
 
 Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod, bool skip_mips) {

diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -510,7 +510,8 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
     case AmdGpu::ImageType::Color3D:
         return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, sampled, format);
     case AmdGpu::ImageType::Cube:
-        return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, sampled, format);
+        return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, desc.is_array, false, sampled,
+                             format);
     default:
         break;
     }
@@ -534,6 +535,7 @@ void EmitContext::DefineImagesAndSamplers() {
             .sampled_type = image_desc.is_storage ? sampled_type : TypeSampledImage(image_type),
             .pointer_type = pointer_type,
             .image_type = image_type,
+            .is_storage = image_desc.is_storage,
         });
         interfaces.push_back(id);
         ++binding;

diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -200,6 +200,7 @@ class EmitContext final : public Sirit::Module {
         Id sampled_type;
         Id pointer_type;
         Id image_type;
+        bool is_storage = false;
     };
 
     struct BufferDefinition {
@@ -216,8 +217,8 @@ class EmitContext final : public Sirit::Module {
         u32 binding;
         Id image_type;
         Id result_type;
-        bool is_integer;
-        bool is_storage;
+        bool is_integer = false;
+        bool is_storage = false;
     };
 
     u32& binding;

diff --git a/src/shader_recompiler/frontend/decode.cpp b/src/shader_recompiler/frontend/decode.cpp
@@ -1032,6 +1032,7 @@ void GcnDecodeContext::decodeInstructionMIMG(uint64_t hexInstruction) {
 
     m_instruction.control.mimg = *reinterpret_cast<InstControlMIMG*>(&hexInstruction);
     m_instruction.control.mimg.mod = getMimgModifier(m_instruction.opcode);
+    ASSERT(m_instruction.control.mimg.r128 == 0);
 }
 
 void GcnDecodeContext::decodeInstructionDS(uint64_t hexInstruction) {

diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp
@@ -71,6 +71,9 @@ void Translator::EmitExport(const GcnInst& inst) {
             ir.SetAttribute(attrib, comp, swizzle(i));
         }
     }
+    if (IR::IsMrt(attrib)) {
+        info.mrt_mask |= 1u << u8(attrib);
+    }
 }
 
 } // namespace Shader::Gcn
diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@@ -546,6 +546,7 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) {
     info.has_offset.Assign(flags.test(MimgModifier::Offset));
     info.explicit_lod.Assign(explicit_lod);
     info.has_derivatives.Assign(has_derivatives);
+    info.is_array.Assign(mimg.da);
 
     // Issue IR instruction, leaving unknown fields blank to patch later.
     const IR::Value texel = [&]() -> IR::Value {
@@ -630,6 +631,7 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) {
     info.has_offset.Assign(flags.test(MimgModifier::Offset));
     // info.explicit_lod.Assign(explicit_lod);
     info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1);
+    info.is_array.Assign(mimg.da);
 
     // Issue IR instruction, leaving unknown fields blank to patch later.
     const IR::Value texel = [&]() -> IR::Value {

diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h
@@ -64,9 +64,10 @@ struct ImageResource {
     u32 dword_offset;
     AmdGpu::ImageType type;
     AmdGpu::NumberFormat nfmt;
-    bool is_storage;
-    bool is_depth;
+    bool is_storage{};
+    bool is_depth{};
     bool is_atomic{};
+    bool is_array{};
 
     constexpr AmdGpu::Image GetSharp(const Info& info) const noexcept;
 };
@@ -171,6 +172,7 @@ struct Info {
     bool uses_fp64{};
     bool uses_step_rates{};
     bool translation_failed{}; // indicates that shader has unsupported instructions
+    u8 mrt_mask{0u};
 
     explicit Info(Stage stage_, ShaderParams params)
         : stage{stage_}, pgm_hash{params.hash}, pgm_base{params.Base()},

diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -200,9 +200,10 @@ class Descriptors {
     u32 Add(const ImageResource& desc) {
         const u32 index{Add(image_resources, desc, [&desc](const auto& existing) {
             return desc.sgpr_base == existing.sgpr_base &&
-                   desc.dword_offset == existing.dword_offset && desc.type == existing.type &&
-                   desc.is_storage == existing.is_storage;
+                   desc.dword_offset == existing.dword_offset;
         })};
+        auto& image = image_resources[index];
+        image.is_storage |= desc.is_storage;
         return index;
     }
 
@@ -441,18 +442,29 @@ void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
 }
 
 IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& t,
-                         const IR::Value& z, bool is_storage) {
+                         const IR::Value& z, bool is_storage, bool is_array) {
     // When cubemap is written with imageStore it is treated like 2DArray.
     if (is_storage) {
         return ir.CompositeConstruct(s, t, z);
     }
+
+    ASSERT(s.Type() == IR::Type::F32); // in case of fetched image need to adjust the code below
+
     // We need to fix x and y coordinate,
     // because the s and t coordinate will be scaled and plus 1.5 by v_madak_f32.
     // We already force the scale value to be 1.0 when handling v_cubema_f32,
     // here we subtract 1.5 to recover the original value.
     const IR::Value x = ir.FPSub(IR::F32{s}, ir.Imm32(1.5f));
     const IR::Value y = ir.FPSub(IR::F32{t}, ir.Imm32(1.5f));
-    return ir.CompositeConstruct(x, y, z);
+    if (is_array) {
+        const IR::U32 array_index = ir.ConvertFToU(32, IR::F32{z});
+        const IR::U32 face_id = ir.BitwiseAnd(array_index, ir.Imm32(7u));
+        const IR::U32 slice_id = ir.ShiftRightLogical(array_index, ir.Imm32(3u));
+        return ir.CompositeConstruct(x, y, ir.ConvertIToF(32, 32, false, face_id),
+                                     ir.ConvertIToF(32, 32, false, slice_id));
+    } else {
+        return ir.CompositeConstruct(x, y, z);
+    }
 }
 
 void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
@@ -481,14 +493,16 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
     }
     ASSERT(image.GetType() != AmdGpu::ImageType::Invalid);
     const bool is_storage = IsImageStorageInstruction(inst);
+    const auto type = image.IsPartialCubemap() ? AmdGpu::ImageType::Color2DArray : image.GetType();
     u32 image_binding = descriptors.Add(ImageResource{
         .sgpr_base = tsharp.sgpr_base,
         .dword_offset = tsharp.dword_offset,
-        .type = image.GetType(),
+        .type = type,
         .nfmt = static_cast<AmdGpu::NumberFormat>(image.GetNumberFmt()),
         .is_storage = is_storage,
         .is_depth = bool(inst_info.is_depth),
         .is_atomic = IsImageAtomicInstruction(inst),
+        .is_array = bool(inst_info.is_array),
     });
 
     // Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions
@@ -545,7 +559,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
         case AmdGpu::ImageType::Color3D: // x, y, z
             return {ir.CompositeConstruct(body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
         case AmdGpu::ImageType::Cube: // x, y, face
-            return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2), is_storage),
+            return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2), is_storage,
+                                   inst_info.is_array),
                     body->Arg(3)};
         default:
             UNREACHABLE_MSG("Unknown image type {}", image.GetType());

diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h
@@ -59,6 +59,7 @@ union TextureInstInfo {
     BitField<5, 1, u32> has_offset;
     BitField<6, 2, u32> gather_comp;
     BitField<8, 1, u32> has_derivatives;
+    BitField<9, 1, u32> is_array;
 };
 
 union BufferInstInfo {

diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h
@@ -62,7 +62,8 @@ struct StageSpecialization {
                      });
         ForEachSharp(binding, images, info->images,
                      [](auto& spec, const auto& desc, AmdGpu::Image sharp) {
-                         spec.type = sharp.GetType();
+                         spec.type = sharp.IsPartialCubemap() ? AmdGpu::ImageType::Color2DArray
+                                                              : sharp.GetType();
                          spec.is_integer = AmdGpu::IsInteger(sharp.GetNumberFmt());
                      });
     }

diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h
@@ -238,10 +238,15 @@ struct Image {
         return pitch + 1;
     }
 
-    u32 NumLayers() const {
+    u32 NumLayers(bool is_array) const {
         u32 slices = GetType() == ImageType::Color3D ? 1 : depth + 1;
         if (GetType() == ImageType::Cube) {
-            slices *= 6;
+            if (is_array) {
+                slices = last_array + 1;
+                ASSERT(slices % 6 == 0);
+            } else {
+                slices = 6;
+            }
         }
         if (pow2pad) {
             slices = std::bit_ceil(slices);
@@ -282,6 +287,11 @@ struct Image {
     bool IsTiled() const {
         return GetTilingMode() != TilingMode::Display_Linear;
     }
+
+    bool IsPartialCubemap() const {
+        const auto viewed_slice = last_array - base_array + 1;
+        return GetType() == ImageType::Cube && viewed_slice < 6;
+    }
 };
 static_assert(sizeof(Image) == 32); // 256bits
 

diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -581,15 +581,23 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
         return false;
     }
     Image& image = texture_cache.GetImage(image_id);
+    ASSERT_MSG(device_addr == image.info.guest_address,
+               "Texel buffer aliases image subresources {:x} : {:x}", device_addr,
+               image.info.guest_address);
     boost::container::small_vector<vk::BufferImageCopy, 8> copies;
     u32 offset = buffer.Offset(image.cpu_addr);
     const u32 num_layers = image.info.resources.layers;
+    u32 total_size = 0;
     for (u32 m = 0; m < image.info.resources.levels; m++) {
         const u32 width = std::max(image.info.size.width >> m, 1u);
         const u32 height = std::max(image.info.size.height >> m, 1u);
         const u32 depth =
             image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
         const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
+        offset += mip_ofs * num_layers;
+        if (offset + (mip_size * num_layers) > buffer.SizeBytes()) {
+            break;
+        }
         copies.push_back({
             .bufferOffset = offset,
             .bufferRowLength = static_cast<u32>(mip_pitch),
@@ -603,11 +611,11 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
             .imageOffset = {0, 0, 0},
             .imageExtent = {width, height, depth},
         });
-        offset += mip_ofs * num_layers;
+        total_size += mip_size * num_layers;
     }
     if (!copies.empty()) {
         scheduler.EndRendering();
-        image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead);
+        image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {});
         const auto cmdbuf = scheduler.CommandBuffer();
         cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer,
                                  copies);

diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.h b/src/video_core/renderer_vulkan/liverpool_to_vk.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <span>
+#include "common/assert.h"
 #include "video_core/amdgpu/liverpool.h"
 #include "video_core/amdgpu/pixel_format.h"
 #include "video_core/amdgpu/resource.h"
@@ -55,4 +56,13 @@ vk::SampleCountFlagBits NumSamples(u32 num_samples, vk::SampleCountFlags support
 
 void EmitQuadToTriangleListIndices(u8* out_indices, u32 num_vertices);
 
+static inline vk::Format PromoteFormatToDepth(vk::Format fmt) {
+    if (fmt == vk::Format::eR32Sfloat) {
+        return vk::Format::eD32Sfloat;
+    } else if (fmt == vk::Format::eR16Unorm) {
+        return vk::Format::eD16Unorm;
+    }
+    UNREACHABLE();
+}
+
 } // namespace Vulkan::LiverpoolToVK
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -202,7 +202,8 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image, bool is_eop
     scheduler.EndRendering();
     const auto cmdbuf = scheduler.CommandBuffer();
 
-    image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead, cmdbuf);
+    image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {},
+                  cmdbuf);
 
     const std::array pre_barrier{
         vk::ImageMemoryBarrier{
@@ -228,7 +229,7 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image, bool is_eop
 
     // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image.
     cmdbuf.blitImage(
-        image.image, image.layout, frame->image, vk::ImageLayout::eTransferDstOptimal,
+        image.image, image.last_state.layout, frame->image, vk::ImageLayout::eTransferDstOptimal,
         MakeImageBlit(image.info.size.width, image.info.size.height, frame->width, frame->height),
         vk::Filter::eLinear);
 
@@ -269,6 +270,9 @@ void RendererVulkan::Present(Frame* frame) {
 
     auto& scheduler = present_scheduler;
     const auto cmdbuf = scheduler.CommandBuffer();
+
+    ImGui::Core::Render(cmdbuf, frame);
+
     {
         auto* profiler_ctx = instance.GetProfilerContext();
         TracyVkNamedZoneC(profiler_ctx, renderer_gpu_zone, cmdbuf, "Host frame",
@@ -326,8 +330,6 @@ void RendererVulkan::Present(Frame* frame) {
             },
         };
 
-        ImGui::Core::Render(cmdbuf, frame);
-
         cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eColorAttachmentOutput,
                                vk::PipelineStageFlagBits::eTransfer,
                                vk::DependencyFlagBits::eByRegion, {}, {}, pre_barriers);