From b90c43ba7e26531d07923255c587af2b4e097e61 Mon Sep 17 00:00:00 2001 From: Jozufozu Date: Fri, 18 Oct 2024 13:45:37 -0700 Subject: [PATCH] Directly visible - Don't actually need a framebuffer attachment for visibility - Instead, process everything in pass 2 and write out the visibility bitset directly - Persist visibility bits between frames for use in pass 1 - No need for indirect dispatch! - Also saves some ssbo bindings - Do frustum culling in both passes --- .../backend/compile/IndirectPrograms.java | 2 + .../engine/indirect/BufferBindings.java | 20 ++- .../engine/indirect/IndirectBuffers.java | 39 ++--- .../engine/indirect/IndirectCullingGroup.java | 29 +--- .../engine/indirect/IndirectDrawManager.java | 26 --- .../engine/indirect/VisibilityBuffer.java | 132 --------------- .../internal/indirect/buffer_bindings.glsl | 24 +-- .../flywheel/internal/indirect/cull.glsl | 152 ------------------ .../internal/indirect/early_cull.glsl | 32 +--- .../flywheel/internal/indirect/late_cull.glsl | 92 +++++++---- .../flywheel/internal/indirect/main.frag | 4 - .../flywheel/internal/indirect/main.vert | 6 +- .../internal/indirect/read_visibility.glsl | 64 -------- 13 files changed, 109 insertions(+), 513 deletions(-) delete mode 100644 common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java delete mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/cull.glsl delete mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java index ad416941f..c34a15147 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java @@ -109,6 +109,8 @@ private static CompilationHarness> createCullingCompiler(ShaderS .nameMapper(instanceType -> name + "/" + ResourceUtil.toDebugFileNameNoExtension(instanceType.cullShader())) .requireExtensions(COMPUTE_EXTENSIONS) .define("_FLW_SUBGROUP_SIZE", GlCompat.SUBGROUP_SIZE) + .enableExtension("GL_KHR_shader_subgroup_basic") + .enableExtension("GL_KHR_shader_subgroup_ballot") .withResource(CULL_SHADER_API_IMPL) .withComponent(InstanceStructComponent::new) .withResource(InstanceType::cullShader) diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java index 67b0a9dc6..322cfe531 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java @@ -1,18 +1,16 @@ package dev.engine_room.flywheel.backend.engine.indirect; public final class BufferBindings { - public static final int PASS_TWO_DISPATCH = 0; - public static final int PASS_TWO_INSTANCE_INDEX = 1; - public static final int PAGE_FRAME_DESCRIPTOR = 2; - public static final int INSTANCE = 3; - public static final int DRAW_INSTANCE_INDEX = 4; - public static final int MODEL = 5; - public static final int DRAW = 6; + public static final int LAST_FRAME_VISIBILITY = 0; + public static final int PAGE_FRAME_DESCRIPTOR = 1; + public static final int INSTANCE = 2; + public static final int DRAW_INSTANCE_INDEX = 3; + public static final int MODEL = 4; + public static final int DRAW = 5; - public static final int LIGHT_LUT = 7; - public static final int LIGHT_SECTION = 8; - public static final int MATRICES = 9; - public static final int LAST_FRAME_VISIBILITY = 10; + public static final int LIGHT_LUT = 6; + public static final int LIGHT_SECTION = 7; + public static final int MATRICES = 8; private BufferBindings() { } diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java index 722c89e8f..62f1453fd 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java @@ -7,11 +7,12 @@ import org.lwjgl.system.Pointer; import dev.engine_room.flywheel.backend.gl.buffer.GlBufferType; +import dev.engine_room.flywheel.lib.math.MoreMath; import dev.engine_room.flywheel.lib.memory.MemoryBlock; public class IndirectBuffers { // Number of vbos created. - public static final int BUFFER_COUNT = 7; + public static final int BUFFER_COUNT = 6; public static final long INT_SIZE = Integer.BYTES; public static final long PTR_SIZE = Pointer.POINTER_SIZE; @@ -30,8 +31,7 @@ public class IndirectBuffers { private static final long BUFFERS_SIZE_BYTES = SIZE_OFFSET + BUFFER_COUNT * PTR_SIZE; // Offsets to the vbos - private static final long PASS_TWO_DISPATCH_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PASS_TWO_DISPATCH * INT_SIZE; - private static final long PASS_TWO_INSTANCE_INDEX_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PASS_TWO_INSTANCE_INDEX * INT_SIZE; + private static final long LAST_FRAME_VISIBILITY_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.LAST_FRAME_VISIBILITY * INT_SIZE; private static final long PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * INT_SIZE; private static final long INSTANCE_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.INSTANCE * INT_SIZE; private static final long DRAW_INSTANCE_INDEX_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW_INSTANCE_INDEX * INT_SIZE; @@ -39,8 +39,7 @@ public class IndirectBuffers { private static final long DRAW_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW * INT_SIZE; // Offsets to the sizes - private static final long PASS_TWO_DISPATCH_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PASS_TWO_DISPATCH * PTR_SIZE; - private static final long PASS_TWO_INSTANCE_INDEX_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PASS_TWO_INSTANCE_INDEX * PTR_SIZE; + private static final long LAST_FRAME_VISIBILITY_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.LAST_FRAME_VISIBILITY * PTR_SIZE; private static final long PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * PTR_SIZE; private static final long INSTANCE_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.INSTANCE * PTR_SIZE; private static final long DRAW_INSTANCE_INDEX_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.DRAW_INSTANCE_INDEX * PTR_SIZE; @@ -66,8 +65,7 @@ public class IndirectBuffers { */ private final MemoryBlock multiBindBlock; - public final ResizableStorageBuffer passTwoDispatch; - public final ResizableStorageArray passTwoInstanceIndex; + public final ResizableStorageArray lastFrameVisibility; public final ObjectStorage objectStorage; public final ResizableStorageArray drawInstanceIndex; public final ResizableStorageArray model; @@ -76,34 +74,29 @@ public class IndirectBuffers { IndirectBuffers(long instanceStride) { this.multiBindBlock = MemoryBlock.calloc(BUFFERS_SIZE_BYTES, 1); - passTwoDispatch = new ResizableStorageBuffer(); - passTwoInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR); + lastFrameVisibility = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR); objectStorage = new ObjectStorage(instanceStride); drawInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR); model = new ResizableStorageArray(MODEL_STRIDE, MODEL_GROWTH_FACTOR); draw = new ResizableStorageArray(DRAW_COMMAND_STRIDE, DRAW_GROWTH_FACTOR); - - passTwoDispatch.ensureCapacity(INT_SIZE * 4); } void updateCounts(int instanceCount, int modelCount, int drawCount) { drawInstanceIndex.ensureCapacity(instanceCount); - passTwoInstanceIndex.ensureCapacity(instanceCount); + lastFrameVisibility.ensureCapacity(MoreMath.ceilingDiv(instanceCount, 32)); model.ensureCapacity(modelCount); draw.ensureCapacity(drawCount); final long ptr = multiBindBlock.ptr(); - MemoryUtil.memPutInt(ptr + PASS_TWO_DISPATCH_HANDLE_OFFSET, passTwoDispatch.handle()); - MemoryUtil.memPutInt(ptr + PASS_TWO_INSTANCE_INDEX_HANDLE_OFFSET, passTwoInstanceIndex.handle()); + MemoryUtil.memPutInt(ptr + LAST_FRAME_VISIBILITY_HANDLE_OFFSET, lastFrameVisibility.handle()); MemoryUtil.memPutInt(ptr + PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET, objectStorage.frameDescriptorBuffer.handle()); MemoryUtil.memPutInt(ptr + INSTANCE_HANDLE_OFFSET, objectStorage.objectBuffer.handle()); MemoryUtil.memPutInt(ptr + DRAW_INSTANCE_INDEX_HANDLE_OFFSET, drawInstanceIndex.handle()); MemoryUtil.memPutInt(ptr + MODEL_HANDLE_OFFSET, model.handle()); MemoryUtil.memPutInt(ptr + DRAW_HANDLE_OFFSET, draw.handle()); - MemoryUtil.memPutAddress(ptr + PASS_TWO_DISPATCH_SIZE_OFFSET, passTwoDispatch.capacity()); - MemoryUtil.memPutAddress(ptr + PASS_TWO_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount); + MemoryUtil.memPutAddress(ptr + LAST_FRAME_VISIBILITY_SIZE_OFFSET, INT_SIZE * MoreMath.ceilingDiv(instanceCount, 32)); MemoryUtil.memPutAddress(ptr + PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET, objectStorage.frameDescriptorBuffer.capacity()); MemoryUtil.memPutAddress(ptr + INSTANCE_SIZE_OFFSET, objectStorage.objectBuffer.capacity()); MemoryUtil.memPutAddress(ptr + DRAW_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount); @@ -112,24 +105,23 @@ void updateCounts(int instanceCount, int modelCount, int drawCount) { } public void bindForCullPassOne() { - multiBind(0, 6); + multiBind(0, 5); } public void bindForCullPassTwo() { - multiBind(0, 6); - GlBufferType.DISPATCH_INDIRECT_BUFFER.bind(passTwoDispatch.handle()); + multiBind(0, 5); } public void bindForApply() { - multiBind(5, 2); + multiBind(4, 2); } public void bindForModelReset() { - multiBind(5, 1); + multiBind(4, 1); } public void bindForDraw() { - multiBind(3, 4); + multiBind(2, 4); GlBufferType.DRAW_INDIRECT_BUFFER.bind(draw.handle()); } @@ -155,7 +147,6 @@ public void delete() { drawInstanceIndex.delete(); model.delete(); draw.delete(); - passTwoDispatch.delete(); - passTwoInstanceIndex.delete(); + lastFrameVisibility.delete(); } } diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java index b998ea6f8..4500534c0 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java @@ -6,7 +6,6 @@ import static org.lwjgl.opengl.GL42.GL_COMMAND_BARRIER_BIT; import static org.lwjgl.opengl.GL42.glMemoryBarrier; import static org.lwjgl.opengl.GL43.glDispatchCompute; -import static org.lwjgl.opengl.GL43.glDispatchComputeIndirect; import java.util.ArrayList; import java.util.Comparator; @@ -14,8 +13,6 @@ import java.util.List; import java.util.Map; -import org.lwjgl.opengl.GL46; - import dev.engine_room.flywheel.api.instance.Instance; import dev.engine_room.flywheel.api.instance.InstanceType; import dev.engine_room.flywheel.api.material.Material; @@ -29,7 +26,6 @@ import dev.engine_room.flywheel.backend.engine.uniform.Uniforms; import dev.engine_room.flywheel.backend.gl.GlCompat; import dev.engine_room.flywheel.backend.gl.shader.GlProgram; -import dev.engine_room.flywheel.lib.material.LightShaders; import dev.engine_room.flywheel.lib.math.MoreMath; public class IndirectCullingGroup { @@ -54,12 +50,6 @@ public class IndirectCullingGroup { private boolean needsDrawSort; public int instanceCountThisFrame; - private int pagesLastFrame = 0; - private int pagesThisFrame = 0; - - private int visibilityWriteOffsetPages = 0; - private int visibilityReadOffsetPages = 0; - IndirectCullingGroup(InstanceType instanceType, IndirectPrograms programs) { this.instanceType = instanceType; instanceStride = MoreMath.align4(instanceType.layout() @@ -95,17 +85,6 @@ public void flushInstancers() { } } - public int flipVisibilityOffsets(int visibilityWriteOffsetPages) { - this.visibilityReadOffsetPages = this.visibilityWriteOffsetPages; - this.visibilityWriteOffsetPages = visibilityWriteOffsetPages; - - pagesLastFrame = pagesThisFrame; - - pagesThisFrame = buffers.objectStorage.capacity(); - - return pagesThisFrame; - } - public void upload(StagingBuffer stagingBuffer) { if (nothingToDo()) { return; @@ -127,8 +106,6 @@ public void upload(StagingBuffer stagingBuffer) { } uploadDraws(stagingBuffer); - - GL46.nglClearNamedBufferData(buffers.passTwoDispatch.handle(), GL46.GL_R32UI, GL46.GL_RED, GL46.GL_UNSIGNED_INT, 0); } public void dispatchCull() { @@ -139,8 +116,6 @@ public void dispatchCull() { Uniforms.bindAll(); earlyCull.bind(); - earlyCull.setUInt("_flw_visibilityReadOffsetPages", visibilityReadOffsetPages); - buffers.bindForCullPassOne(); glDispatchCompute(buffers.objectStorage.capacity(), 1, 1); } @@ -154,7 +129,7 @@ public void dispatchCullPassTwo() { lateCull.bind(); buffers.bindForCullPassTwo(); - glDispatchComputeIndirect(0); + glDispatchCompute(buffers.objectStorage.capacity(), 1, 1); } public void dispatchApply() { @@ -257,8 +232,6 @@ public void submit(VisualType visualType) { // Don't need to do this unless the program changes. drawProgram.bind(); baseDrawUniformLoc = drawProgram.getUniformLocation("_flw_baseDraw"); - - drawProgram.setUInt("_flw_visibilityWriteOffsetInstances", visibilityWriteOffsetPages << ObjectStorage.LOG_2_PAGE_SIZE); } glUniform1ui(baseDrawUniformLoc, multiDraw.start); diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java index c898a84dc..d59cbe381 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java @@ -53,11 +53,6 @@ public class IndirectDrawManager extends DrawManager> { private final MatrixBuffer matrixBuffer; private final DepthPyramid depthPyramid; - private final VisibilityBuffer visibilityBuffer; - - private int totalPagesLastFrame = 0; - - private boolean needsBarrier = false; public IndirectDrawManager(IndirectPrograms programs) { this.programs = programs; @@ -73,7 +68,6 @@ public IndirectDrawManager(IndirectPrograms programs) { matrixBuffer = new MatrixBuffer(); depthPyramid = new DepthPyramid(programs); - visibilityBuffer = new VisibilityBuffer(programs); } @Override @@ -112,8 +106,6 @@ public void render(VisualType visualType) { glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_BUFFER_UPDATE_BARRIER_BIT); - visibilityBuffer.bind(); - for (var group1 : cullingGroups.values()) { group1.dispatchCull(); } @@ -124,8 +116,6 @@ public void render(VisualType visualType) { glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - visibilityBuffer.attach(); - submitDraws(); depthPyramid.generate(); @@ -156,8 +146,6 @@ public void render(VisualType visualType) { MaterialRenderState.reset(); TextureBinder.resetLightAndOverlay(); - - visibilityBuffer.detach(); } private void dispatchApply() { @@ -185,20 +173,12 @@ public void flush(LightStorage lightStorage, EnvironmentStorage environmentStora group.flushInstancers(); } - visibilityBuffer.read(totalPagesLastFrame); - visibilityBuffer.clear(); - cullingGroups.values() .removeIf(IndirectCullingGroup::checkEmptyAndDelete); instancers.values() .removeIf(instancer -> instancer.instanceCount() == 0); - int totalPagesThisFrame = 0; - for (var group : cullingGroups.values()) { - totalPagesThisFrame += group.flipVisibilityOffsets(totalPagesThisFrame); - } - meshPool.flush(); stagingBuffer.reclaim(); @@ -215,10 +195,6 @@ public void flush(LightStorage lightStorage, EnvironmentStorage environmentStora // We could probably save some driver calls here when there are // actually zero instances, but that feels like a very rare case - - needsBarrier = true; - - totalPagesLastFrame = totalPagesThisFrame; } @Override @@ -238,8 +214,6 @@ public void delete() { programs.release(); depthPyramid.delete(); - - visibilityBuffer.delete(); } public void renderCrumbling(List crumblingBlocks) { diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java deleted file mode 100644 index 7a82f7735..000000000 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java +++ /dev/null @@ -1,132 +0,0 @@ -package dev.engine_room.flywheel.backend.engine.indirect; - -import org.lwjgl.opengl.GL30; -import org.lwjgl.opengl.GL32; -import org.lwjgl.opengl.GL46; -import org.lwjgl.opengl.GL46C; - -import com.mojang.blaze3d.platform.GlStateManager; - -import dev.engine_room.flywheel.backend.FlwBackend; -import dev.engine_room.flywheel.backend.compile.IndirectPrograms; -import dev.engine_room.flywheel.backend.gl.GlTextureUnit; -import dev.engine_room.flywheel.lib.math.MoreMath; -import it.unimi.dsi.fastutil.ints.IntArraySet; -import it.unimi.dsi.fastutil.ints.IntSet; -import net.minecraft.client.Minecraft; - -public class VisibilityBuffer { - private static final int READ_GROUP_SIZE = 32; - private static final int ATTACHMENT = GL30.GL_COLOR_ATTACHMENT1; - - private final IndirectPrograms programs; - private final ResizableStorageArray lastFrameVisibility; - private int textureId = -1; - - private int lastWidth = -1; - private int lastHeight = -1; - - private final IntSet attached = new IntArraySet(); - - public VisibilityBuffer(IndirectPrograms programs) { - this.programs = programs; - lastFrameVisibility = new ResizableStorageArray(Integer.BYTES, 1.25f); - } - - public void read(int pageCount) { - if (pageCount == 0) { - return; - } - - lastFrameVisibility.ensureCapacity(pageCount); - - GL46.nglClearNamedBufferData(lastFrameVisibility.handle(), GL46.GL_R32UI, GL46.GL_RED_INTEGER, GL46.GL_UNSIGNED_INT, 0); - - if (lastWidth == -1 || lastHeight == -1) { - return; - } - - programs.getReadVisibilityProgram() - .bind(); - bind(); - - GlTextureUnit.T0.makeActive(); - GlStateManager._bindTexture(textureId); - - GL46.glDispatchCompute(MoreMath.ceilingDiv(lastWidth, READ_GROUP_SIZE), MoreMath.ceilingDiv(lastHeight, READ_GROUP_SIZE), 1); - } - - public void bind() { - GL46.glBindBufferBase(GL46.GL_SHADER_STORAGE_BUFFER, BufferBindings.LAST_FRAME_VISIBILITY, lastFrameVisibility.handle()); - } - - public void attach() { - var mainRenderTarget = Minecraft.getInstance() - .getMainRenderTarget(); - - setupTexture(mainRenderTarget.width, mainRenderTarget.height); - - if (attached.add(mainRenderTarget.frameBufferId)) { - GL46.glNamedFramebufferTexture(mainRenderTarget.frameBufferId, ATTACHMENT, textureId, 0); - - try { - mainRenderTarget.checkStatus(); - } catch (Exception e) { - FlwBackend.LOGGER.error("Error attaching visbuffer", e); - } - } - - // Enable writes - GL46.glNamedFramebufferDrawBuffers(mainRenderTarget.frameBufferId, new int[] { GL30.GL_COLOR_ATTACHMENT0, ATTACHMENT }); - } - - public void detach() { - var mainRenderTarget = Minecraft.getInstance() - .getMainRenderTarget(); - - // Disable writes - GL46.glNamedFramebufferDrawBuffers(mainRenderTarget.frameBufferId, new int[] { GL30.GL_COLOR_ATTACHMENT0 }); - } - - public void delete() { - deleteTexture(); - lastFrameVisibility.delete(); - } - - private void deleteTexture() { - if (textureId != -1) { - GL32.glDeleteTextures(textureId); - textureId = -1; - } - } - - public void clear() { - if (lastWidth == -1 || lastHeight == -1) { - return; - } - - GL46C.nglClearTexImage(textureId, 0, GL32.GL_RED_INTEGER, GL32.GL_UNSIGNED_INT, 0); - } - - private void setupTexture(int width, int height) { - if (lastWidth == width && lastHeight == height) { - return; - } - - // Need to rebind to all fbos because an attachment becomes incomplete when it's resized - attached.clear(); - - lastWidth = width; - lastHeight = height; - - deleteTexture(); - - textureId = GL46.glCreateTextures(GL46.GL_TEXTURE_2D); - GL46.glTextureStorage2D(textureId, 1, GL32.GL_R32UI, width, height); - - GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_MIN_FILTER, GL32.GL_NEAREST); - GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_MAG_FILTER, GL32.GL_NEAREST); - GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_WRAP_S, GL32.GL_CLAMP_TO_EDGE); - GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_WRAP_T, GL32.GL_CLAMP_TO_EDGE); - } -} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl index 87eb99051..a035454a5 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl @@ -1,17 +1,17 @@ +// FIXME: minimum required SSBO bindings in OpenGL is 8, but we use 9. +// A few of these could be combined. + // Per culling group -#define _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING 0 // cull1 -#define _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING 1 // cull1, cull2 -#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 2 // cull1, cull2 -#define _FLW_INSTANCE_BUFFER_BINDING 3 // cull1, cull2, draw -#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 4 // cull1, cull2, draw -#define _FLW_MODEL_BUFFER_BINDING 5 // cull1, cull2, apply -#define _FLW_DRAW_BUFFER_BINDING 6 // apply, draw +#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 0// cull1, cull2 +#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 1// cull1, cull2 +#define _FLW_INSTANCE_BUFFER_BINDING 2// cull1, cull2, draw +#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 3// cull1, cull2, draw +#define _FLW_MODEL_BUFFER_BINDING 4// cull1, cull2, apply +#define _FLW_DRAW_BUFFER_BINDING 5// apply, draw // Global to the engine -#define _FLW_LIGHT_LUT_BUFFER_BINDING 7 -#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 8 - -#define _FLW_MATRIX_BUFFER_BINDING 9 +#define _FLW_LIGHT_LUT_BUFFER_BINDING 6 +#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 7 -#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 10 +#define _FLW_MATRIX_BUFFER_BINDING 8 diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/cull.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/cull.glsl deleted file mode 100644 index 395979d5c..000000000 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/cull.glsl +++ /dev/null @@ -1,152 +0,0 @@ -#include "flywheel:internal/indirect/buffer_bindings.glsl" -#include "flywheel:internal/indirect/model_descriptor.glsl" -#include "flywheel:internal/uniforms/uniforms.glsl" -#include "flywheel:util/matrix.glsl" -#include "flywheel:internal/indirect/matrices.glsl" - -layout(local_size_x = 32) in; - -layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer TargetBuffer { - uint _flw_instanceIndices[]; -}; - -// High 6 bits for the number of instances in the page. -const uint _FLW_PAGE_COUNT_OFFSET = 26u; -// Bottom 26 bits for the model index. -const uint _FLW_MODEL_INDEX_MASK = 0x3FFFFFF; - -layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict readonly buffer PageFrameDescriptorBuffer { - uint _flw_pageFrameDescriptors[]; -}; - -layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer { - ModelDescriptor _flw_models[]; -}; - -layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict readonly buffer MatrixBuffer { - Matrices _flw_matrices[]; -}; - -layout(binding = 0) uniform sampler2D _flw_depthPyramid; - -// Disgustingly vectorized sphere frustum intersection taking advantage of ahead of time packing. -// Only uses 6 fmas and some boolean ops. -// See also: -// flywheel:uniform/flywheel.glsl -// dev.engine_room.flywheel.lib.math.MatrixMath.writePackedFrustumPlanes -// org.joml.FrustumIntersection.testSphere -bool _flw_testSphere(vec3 center, float radius) { - bvec4 xyInside = greaterThanEqual(fma(flw_frustumPlanes.xyX, center.xxxx, fma(flw_frustumPlanes.xyY, center.yyyy, fma(flw_frustumPlanes.xyZ, center.zzzz, flw_frustumPlanes.xyW))), -radius.xxxx); - bvec2 zInside = greaterThanEqual(fma(flw_frustumPlanes.zX, center.xx, fma(flw_frustumPlanes.zY, center.yy, fma(flw_frustumPlanes.zZ, center.zz, flw_frustumPlanes.zW))), -radius.xx); - - return all(xyInside) && all(zInside); -} - -bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4 aabb) { - // Closest point on the sphere is between the camera and the near plane, don't even attempt to cull. - if (c.z + r > -znear) { - return false; - } - - vec3 cr = c * r; - float czr2 = c.z * c.z - r * r; - - float vx = sqrt(c.x * c.x + czr2); - float minx = (vx * c.x - cr.z) / (vx * c.z + cr.x); - float maxx = (vx * c.x + cr.z) / (vx * c.z - cr.x); - - float vy = sqrt(c.y * c.y + czr2); - float miny = (vy * c.y - cr.z) / (vy * c.z + cr.y); - float maxy = (vy * c.y + cr.z) / (vy * c.z - cr.y); - - aabb = vec4(minx * P00, miny * P11, maxx * P00, maxy * P11); - aabb = aabb.xwzy * vec4(-0.5f, -0.5f, -0.5f, -0.5f) + vec4(0.5f); // clip space -> uv space - - return true; -} - -bool _flw_isVisible(uint instanceIndex, uint modelIndex) { - uint matrixIndex = _flw_models[modelIndex].matrixIndex; - BoundingSphere sphere = _flw_models[modelIndex].boundingSphere; - - vec3 center; - float radius; - _flw_unpackBoundingSphere(sphere, center, radius); - - FlwInstance instance = _flw_unpackInstance(instanceIndex); - - flw_transformBoundingSphere(instance, center, radius); - - if (matrixIndex > 0) { - transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius); - } - - bool isVisible = _flw_testSphere(center, radius); - - if (isVisible) { - transformBoundingSphere(flw_view, center, radius); - - vec4 aabb; - if (projectSphere(center, radius, _flw_cullData.znear, _flw_cullData.P00, _flw_cullData.P11, aabb)) - { - float width = (aabb.z - aabb.x) * _flw_cullData.pyramidWidth; - float height = (aabb.w - aabb.y) * _flw_cullData.pyramidHeight; - - int level = clamp(int(ceil(log2(max(width, height)))), 0, _flw_cullData.pyramidLevels); - - ivec2 levelSize = textureSize(_flw_depthPyramid, level); - - ivec4 levelSizePair = ivec4(levelSize, levelSize); - - ivec4 bounds = ivec4(aabb * vec4(levelSizePair)); - - // Clamp to the texture bounds. - // Since we're not going through a sampler out of bounds texel fetches will return 0. - bounds = clamp(bounds, ivec4(0), levelSizePair); - - float depth01 = texelFetch(_flw_depthPyramid, bounds.xw, level).r; - float depth11 = texelFetch(_flw_depthPyramid, bounds.zw, level).r; - float depth10 = texelFetch(_flw_depthPyramid, bounds.zy, level).r; - float depth00 = texelFetch(_flw_depthPyramid, bounds.xy, level).r; - - float depth; - if (_flw_cullData.useMin == 0) { - depth = max(max(depth00, depth01), max(depth10, depth11)); - } else { - depth = min(min(depth00, depth01), min(depth10, depth11)); - } - - float depthSphere = 1. + _flw_cullData.znear / (center.z + radius); - - isVisible = isVisible && depthSphere <= depth; - } - } - - return isVisible; -} - -void main() { - uint pageIndex = gl_WorkGroupID.x; - - if (pageIndex >= _flw_pageFrameDescriptors.length()) { - return; - } - - uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex]; - - uint pageInstanceCount = packedModelIndexAndCount >> _FLW_PAGE_COUNT_OFFSET; - - if (gl_LocalInvocationID.x >= pageInstanceCount) { - return; - } - - uint instanceIndex = gl_GlobalInvocationID.x; - - uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK; - - if (_flw_isVisible(instanceIndex, modelIndex)) { - uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1); - uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex; - _flw_instanceIndices[targetIndex] = instanceIndex; - } -} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl index 1c82a935d..d06e3ef51 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl @@ -7,16 +7,6 @@ layout(local_size_x = 32) in; -uniform uint _flw_visibilityReadOffsetPages; - -layout(std430, binding = _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING) restrict buffer PassTwoDispatchBuffer { - _FlwLateCullDispatch _flw_lateCullDispatch; -}; - -layout(std430, binding = _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer PassTwoIndexBuffer { - uint _flw_passTwoIndices[]; -}; - layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer { uint _flw_drawIndices[]; }; @@ -31,7 +21,7 @@ layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict rea }; layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict readonly buffer LastFrameVisibilityBuffer { - uint _flw_lastFrameVisibility[]; + uint _flw_visibility[]; }; layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer { @@ -74,6 +64,10 @@ bool _flw_isVisible(uint instanceIndex, uint modelIndex) { return _flw_testSphere(center, radius); } +// TODO: There's an opportunity here to write out the transformed bounding spheres to a buffer and use them in pass 2, +// instead of pulling the entire instance again. It would save a lot of memory bandwidth and matrix multiplications in +// pass 2, but it would also be a good bit of writes in pass 1. It's worth investigating, but it would be nice to have +// nsight trace working to be more sure. void main() { uint pageIndex = gl_WorkGroupID.x; @@ -97,26 +91,12 @@ void main() { return; } - uint pageVisibility = _flw_lastFrameVisibility[_flw_visibilityReadOffsetPages + pageIndex]; + uint pageVisibility = _flw_visibility[pageIndex]; if ((pageVisibility & (1u << gl_LocalInvocationID.x)) != 0u) { // This instance was visibile last frame, it should be rendered early. uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1); uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex; _flw_drawIndices[targetIndex] = instanceIndex; - } else { - // Try again later to see if it's been disoccluded. - uint targetIndex = atomicAdd(_flw_lateCullDispatch.threadCount, 1); - _flw_passTwoIndices[targetIndex] = instanceIndex; - - if (targetIndex % 32u == 0u) { - // This thread wrote an index that will be at the start of a new workgroup later - atomicAdd(_flw_lateCullDispatch.x, 1); - - if (targetIndex == 0) { - _flw_lateCullDispatch.y = 1; - _flw_lateCullDispatch.z = 1; - } - } } } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl index 4a32340b9..6459b84eb 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl @@ -7,15 +7,6 @@ layout(local_size_x = 32) in; - -layout(std430, binding = _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING) restrict buffer PassTwoDispatchBuffer { - _FlwLateCullDispatch _flw_lateCullDispatch; -}; - -layout(std430, binding = _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING) restrict readonly buffer PassTwoIndexBuffer { - uint _flw_passTwoIndices[]; -}; - layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer { uint _flw_drawIndices[]; }; @@ -31,6 +22,10 @@ layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict rea uint _flw_pageFrameDescriptors[]; }; +layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict buffer LastFrameVisibilityBuffer { + uint _flw_visibility[]; +}; + layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer { ModelDescriptor _flw_models[]; }; @@ -64,22 +59,20 @@ bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4 return true; } -bool _flw_isVisible(uint instanceIndex, uint modelIndex) { - uint matrixIndex = _flw_models[modelIndex].matrixIndex; - BoundingSphere sphere = _flw_models[modelIndex].boundingSphere; - - vec3 center; - float radius; - _flw_unpackBoundingSphere(sphere, center, radius); - - FlwInstance instance = _flw_unpackInstance(instanceIndex); - - flw_transformBoundingSphere(instance, center, radius); - - if (matrixIndex > 0) { - transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius); - } +// Disgustingly vectorized sphere frustum intersection taking advantage of ahead of time packing. +// Only uses 6 fmas and some boolean ops. +// See also: +// flywheel:uniform/flywheel.glsl +// dev.engine_room.flywheel.lib.math.MatrixMath.writePackedFrustumPlanes +// org.joml.FrustumIntersection.testSphere +bool _flw_testSphere(vec3 center, float radius) { + bvec4 xyInside = greaterThanEqual(fma(flw_frustumPlanes.xyX, center.xxxx, fma(flw_frustumPlanes.xyY, center.yyyy, fma(flw_frustumPlanes.xyZ, center.zzzz, flw_frustumPlanes.xyW))), -radius.xxxx); + bvec2 zInside = greaterThanEqual(fma(flw_frustumPlanes.zX, center.xx, fma(flw_frustumPlanes.zY, center.yy, fma(flw_frustumPlanes.zZ, center.zz, flw_frustumPlanes.zW))), -radius.xx); + + return all(xyInside) && all(zInside); +} +bool _flw_hizTest(vec3 center, float radius) { transformBoundingSphere(flw_view, center, radius); vec4 aabb; @@ -116,22 +109,63 @@ bool _flw_isVisible(uint instanceIndex, uint modelIndex) { return true; } +bool _flw_isVisible(uint instanceIndex, uint modelIndex) { + uint matrixIndex = _flw_models[modelIndex].matrixIndex; + BoundingSphere sphere = _flw_models[modelIndex].boundingSphere; + + vec3 center; + float radius; + _flw_unpackBoundingSphere(sphere, center, radius); + + FlwInstance instance = _flw_unpackInstance(instanceIndex); + + flw_transformBoundingSphere(instance, center, radius); + + if (matrixIndex > 0) { + transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius); + } + + bool visible = _flw_testSphere(center, radius); + + if (visible) { + visible = visible && _flw_hizTest(center, radius); + } + + return visible; +} + void main() { - if (gl_GlobalInvocationID.x >= _flw_lateCullDispatch.threadCount) { + uint pageIndex = gl_WorkGroupID.x; + + if (pageIndex >= _flw_pageFrameDescriptors.length()) { return; } - uint instanceIndex = _flw_passTwoIndices[gl_GlobalInvocationID.x]; + uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex]; - uint pageIndex = instanceIndex >> 5; + uint pageInstanceCount = packedModelIndexAndCount >> _FLW_PAGE_COUNT_OFFSET; - uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex]; + if (gl_LocalInvocationID.x >= pageInstanceCount) { + return; + } + + uint instanceIndex = gl_GlobalInvocationID.x; uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK; - if (_flw_isVisible(instanceIndex, modelIndex)) { + bool visible = _flw_isVisible(instanceIndex, modelIndex); + bool visibleLastFrame = (_flw_visibility[pageIndex] & (1u << gl_LocalInvocationID.x)) != 0u; + + if (visible && !visibleLastFrame) { uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1); uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex; _flw_drawIndices[targetIndex] = instanceIndex; } + + // FIXME: need a non-subgroup path + uvec4 visibility = subgroupBallot(visible); + + if (subgroupElect()) { + _flw_visibility[pageIndex] = visibility.x; + } } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag index 62e3b4d4a..ae0173ea8 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag @@ -6,13 +6,9 @@ flat in uvec2 _flw_packedMaterial; flat in uint _flw_instanceID; -layout(location = 1) out uint _flw_out_instanceID; - void main() { _flw_unpackUint2x16(_flw_packedMaterial.x, _flw_uberFogIndex, _flw_uberCutoutIndex); _flw_unpackMaterialProperties(_flw_packedMaterial.y, flw_material); _flw_main(_flw_instanceID); - - _flw_out_instanceID = _flw_instanceID; } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert index 500a509a2..72a29e9b5 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert @@ -21,10 +21,6 @@ layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict buffer MatrixBuffe uniform uint _flw_baseDraw; -// We read the visibility buffer for all culling groups into a single shared buffer. -// This offset is used to know where each culling group starts. -uniform uint _flw_visibilityWriteOffsetInstances = 0; - flat out uvec2 _flw_packedMaterial; flat out uint _flw_instanceID; @@ -60,5 +56,5 @@ void main() { _flw_main(instance); // Add 1 because a 0 instance id means null. - _flw_instanceID = _flw_visibilityWriteOffsetInstances + instanceIndex + 1; + _flw_instanceID = instanceIndex + 1; } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl deleted file mode 100644 index b4d506f16..000000000 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl +++ /dev/null @@ -1,64 +0,0 @@ -#include "flywheel:internal/indirect/buffer_bindings.glsl" - -layout(local_size_x = 256) in; - -layout(binding = 0) uniform usampler2D visBuffer; - -layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict buffer LastFrameVisibilityBuffer { - uint _flw_lastFrameVisibility[]; -}; - -uint extractBits(uint e, uint offset, uint count) { - return (e >> offset) & ((1u << count) - 1u); -} - -uint insertBits(uint e, uint newbits, uint offset, uint count) { - uint countMask = ((1u << count) - 1u); - // zero out the bits we're going to replace first - return (e & ~(countMask << offset)) | ((newbits & countMask) << offset); -} - -uvec2 remap_for_wave_reduction(uint a) { - return uvec2( - insertBits(extractBits(a, 2u, 3u), a, 0u, 1u), - insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u) - ); -} - -void emit(uint instanceID) { - // Null instance id. - if (instanceID == 0) { - return; - } - - // Adjust for null to find the actual index. - instanceID = instanceID - 1; - - uint index = instanceID >> 5; - - uint mask = 1u << (instanceID & 31u); - - atomicOr(_flw_lastFrameVisibility[index], mask); -} - -void main() { - uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); - uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); - uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); - - ivec2 tex = ivec2(gl_WorkGroupID.xy) * 32 + ivec2(x, y) * 2; - - uint instanceID01 = texelFetchOffset(visBuffer, tex, 0, ivec2(0, 1)).r; - uint instanceID11 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 1)).r; - uint instanceID10 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 0)).r; - uint instanceID00 = texelFetch(visBuffer, tex, 0).r; - - if (instanceID00 == instanceID01 && instanceID01 == instanceID10 && instanceID10 == instanceID11) { - emit(instanceID00); - } else { - emit(instanceID00); - emit(instanceID01); - emit(instanceID10); - emit(instanceID11); - } -}