Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement hierarchical depth culling for meshlets #87

Merged
merged 12 commits into from
Aug 3, 2024
Merged
4 changes: 4 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@
"-DCMAKE_BUILD_TYPE=RelWithDebInfo",
// Ninja color output
"-DCMAKE_CXX_FLAGS=-fdiagnostics-color=always",
// mold is significantly faster than the default linkers
"-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=mold",
"-G",
"Ninja",
"-B",
Expand All @@ -305,6 +307,8 @@
"-DCMAKE_BUILD_TYPE=Debug",
// Ninja color output
"-DCMAKE_CXX_FLAGS=-fdiagnostics-color=always",
// mold is significantly faster than the default linkers
"-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=mold",
"-G",
"Ninja",
"-B",
Expand Down
2 changes: 2 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Vulkan renderer spun off from following https://vulkan-tutorial.com/. Work of [S
- Mesh shaders
- Drawlist generation and meshlet culling in compute
- Not all HW supports task shaders so let's have a unified implementation
- Hierarchical depth culling without explicit blocker geometry
- Based on Aaltonen's work in [GPU-Driven Rendering Pipelines](https://www.advances.realtimerendering.com/s2015/aaltonenhaar_siggraph2015_combined_final_footer_220dpi.pdf)
- ReSTIR DI in deferred path
- Initial candidate sampling and biased spatial reuse implemented so far
- Path tracing reference
Expand Down
182 changes: 164 additions & 18 deletions res/shader/draw_list_culler.comp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#extension GL_EXT_shader_8bit_storage : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_EXT_shader_image_load_formatted : require

#include "scene/camera.glsl"
#include "scene/geometry.glsl"
Expand All @@ -16,27 +17,48 @@ struct DrawMeshletInstance
uint drawInstanceID;
uint meshletID;
};

layout(std430, set = STORAGE_SET, binding = 0) readonly buffer InDrawList
{
uint count;
DrawMeshletInstance instance[];
}
inDrawList;
layout(std430, set = STORAGE_SET, binding = 1) writeonly buffer OutDrawList

layout(std430, set = STORAGE_SET, binding = 1) buffer OutDrawList
{
uint count;
DrawMeshletInstance instance[];
}
outDrawList;

layout(std430, set = STORAGE_SET, binding = 2) buffer DispatchArguments
layout(std430, set = STORAGE_SET, binding = 2) buffer OutDispatchArguments
{
uint groupsX;
uint groupsY;
uint groupsZ;
}
outDispatchArguments;

layout(std430, set = STORAGE_SET, binding = 3) buffer OutSecondPhaseDrawList
{
uint count;
DrawMeshletInstance instance[];
}
outSecondPhaseDrawList;

layout(set = STORAGE_SET, binding = 4) uniform texture2D
inHierarchicalDepth[MAX_HIZ_MIPS];
// This should clamp to 1 on/beyond edges
layout(set = STORAGE_SET, binding = 5) uniform sampler depthSampler;

layout(push_constant) uniform DrawListCullerPC
{
// 0 means no hiz bound
uint hizMipCount;
uint outputSecondPhaseInput;
}
PC;

float signedDistance(vec4 plane, vec3 p) { return dot(plane, vec4(p, 1)); }

void transformBounds(inout MeshletBounds bounds, Transforms trfn, float scale)
Expand All @@ -57,6 +79,99 @@ bool isSphereOutsideFrustum(MeshletBounds bounds)
signedDistance(camera.topPlane, bounds.center) < -bounds.radius;
}

// From https://zeux.io/2023/01/12/approximate-projected-bounds/
// based on
// 2D Polyhedral Bounds of a Clipped, Perspective-Projected 3D Sphere. Michael
// Mara, Morgan McGuire. 2013
// Assumes c,r are in view space and that the projection is symmetrical
bool projectSphereView(
vec3 c, float r, float znear, float P00, float P11, out vec4 aabb)
{
if (c.z < r + znear)
return false;

vec3 cr = c * r;
float czr2 = c.z * c.z - r * r;

float vx = sqrt(c.x * c.x + czr2);
float minx = (vx * c.x - cr.z) / (vx * c.z + cr.x);
float maxx = (vx * c.x + cr.z) / (vx * c.z - cr.x);

float vy = sqrt(c.y * c.y + czr2);
float miny = (vy * c.y - cr.z) / (vy * c.z + cr.y);
float maxy = (vy * c.y + cr.z) / (vy * c.z - cr.y);

aabb = vec4(minx * P00, miny * P11, maxx * P00, maxy * P11);
// clip space -> uv space
aabb = aabb.xwzy * vec4(0.5f, -0.5f, 0.5f, -0.5f) + vec4(0.5f);

return true;
}

bool isSphereOccluded(MeshletBounds bounds)
{
if (PC.hizMipCount == 0)
return false;

vec4 centerInView = camera.worldToCamera * vec4(bounds.center, 1);

// Figure out bounds radius in texels, this also early outs if the cam is
// inside the bounds, or if the bounds are behind the camera
vec4 aabbScreen;
float conservativeRadius = bounds.radius * camera.maxViewScale;
if (!projectSphereView(
vec3(centerInView.xy, -centerInView.z), conservativeRadius,
camera.near, camera.cameraToClip[0][0], camera.cameraToClip[1][1],
aabbScreen))
return false;
vec2 aabbDiagonalPx = aabbScreen.zw - aabbScreen.xy;
aabbDiagonalPx *= camera.resolution;
float pxRadius = length(aabbDiagonalPx);

// Sample from the first mip where the whole sphere will fit a 2x2 texel
// area. floor without + 1 as hiz mip 0 is depth mip 1. clamp to 0 as < 1
// radii would be negative.
uint hizMip = uint(max(floor(log2(pxRadius)), 0));
if (hizMip >= PC.hizMipCount)
return false;

// Figure out what uv to sample hiz from
vec4 centerClipPos = camera.cameraToClip * centerInView;
centerClipPos.xyz /= centerClipPos.w;

vec2 uv = centerClipPos.xy * .5 + .5;
// Pick the closest 2x2 set of texels around the sample for gather
uv *= camera.resolution;
uv -= .5;
uv = floor(uv);
uv /= camera.resolution;
// Sampler should clamp to a border of 1 so that out of bounds samples don't
// get incorrectly culled

// Figure out the closest depth on the bounds for conservative culling
// TODO:
// We're only interested in z and w, so xy are extra math here
vec3 viewWorldDir = normalize(camera.eye.xyz - bounds.center);
vec3 closestWorldPos = bounds.center + viewWorldDir * bounds.radius;
vec4 closestClipPos =
camera.cameraToClip * camera.worldToCamera * vec4(closestWorldPos, 1);
float closestDepth = closestClipPos.z / closestClipPos.w;

// Gather the neighborhood around the sample point
// Let's not worry about the cases when the whole bounds are guaranteed to
// fit in one px. Sub-pixel meshlets are a bad time for perf regardless.
vec4 hizDepths = textureGather(
sampler2D(inHierarchicalDepth[nonuniformEXT(hizMip)], depthSampler), uv,
0);

// Reverse-Z so furthest away point is the smallest depth value and we know
// the cluster is occluded if its depth value is smaller than the depth
// buffer value
float hizDepth =
min(min(hizDepths.x, hizDepths.y), min(hizDepths.z, hizDepths.w));
return closestDepth < hizDepth;
}

bool isConeCapHidden(MeshletBounds bounds)
{
// From meshoptimizer.h
Expand All @@ -69,6 +184,8 @@ layout(local_size_x = GROUP_X) in;
void main()
{
// These were zeroed before the pass to init X for use as the write pointer
// without barriers after this init that now just sets the other group
// counts to 1
if (gl_GlobalInvocationID.x == 0)
{
outDispatchArguments.groupsY = 1;
Expand All @@ -91,6 +208,7 @@ void main()

float scale = modelInstanceScales.instance[instance.modelInstanceID];
bool meshletVisible = true;
bool meshletOccluded = false;
if (scale != 0.)
{
MeshletBounds bounds =
Expand All @@ -101,24 +219,52 @@ void main()
meshletVisible = !isSphereOutsideFrustum(bounds);
if (meshletVisible)
meshletVisible = !isConeCapHidden(bounds);
if (meshletVisible)
{
meshletOccluded = isSphereOccluded(bounds);
meshletVisible = !meshletOccluded;
}
}

// Figure out the subgroup offset for writes
uvec4 visibleMeshletsMask = subgroupBallot(meshletVisible);
uint subgroupMeshletCount = subgroupBallotBitCount(visibleMeshletsMask);
uint subgroupStartOffset;
if (subgroupElect())
{
// Keep count in the buffer in sync for consistency
atomicAdd(outDrawList.count, subgroupMeshletCount);
subgroupStartOffset =
atomicAdd(outDispatchArguments.groupsX, subgroupMeshletCount);
// Figure out the subgroup offset for writes
uvec4 visibleMeshletsMask = subgroupBallot(meshletVisible);
uint subgroupMeshletCount = subgroupBallotBitCount(visibleMeshletsMask);
uint subgroupStartOffset;
if (subgroupElect())
{
// Keep count in the buffer in sync for consistency
atomicAdd(outDrawList.count, subgroupMeshletCount);
subgroupStartOffset =
atomicAdd(outDispatchArguments.groupsX, subgroupMeshletCount);
}
subgroupStartOffset = subgroupBroadcastFirst(subgroupStartOffset);

// Write out within the subgroup block
uint threadOffset =
subgroupBallotExclusiveBitCount(visibleMeshletsMask);
if (meshletVisible)
outDrawList.instance[subgroupStartOffset + threadOffset] =
meshletInstance;
}
subgroupStartOffset = subgroupBroadcastFirst(subgroupStartOffset);

// Write out within the subgroup block
uint threadOffset = subgroupBallotExclusiveBitCount(visibleMeshletsMask);
if (meshletVisible)
outDrawList.instance[subgroupStartOffset + threadOffset] =
meshletInstance;
if (PC.outputSecondPhaseInput == 1)
{
// Figure out the subgroup offset for writes
uvec4 occludedMeshletsMask = subgroupBallot(meshletOccluded);
uint subgroupMeshletCount =
subgroupBallotBitCount(occludedMeshletsMask);
uint subgroupStartOffset;
if (subgroupElect())
subgroupStartOffset =
atomicAdd(outSecondPhaseDrawList.count, subgroupMeshletCount);
subgroupStartOffset = subgroupBroadcastFirst(subgroupStartOffset);

// Write out within the subgroup block
uint threadOffset =
subgroupBallotExclusiveBitCount(occludedMeshletsMask);
if (meshletOccluded)
outSecondPhaseDrawList
.instance[subgroupStartOffset + threadOffset] = meshletInstance;
}
}
103 changes: 103 additions & 0 deletions res/shader/hiz_downsampler.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#pragma shader_stage(compute)

#extension GL_EXT_shader_image_load_formatted : require

// Based on A Life of a Bokeh by Guillaume Abadie
// https://advances.realtimerendering.com/s2018/index.htm
// implemented using https://github.com/GPUOpen-Effects/FidelityFX-SPD

layout(set = 0, binding = 0) uniform texture2D depthSrc;
layout(set = 0, binding = 1) uniform sampler depthSampler;
layout(set = 0, binding = 2) uniform coherent image2D depthDst[12];
layout(std430, set = 0, binding = 3) coherent buffer SpdGlobalAtomicBuffer
{
uint counter;
}
spdGlobalAtomic;

layout(push_constant) uniform ReducePC
{
ivec2 topMipResolution;
uint numWorkGroupsPerSlice;
uint mips;
}
PC;

#define A_GPU 1
#define A_GLSL 1
#include "ext/ffx_a.h"

shared AF4 spdIntermediate[16][16];
shared AU1 spdCounter;

AF4 SpdLoadSourceImage(ASU2 p, AU1 slice)
{
// Clamp to edge
p = min(p, PC.topMipResolution - 1);

// TODO:
// Single fetch per pixel feels excessive instead of 4 texel gather. Does
// SPD support the latter without hacking?
float nonLinearDepth =
texelFetch(sampler2D(depthSrc, depthSampler), p, 0).x;

return AF4(nonLinearDepth, 0, 0, 0);
}

AF4 SpdLoad(ASU2 p, AU1 slice)
{
// Clamp to edge
ASU2 mip5Res = max(PC.topMipResolution >> 5, ASU2(1));
p = min(p, mip5Res - 1);

// TODO:
// Single fetch per pixel feels excessive instead of 4 texel gather. Does
// SPD support the latter without hacking? Is it even possible from a
// image2D?
float nonLinearDepth = imageLoad(depthDst[5], p).x;

return AF4(nonLinearDepth, 0, 0, 0);
}

void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice)
{
// Skip writes that would have gone over
ASU2 mipRes = max(PC.topMipResolution >> mip, ASU2(1));
if (any(greaterThanEqual(p, mipRes)))
return;

imageStore(depthDst[mip], p, AF4(value.x, 0, 0, 0));
}

void SpdIncreaseAtomicCounter(AU1 slice)
{
spdCounter = atomicAdd(spdGlobalAtomic.counter, 1);
}

AF4 SpdLoadIntermediate(AU1 x, AU1 y) { return spdIntermediate[x][y]; }

void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value)
{
spdIntermediate[x][y] = AF4(value.x, 0, 0, 0);
}

AU1 SpdGetAtomicCounter() { return spdCounter; }

void SpdResetAtomicCounter(AU1 slice) { spdGlobalAtomic.counter = 0; }

AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3)
{
// Keep the furthest away sample.
// Reverse-Z so the furthest away sample is the smallest.
return AF4(min(min(v0.x, v1.x), min(v2.x, v3.x)), 0, 0, 0);
}

#include "ext/ffx_spd.h"

layout(local_size_x = GROUP_X) in;
void main()
{
SpdDownsample(
AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), AU1(PC.mips),
AU1(PC.numWorkGroupsPerSlice), AU1(gl_WorkGroupID.z));
}
1 change: 1 addition & 0 deletions res/shader/scene/camera.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ layout(std430, set = CAMERA_SET, binding = 0) buffer CameraDSB
vec2 previousJitter;
float near;
float far;
float maxViewScale;
}
camera;

Expand Down
Loading
Loading