-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathsubgroup_ops.rs
256 lines (218 loc) · 8.55 KB
/
subgroup_ops.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
use {
bytemuck::cast_slice,
inline_spirv::inline_spirv,
screen_13::prelude::*,
std::{mem::size_of, sync::Arc, time::Instant},
};
/// Advanced example demonstrating subgroup operations (arithmetic and ballot).
///
/// This code demonstrates "exclusive sum", which is a GPU-implementation of "parallel prefix scan".
///
/// Given an input buffer of unsigned integers, the result is an output buffer where each index is
/// the sum of all items prior to that index. Example:
///
/// Input: [1, 1, 1, 1]
/// Output: [0, 1, 2, 3]
///
/// Input: [1, 4, 0, 2]
/// Output: [0, 1, 5, 5]
///
/// This algorthim is useful in *many* places, one of which being GPU-driven rendering. In that case
/// you may have a list of meshes where each index contains the number of those meshes in the scene.
/// Exclusive sum might be used to generate an offset where you may safely record instance drawing
/// commands for a given mesh.
///
/// https://www.khronos.org/blog/vulkan-subgroup-tutorial
/// https://www.khronos.org/assets/uploads/developers/library/2018-vulkan-devday/06-subgroups.pdf
fn main() -> Result<(), DriverError> {
pretty_env_logger::init();
let device = Arc::new(Device::create_headless(DeviceInfo::default())?);
let Vulkan11Properties {
subgroup_size,
subgroup_supported_operations,
..
} = device.physical_device.properties_v1_1;
assert!(subgroup_supported_operations.contains(vk::SubgroupFeatureFlags::ARITHMETIC));
assert!(subgroup_supported_operations.contains(vk::SubgroupFeatureFlags::BALLOT));
let reduce_pipeline = create_reduce_pipeline(&device)?;
let excl_sum_pipeline = create_exclusive_sum_pipeline(&device)?;
for data_len in [32, 64, 128, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536] {
// Provided data must always be at least a full workgroup
if data_len < subgroup_size {
continue;
}
let input_data = generate_input_data(data_len);
let output_data =
exclusive_sum(&device, &reduce_pipeline, &excl_sum_pipeline, &input_data)?;
assert_output_data(&input_data, &output_data);
}
Ok(())
}
fn exclusive_sum(
device: &Arc<Device>,
reduce_pipeline: &Arc<ComputePipeline>,
scan_pipeline: &Arc<ComputePipeline>,
input_data: &[u32],
) -> Result<Vec<u32>, DriverError> {
let mut render_graph = RenderGraph::new();
let input_buf = render_graph.bind_node(Buffer::create_from_slice(
device,
vk::BufferUsageFlags::STORAGE_BUFFER,
cast_slice(input_data),
)?);
let output_buf = render_graph.bind_node(Arc::new(Buffer::create(
device,
BufferInfo::host_mem(
input_data.len() as vk::DeviceSize * size_of::<u32>() as vk::DeviceSize,
vk::BufferUsageFlags::STORAGE_BUFFER,
),
)?));
let workgroup_count =
input_data.len() as u32 / device.physical_device.properties_v1_1.subgroup_size;
let reduce_count = workgroup_count - 1;
let workgroup_buf = render_graph.bind_node(Buffer::create(
device,
BufferInfo::device_mem(
reduce_count.max(1) as vk::DeviceSize * size_of::<u32>() as vk::DeviceSize,
vk::BufferUsageFlags::STORAGE_BUFFER,
),
)?);
if reduce_count > 0 {
render_graph
.begin_pass("exclusive sum reduce")
.bind_pipeline(reduce_pipeline)
.read_descriptor(0, input_buf)
.write_descriptor(1, workgroup_buf)
.record_compute(move |compute, _| {
compute.dispatch(reduce_count, 1, 1);
});
}
render_graph
.begin_pass("exclusive sum scan")
.bind_pipeline(scan_pipeline)
.read_descriptor(0, workgroup_buf)
.read_descriptor(1, input_buf)
.write_descriptor(2, output_buf)
.record_compute(move |compute, _| {
compute.dispatch(workgroup_count, 1, 1);
});
let output_buf = render_graph.unbind_node(output_buf);
let cmd_buf = render_graph
.resolve()
.submit(&mut HashPool::new(device), 0, 0)?;
let started = Instant::now();
cmd_buf.wait_until_executed()?;
println!(
"Waited {}μs (len={})",
(Instant::now() - started).as_micros(),
input_data.len()
);
Ok(cast_slice(Buffer::mapped_slice(&output_buf)).to_vec())
}
fn generate_input_data(data_len: u32) -> Vec<u32> {
let mut data = Vec::with_capacity(data_len as _);
for i in 0..data_len {
data.push(i);
}
data
}
fn assert_output_data(input_data: &[u32], output_data: &[u32]) {
let mut sum = 0;
for idx in 0..input_data.len() {
assert_eq!(sum, output_data[idx], "exclusive sum at {idx} not equal");
sum += input_data[idx];
}
}
fn create_reduce_pipeline(device: &Arc<Device>) -> Result<Arc<ComputePipeline>, DriverError> {
Ok(Arc::new(ComputePipeline::create(
device,
ComputePipelineInfo::default(),
Shader::new_compute(
inline_spirv!(
r#"
#version 460 core
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_KHR_shader_subgroup_arithmetic : require
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) restrict readonly buffer InputBuffer {
uint32_t input_buf[];
};
layout(binding = 1) restrict writeonly buffer WorkgroupBuffer {
uint32_t workgroup_buf[];
};
void main() {
uint32_t sum = subgroupAdd(input_buf[gl_GlobalInvocationID.x]);
if (subgroupElect()) {
workgroup_buf[gl_WorkGroupID.x] = sum;
}
}
"#,
comp,
vulkan1_2,
)
.as_slice(),
)
.specialization_info(SpecializationInfo {
data: device
.physical_device
.properties_v1_1
.subgroup_size
.to_ne_bytes()
.to_vec(),
map_entries: vec![vk::SpecializationMapEntry {
constant_id: 0,
offset: 0,
size: size_of::<u32>(),
}],
}),
)?))
}
fn create_exclusive_sum_pipeline(
device: &Arc<Device>,
) -> Result<Arc<ComputePipeline>, DriverError> {
Ok( Arc::new(
ComputePipeline::create(
device,
ComputePipelineInfo::default(),
Shader::new_compute(inline_spirv!(
r#"
#version 460 core
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_KHR_shader_subgroup_arithmetic : require
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) restrict readonly buffer WorkgroupBuffer {
uint32_t workgroup_buf[];
};
layout(binding = 1) restrict readonly buffer InputBuffer {
uint32_t input_buf[];
};
layout(binding = 2) restrict writeonly buffer OutputBuffer {
uint32_t output_buf[];
};
void main() {
uint32_t subgroup_sum = subgroupExclusiveAdd(input_buf[gl_GlobalInvocationID.x]);
uint32_t workgroup_sum = 0;
uint workgroups_per_subgroup_invocation = (gl_NumWorkGroups.x + gl_SubgroupSize - 1) / gl_SubgroupSize;
uint start = gl_SubgroupInvocationID * workgroups_per_subgroup_invocation;
uint end = min(start + workgroups_per_subgroup_invocation, gl_WorkGroupID.x);
for (uint workgroup_id = start; workgroup_id < end; workgroup_id++) {
workgroup_sum += workgroup_buf[workgroup_id];
}
workgroup_sum = subgroupAdd(workgroup_sum);
output_buf[gl_GlobalInvocationID.x] = subgroup_sum + workgroup_sum;
}
"#,
comp,
vulkan1_2
).as_slice())
.specialization_info(SpecializationInfo {
data: device.physical_device.properties_v1_1.subgroup_size.to_ne_bytes().to_vec(),
map_entries: vec![vk::SpecializationMapEntry {
constant_id: 0,
offset: 0,
size: size_of::<u32>(),
}],
}),
)?
))
}