-
Notifications
You must be signed in to change notification settings - Fork 0
/
shfl_semantic.txt
47 lines (39 loc) · 1.81 KB
/
shfl_semantic.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// Commenting on shfl's semantic
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
// > All non-exited threads named in mask must execute the same intrinsic
// > with the same mask, or the result is undefined.
// wait for all threads in membermask to arrive
wait_for_specified_threads(membermask);
lane[4:0] = [Thread].laneid; // position of thread in warp
bval[4:0] = b[4:0]; // source lane or lane offset (0..31)
cval[4:0] = c[4:0]; // clamp value
segmask[4:0] = c[12:8]; // => (32 - width) of __shfl_sync
// get value of source register a if thread is active and
// guard predicate true, else unpredictable
if (isActive(Thread) && isGuardPredicateTrue(Thread)) {
SourceA[lane] = a;
} else {
// Value of SourceA[lane] is unpredictable for
// inactive/predicated-off threads in warp
}
maxLane = (lane[4:0] & segmask[4:0]) | (cval[4:0] & ~segmask[4:0]);
minLane = (lane[4:0] & segmask[4:0]);
// width must be 2^n so minLane is always 32-2^n
// minLane represents bits outside the width
// maxLane is (minLane | (clamp-value & width)) -> first lane of the segment for .up; otherwise last lane
// cval = 0b00000 when .up
// cval = 0b11111 when .down
// cval = 0b11111 when .bfly
// cval = 0b11111 when .idx
switch (.mode) {
case .up: j = lane - bval; pval = (j >= maxLane); break;
case .down: j = lane + bval; pval = (j <= maxLane); break;
case .bfly: j = lane ^ bval; pval = (j <= maxLane); break;
case .idx: j = minLane | (bval[4:0] & ~segmask[4:0]);
pval = (j <= maxLane); break;
}
// > the lower/upper delta lanes will be unchanged
if (!pval) j = lane; // copy from own lane
d = SourceA[j]; // copy input a from lane j
if (dest predicate selected)
p = pval;