-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtile_copy.ptx
102 lines (92 loc) · 2.61 KB
/
tile_copy.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-34714021
// Cuda compilation tools, release 12.6, V12.6.68
// Based on NVVM 7.0.1
//
.version 8.5
.target sm_52
.address_size 64
// .globl copyTileToCanvas
.visible .entry copyTileToCanvas(
.param .u64 copyTileToCanvas_param_0,
.param .u32 copyTileToCanvas_param_1,
.param .u32 copyTileToCanvas_param_2,
.param .u64 copyTileToCanvas_param_3,
.param .u32 copyTileToCanvas_param_4,
.param .u32 copyTileToCanvas_param_5,
.param .u32 copyTileToCanvas_param_6,
.param .u32 copyTileToCanvas_param_7,
.param .u32 copyTileToCanvas_param_8,
.param .u32 copyTileToCanvas_param_9
)
{
.reg .pred %p<7>;
.reg .b16 %rs<4>;
.reg .f32 %f<11>;
.reg .b32 %r<29>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [copyTileToCanvas_param_0];
ld.param.u32 %r5, [copyTileToCanvas_param_1];
ld.param.u32 %r6, [copyTileToCanvas_param_2];
ld.param.u64 %rd2, [copyTileToCanvas_param_3];
ld.param.u32 %r7, [copyTileToCanvas_param_4];
ld.param.u32 %r8, [copyTileToCanvas_param_5];
ld.param.u32 %r9, [copyTileToCanvas_param_6];
ld.param.u32 %r10, [copyTileToCanvas_param_7];
ld.param.u32 %r11, [copyTileToCanvas_param_8];
ld.param.u32 %r12, [copyTileToCanvas_param_9];
mov.u32 %r13, %ntid.x;
mov.u32 %r14, %ctaid.x;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r1, %r14, %r13, %r15;
mov.u32 %r16, %ntid.y;
mov.u32 %r17, %ctaid.y;
mov.u32 %r18, %tid.y;
mad.lo.s32 %r2, %r17, %r16, %r18;
setp.ge.s32 %p1, %r1, %r11;
setp.ge.s32 %p2, %r2, %r12;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_3;
add.s32 %r3, %r1, %r9;
setp.ge.s32 %p4, %r3, %r5;
add.s32 %r4, %r2, %r10;
setp.ge.s32 %p5, %r4, %r6;
or.pred %p6, %p4, %p5;
@%p6 bra $L__BB0_3;
cvt.rn.f32.s32 %f1, %r7;
cvt.rn.f32.s32 %f2, %r11;
div.rn.f32 %f3, %f1, %f2;
cvt.rn.f32.s32 %f4, %r12;
cvt.rn.f32.s32 %f5, %r8;
div.rn.f32 %f6, %f5, %f4;
cvt.rn.f32.s32 %f7, %r1;
mul.f32 %f8, %f3, %f7;
cvt.rzi.s32.f32 %r19, %f8;
add.s32 %r20, %r7, -1;
min.s32 %r21, %r19, %r20;
cvt.rn.f32.s32 %f9, %r2;
mul.f32 %f10, %f6, %f9;
cvt.rzi.s32.f32 %r22, %f10;
add.s32 %r23, %r8, -1;
min.s32 %r24, %r22, %r23;
mad.lo.s32 %r25, %r24, %r7, %r21;
mul.lo.s32 %r26, %r25, 3;
mad.lo.s32 %r27, %r4, %r5, %r3;
mul.lo.s32 %r28, %r27, 3;
cvt.s64.s32 %rd3, %r26;
cvta.to.global.u64 %rd4, %rd2;
add.s64 %rd5, %rd4, %rd3;
ld.global.u8 %rs1, [%rd5];
cvt.s64.s32 %rd6, %r28;
cvta.to.global.u64 %rd7, %rd1;
add.s64 %rd8, %rd7, %rd6;
st.global.u8 [%rd8], %rs1;
ld.global.u8 %rs2, [%rd5+1];
st.global.u8 [%rd8+1], %rs2;
ld.global.u8 %rs3, [%rd5+2];
st.global.u8 [%rd8+2], %rs3;
$L__BB0_3:
ret;
}