-
Notifications
You must be signed in to change notification settings - Fork 1
/
asm-pinsrw-nodep.s
201 lines (170 loc) · 6.52 KB
/
asm-pinsrw-nodep.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# in amd64, encoding (%rbp, %rax, 4) takes an extra byte vs. using other regs. It has to get encoded with a 0-byte displacement, rather than no disp
# r13 suffers the same problem as rbp. http://www.x86-64.org/documentation/assembly.html
# pinsrw $2, 0x0000(%rbp, %rax, 4), %xmm0 # 7B
# pinsrw $2, 0x0000(%r10, %rax, 4), %xmm0 # 7B
# pinsrw $2, 0x0000(%rcx, %rax, 4), %xmm0 # 6B
# pinsrw $1, (%rsi,%rax,4), %xmm2 # 6B
# vpinsrw $0x4,(%rcx,%rbp,4),%xmm0,%xmm1 # 6B
# vpinsrw $0x7,(%rcx,%rbx,4),%xmm12,%xmm14 # 6B
# vpinsrw $0x3,(%rcx,%r11,4),%xmm10,%xmm12 # 7B
# freeing up more low regs would seem to require more prologue to shuffle rsi and rdi
# so don't bother unless optimizing for something without a uop cache
# or could use rax or rbx, but then we'd lose the nice readability, and get mov %ch, %rbp or something
.align 16
.text
##### hacked up to use different xmm regs, to see if pinsrw dep chains are a bottleneck
##### somehow it's slower than the dependent version, except with -mavx (which generates VEX versions of everything)
##### even with vzeroupper between every timing
.globl rs_process_pinsrw_nodep
rs_process_pinsrw_nodep:
# rs_process_pinsrw128(void* dst (%rdi), const void* src (%rsi), size_t size (%rdx), const u16* LH (%rcx));
# ~1 byte per cycle on SandyBridge
# # 32b padded LH
prefetchT0 (%rdi)
push %rbp
# push %rsi
# push %rdi
push %rbx
#r8-11 can be modified
# push %r12
# push %r13
# push %r14
# push %r15
mov %rdi, %rbp # dest buffer
mov %rcx, %rdi # combined multiplication table.
mov %rdx, %r11 # number of bytes to process (multiple of 16)
movq (%rsi), %rdx # load first 8 source bytes
# movq 8(%rsi), %rcx
sub $16, %r11 # last8 is a loop iter without loading more src
jle last8 # can only skip fixing up src/dest ptr if count is now exactly 0, not just under 16 on entry
# prefetchT0 64(%rdi)
# prefetchT0 64(%rsi)
# prefetch0 128(%rsi) # is it worth prefetching a lot, to trigger HW prefetch? nvm, on Core, HW and SW prefetch aren't linked
add %r11, %rsi # point to last set of 8-bytes of input
add %r11, %rbp # point to last set of 8-bytes of output
neg %r11 # convert byte size to count-up
# %rbp # destination (function arg comes in in rdi)
# %rsi # source (function arg)
# %rdi: lookup table (one less byte to encode than %rbp)
# eax: scratch (holds %dl)
# ebx: scratch (holds %dh)
# r11: -count, counts upward to 0.
# rdx, rcx: src. src words [0..3] and [4..7]
# mm5: previous value of dest
.align 32
loop:
# do 16 bytes of data per iter, with two 8B loads of src data per 16B load/store of dest data
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
movq 8(%rsi, %r11), %rcx # next 8 source bytes
movd 0x0000(%rdi, %rax, 4), %xmm0 # There is no movw to vector reg. upper 16 has garbage. (and can cacheline-split)
movd 0x0400(%rdi, %rbx, 4), %xmm1 # use movd over pinsrw anyway, to break the dependency chain. (and one less uop)
# it seems to be a slowdown to rearrange things to alternate rdx and rcx blocks. Maybe having a movd block after a pinsrw block is good?
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
pinsrw $1, 0x0000(%rdi, %rax, 4), %xmm8
movzx %cl, %eax
pinsrw $1, 0x0400(%rdi, %rbx, 4), %xmm9
movzx %ch, %ebx
movd 0x0000(%rdi, %rax, 4), %xmm2 # separate dep chain for the other 8 src bytes costs 2 uops (pxor + punpck)
shr $16, %rcx
movd 0x0400(%rdi, %rbx, 4), %xmm3 # but movd is cheaper than pinsrw. No net uop diff. (It is slightly faster)
movzx %dl, %eax
movzx %dh, %ebx
pinsrw $2, 0x0000(%rdi, %rax, 4), %xmm10
movzx %cl, %eax
shr $16, %rdx
pinsrw $2, 0x0400(%rdi, %rbx, 4), %xmm11
movzx %ch, %ebx
pinsrw $1, 0x0000(%rdi, %rax, 4), %xmm12
shr $16, %rcx
movzx %dl, %eax
pinsrw $1, 0x0400(%rdi, %rbx, 4), %xmm13
movzx %dh, %ebx
pinsrw $3, 0x0000(%rdi, %rax, 4), %xmm14
movzx %cl, %eax
pinsrw $3, 0x0400(%rdi, %rbx, 4), %xmm15
movzx %ch, %ebx
shr $16, %rcx
movq 16(%rsi, %r11), %rdx # read-ahead for next iter
pinsrw $2, 0x0000(%rdi, %rax, 4), %xmm4
pinsrw $2, 0x0400(%rdi, %rbx, 4), %xmm5
movzx %cl, %eax
movzx %ch, %ebx
pxor %xmm1, %xmm0
pinsrw $3, 0x0000(%rdi, %rax, 4), %xmm6
pinsrw $3, 0x0400(%rdi, %rbx, 4), %xmm7
pxor %xmm3, %xmm2
# movlhps %xmm2, %xmm0 # runs on p5 only
punpcklqdq %xmm2, %xmm0 # runs on p1 / p5, same as pinsrw (SnB)
# movq (%rbp, %r11), %xmm5
# pxor %xmm5, %xmm0
pxor (%rbp, %r11), %xmm0
movdqu %xmm0, (%rbp, %r11)
# movq %xmm0, (%rbp, %r11)
add $16, %r11
jnz loop
#
# handle final iteration separately (so that a read beyond the end of the input/output buffer is avoided)
#
last8:
# do 16 bytes of data per iter, with two 8B loads of src data per 16B load/store of dest data
# still using the longer-dep-chain PINSRW all the way, instead of 2 chains and punpck Q->DQ
# TODO: update last iter code to whatever proves fastest in the loop, with loads for next iter commented out
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
movd 0x0000(%rdi, %rax, 4), %xmm0 # There is no movw to vector reg. upper 16 has garbage. (and can cacheline-split)
movd 0x0400(%rdi, %rbx, 4), %xmm1 # use movd over pinsrw anyway, to break the dependency chain. (and one less uop)
# movq 8(%rsi, %r11), %rcx # next 8 source bytes
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
pinsrw $1, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $1, 0x0400(%rdi, %rbx, 4), %xmm1
movzx %cl, %eax
movzx %ch, %ebx
shr $16, %rcx
pinsrw $4, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $4, 0x0400(%rdi, %rbx, 4), %xmm1
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
pinsrw $2, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $2, 0x0400(%rdi, %rbx, 4), %xmm1
movzx %cl, %eax
movzx %ch, %ebx
shr $16, %rcx
pinsrw $5, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $5, 0x0400(%rdi, %rbx, 4), %xmm1
movzx %dl, %eax
movzx %dh, %ebx
pinsrw $3, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $3, 0x0400(%rdi, %rbx, 4), %xmm1
# movq 16(%rsi, %r11), %rdx # read-ahead for next iter
movzx %cl, %eax
movzx %ch, %ebx
shr $16, %rcx
pinsrw $6, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $6, 0x0400(%rdi, %rbx, 4), %xmm1
movzx %cl, %eax
movzx %ch, %ebx
pinsrw $7, 0x0000(%rdi, %rax, 4), %xmm0
pinsrw $7, 0x0400(%rdi, %rbx, 4), %xmm1
pxor %xmm0, %xmm1
# movq 0(%rbp, %r11, 1), %xmm5
# pxor %xmm5, %xmm1
pxor (%rbp, %r11), %xmm1
movdqu %xmm1, 0(%rbp, %r11, 1)
# movq %xmm1, 0(%rbp, %r11, 1)
# pop %r15
# pop %r14
# pop %r13
# pop %r12
pop %rbx
# pop %rdi
# pop %rsi
pop %rbp
ret