-
Notifications
You must be signed in to change notification settings - Fork 1
/
asm-pinsrw64.s
117 lines (104 loc) · 3.3 KB
/
asm-pinsrw64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
.align 16
.globl rs_process_pinsrw64
.text
rs_process_pinsrw64:
# rs_process_pinsrw64(void* dst (%rdi), const void* src (%rsi), size_t size (%rdx), const u16* LH (%rcx));
# # 32b padded LH
prefetchT0 (%rdi)
push %rbp
# push %rsi
# push %rdi
push %rbx
#r8-11 can be modified
# push %r12
# push %r13
# push %r14
# push %r15
# mov %rcx, %rbp # combined multiplication table
mov %rdx, %r11 # number of bytes to process (multiple of 4)
movq (%rsi), %rdx # load 1st 8 source bytes
sub $8, %r11 # last8 is a loop iter without loading more src
jle last8
# prefetchT0 64(%rdi)
# prefetchT0 64(%rsi)
# prefetch0 128(%rsi) # is it worth prefetching a lot, to trigger HW prefetch? nvm, on Core, HW and SW prefetch aren't linked
add %r11, %rsi # point to last set of 8-bytes of input
add %r11, %rdi # point to last set of 8-bytes of output
neg %r11 # convert byte size to count-up
# %rdi # destination (function arg)
# %rsi # source (function arg)
# rcx: lookup table
# eax: scratch (holds %dl)
# ebx: scratch (holds %dh)
# r11: -count, counts upward to 0.
# rdx: src.
# mm5: previous value of dest
.align 32
loop:
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
movd 0x0000(%rcx, %rax, 4), %xmm0 # There is no movw to vector reg. upper 16 has garbage. (and can cacheline-split)
movd 0x0400(%rcx, %rbx, 4), %xmm1 # use movd over pinsrw anyway, to break the dependency chain. (and one less uop)
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
pinsrw $1, 0x0000(%rcx, %rax, 4), %xmm0
pinsrw $1, 0x0400(%rcx, %rbx, 4), %xmm1
movzx %dl, %eax
movzx %dh, %ebx
shr $16, %rdx
movq 0(%rdi, %r11, 1), %xmm5 # can't pxor from mem, because we only want 8B
pinsrw $2, 0x0000(%rcx, %rax, 4), %xmm0
pinsrw $2, 0x0400(%rcx, %rbx, 4), %xmm1
# movzx 0x0000(%rcx, %rax, 4), %r8
# movzx 0x0400(%rcx, %rbx, 4), %r9
movzx %dl, %eax
movzx %dh, %ebx
movq 8(%rsi, %r11, 1), %rdx # read-ahead next 8 source bytes
pinsrw $3, 0x0000(%rcx, %rax, 4), %xmm0
pxor %xmm5, %xmm0
pinsrw $3, 0x0400(%rcx, %rbx, 4), %xmm1
pxor %xmm1, %xmm0
movq %xmm0, 0(%rdi, %r11, 1)
add $8, %r11
jnz loop
#
# handle final iteration separately (so that a read beyond the end of the input/output buffer is avoided)
#
last8:
movzx %dl, %eax
movzx %dh, %ebx
movd 0x0000(%rcx, %rax, 4), %xmm0 # There is no movw to vector reg. upper 16 has garbage. (and can cacheline-split)
movd 0x0400(%rcx, %rbx, 4), %xmm1 # use movd over pinsrw anyway, to break the dependency chain. (and one less uop)
shr $16, %rdx
movzx %dl, %eax
movzx %dh, %ebx
pinsrw $1, 0x0000(%rcx, %rax, 4), %xmm0
pinsrw $1, 0x0400(%rcx, %rbx, 4), %xmm1
shr $16, %rdx
movzx %dl, %eax
movzx %dh, %ebx
pinsrw $2, 0x0000(%rcx, %rax, 4), %xmm0
pinsrw $2, 0x0400(%rcx, %rbx, 4), %xmm1
# movzx 0x0000(%rcx, %rax, 4), %r8
# movzx 0x0400(%rcx, %rbx, 4), %r9
shr $16, %rdx
movzx %dl, %eax
movzx %dh, %ebx
# movq 8(%rsi, %r11, 1), %rdx # read-ahead next 8 source bytes
pinsrw $3, 0x0000(%rcx, %rax, 4), %xmm0
pinsrw $3, 0x0400(%rcx, %rbx, 4), %xmm1
pxor %xmm0, %xmm1
movq 0(%rdi, %r11, 1), %xmm5
pxor %xmm5, %xmm1
movq %xmm1, 0(%rdi, %r11, 1)
# pop %r15
# pop %r14
# pop %r13
# pop %r12
pop %rbx
# pop %rdi
# pop %rsi
pop %rbp
ret