-
Notifications
You must be signed in to change notification settings - Fork 12
/
memrchr.s
68 lines (52 loc) · 2.42 KB
/
memrchr.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# 2020, Georg Sauthoff <[email protected]>, LGPLv3+
.text
# .balign 4
# void *memrchr(const void *src, int c, size_t n);
#
# a0 = src, a1 = c, a2 = n
#
.global memrchr
memrchr:
add a0, a0, a2 # set to one past the last char
.Loop:
vsetvli a3, a2, e8, m8 # switch to 8 bit element size,
# i.e. 4 groups of 8 registers
sub a0, a0, a3 # decrement end pointer
vlb.v v8, (a0) # load a3 bytes
vmseq.vx v0, v8, a1 # set mask bit if equal to scalar c
# since there is no vlast.m, we have to use vfirst.m and post-process
vfirst.m a4, v0 # find lowest index of set mask bit
bgez a4, .Lmatch # branch if greater-than-or-equal-to-zero
sub a2, a2, a3 # decrement n
bnez a2, .Loop # branch if not-equal to zero, i.e. continue loop
li a0, 0 # load-immediate NULL as return value
ret
.Lmatch:
li a7, 256 # maximum vl that doesn't overflow 8bit indices
.Loop2:
ble a3, a7, .Ldone # branch if less-than-or-equal-to 256,
# i.e. jump is the happy case
# otherwise, divide & conquer the vector
srli a5, a3, 1 # divide vl by 2
sub a6, a3, a5 # in case vl was odd, i.e.
# high_part_size = a6, low_part_size = a5
# move the high part into v16
vslidedown.vx v16, v8, a5
vsetvli t0, a6, e8, m8 # ignore tailing bytes in the high_part
vmseq.vx v24, v16, a1 # check for matches in the high part
vfirst.m a4, v24 # store index of first match
vsetvli a3, a5, e8, m8 # update config for lower part, in case we branch
bltz a4, .Loop2 # branch if less-than-zero
vsetvli a3, a6, e8, m8 # restore config for higher part
vmv.v.v v8, v16 # vector mv high part to low part
vmcpy.m v0, v24 # restore mask in higher part
add a0, a0, a5 # increment src by size of low part
j .Loop2 # unconditionally branch to loop head
.Ldone:
vid.v v16, v0.t # write element index in each masked element
# v24[0] = max_unsigned(v16[*], v16[0])
vredmaxu.vs v24, v16, v16, v0.t
vmv.x.s a5, v24 # move first vector element to register
andi a5, a5, 0xff # remove sign bits in case element
add a0, a0, a5
ret