-
Notifications
You must be signed in to change notification settings - Fork 7
/
byteshift_strstr.c
160 lines (136 loc) · 5.45 KB
/
byteshift_strstr.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "byteshift_strstr.h"
#include "make_big_endian.h"
/**
* Finds the first occurrence of the sub-string needle in the string haystack.
* Returns NULL if needle was not found.
*/
char *byteshift_strstr(const char *haystack, const char *needle)
{
if (!*needle) // Empty needle.
return (char *) haystack;
const char needle_first = *needle;
// Runs strchr() on the first section of the haystack as it has a lower
// algorithmic complexity for discarding the first non-matching characters.
haystack = strchr(haystack, needle_first);
if (!haystack) // First character of needle is not in the haystack.
return NULL;
// First characters of haystack and needle are the same now. Both are
// guaranteed to be at least one character long.
// Now computes the sum of the first needle_len characters of haystack
// minus the sum of characters values of needle.
const unsigned char *i_haystack = (const unsigned char *)haystack + 1;
const unsigned char *i_needle = (const unsigned char *)needle + 1;
bool identical = true;
while (*i_haystack && *i_needle) {
identical &= *i_haystack++ == *i_needle++;
}
// i_haystack now references the (needle_len + 1)-th character.
if (*i_needle) // haystack is smaller than needle.
return NULL;
else if (identical)
return (char *) haystack;
size_t needle_len = i_needle - (const unsigned char*)needle;
// Note: needle_len > 1, because we checked that it isn't zero, and if it
// is 1 then identical must be true because the first strchr() ensured
// that the first characters are identical
const char *sub_start = haystack;
size_t compare_len;
unsigned long last_needle_chars;
unsigned long last_haystack_chars;
unsigned long mask;
size_t needle_cmp_len = (needle_len < LONG_INT_N_BYTES) ? needle_len : LONG_INT_N_BYTES;
#ifdef MAKE_ULONG_BIGENDIAN
last_needle_chars = MAKE_ULONG_BIGENDIAN(*((unsigned long *)(i_needle - needle_cmp_len))) >> (8 * (LONG_INT_N_BYTES - needle_cmp_len));
last_haystack_chars = MAKE_ULONG_BIGENDIAN(*((unsigned long *)(i_haystack - needle_cmp_len))) >> (8 * (LONG_INT_N_BYTES - needle_cmp_len));
#else
const unsigned char *needle_cmp_end = i_needle;
i_needle -= needle_cmp_len;
i_haystack -= needle_cmp_len;
last_needle_chars = 0;
last_haystack_chars = 0;
while (i_needle != needle_cmp_end) {
last_needle_chars <<= 8;
last_needle_chars ^= *i_needle++;
last_haystack_chars <<= 8;
last_haystack_chars ^= *i_haystack++;
}
#endif
// At this point:
// * needle is at least two characters long
// * haystack is at least needle_len characters long (also at least two)
// * the first characters of needle and haystack are identical
if (needle_len > LONG_INT_N_BYTES + 1)
{
/* we will call memcmp() only once we know that the LONG_INT_N_BYTES
last chars are equal, so it will be enough to compare all but the
last LONG_INT_N_BYTES characters */
compare_len = needle_len - LONG_INT_N_BYTES;
/* iterate through the remainder of haystack while checking for identity
of the last LONG_INT_N_BYTES, and checking the rest with memcmp() */
while (*i_haystack)
{
last_haystack_chars <<= 8;
last_haystack_chars ^= *i_haystack++;
sub_start++;
if ( last_haystack_chars == last_needle_chars
&& memcmp(sub_start, needle, compare_len) == 0)
{
return (char *) sub_start;
}
}
}
else if (needle_len == LONG_INT_N_BYTES + 1)
{
/* iterate through the remainder of haystack while checking for identity
of the last LONG_INT_N_BYTES as well as the single additional
character, which is the first one */
while (*i_haystack)
{
last_haystack_chars <<= 8;
last_haystack_chars ^= *i_haystack++;
sub_start++;
if ( last_haystack_chars == last_needle_chars
&& *sub_start == needle_first)
{
return (char *) sub_start;
}
}
}
else if (needle_len == LONG_INT_N_BYTES)
{
/* iterate through the remainder of haystack while checking for identity
of the last LONG_INT_N_BYTES characters, which should exactly match
the entire needle */
while (*i_haystack)
{
last_haystack_chars <<= 8;
last_haystack_chars ^= *i_haystack++;
if (last_haystack_chars == last_needle_chars)
{
return (char *) (i_haystack - needle_len);
}
}
}
else /* needle_len < LONG_INT_N_BYTES */
{
mask = (((unsigned long) 1) << (needle_len * 8)) - 1;
last_needle_chars &= mask;
/* iterate through the remainder of haystack, updating the sums' difference
and checking for identity whenever the difference is zero */
while (*i_haystack)
{
last_haystack_chars <<= 8;
last_haystack_chars ^= *i_haystack++;
last_haystack_chars &= mask;
if (last_haystack_chars == last_needle_chars)
{
return (char *) (i_haystack - needle_len);
}
}
}
return NULL;
}