-
Notifications
You must be signed in to change notification settings - Fork 10
/
fixed-strings.sh
executable file
·399 lines (310 loc) · 8.72 KB
/
fixed-strings.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
#!/bin/bash
#
# Usage:
# ./fixed-strings.sh <function name>
#
# Demo:
# ./fixed-strings.sh fetch-data
# ./fixed-strings.sh make-ten # ten copies of the data
# ./fixed-strings.sh all-benchmarks
#
# Also useful:
# viz, viz-all
source common.sh
source words.sh # for constructing patterns
set -o nounset
set -o pipefail
set -o errexit
readonly ONE=_tmp/all-1.txt
readonly TWO=_tmp/all-2.txt
readonly TEN=_tmp/all-10.txt
# ESSENTIAL BUG FIX FOR GREP
export LC_ALL=C
banner() {
echo
echo ----- "$@"
echo
}
# Disabled - use the one in $PATH
# re2c() {
# ~/git/oilshell/oil/_deps/re2c-1.0.3/re2c "$@"
# }
publish-data() {
local name=$1
scp $ONE $name@$name.org:oilshell.org/share
}
fetch-data() {
mkdir -p _tmp
wget --directory _tmp https://www.oilshell.org/share/all-1.txt
}
# Compare re2c with fgrep.
# Is fgrep using Aho-Corsick?
# I suspect that fgrep does GNU grep-like tricks to be fast. Maybe we have to
# mmap() the whole file and then use re2c? assume it has a NUL terminator.
fgrep-demo() {
time fgrep $'main\nint' demo/* | wc -l
}
readonly MANIFEST=~/git/oilshell/oil/_tmp/wild/MANIFEST.txt
readonly ABS_PATHS=_tmp/wild-abs-paths.txt
make-manifest() {
wc -l $MANIFEST
mkdir -p _tmp
awk '{ print "/home/andy/git/oilshell/oil/" $2 }' $MANIFEST > $ABS_PATHS
}
make-big() {
# 34 MB of shell scripts!
time xargs cat < $ABS_PATHS > $ONE
for i in $(seq 2); do
echo _tmp/all-1.txt
done | xargs cat > $TWO
make-ten
wc -l $ONE $TWO $TEN
ls -l -h $ONE $TWO $TEN
}
make-ten() {
# 338 MB now!
for i in $(seq 10); do
echo _tmp/all-1.txt
done | xargs cat > $TEN
ls -l -h $TEN
}
describe-problem() {
echo "Matching this file:"
ls -l -h $TEN
echo
echo "Against ${#KEYWORDS[@]} keywords:"
echo " ${KEYWORDS[@]}"
echo
}
# 9.3 seconds for "for line in f", but it has a loop.
# 5.7 seconds for findall on the whole thing.
python-re-benchmark() {
banner 'Python re.findall()'
time python -c '
from __future__ import print_function
import re, sys, time
path = sys.argv[1]
pat = re.compile("|".join(sys.argv[2:]))
with open(path) as f:
contents = f.read()
start_time = time.time()
pat.findall(contents)
elapsed = time.time() - start_time
print("findall() took %.f seconds" % elapsed, file=sys.stderr)
#for line in f:
# m = pat.findall(line)
' $TEN "${KEYWORDS[@]}"
}
io-benchmark() {
banner 'CAT'
time cat $TEN > /dev/null
banner 'WC'
time wc -l $TEN
}
# Run with COUNT_RESULTS=1 ./fixed-strings.sh
COUNT_RESULTS=${COUNT_RESULTS:-}
fgrep-pat() { ./make_pat.py fgrep; }
grep-pat() { ./make_pat.py grep; }
ripgrep-pat() { ./make_pat.py ripgrep; }
re2c-pat() { ./make_pat.py re2c; }
re2-pat() { ./make_pat.py re2; }
test-pat() {
local k=_tmp/keywords.txt
argv "$(fgrep-pat < $k)"
argv "$(grep-pat < $k)"
argv "$(ripgrep-pat < $k)"
argv "$(re2c-pat < $k)"
}
grep-fixed-benchmark() {
local words=_tmp/keywords.txt
# fgrep is slowest!
# what if I increase the number of strings?
if test -n "$COUNT_RESULTS"; then
echo
echo 'FGREP number of results'
fgrep "$(fgrep-pat < $words)" $TEN | wc -l
fi
banner 'FGREP'
time fgrep "$(fgrep-pat < $words)" $TEN >/dev/null
# Wow fgrep is significantly slower than grep!
if test -n "$COUNT_RESULTS"; then
echo
echo 'GREP number of results'
grep "$(grep-pat < $words)" $TEN | wc -l
fi
banner 'GREP'
time grep "$(grep-pat < $words)" $TEN >/dev/null
if test -f $RG; then
if test -n "$COUNT_RESULTS"; then
echo
echo 'RIPGREP number of results'
$RG "$(ripgrep-pat < $words)" $TEN | wc -l
fi
banner 'RIPGREP'
time $RG "$(ripgrep-pat < $words)" $TEN >/dev/null
fi
}
# NOTE: egrep is faster with 14,300 strings than on 13 strings? Probably
# because it is able to search LESS of the line. A match is more likely to
# appear earlier in the line, even though there are fewer matches overall?
many-words-grep-benchmark() {
local file=$1
local pat="$(./make_pat.py ripgrep < $file)"
rm -f _tmp/many-*
# Always enable
COUNT_RESULTS=1
if test -f $RG; then
if test -n "$COUNT_RESULTS"; then
echo
echo 'RIPGREP number of results'
$RG "$pat" $TEN > _tmp/many-ripgrep.txt
fi
banner 'RIPGREP'
time $RG "$pat" $TEN >/dev/null
fi
wc -l _tmp/many-*
md5sum _tmp/many-*
return
banner 'NOTE: egrep blows up on large input! May want to Ctrl-C.'
if test -n "$COUNT_RESULTS"; then
echo
echo 'EGREP number of results'
egrep "$pat" $TEN > _tmp/many-egrep.txt
fi
banner 'EGREP'
time egrep "$pat" $TEN >/dev/null
wc -l _tmp/many-*
md5sum _tmp/many-*
}
# Does this make a difference? I'm not seeing it. Could be related to
# locale.
egrep-syntax-comparison() {
if true; then
banner 'EGREP with | syntax'
time egrep "$(egrep-pat)" $TEN >/dev/null
banner 'EGREP with -e'
time egrep $(egrep-dash-e-argv) $TEN >/dev/null
fi
}
# I'm getting opposite results? mmap() is a win?
# https://lemire.me/blog/2012/06/26/which-is-fastest-read-fread-ifstream-or-mmap/
# TODO: Always update the words here?
re2c-fixed-benchmark() {
local gen=_gen/fixed-strings.cc
local bin=_tmp/fixed-strings
mkdir -p _gen
banner 'Compiling with re2c'
time re2c -o $gen fixed-strings.re2c.cc
banner 'Done'
return
#g++ $DEBUG_FLAGS -o _tmp/fread _gen/fread.cc
banner 'Compiling with g++ (GCC)'
time g++ $OPT_FLAGS -o $bin $gen
# 800 ms to read line-by-line. disabled because it's slow.
if false; then
banner 'fgets'
time $bin fgets $TEN >/dev/null
fi
# Without re2c: count the lines at memory bandwidth. 187 ms.
banner 'read:count-lines'
time $bin read:count-lines $TEN >/dev/null
# 1214 ms.
banner 'read:re2c-match'
time $bin read:re2c-match $TEN >/dev/null
if false; then
# Hm the mmap() is the thing that is slow here? Not re2c?
banner 'mmap'
time $bin mmap $TEN #>/dev/null
fi
}
re2-fixed-benchmark() {
local words=${1:-_tmp/keywords.txt}
banner "RE2 on $words"
./build.sh re2-grep
time _tmp/re2_grep "$(re2-pat < $words)" $TEN >/dev/null
}
all-benchmarks() {
io-benchmark
grep-fixed-benchmark
re2c-fixed-benchmark
re2-fixed-benchmark
python-re-benchmark
}
viz() {
local name=${1:-fixed-strings}
local dot=_gen/$name.dot
re2c --emit-dot -o $dot $name.re2c.cc
re2c -o _gen/$name.cc $name.re2c.cc
dot -T png -o _gen/$name.png $dot
}
viz-all() {
viz trie
viz do-done
viz else-elif
viz synthetic-rsc
ls -l _gen
}
edited() {
for name in do-done-edited else-elif-edited; do
local dot=$name.dot
dot -T png -o _gen/$name.png $dot
done
}
# GrepFast() is 682 bytes only.
code-size() {
~/git/other/bloaty/bloaty -d symbols _tmp/fixed-strings | tee _gen/code-size.txt
}
readonly RG=~/install/ripgrep-0.10.0-x86_64-unknown-linux-musl/rg
download-ripgrep() {
wget --directory ~/install \
https://github.com/BurntSushi/ripgrep/releases/download/0.10.0/ripgrep-0.10.0-x86_64-unknown-linux-musl.tar.gz
}
# grep is faster than both fgrep and the "optimal" DFA in native code
# (generated by re2c). I think grep is benefitting from SKIPPING bytes.
# All times are 'user' time, which is most of the 'real' time.
# re2c compile | re2c code size | re2c match time | ripgrep time | RE2
# n= 100 7 ms 11 KiB 1,566 ms 687 ms 1,398 ms
# n=1000 66 ms 57 KiB 2,311 ms 1,803 ms 1,874 ms
# n=2000 120 ms 93 KiB 2,499 ms 3,591 ms 2,681 ms
# n=3000 204 ms 125 KiB 2,574 ms 5,801 ms 3,471 ms
# n=4000 266 ms 159 KiB 2,563 ms 8,083 ms 4,323 ms
# n=5000 363 ms 186 KiB 2,638 ms 10,431 ms 5,294 ms
# n=6000 366 ms 213 KiB 2,659 ms 13,182 ms 6,397 ms
# n=47,000 2,814 ms
#
# NOTES:
# - egrep blows up around 400 strings!
# - RE2 says "DFA out of memory" at 2000 strings, because it exhausts its 8 MB
# budget. We simply bump it up.
# - at 48,000 words, re2c segfaults!
# - At 10,000 words, GCC takes 36 seconds to compile re2c's output! It's 74K
# lines in 1.2 MB of source.
compare-many-words() {
local n=${1:-1000}
local words=_tmp/sampled-$n.txt
# ripgrep
many-words-grep-benchmark $words
# NOTE: blows up
re2-fixed-benchmark $words
update-re2c-keywords $words
re2c-fixed-benchmark $words
code-size
}
re2-many() {
for n in 1000 2000 3000 4000 5000 6000; do
local words=_tmp/sampled-$n.txt
re2-fixed-benchmark $words
done
}
re2c-huge() {
local n=${1:-9000}
write-sample $n
local words=_tmp/sampled-$n.txt
# Can't do this because the argument list is too long for sed! Doh!
update-re2c-keywords $words
re2c-fixed-benchmark $words
}
max-re2c-state() {
egrep -o 'yy[0-9]+' _gen/fixed-strings.cc | sort
}
"$@"