From db702ff73d08ee8120863e59a7c698e001de816f Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 16 Feb 2023 13:52:31 -0500 Subject: [PATCH 01/51] Add src/adt/idmap.c, a state -> ID set map. --- Makefile | 1 + include/adt/idmap.h | 58 ++++++ src/adt/Makefile | 1 + src/adt/idmap.c | 392 ++++++++++++++++++++++++++++++++++++++ tests/idmap/Makefile | 19 ++ tests/idmap/idmap_basic.c | 136 +++++++++++++ 6 files changed, 607 insertions(+) create mode 100644 include/adt/idmap.h create mode 100644 src/adt/idmap.c create mode 100644 tests/idmap/Makefile create mode 100644 tests/idmap/idmap_basic.c diff --git a/Makefile b/Makefile index a94baac61..73a675013 100644 --- a/Makefile +++ b/Makefile @@ -92,6 +92,7 @@ SUBDIR += src SUBDIR += tests/capture SUBDIR += tests/complement SUBDIR += tests/gen +SUBDIR += tests/idmap SUBDIR += tests/intersect #SUBDIR += tests/ir # XXX: fragile due to state numbering SUBDIR += tests/eclosure diff --git a/include/adt/idmap.h b/include/adt/idmap.h new file mode 100644 index 000000000..064fd15d1 --- /dev/null +++ b/include/adt/idmap.h @@ -0,0 +1,58 @@ +#ifndef IDMAP_H +#define IDMAP_H + +/* Mapping between one fsm_state_t and a set of + * unsigned IDs. The implementation assumes that both + * IDs are sequentially assigned and don't need a sparse + * mapping -- it will handle 10 -> [1, 3, 47] well, but + * not 1000000 -> [14, 524288, 1073741823]. */ + +#include + +#include "fsm/fsm.h" +#include "fsm/alloc.h" + +struct idmap; /* Opaque handle. */ + +struct idmap * +idmap_new(const struct fsm_alloc *alloc); + +void +idmap_free(struct idmap *m); + +/* Associate a value with a state (if not already present.) + * Returns 1 on success, or 0 on allocation failure. */ +int +idmap_set(struct idmap *m, fsm_state_t state_id, unsigned value); + +/* How many values are associated with an ID? */ +size_t +idmap_get_value_count(const struct idmap *m, fsm_state_t state_id); + +/* Get the values associated with an ID. + * + * Returns 1 on success and writes them into the buffer, in ascending + * order, with the count in *written (if non-NULL). + * + * Returns 0 on error (insufficient buffer space). */ +int +idmap_get(const struct idmap *m, fsm_state_t state_id, + size_t buf_size, unsigned *buf, size_t *written); + +/* Iterator callback. */ +typedef void +idmap_iter_fun(fsm_state_t state_id, unsigned value, void *opaque); + +/* Iterate over the ID map. State IDs may be yielded out of order, + * values will be in ascending order. */ +void +idmap_iter(const struct idmap *m, + idmap_iter_fun *cb, void *opaque); + +/* Iterate over the values associated with a single state + * (in ascending order). */ +void +idmap_iter_for_state(const struct idmap *m, fsm_state_t state_id, + idmap_iter_fun *cb, void *opaque); + +#endif diff --git a/src/adt/Makefile b/src/adt/Makefile index 05199f2dc..64ad7429f 100644 --- a/src/adt/Makefile +++ b/src/adt/Makefile @@ -2,6 +2,7 @@ SRC += src/adt/alloc.c SRC += src/adt/bitmap.c +SRC += src/adt/idmap.c SRC += src/adt/internedstateset.c SRC += src/adt/priq.c SRC += src/adt/path.c diff --git a/src/adt/idmap.c b/src/adt/idmap.c new file mode 100644 index 000000000..ca169b71e --- /dev/null +++ b/src/adt/idmap.c @@ -0,0 +1,392 @@ +/* + * Copyright 2021 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "adt/idmap.h" + +#include "adt/alloc.h" +#include "adt/hash.h" +#include "adt/u64bitset.h" + +#include +#include +#include + +#define NO_STATE ((fsm_state_t)-1) + +#define DEF_BUCKET_COUNT 4 + +struct idmap { + const struct fsm_alloc *alloc; + unsigned bucket_count; + unsigned buckets_used; + + /* All buckets' values are assumed to be large + * enough to store this value, and they will all + * grow as necessary. */ + unsigned max_value; + + /* Basic linear-probing, add-only hash table. */ + struct idmap_bucket { + fsm_state_t state; /* Key. NO_STATE when empty. */ + + /* values[] is always either NULL or has at least + * max_value + 1 bits; all grow on demand. */ + uint64_t *values; + } *buckets; +}; + +static unsigned +value_words(unsigned max_value) { + if (max_value == 0) { + /* Still allocate one word, for storing 0. */ + return 1; + } else { + return u64bitset_words(max_value); + } +} + +struct idmap * +idmap_new(const struct fsm_alloc *alloc) +{ + struct idmap *res = NULL; + struct idmap_bucket *buckets = NULL; + + res = f_malloc(alloc, sizeof(*res)); + if (res == NULL) { + goto cleanup; + } + + buckets = f_calloc(alloc, + DEF_BUCKET_COUNT, sizeof(buckets[0])); + if (buckets == NULL) { + goto cleanup; + } + + for (size_t i = 0; i < DEF_BUCKET_COUNT; i++) { + buckets[i].state = NO_STATE; + } + + res->alloc = alloc; + res->buckets_used = 0; + res->bucket_count = DEF_BUCKET_COUNT; + res->max_value = 0; + res->buckets = buckets; + + return res; + +cleanup: + f_free(alloc, res); + f_free(alloc, buckets); + return NULL; +} + +void +idmap_free(struct idmap *m) +{ + if (m == NULL) { + return; + } + + for (size_t i = 0; i < m->bucket_count; i++) { + if (m->buckets[i].state == NO_STATE) { + continue; + } + f_free(m->alloc, m->buckets[i].values); + } + + f_free(m->alloc, m->buckets); + f_free(m->alloc, m); +} + +static int +grow_bucket_values(struct idmap *m, unsigned old_words, unsigned new_words) +{ + assert(new_words > old_words); + + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + struct idmap_bucket *b = &m->buckets[b_i]; + if (b->state == NO_STATE) { + assert(b->values == NULL); + continue; + } + + uint64_t *nv = f_calloc(m->alloc, + new_words, sizeof(nv[0])); + if (nv == NULL) { + return 0; + } + + for (size_t w_i = 0; w_i < old_words; w_i++) { + nv[w_i] = b->values[w_i]; + } + f_free(m->alloc, b->values); + b->values = nv; + } + return 1; +} + +static int +grow_buckets(struct idmap *m) +{ + const size_t ocount = m->bucket_count; + const size_t ncount = 2*ocount; + assert(ncount > m->bucket_count); + + struct idmap_bucket *nbuckets = f_calloc(m->alloc, + ncount, sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return 0; + } + for (size_t nb_i = 0; nb_i < ncount; nb_i++) { + nbuckets[nb_i].state = NO_STATE; + } + + const size_t nmask = ncount - 1; + + for (size_t ob_i = 0; ob_i < ocount; ob_i++) { + const struct idmap_bucket *ob = &m->buckets[ob_i]; + if (ob->state == NO_STATE) { + continue; + } + + const uint64_t h = hash_id(ob->state); + for (size_t nb_i = 0; nb_i < ncount; nb_i++) { + struct idmap_bucket *nb = &nbuckets[(h + nb_i) & nmask]; + if (nb->state == NO_STATE) { + nb->state = ob->state; + nb->values = ob->values; + break; + } else { + assert(nb->state != ob->state); + /* collision */ + continue; + } + } + } + + f_free(m->alloc, m->buckets); + + m->buckets = nbuckets; + m->bucket_count = ncount; + + return 1; +} + +int +idmap_set(struct idmap *m, fsm_state_t state_id, + unsigned value) +{ + assert(state_id != NO_STATE); + + const uint64_t h = hash_id(state_id); + if (value > m->max_value) { + const unsigned ovw = value_words(m->max_value); + const unsigned nvw = value_words(value); + /* If this value won't fit in the existing value + * arrays, then grow them all. We do not track the + * number of bits in each individual array. */ + if (nvw > ovw && !grow_bucket_values(m, ovw, nvw)) { + return 0; + } + m->max_value = value; + } + + assert(m->max_value >= value); + + if (m->buckets_used >= m->bucket_count/2) { + if (!grow_buckets(m)) { + return 0; + } + } + + const uint64_t mask = m->bucket_count - 1; + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; + if (b->state == state_id) { + assert(b->values != NULL); + u64bitset_set(b->values, value); + return 1; + } else if (b->state == NO_STATE) { + b->state = state_id; + assert(b->values == NULL); + + const unsigned vw = value_words(m->max_value); + b->values = f_calloc(m->alloc, + vw, sizeof(b->values[0])); + if (b->values == NULL) { + return 0; + } + m->buckets_used++; + + u64bitset_set(b->values, value); + return 1; + } else { + continue; /* collision */ + } + + } + + assert(!"unreachable"); + return 0; +} + +static const struct idmap_bucket * +get_bucket(const struct idmap *m, fsm_state_t state_id) +{ + const uint64_t h = hash_id(state_id); + const uint64_t mask = m->bucket_count - 1; + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + const struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; + if (b->state == NO_STATE) { + return NULL; + } else if (b->state == state_id) { + return b; + } + } + + return NULL; +} + +size_t +idmap_get_value_count(const struct idmap *m, fsm_state_t state_id) +{ + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + return 0; + } + assert(b->values != NULL); + + size_t res = 0; + const size_t words = value_words(m->max_value); + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + /* This could use popcount64(w). */ + if (w == 0) { + continue; + } + for (uint64_t bit = 1; bit; bit <<= 1) { + if (w & bit) { + res++; + } + } + } + + return res; +} + +int +idmap_get(const struct idmap *m, fsm_state_t state_id, + size_t buf_size, unsigned *buf, size_t *written) +{ + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + if (written != NULL) { + *written = 0; + } + return 1; + } + + size_t buf_offset = 0; + const size_t words = value_words(m->max_value); + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + if (buf_offset * sizeof(buf[0]) >= buf_size) { + return 0; + } + buf[buf_offset] = 64*w_i + b_i; + buf_offset++; + } + } + } + + if (written != NULL) { + *written = buf_offset; + } + return 1; +} + +void +idmap_iter(const struct idmap *m, + idmap_iter_fun *cb, void *opaque) +{ + const size_t words = value_words(m->max_value); + + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + const struct idmap_bucket *b = &m->buckets[b_i]; + if (b->state == NO_STATE) { + continue; + } + + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + const unsigned v = 64*w_i + b_i; + cb(b->state, v, opaque); + } + } + } + } +} + +void +idmap_iter_for_state(const struct idmap *m, fsm_state_t state_id, + idmap_iter_fun *cb, void *opaque) +{ + const size_t words = value_words(m->max_value); + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + return; + } + + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + /* if N contiguous bits are all zero, skip them all at once */ +#define BLOCK_BITS 16 + uint64_t block = ((uint64_t)1 << BLOCK_BITS) - 1; + size_t block_count = 0; + + uint64_t b_i = 0; + while (b_i < 64) { + if ((w & block) == 0) { + block <<= BLOCK_BITS; + b_i += BLOCK_BITS; + continue; + } + + if (w & ((uint64_t)1 << b_i)) { + const unsigned v = 64*w_i + b_i; + cb(b->state, v, opaque); + block_count++; + } + b_i++; + block <<= 1; + } + +#define CHECK 0 +#if CHECK + size_t check_count = 0; + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + check_count++; + } + } + assert(block_count == check_count); +#endif + } +} diff --git a/tests/idmap/Makefile b/tests/idmap/Makefile new file mode 100644 index 000000000..aee01f565 --- /dev/null +++ b/tests/idmap/Makefile @@ -0,0 +1,19 @@ +.include "../../share/mk/top.mk" + +TEST.tests/idmap != ls -1 tests/idmap/idmap*.c +TEST_SRCDIR.tests/idmap = tests/idmap +TEST_OUTDIR.tests/idmap = ${BUILD}/tests/idmap + +.for n in ${TEST.tests/idmap:T:R:C/^idmap//} +INCDIR.${TEST_SRCDIR.tests/idmap}/idmap${n}.c += src/adt +.endfor + +.for n in ${TEST.tests/idmap:T:R:C/^idmap//} +test:: ${TEST_OUTDIR.tests/idmap}/res${n} +SRC += ${TEST_SRCDIR.tests/idmap}/idmap${n}.c +CFLAGS.${TEST_SRCDIR.tests/idmap}/idmap${n}.c += -UNDEBUG -D_DEFAULT_SOURCE -std=c99 +${TEST_OUTDIR.tests/idmap}/run${n}: ${TEST_OUTDIR.tests/idmap}/idmap${n}.o ${BUILD}/lib/adt.o + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/idmap}/idmap${n}.c} -o ${TEST_OUTDIR.tests/idmap}/run${n} ${TEST_OUTDIR.tests/idmap}/idmap${n}.o ${BUILD}/lib/adt.o +${TEST_OUTDIR.tests/idmap}/res${n}: ${TEST_OUTDIR.tests/idmap}/run${n} + ( ${TEST_OUTDIR.tests/idmap}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/idmap}/res${n} +.endfor diff --git a/tests/idmap/idmap_basic.c b/tests/idmap/idmap_basic.c new file mode 100644 index 000000000..19f44d56e --- /dev/null +++ b/tests/idmap/idmap_basic.c @@ -0,0 +1,136 @@ +/* + * Copyright 2021 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include + +#include + +#define DEF_LIMIT 10 +#define DEF_SEED 0 + +/* Thes numbers were chose to get a reasonable variety, + * but also some duplicated values as the input grows. */ +#define MAX_GEN_VALUES 23 +#define ID_MASK ((1 << 9) - 1) +#define VALUE_MASK ((1 << 10) - 1) + +static void +dump_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + /* fprintf(stderr, " -- state %d, value %u\n", state_id, value); */ + assert(state_id <= ID_MASK); + assert(value <= VALUE_MASK); + (void)opaque; +} + +static int +cmp_u(const void *pa, const void *pb) +{ + const unsigned a = *(unsigned *)pa; + const unsigned b = *(unsigned *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +int main(int argc, char **argv) { + const size_t limit = (argc > 1 ? atoi(argv[1]) : DEF_LIMIT); + const unsigned seed = (argc > 2 ? atoi(argv[2]) : DEF_SEED); + + (void)argc; + (void)argv; + struct idmap *m = idmap_new(NULL); + + srandom(seed); + + /* Fill the table with random data */ + for (size_t id_i = 0; id_i < limit; id_i++) { + const fsm_state_t id = (fsm_state_t)(random() & ID_MASK); + const size_t value_count = random() % MAX_GEN_VALUES; + + for (size_t v_i = 0; v_i < value_count; v_i++) { + const unsigned v = random() & VALUE_MASK; + if (!idmap_set(m, id, v)) { + assert(!"failed to set"); + } + } + } + + idmap_iter(m, dump_cb, NULL); + + srandom(seed); + + size_t got_buf_ceil = MAX_GEN_VALUES; + unsigned *got_buf = malloc(got_buf_ceil * sizeof(got_buf[0])); + assert(got_buf != NULL); + + /* Reset the PRNG and read back the same data. */ + for (size_t id_i = 0; id_i < limit; id_i++) { + const fsm_state_t id = (fsm_state_t)(random() & ID_MASK); + const size_t generated_value_count = random() % MAX_GEN_VALUES; + + /* Note: This can occasionally differ from + * generated_value_count, because the same id or values + * may have been generated more than once. As long as + * all the values match, it's fine. */ + const size_t value_count = idmap_get_value_count(m, id); + + if (value_count > got_buf_ceil) { + size_t nceil = got_buf_ceil; + while (nceil <= value_count) { + nceil *= 2; + } + free(got_buf); + got_buf = malloc(nceil * sizeof(got_buf[0])); + assert(got_buf != NULL); + got_buf_ceil = nceil; + } + + size_t written; + if (!idmap_get(m, id, + got_buf_ceil * sizeof(got_buf[0]), got_buf, + &written)) { + assert(!"failed to get"); + } + assert(written == value_count); + + unsigned gen_buf[MAX_GEN_VALUES]; + + for (size_t v_i = 0; v_i < generated_value_count; v_i++) { + const unsigned v = random() & VALUE_MASK; + gen_buf[v_i] = v; + } + qsort(gen_buf, generated_value_count, sizeof(gen_buf[0]), cmp_u); + + /* Every generated value should appear in the buffer. + * There may be more in the buffer; ignore them. */ + size_t v_i = 0; + for (size_t gen_i = 0; gen_i < generated_value_count; gen_i++) { + int found = 0; + const unsigned gv = gen_buf[gen_i]; + assert(value_count <= got_buf_ceil); + /* got_buf should be sorted, so we can pick up where we left off */ + while (v_i < value_count) { + if (gv == got_buf[v_i]) { + /* Intentionally don't increment v_i on match, + * because gen_buf can repeat values. */ + found = 1; + break; + } + v_i++; + } + if (!found) { + fprintf(stderr, "NOT FOUND: state %d -- value: %u\n", + id, gv); + return EXIT_FAILURE; + } + } + } + + free(got_buf); + idmap_free(m); + return EXIT_SUCCESS; +} From 41a92d797b8caca61453555c4b7ccbff52ef2058 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 1 May 2023 15:37:20 -0400 Subject: [PATCH 02/51] Switch adt code to `uint64_t` for hashes, not `unsigned long`. Remove include/adt/mappingset.h, it's no longer used. --- include/adt/hashrec.h | 2 +- include/adt/mappingset.h | 42 ---------------------------------------- include/adt/stateset.h | 4 +++- src/adt/hashrec.c | 2 +- src/adt/stateset.c | 2 +- 5 files changed, 6 insertions(+), 46 deletions(-) delete mode 100644 include/adt/mappingset.h diff --git a/include/adt/hashrec.h b/include/adt/hashrec.h index 54816286e..545a20960 100644 --- a/include/adt/hashrec.h +++ b/include/adt/hashrec.h @@ -7,7 +7,7 @@ #ifndef ADT_HASHREC_H #define ADT_HASHREC_H -unsigned long +uint64_t hashrec(const void *p, size_t n); #endif diff --git a/include/adt/mappingset.h b/include/adt/mappingset.h deleted file mode 100644 index d4f91105e..000000000 --- a/include/adt/mappingset.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2019 Shannon F. Stewman - * - * See LICENCE for the full copyright terms. - */ - -#ifndef ADT_MAPPINGSET_H -#define ADT_MAPPINGSET_H - -struct fsm_alloc; -struct mapping_set; -struct mapping; - -struct mapping_iter { - struct hashset_iter iter; -}; - -struct mapping_set * -mapping_set_create(const struct fsm_alloc *a, - unsigned long (*hash)(const struct mapping *a), - int (*cmp)(const void *a, const void *b)); - -void -mapping_set_free(struct mapping_set *set); - -struct mapping * -mapping_set_add(struct mapping_set *set, struct mapping *item); - -struct mapping * -mapping_set_contains(const struct mapping_set *set, const struct mapping *item); - -void -mapping_set_clear(struct mapping_set *set); - -struct mapping * -mapping_set_first(const struct mapping_set *set, struct mapping_iter *it); - -struct mapping * -mapping_set_next(struct mapping_iter *it); - -#endif - diff --git a/include/adt/stateset.h b/include/adt/stateset.h index 83e835467..becd263ad 100644 --- a/include/adt/stateset.h +++ b/include/adt/stateset.h @@ -7,6 +7,8 @@ #ifndef ADT_STATESET_H #define ADT_STATESET_H +#include + struct set; struct fsm_alloc; struct state_set; @@ -72,7 +74,7 @@ state_set_rebase(struct state_set **set, fsm_state_t base); void state_set_replace(struct state_set **set, fsm_state_t old, fsm_state_t new); -unsigned long +uint64_t state_set_hash(const struct state_set *set); #endif diff --git a/src/adt/hashrec.c b/src/adt/hashrec.c index 6a341710b..7348cbc38 100644 --- a/src/adt/hashrec.c +++ b/src/adt/hashrec.c @@ -23,7 +23,7 @@ static const unsigned char hashk[] = { 0x14, 0xa8, 0xff, 0x36, 0x15, 0x16, 0x2c, 0xf7, 0xf4, 0xce, 0xb8, 0x66, 0x74, 0xf4, 0x3d, 0x64, }; -unsigned long +uint64_t hashrec(const void *p, size_t n) { uint64_t h = 0; diff --git a/src/adt/stateset.c b/src/adt/stateset.c index c1cff9933..eefd704f6 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -659,7 +659,7 @@ state_set_replace(struct state_set **setp, fsm_state_t old, fsm_state_t new) } } -unsigned long +uint64_t state_set_hash(const struct state_set *set) { if (set == NULL) { From ec30958032f930f4d7c3b4bba5554308b8fc9d1b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 2 May 2023 13:45:59 -0400 Subject: [PATCH 03/51] stateset: Add `EXPENSIVE_CHECKS` guard around expensive asserts. This and src/libfsm/internal.h's EXPENSIVE_CHECKS should move to a common place later. --- src/adt/stateset.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/adt/stateset.c b/src/adt/stateset.c index eefd704f6..ab3542161 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -15,6 +15,12 @@ #include #include +/* FIXME: This should use the same define as currently exists in + * src/fsm/internal.h. This is used here because the calls to + * state_set_contains change the order of growth. */ +#define EXPENSIVE_CHECKS 0 + + /* * TODO: now fsm_state_t is a numeric index, this could be a dynamically * allocated bitmap, instead of a set.inc's array of items. @@ -276,7 +282,9 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, set->i = 1; } +#if EXPENSIVE_CHECKS assert(state_set_contains(set, state)); +#endif return 1; } @@ -478,7 +486,9 @@ state_set_remove(struct state_set **setp, fsm_state_t state) set->i--; } +#if EXPENSIVE_CHECKS assert(!state_set_contains(set, state)); +#endif } int From d1ce686861ab8575a3257a3b1875a42b4e95d80c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 2 May 2023 13:46:58 -0400 Subject: [PATCH 04/51] Namespace epsilon_closure and closure_free with "fsm_". These symbols are exported in the library. --- src/fsm/main.c | 4 ++-- src/libfsm/closure.c | 4 ++-- src/libfsm/epsilons.c | 4 ++-- src/libfsm/internal.h | 4 ++-- src/libfsm/libfsm.syms | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/fsm/main.c b/src/fsm/main.c index cd5eea90f..0f97a9a81 100644 --- a/src/fsm/main.c +++ b/src/fsm/main.c @@ -598,7 +598,7 @@ main(int argc, char *argv[]) size_t n; struct state_iter it; - closures = epsilon_closure(fsm); + closures = fsm_epsilon_closure(fsm); if (closures == NULL) { return -1; } @@ -619,7 +619,7 @@ main(int argc, char *argv[]) printf("\n"); } - closure_free(closures, fsm->statecount); + fsm_closure_free(closures, fsm->statecount); return 0; } else { diff --git a/src/libfsm/closure.c b/src/libfsm/closure.c index fa2d0c783..9ebf48eb9 100644 --- a/src/libfsm/closure.c +++ b/src/libfsm/closure.c @@ -128,7 +128,7 @@ epsilon_closure_single(const struct fsm *fsm, struct state_set **closures, fsm_s } struct state_set ** -epsilon_closure(struct fsm *fsm) +fsm_epsilon_closure(struct fsm *fsm) { struct state_set **closures; fsm_state_t s; @@ -190,7 +190,7 @@ epsilon_closure(struct fsm *fsm) } void -closure_free(struct state_set **closures, size_t n) +fsm_closure_free(struct state_set **closures, size_t n) { fsm_state_t s; diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index e87d9d974..06da7739e 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -66,7 +66,7 @@ fsm_remove_epsilons(struct fsm *nfa) assert(nfa != NULL); TIME(&pre); - eclosures = epsilon_closure(nfa); + eclosures = fsm_epsilon_closure(nfa); TIME(&post); DIFF_MSEC("epsilon_closure", pre, post, NULL); @@ -167,7 +167,7 @@ fsm_remove_epsilons(struct fsm *nfa) res = 1; cleanup: if (eclosures != NULL) { - closure_free(eclosures, state_count); + fsm_closure_free(eclosures, state_count); } return res; diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index 4e25b3950..1dde1fc96 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -162,10 +162,10 @@ state_hasnondeterminism(const struct fsm *fsm, fsm_state_t state, struct bm *bm) * for states, with wrapper to populate malloced array of user-facing structs. */ struct state_set ** -epsilon_closure(struct fsm *fsm); +fsm_epsilon_closure(struct fsm *fsm); void -closure_free(struct state_set **closures, size_t n); +fsm_closure_free(struct state_set **closures, size_t n); /* * Internal free function that invokes free(3) by default, or a user-provided diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 3474fa289..f9f6bf003 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -115,8 +115,8 @@ fsm_print_cfrag # XXX: workaround for lx make_ir # XXX: workaround for lx free_ir # XXX: workaround for lx -epsilon_closure # XXX: workaround for fsm -closure_free # XXX: workaround for fsm +fsm_epsilon_closure # XXX: workaround for fsm +fsm_closure_free # XXX: workaround for fsm fsm_mergeab From deb14a8cf410c55ac7496deb316f063bf6e95018 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 3 May 2023 08:52:01 -0400 Subject: [PATCH 05/51] tests/capture/: Delete outdated capture tests. These rely on either direct FSM construction or setting captures via a capture path, but captures are now implemented by a completely different mechanism. New tests will be added with the capture code in a future commit. --- tests/capture/capture0.c | 27 ---- tests/capture/capture1.c | 28 ---- tests/capture/capture2.c | 31 ---- tests/capture/capture3.c | 221 -------------------------- tests/capture/capture4.c | 242 ----------------------------- tests/capture/capture5.c | 150 ------------------ tests/capture/capture_concat1.c | 133 ---------------- tests/capture/capture_concat2.c | 133 ---------------- tests/capture/capture_long_trail.c | 28 ---- tests/capture/capture_union1.c | 140 ----------------- tests/capture/capture_union2.c | 117 -------------- 11 files changed, 1250 deletions(-) delete mode 100644 tests/capture/capture0.c delete mode 100644 tests/capture/capture1.c delete mode 100644 tests/capture/capture2.c delete mode 100644 tests/capture/capture3.c delete mode 100644 tests/capture/capture4.c delete mode 100644 tests/capture/capture5.c delete mode 100644 tests/capture/capture_concat1.c delete mode 100644 tests/capture/capture_concat2.c delete mode 100644 tests/capture/capture_long_trail.c delete mode 100644 tests/capture/capture_union1.c delete mode 100644 tests/capture/capture_union2.c diff --git a/tests/capture/capture0.c b/tests/capture/capture0.c deleted file mode 100644 index 4e7d0e3fa..000000000 --- a/tests/capture/capture0.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" - -/* /a(bcd)e/ */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abcde", - { - { 1, 4 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture1.c b/tests/capture/capture1.c deleted file mode 100644 index e9fe53ab9..000000000 --- a/tests/capture/capture1.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" -/* (a(b(c))) */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abc", - { - { 0, 3 }, - { 1, 3 }, - { 2, 3 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture2.c b/tests/capture/capture2.c deleted file mode 100644 index 20a1c1bac..000000000 --- a/tests/capture/capture2.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" - -/* (a(b((c))(d))) */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abcd", - { - { 0, 4 }, - { 1, 4 }, - { 2, 3 }, - { 2, 3 }, - { 3, 4 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture3.c b/tests/capture/capture3.c deleted file mode 100644 index 9d4d284ab..000000000 --- a/tests/capture/capture3.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "captest.h" - -/* Combine 3 fully disjoint FSMs: - * - * - 0: "(a(b))" - * - 1: "(cd(e))" - * - 2: "(fgh(i))" - * - * Shift the captures for 1 and 2 forward and use/combine - * opaques on them to track which one(s) matched. - * - * This tracking of which DFA matched should be more directly - * supported by the API later. */ - -static void -check(const struct fsm *fsm, const char *string, - unsigned end_id, unsigned capture_base); - -static void -det_and_min(const char *tag, struct fsm *fsm); - -int main(void) { - struct fsm *f_ab = captest_fsm_of_string("ab", 0); - struct fsm *f_cde = captest_fsm_of_string("cde", 1); - struct fsm *f_fghi = captest_fsm_of_string("fghi", 2); - struct fsm *f_all = NULL; - unsigned captures; - - struct fsm_combined_base_pair bases[3]; - struct fsm *fsms[3]; - - assert(f_ab); - assert(f_cde); - assert(f_fghi); - - /* set captures */ -#define SET_CAPTURE(FSM, STATE, CAPTURE, TYPE) \ - if (!fsm_set_capture_action(FSM, STATE, CAPTURE, TYPE)) { \ - fprintf(stderr, "failed to set capture on line %d\n", \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } - - /* (a(b)) */ - if (!fsm_capture_set_path(f_ab, 0, 0, 2)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_ab, 1, 1, 2)) { - exit(EXIT_FAILURE); - } - - /* (cd(e)) */ - if (!fsm_capture_set_path(f_cde, 0, 0, 3)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_cde, 1, 2, 3)) { - exit(EXIT_FAILURE); - } - - /* (fgh(i)) */ - if (!fsm_capture_set_path(f_fghi, 0, 0, 4)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_fghi, 1, 3, 4)) { - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "\n=== f_ab...\n"); - fsm_print_fsm(stderr, f_ab); - fsm_capture_dump(stderr, "#### f_ab", f_ab); - - fprintf(stderr, "\n=== f_cde...\n"); - fsm_print_fsm(stderr, f_cde); - fsm_capture_dump(stderr, "#### f_cde", f_cde); - - fprintf(stderr, "\n=== f_fghi...\n"); - fsm_print_fsm(stderr, f_fghi); - fsm_capture_dump(stderr, "#### f_fghi", f_fghi); -#endif - - /* determinise and minimise each before unioning */ - det_and_min("ab", f_ab); - det_and_min("cde", f_cde); - det_and_min("fghi", f_fghi); - - /* union them */ - fsms[0] = f_ab; - fsms[1] = f_cde; - fsms[2] = f_fghi; - - f_all = fsm_union_array(3, fsms, bases); - assert(f_all != NULL); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "=== unioned f_ab with f_cde... (CB ab: %u, cde: %u)\n", - bases[0].capture, bases[1].capture); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "#### f_all", f_all); -#endif - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "=== unioned f_all with f_fghi... (CB fghi: %u), %u captures\n", - bases[2].capture, fsm_countcaptures(f_all)); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "#### f_all #2", f_all); -#endif - - if (!fsm_determinise(f_all)) { - fprintf(stderr, "NOPE %d\n", __LINE__); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "#### f_all", f_all); -#endif - - captures = fsm_countcaptures(f_all); - if (captures != 6) { - fprintf(stderr, "expected 6 captures, got %u\n", captures); - exit(EXIT_FAILURE); - } - - check(f_all, "ab", 0, bases[0].capture); - check(f_all, "cde", 1, bases[1].capture); - check(f_all, "fghi", 2, bases[2].capture); - - - fsm_free(f_all); - - return 0; -} - -static void -det_and_min(const char *tag, struct fsm *fsm) -{ - if (!fsm_determinise(fsm)) { - fprintf(stderr, "Failed to determise '%s'\n", tag); - exit(EXIT_FAILURE); - } - - if (!fsm_minimise(fsm)) { - fprintf(stderr, "Failed to minimise '%s'\n", tag); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after det_and_min: '%s'\n", tag); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, tag, fsm); -#endif - -} - -static void -check(const struct fsm *fsm, const char *string, - unsigned end_id, unsigned capture_base) -{ - int exec_res; - size_t i; - struct captest_input input; - fsm_state_t end; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - const size_t length = strlen(string); - const unsigned cb = capture_base; /* alias */ - - input.string = string; - input.pos = 0; - - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; - } - - exec_res = fsm_exec(fsm, captest_getc, &input, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "fsm_exec: %d for '%s', expected 1\n", - exec_res, string); - exit(EXIT_FAILURE); - } - - /* check end ID */ - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - /* check captures */ - if (0) { - fprintf(stderr, "captures for '%s' (cb %u): [%ld, %ld], [%ld, %ld]\n", - string, capture_base, - captures[0 + cb].pos[0], captures[0 + cb].pos[1], - captures[1 + cb].pos[0], captures[1 + cb].pos[1]); - } - - assert(captures[0 + cb].pos[0] == 0); - assert(captures[0 + cb].pos[1] == length); - assert(captures[1 + cb].pos[0] == length - 1); - assert(captures[1 + cb].pos[1] == length); -} diff --git a/tests/capture/capture4.c b/tests/capture/capture4.c deleted file mode 100644 index 170cbe8b0..000000000 --- a/tests/capture/capture4.c +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "captest.h" - -/* Combine 2 mostly overlapping FSMs: - * - 0: "(abc)" - * - 1: "(ab*c)" - * and check for false positives in the match. - */ - -static struct fsm * -build_and_combine(unsigned *cb_a, unsigned *cb_b); - -static void -det_and_min(const char *tag, struct fsm *fsm); - -static struct fsm * -build_ab_c(void); - -static void -check(const struct fsm *fsm, const char *string, - unsigned expected_ends, - unsigned cb_a, size_t pa_0, size_t pa_1, - unsigned cb_b, size_t pb_0, size_t pb_1); - -int main(void) { - unsigned cb_abc, cb_ab_c; - struct fsm *f_all = build_and_combine(&cb_abc, &cb_ab_c); - unsigned captures; - const unsigned exp_0 = 1U << 0; - const unsigned exp_1 = 1U << 1; - - captures = fsm_countcaptures(f_all); - if (captures != 2) { - fprintf(stderr, "expected 2 captures, got %u\n", captures); - exit(EXIT_FAILURE); - } - - #define NO_POS FSM_CAPTURE_NO_POS - check(f_all, "abc", /* captures 0 and 1 */ - exp_0 | exp_1, - cb_abc, 0, 3, - cb_ab_c, 0, 3); - check(f_all, "ac", /* only capture 1 */ - exp_1, - cb_abc, NO_POS, NO_POS, - cb_ab_c, 0, 2); - check(f_all, "abbc", /* only capture 1 */ - exp_1, - cb_abc, NO_POS, NO_POS, - cb_ab_c, 0, 4); - - fsm_free(f_all); - - return 0; -} - -static struct fsm * -build_and_combine(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *f_abc = captest_fsm_of_string("abc", 0); - struct fsm *f_ab_c = build_ab_c(); - struct fsm *f_all; - struct fsm_combine_info ci; - - assert(f_abc); - assert(f_ab_c); - - if (!fsm_capture_set_path(f_abc, 0, 0, 3)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_ab_c, 0, 0, 3)) { - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==================== abc \n"); - fsm_print_fsm(stderr, f_abc); - fsm_capture_dump(stderr, "abc", f_abc); - - fprintf(stderr, "==================== ab*c \n"); - fsm_print_fsm(stderr, f_ab_c); - fsm_capture_dump(stderr, "ab*c", f_ab_c); -#endif - - det_and_min("abc", f_abc); - det_and_min("ab*c", f_ab_c); - - /* union them */ - f_all = fsm_union(f_abc, f_ab_c, &ci); - assert(f_all != NULL); - - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==================== post-union \n"); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "capture_actions", f_all); - fprintf(stderr, "====================\n"); -#endif - - if (!fsm_determinise(f_all)) { - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==================== post-det \n"); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "capture_actions", f_all); - fprintf(stderr, "====================\n"); -#endif - - return f_all; -} - -static void -det_and_min(const char *tag, struct fsm *fsm) -{ - if (!fsm_determinise(fsm)) { - fprintf(stderr, "Failed to determise '%s'\n", tag); - exit(EXIT_FAILURE); - } - - if (!fsm_minimise(fsm)) { - fprintf(stderr, "Failed to minimise '%s'\n", tag); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after det_and_min: '%s'\n", tag); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, tag, fsm); -#endif - -} - -static struct fsm * -build_ab_c(void) -{ - struct fsm *fsm = captest_fsm_with_options(); - assert(fsm != NULL); - - if (!fsm_addstate_bulk(fsm, 4)) { goto fail; } - - fsm_setstart(fsm, 0); - if (!fsm_addedge_literal(fsm, 0, 1, 'a')) { goto fail; } - - if (!fsm_addedge_literal(fsm, 1, 2, 'b')) { goto fail; } - if (!fsm_addedge_literal(fsm, 1, 3, 'c')) { goto fail; } - - if (!fsm_addedge_literal(fsm, 2, 2, 'b')) { goto fail; } - if (!fsm_addedge_literal(fsm, 2, 3, 'c')) { goto fail; } - - fsm_setend(fsm, 3, 1); - if (!fsm_setendid(fsm, 1)) { - goto fail; - } - - return fsm; - -fail: - exit(EXIT_FAILURE); -} - -static void -check(const struct fsm *fsm, const char *string, - unsigned expected_ends, - unsigned cb_a, size_t pa_0, size_t pa_1, - unsigned cb_b, size_t pb_0, size_t pb_1) -{ - int exec_res; - size_t i; - struct captest_input input; - fsm_state_t end; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - fprintf(stderr, "#### check '%s', exp: ends 0x%u, c%u: (%ld, %ld), c%u: %ld, %ld)\n", - string, expected_ends, - cb_a, pa_0, pa_1, - cb_b, pb_0, pb_1); - - input.string = string; - input.pos = 0; - - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; - } - - exec_res = fsm_exec(fsm, captest_getc, &input, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "fsm_exec: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - /* check captures */ - fprintf(stderr, "captures for '%s': [%ld, %ld], [%ld, %ld]\n", - string, - captures[0].pos[0], captures[0].pos[1], - captures[1].pos[0], captures[1].pos[1]); - assert(captures[cb_a].pos[0] == pa_0); - assert(captures[cb_a].pos[1] == pa_1); - assert(captures[cb_b].pos[0] == pb_0); - assert(captures[cb_b].pos[1] == pb_1); - - { - enum fsm_getendids_res gres; - fsm_end_id_t id_buf[2]; - size_t written; - gres = fsm_getendids(fsm, end, 2, id_buf, &written); - if (gres != FSM_GETENDIDS_FOUND) { - assert(!"fsm_getendids failed"); - } - - if (expected_ends == 0x2) { - assert(written == 1); - assert(id_buf[0] == 1); - } else if (expected_ends == 0x3) { - assert(written == 2); - assert(id_buf[0] == 0); - assert(id_buf[1] == 1); - } else { - assert(!"test not handled"); - } - } -} diff --git a/tests/capture/capture5.c b/tests/capture/capture5.c deleted file mode 100644 index b3a4be3ee..000000000 --- a/tests/capture/capture5.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define LOG_INTERMEDIATE_FSMS 0 -#include "captest.h" - -/* Check that self edges are handled properly in the - * capture action analysis. - * - * The DFA corresponds to /a(b*)(c)/. */ - -static struct fsm * -build(void); - -static void -check(struct fsm *f, const char *input, - unsigned pa_0, unsigned pa_1, - unsigned pb_0, unsigned pb_1); - -int main(void) { - struct fsm *f = build(); - unsigned captures; - assert(f != NULL); - - captures = fsm_countcaptures(f); - if (captures != 2) { - fprintf(stderr, "expected 2 captures, got %u\n", captures); - exit(EXIT_FAILURE); - } - - check(f, "ac", - 1, 1, - 1, 2); - check(f, "abc", - 1, 2, - 2, 3); - check(f, "abbc", - 1, 3, - 3, 4); - - fsm_free(f); - - return 0; -} - -static struct fsm * -build(void) -{ - struct fsm *fsm = captest_fsm_with_options(); - - if (!fsm_addstate_bulk(fsm, 4)) { goto fail; } - - fsm_setstart(fsm, 0); - if (!fsm_addedge_literal(fsm, 0, 1, 'a')) { goto fail; } - - if (!fsm_addedge_literal(fsm, 1, 1, 'b')) { goto fail; } - if (!fsm_addedge_literal(fsm, 1, 2, 'c')) { goto fail; } - - fsm_setend(fsm, 2, 1); - - if (!fsm_capture_set_path(fsm, 0, 1, 1)) { goto fail; } - if (!fsm_capture_set_path(fsm, 1, 1, 2)) { goto fail; } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== built\n"); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "built", fsm); -#endif - - if (!fsm_determinise(fsm)) { - fprintf(stderr, "Failed to determise\n"); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after det\n"); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "after det", fsm); -#endif - - if (!fsm_minimise(fsm)) { - fprintf(stderr, "Failed to minimise\n"); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after min\n"); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "after min", fsm); -#endif - return fsm; - -fail: - exit(EXIT_FAILURE); -} - -static void -check(struct fsm *fsm, const char *string, - unsigned pa_0, unsigned pa_1, - unsigned pb_0, unsigned pb_1) -{ - int exec_res; - size_t i; - struct captest_input input; - fsm_state_t end; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - fprintf(stderr, "#### check '%s', exp: c%u: (%u, %u), c%u: %u, %u)\n", - string, - 0, pa_0, pa_1, - 1, pb_0, pb_1); - - input.string = string; - input.pos = 0; - - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; - } - - exec_res = fsm_exec(fsm, captest_getc, &input, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "fsm_exec: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - /* check captures */ - fprintf(stderr, "captures for '%s': [%ld, %ld], [%ld, %ld]\n", - string, - captures[0].pos[0], captures[0].pos[1], - captures[1].pos[0], captures[1].pos[1]); - assert(captures[0].pos[0] == pa_0); - assert(captures[0].pos[1] == pa_1); - assert(captures[1].pos[0] == pb_0); - assert(captures[1].pos[1] == pb_1); -} diff --git a/tests/capture/capture_concat1.c b/tests/capture/capture_concat1.c deleted file mode 100644 index ee9c8aaab..000000000 --- a/tests/capture/capture_concat1.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "captest.h" - -/* concat /(ab)/ and /(cde)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_ab, size_t exp_start_ab, size_t exp_end_ab, - unsigned cb_cde, size_t exp_start_cde, size_t exp_end_cde); - -int main(void) { - unsigned cb_ab, cb_cde; /* capture base */ - struct fsm *abcde = build(&cb_ab, &cb_cde); - - check(abcde, "abcde", 1, - cb_ab, 0, 2, - cb_cde, 2, 5); - - fsm_free(abcde); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *ab = captest_fsm_of_string("ab", 0); - struct fsm *cde = captest_fsm_of_string("cde", 1); - struct fsm *abcde; - struct fsm_combine_info ci; - size_t cc_ab, cc_cde, cc_abcde; - - assert(ab); - assert(cde); - - if (!fsm_capture_set_path(ab, 0, 0, 2)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(cde, 0, 0, 3)) { - assert(!"path 1"); - } - - cc_ab = fsm_countcaptures(ab); - assert(cc_ab == 1); - - cc_cde = fsm_countcaptures(cde); - assert(cc_cde == 1); - - abcde = fsm_concat(ab, cde, &ci); - assert(abcde); - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - - cc_abcde = fsm_countcaptures(abcde); - assert(cc_abcde == cc_ab + cc_cde); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after concat: cb_ab %u, cb_cde %u\n", - *cb_a, *cb_b); - fsm_print_fsm(stderr, abcde); - - fsm_capture_dump(stderr, "#### after concat", abcde); - - fprintf(stderr, "==== determinise\n"); -#endif - - if (!fsm_determinise(abcde)) { - assert(!"determinise"); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, abcde); - - assert(fsm_countcaptures(abcde) == cc_abcde); - - fsm_capture_dump(stderr, "#### after det", abcde); -#endif - - assert(fsm_countcaptures(abcde) == cc_abcde); - return abcde; -} - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_ab, size_t exp_start_ab, size_t exp_end_ab, - unsigned cb_cde, size_t exp_start_cde, size_t exp_end_cde) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - assert(captures[cb_ab].pos[0] == exp_start_ab); - assert(captures[cb_ab].pos[1] == exp_end_ab); - - assert(captures[cb_cde].pos[0] == exp_start_cde); - assert(captures[cb_cde].pos[1] == exp_end_cde); -} diff --git a/tests/capture/capture_concat2.c b/tests/capture/capture_concat2.c deleted file mode 100644 index a8f070c7e..000000000 --- a/tests/capture/capture_concat2.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "captest.h" - -/* concat /(abc)/ and /(de)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_ab, size_t exp_start_ab, size_t exp_end_ab, - unsigned cb_cde, size_t exp_start_cde, size_t exp_end_cde); - -int main(void) { - unsigned cb_abc, cb_de; /* capture base */ - struct fsm *abcde = build(&cb_abc, &cb_de); - - check(abcde, "abcde", 1, - cb_abc, 0, 3, - cb_de, 3, 5); - - fsm_free(abcde); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *abc = captest_fsm_of_string("abc", 0); - struct fsm *de = captest_fsm_of_string("de", 1); - struct fsm *abcde; - struct fsm_combine_info ci; - size_t cc_abc, cc_de, cc_abcde; - - assert(abc); - assert(de); - - if (!fsm_capture_set_path(abc, 0, 0, 3)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(de, 0, 0, 2)) { - assert(!"path 1"); - } - - cc_abc = fsm_countcaptures(abc); - assert(cc_abc == 1); - - cc_de = fsm_countcaptures(de); - assert(cc_de == 1); - - abcde = fsm_concat(abc, de, &ci); - assert(abcde); - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - - cc_abcde = fsm_countcaptures(abcde); - assert(cc_abcde == cc_abc + cc_de); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after concat: cb_abc %u, cb_de %u\n", - *cb_a, *cb_b); - fsm_print_fsm(stderr, abcde); - - fsm_capture_dump(stderr, "#### after concat", abcde); - - fprintf(stderr, "==== determinise\n"); -#endif - - if (!fsm_determinise(abcde)) { - assert(!"determinise"); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, abcde); - - assert(fsm_countcaptures(abcde) == cc_abcde); - - fsm_capture_dump(stderr, "#### after det", abcde); -#endif - - assert(fsm_countcaptures(abcde) == cc_abcde); - return abcde; -} - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_abc, size_t exp_start_abc, size_t exp_end_abc, - unsigned cb_de, size_t exp_start_de, size_t exp_end_de) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - assert(captures[cb_abc].pos[0] == exp_start_abc); - assert(captures[cb_abc].pos[1] == exp_end_abc); - - assert(captures[cb_de].pos[0] == exp_start_de); - assert(captures[cb_de].pos[1] == exp_end_de); -} diff --git a/tests/capture/capture_long_trail.c b/tests/capture/capture_long_trail.c deleted file mode 100644 index 349717b0f..000000000 --- a/tests/capture/capture_long_trail.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" -/* a(bcdefghijklmnopqrstuvwxy)z - * This is long enough to exercise growing the trail for - * capture action analysis. */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abcdefghijklmnopqrstuvwxyz", - { - { 1, 25 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture_union1.c b/tests/capture/capture_union1.c deleted file mode 100644 index 5d9bd2920..000000000 --- a/tests/capture/capture_union1.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "captest.h" - -/* union /(ab)/ and /(cde)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end); - -int main(void) { - unsigned cb_ab, cb_cde; /* capture base */ - struct fsm *abcde = build(&cb_ab, &cb_cde); - - check(abcde, "ab", 0, cb_ab, 0, 2); - check(abcde, "cde", 1, cb_cde, 0, 3); - - fsm_free(abcde); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *ab = captest_fsm_of_string("ab", 0); - struct fsm *cde = captest_fsm_of_string("cde", 1); - struct fsm *abcde; - struct fsm_combine_info ci; - size_t cc_ab, cc_cde, cc_abcde; - - assert(ab); - assert(cde); - - if (!fsm_capture_set_path(ab, 0, 0, 2)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(cde, 0, 0, 3)) { - assert(!"path 1"); - } - - cc_ab = fsm_countcaptures(ab); - assert(cc_ab == 1); - - cc_cde = fsm_countcaptures(cde); - assert(cc_cde == 1); - - abcde = fsm_union(ab, cde, &ci); - assert(abcde); - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - - cc_abcde = fsm_countcaptures(abcde); - assert(cc_abcde == cc_ab + cc_cde); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after union: cb_ab %u, cb_cde %u\n", - *cb_a, *cb_b); - fsm_print_fsm(stderr, abcde); - - fsm_capture_dump(stderr, "#### after union", abcde); - - fprintf(stderr, "==== determinise\n"); -#endif - - if (!fsm_determinise(abcde)) { - assert(!"determinise"); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, abcde); - - assert(fsm_countcaptures(abcde) == cc_abcde); - - fsm_capture_dump(stderr, "#### after det", abcde); -#endif - - assert(fsm_countcaptures(abcde) == cc_abcde); - return abcde; -} - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture got_captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, got_captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - if (got_captures[exp_capture_id].pos[0] != exp_start) { - fprintf(stderr, "capture[%u].pos[0]: exp %lu, got %lu\n", - exp_capture_id, exp_start, - got_captures[exp_capture_id].pos[0]); - exit(EXIT_FAILURE); - } - if (got_captures[exp_capture_id].pos[1] != exp_end) { - fprintf(stderr, "capture[%u].pos[1]: exp %lu, got %lu\n", - exp_capture_id, exp_end, - got_captures[exp_capture_id].pos[1]); - exit(EXIT_FAILURE); - } -} diff --git a/tests/capture/capture_union2.c b/tests/capture/capture_union2.c deleted file mode 100644 index 7fab2f18d..000000000 --- a/tests/capture/capture_union2.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "captest.h" - -/* union /(abcd)/ and /(abed)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end); - -int main(void) { - unsigned cb_abcd, cb_abed; - struct fsm *fsm = build(&cb_abcd, &cb_abed); - - check(fsm, "abcd", 0, cb_abcd, 0, 4); - check(fsm, "abed", 1, cb_abed, 0, 4); - - fsm_free(fsm); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *abcd = captest_fsm_of_string("abcd", 0); - struct fsm *abed = captest_fsm_of_string("abed", 1); - struct fsm *res; - - assert(abcd); - assert(abed); - - if (!fsm_capture_set_path(abcd, 0, 0, 4)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(abed, 0, 0, 4)) { - assert(!"path 1"); - } - - { - struct fsm *fsms[2]; - struct fsm_combined_base_pair bases[2]; - fsms[0] = abcd; - fsms[1] = abed; - res = fsm_union_array(2, fsms, bases); - assert(res); - *cb_a = bases[0].capture; - *cb_b = bases[1].capture; - } - - if (!fsm_determinise(res)) { - assert(!"determinise"); - } - - assert(fsm_countcaptures(res) == 2); - - return res; -} - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture got_captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, got_captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - if (got_captures[exp_capture_id].pos[0] != exp_start) { - fprintf(stderr, "capture[%u].pos[0]: exp %lu, got %lu\n", - exp_capture_id, exp_start, - got_captures[exp_capture_id].pos[0]); - exit(EXIT_FAILURE); - } - if (got_captures[exp_capture_id].pos[1] != exp_end) { - fprintf(stderr, "capture[%u].pos[1]: exp %lu, got %lu\n", - exp_capture_id, exp_end, - got_captures[exp_capture_id].pos[1]); - exit(EXIT_FAILURE); - } -} From ffea22e3164142e3944c1a458108af1a83c14590 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 31 May 2023 17:14:35 -0400 Subject: [PATCH 06/51] fuzz/run_fuzzer: Run single seed file when given as argument. --- fuzz/run_fuzzer | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fuzz/run_fuzzer b/fuzz/run_fuzzer index be8ba1d95..429ffa961 100755 --- a/fuzz/run_fuzzer +++ b/fuzz/run_fuzzer @@ -4,6 +4,8 @@ BUILD=../build FUZZER=${BUILD}/fuzz/fuzzer SEEDS=${BUILD}/fuzz/fuzzer_seeds +ARG=$1 + SECONDS=${SECONDS:-60} WORKERS=${WORKERS:-4} SEEDS=${SEEDS:-seeds} @@ -25,5 +27,9 @@ if [ ! -d "${SEEDS}" ]; then mkdir -p "${SEEDS}" fi -echo "\n==== ${FUZZER}" -${FUZZER} -jobs=${WORKERS} -workers=${WORKERS} -max_total_time=${SECONDS} ${SEEDS} +if [ -z "${ARG}" ]; then + echo "\n==== ${FUZZER}" + exec ${FUZZER} -jobs=${WORKERS} -workers=${WORKERS} -max_total_time=${SECONDS} ${SEEDS} +else + exec ${FUZZER} ${ARG} +fi \ No newline at end of file From 2beeff42ac594353906862a6636f11d69e43d987 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 31 May 2023 17:13:38 -0400 Subject: [PATCH 07/51] Complemely rework capture resolution. This is a big commit, unfortunately difficult to break apart further due to interface changes, metadata being passed through whole-FSM transformations, and so on. Sorry about that. - Delete code related to capture action metadata on edges. That approach made FSM transformations (determinisation, minimisation, etc.) considerably more expensive, and there were some corner cases that I wasn't able to get working correctly. - Switch to a somewhat simpler method, adapted from Russ Cox's "Regular Expression Matching: the Virtual Machine Approach". Since the capture resolution metadata (an opcode program for a virtual machine) is associated with individual end states, this combines cleanly when multiple regexes are unioned into a single large DFA that matches them all at once. - Add lots of capture regression tests, mostly from using libfsm's `fsm_generate_matches` and a fuzzer to compare behavior against PCRE. This brought many, many obscure cases to light. The updated fuzzer harness will appear in a later commit. --- include/adt/idmap.h | 5 +- include/fsm/capture.h | 43 +- include/fsm/fsm.h | 32 +- include/re/re.h | 24 +- src/adt/Makefile | 8 +- src/fsm/main.c | 4 +- src/libfsm/Makefile | 2 + src/libfsm/capture.c | 1126 +++++------- src/libfsm/capture.h | 139 +- src/libfsm/capture_internal.h | 115 -- src/libfsm/capture_log.h | 27 + src/libfsm/capture_vm.c | 193 ++ src/libfsm/capture_vm.h | 69 + src/libfsm/capture_vm_exec.c | 2063 ++++++++++++++++++++++ src/libfsm/capture_vm_log.h | 21 + src/libfsm/capture_vm_program.h | 74 + src/libfsm/clone.c | 86 +- src/libfsm/consolidate.c | 173 +- src/libfsm/determinise.c | 249 +-- src/libfsm/determinise_internal.h | 30 +- src/libfsm/endids.c | 136 +- src/libfsm/endids.h | 7 +- src/libfsm/endids_internal.h | 43 - src/libfsm/epsilons.c | 412 ++--- src/libfsm/exec.c | 96 +- src/libfsm/internal.h | 4 - src/libfsm/libfsm.syms | 15 +- src/libfsm/merge.c | 224 ++- src/libfsm/minimise.c | 579 +++++- src/libfsm/print/Makefile | 3 + src/libfsm/state.c | 24 +- src/libfsm/trim.c | 81 +- src/libfsm/union.c | 12 + src/libre/Makefile | 6 +- src/libre/ast.c | 3 + src/libre/ast.h | 19 +- src/libre/ast_analysis.c | 742 ++++++-- src/libre/ast_analysis.h | 4 +- src/libre/ast_compile.c | 1260 +++++++++---- src/libre/ast_rewrite.c | 2 +- src/libre/print/tree.c | 23 +- src/libre/re.c | 18 +- src/libre/re_capvm_compile.c | 1689 ++++++++++++++++++ src/libre/re_capvm_compile.h | 32 + src/libre/strerror.c | 2 + src/lx/parser.act | 2 +- src/re/main.c | 71 +- tests/capture/Makefile | 8 +- tests/capture/captest.c | 560 ++++-- tests/capture/captest.h | 121 +- tests/capture/capture_test_case_list.c | 1867 ++++++++++++++++++++ tests/idmap/idmap_basic.c | 3 +- tests/minimise/minimise_test_case_list.c | 3 +- theft/fuzz_capture_string_set.c | 12 +- 54 files changed, 10356 insertions(+), 2210 deletions(-) delete mode 100644 src/libfsm/capture_internal.h create mode 100644 src/libfsm/capture_log.h create mode 100644 src/libfsm/capture_vm.c create mode 100644 src/libfsm/capture_vm.h create mode 100644 src/libfsm/capture_vm_exec.c create mode 100644 src/libfsm/capture_vm_log.h create mode 100644 src/libfsm/capture_vm_program.h delete mode 100644 src/libfsm/endids_internal.h create mode 100644 src/libre/re_capvm_compile.c create mode 100644 src/libre/re_capvm_compile.h create mode 100644 tests/capture/capture_test_case_list.c diff --git a/include/adt/idmap.h b/include/adt/idmap.h index 064fd15d1..a13c9d115 100644 --- a/include/adt/idmap.h +++ b/include/adt/idmap.h @@ -39,8 +39,9 @@ int idmap_get(const struct idmap *m, fsm_state_t state_id, size_t buf_size, unsigned *buf, size_t *written); -/* Iterator callback. */ -typedef void +/* Iterator callback. + * The return value indicates whether iteration should continue. */ +typedef int idmap_iter_fun(fsm_state_t state_id, unsigned value, void *opaque); /* Iterate over the ID map. State IDs may be yielded out of order, diff --git a/include/fsm/capture.h b/include/fsm/capture.h index d3c1aaa54..0fd418925 100644 --- a/include/fsm/capture.h +++ b/include/fsm/capture.h @@ -23,41 +23,32 @@ struct fsm_capture { size_t pos[2]; }; -/* How many captures does the FSM use? */ +/* What is the max capture ID an FSM uses? */ unsigned -fsm_countcaptures(const struct fsm *fsm); +fsm_capture_ceiling(const struct fsm *fsm); /* Does a specific state have any capture actions? */ int fsm_capture_has_capture_actions(const struct fsm *fsm, fsm_state_t state); -/* Set a capture path on an FSM. This means that during matching, the - * portion of a match between the path's START and END states will be - * captured. As the FSM is transformed (determinisation, minimisation, - * unioning, etc.), the path will be converted to refer to the pair(s) - * of new states instead. If the path's END state is no longer reachable - * from its START state, then the capture path will be ignored. - * Multiple instances of the same capture_id and path are ignored. */ -int -fsm_capture_set_path(struct fsm *fsm, unsigned capture_id, - fsm_state_t start, fsm_state_t end); - -/* Increase the base capture ID for all captures in an fsm. - * This could be used before combining multiple FSMs -- for - * example, before unioning a and b, where a has 3 captures - * and b has 2, b may be rebase'd to 3 -- so a has captures - * 0-2 and b has 3-4. */ -void -fsm_capture_rebase_capture_id(struct fsm *fsm, unsigned base); +/* Allocate a capture buffer with enough space for + * the current FSM's captures. + * + * This is provided for convenience -- the necessary array + * count can be checked with fsm_capture_ceiling, and then + * the buffer can be allocated directly. */ +struct fsm_capture * +fsm_capture_alloc_capture_buffer(const struct fsm *fsm); -/* Same, but for capture action states. */ +/* Free a capture buffer. */ void -fsm_capture_rebase_capture_action_states(struct fsm *fsm, fsm_state_t base); +fsm_capture_free_capture_buffer(const struct fsm *fsm, struct fsm_capture *capture_buffer); -/* Allocate a capture buffer with enough space for - * the current FSM's captures. */ -struct fsm_capture * -fsm_capture_alloc(const struct fsm *fsm); +/* Note that a capture is active for a particular end state. + * Using this for a non-end state is an unchecked error. */ +int +fsm_capture_set_active_for_end(struct fsm *fsm, + unsigned capture_id, fsm_state_t end_state); #ifndef NDEBUG #include diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index d8e99d9a3..30e2f3f91 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -190,6 +190,10 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq); void fsm_setend(struct fsm *fsm, fsm_state_t state, int end); +/* Associate a numeric ID with a single end state in an FSM. */ +int +fsm_setendid_state(struct fsm *fsm, fsm_state_t s, fsm_end_id_t id); + /* Associate a numeric ID with the end states in an fsm. * This can be used to track which of the original fsms matched * input when multiple fsms are combined. @@ -225,6 +229,17 @@ fsm_getendids(const struct fsm *fsm, fsm_state_t end_state, size_t fsm_getendidcount(const struct fsm *fsm, fsm_state_t end_state); +/* Callback for iterating over end IDs. + * Returns whether iteration should continue. */ +typedef int +fsm_iterendids_cb(const struct fsm *fsm, fsm_state_t end_state, + size_t nth, fsm_end_id_t id, void *opaque); + +/* Iterate over the end IDs associated with a state, if any. */ +void +fsm_iterendids(const struct fsm *fsm, fsm_state_t state, + fsm_iterendids_cb *cb, void *opaque); + /* * Find the state (if there is just one), or add epsilon edges from all states, * for which the given predicate is true. @@ -415,8 +430,21 @@ fsm_shortest(const struct fsm *fsm, * The given FSM is expected to be a DFA. */ int -fsm_exec(const struct fsm *fsm, int (*fsm_getc)(void *opaque), void *opaque, - fsm_state_t *end, struct fsm_capture *captures); +fsm_exec(const struct fsm *fsm, + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end); + +/* Same as fsm_exec, but also populate information about captures if + * *captures is non-NULL and capture metadata is available for the DFA. + * Captures is expected to be large enough to fit captures from the FSM. + * To check, use `fsm_capture_ceiling`. + * + * The current implementation requires all input to be buffered ahead of + * time, so this takes a pointer to an input array rather than a + * character iterator. */ +int +fsm_exec_with_captures(const struct fsm *fsm, const unsigned char *input, + size_t input_length, fsm_state_t *end, + struct fsm_capture *captures, size_t capture_buf_length); /* * Callbacks which may be passed to fsm_exec(). These are conveniences for diff --git a/include/re/re.h b/include/re/re.h index deab6caed..ab5f09b39 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -20,16 +20,18 @@ enum re_dialect { }; enum re_flags { - RE_ICASE = 1 << 0, - RE_TEXT = 1 << 1, - RE_MULTI = 1 << 2, - RE_REVERSE = 1 << 3, - RE_SINGLE = 1 << 4, /* aka PCRE_DOTALL */ - RE_ZONE = 1 << 5, - RE_ANCHORED = 1 << 6, - RE_EXTENDED = 1 << 7, /* PCRE extended mode */ - RE_END_NL = 1 << 8, /* end anchor matches '\n' */ - RE_FLAGS_NONE = 0 + RE_ICASE = 1 << 0, + RE_TEXT = 1 << 1, + RE_MULTI = 1 << 2, + RE_REVERSE = 1 << 3, + RE_SINGLE = 1 << 4, /* aka PCRE_DOTALL */ + RE_ZONE = 1 << 5, + RE_ANCHORED = 1 << 6, + RE_EXTENDED = 1 << 7, /* PCRE extended mode */ + RE_NOCAPTURE = 1 << 8, /* disable captures */ + RE_END_NL = 1 << 9, /* end anchor matches '\n' */ + RE_END_NL_DISABLE = 1 << 10, /* disable end anchor matching '\n' */ + RE_FLAGS_NONE = 0 }; #define RE_ANCHOR (RE_TEXT | RE_MULTI | RE_ZONE) @@ -46,6 +48,8 @@ enum re_errno { RE_EERRNO = 1 | RE_MISC, RE_EBADDIALECT = 2 | RE_MISC, RE_EBADGROUP = 3 | RE_MISC, + RE_EUNSUPCAPTUR = 4 | RE_MISC, + RE_EUNSUPPPCRE = 5 | RE_MISC, RE_ENEGRANGE = 0 | RE_MARK | RE_GROUP, RE_ENEGCOUNT = 1 | RE_MARK | RE_GROUP, diff --git a/src/adt/Makefile b/src/adt/Makefile index 64ad7429f..6fae4e7ca 100644 --- a/src/adt/Makefile +++ b/src/adt/Makefile @@ -20,12 +20,10 @@ CFLAGS.${src} += -I src # XXX: for internal.h DFLAGS.${src} += -I src # XXX: for internal.h .endfor -# not all concrete set interfaces use all static functions from set.inc -.if ${CC:T:Mgcc*} || ${CC:T:Mclang*} -.for src in ${SRC:Msrc/adt/stateset.c} ${SRC:Msrc/adt/tupleset.c} ${SRC:Msrc/adt/edgeset.c} -CFLAGS.${src} += -Wno-unused-function +.for src in ${SRC:Msrc/adt/siphash.c} ${SRC:Msrc/adt/edgeset.c} ${SRC:Msrc/adt/idmap.c} ${SRC:Msrc/adt/ipriq.c} ${SRC:Msrc/adt/internedstateset.c} +CFLAGS.${src} += -std=c99 # XXX: for internal.h +DFLAGS.${src} += -std=c99 # XXX: for internal.h .endfor -.endif # I want to assert on things which are currently true for this platform, # but not true in general. diff --git a/src/fsm/main.c b/src/fsm/main.c index 0f97a9a81..0873b241f 100644 --- a/src/fsm/main.c +++ b/src/fsm/main.c @@ -646,7 +646,7 @@ main(int argc, char *argv[]) f = xopen(argv[0]); - e = fsm_exec(fsm, fsm_fgetc, f, &state, NULL); + e = fsm_exec(fsm, fsm_fgetc, f, &state); fclose(f); } else { @@ -654,7 +654,7 @@ main(int argc, char *argv[]) s = argv[i]; - e = fsm_exec(fsm, fsm_sgetc, &s, &state, NULL); + e = fsm_exec(fsm, fsm_sgetc, &s, &state); } if (e != 1) { diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 9af51a5a4..bfa8e67db 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -1,6 +1,8 @@ .include "../../share/mk/top.mk" SRC += src/libfsm/capture.c +SRC += src/libfsm/capture_vm.c +SRC += src/libfsm/capture_vm_exec.c SRC += src/libfsm/collate.c SRC += src/libfsm/complete.c SRC += src/libfsm/consolidate.c diff --git a/src/libfsm/capture.c b/src/libfsm/capture.c index 806bb3b12..21f32d06e 100644 --- a/src/libfsm/capture.c +++ b/src/libfsm/capture.c @@ -6,31 +6,82 @@ #include -#include "capture_internal.h" +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "capture.h" +#include "capture_vm_program.h" +#include "capture_log.h" +#include "capture_vm.h" +#include "endids.h" + +#define DEF_PROGRAMS_CEIL 4 + +struct fsm_capture_info { + unsigned max_capture_id; + + /* For particular end states, which captures are active? */ + struct idmap *end_capture_map; + + /* Set of capture resolution programs associated with specific + * end states. */ + struct capvm_program_set { + uint32_t ceil; + uint32_t used; + struct capvm_program **set; + } programs; + + /* For particular end states, which capture programs are + * associtaed with them? */ + struct idmap *end_capvm_program_map; +}; int fsm_capture_init(struct fsm *fsm) { struct fsm_capture_info *ci = NULL; - size_t i; + struct idmap *end_capture_map = NULL; + struct idmap *end_capvm_program_map = NULL; ci = f_calloc(fsm->opt->alloc, 1, sizeof(*ci)); if (ci == NULL) { goto cleanup; } - fsm->capture_info = ci; + end_capture_map = idmap_new(fsm->opt->alloc); + if (end_capture_map == NULL) { + goto cleanup; + } + ci->end_capture_map = end_capture_map; - for (i = 0; i < fsm->statealloc; i++) { - fsm->states[i].has_capture_actions = 0; + end_capvm_program_map = idmap_new(fsm->opt->alloc); + if (end_capvm_program_map == NULL) { + goto cleanup; } + ci->end_capvm_program_map = end_capvm_program_map; + + fsm->capture_info = ci; return 1; cleanup: - if (ci != NULL) { - f_free(fsm->opt->alloc, ci); - } + f_free(fsm->opt->alloc, ci); + idmap_free(end_capture_map); + idmap_free(end_capvm_program_map); return 0; } @@ -41,799 +92,570 @@ fsm_capture_free(struct fsm *fsm) if (ci == NULL) { return; } - f_free(fsm->opt->alloc, ci->buckets); + + idmap_free(ci->end_capture_map); + idmap_free(ci->end_capvm_program_map); + + for (size_t p_i = 0; p_i < ci->programs.used; p_i++) { + fsm_capvm_program_free(fsm->opt->alloc, ci->programs.set[p_i]); + } + f_free(fsm->opt->alloc, ci->programs.set); + f_free(fsm->opt->alloc, ci); fsm->capture_info = NULL; } unsigned -fsm_countcaptures(const struct fsm *fsm) +fsm_capture_ceiling(const struct fsm *fsm) { - (void)fsm; if (fsm->capture_info == NULL) { return 0; } - if (fsm->capture_info->buckets_used == 0) { - return 0; - } - /* check actual */ #if EXPENSIVE_CHECKS - { - struct fsm_capture_info *ci = fsm->capture_info; - size_t i; - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { /* empty */ - continue; - } - assert(ci->max_capture_id >= b->action.id); + /* check actual */ + unsigned res = 0; + for (size_t i = 0; i < fsm->capture_info->programs.used; i++) { + const unsigned id = fsm_capvm_program_get_max_capture_id(fsm->capture_info->programs.set[i]); + if (id > res) { + res = id; } } + assert(res == fsm->capture_info->max_capture_id); #endif return fsm->capture_info->max_capture_id + 1; } +struct fsm_capture * +fsm_capture_alloc_capture_buffer(const struct fsm *fsm) +{ + assert(fsm != NULL); + const size_t len = fsm_capture_ceiling(fsm); + struct fsm_capture *res = f_malloc(fsm->opt->alloc, + len * sizeof(res[0])); + return res; +} + +void +fsm_capture_free_capture_buffer(const struct fsm *fsm, + struct fsm_capture *capture_buffer) +{ + assert(fsm != NULL); + f_free(fsm->opt->alloc, capture_buffer); +} + + int fsm_capture_has_captures(const struct fsm *fsm) { return fsm->capture_info - ? fsm->capture_info->buckets_used > 0 + ? fsm->capture_info->programs.used > 0 : 0; } -int -fsm_capture_has_capture_actions(const struct fsm *fsm, fsm_state_t state) +void +fsm_capture_dump_programs(FILE *f, const struct fsm *fsm) { - assert(state < fsm->statecount); - return fsm->states[state].has_capture_actions; + fprintf(f, "\n==== %s:\n", __func__); + struct fsm_capture_info *ci = fsm->capture_info; + for (uint32_t i = 0; i < ci->programs.used; i++) { + const struct capvm_program *p = ci->programs.set[i]; + fprintf(f, "# program %u, capture_count %u, base %u\n", + i, p->capture_count, p->capture_base); + fsm_capvm_program_dump(f, p); + fprintf(f, "\n"); + } } int -fsm_capture_set_path(struct fsm *fsm, unsigned capture_id, - fsm_state_t start, fsm_state_t end) +fsm_capture_set_active_for_end(struct fsm *fsm, + unsigned capture_id, fsm_state_t end_state) { - struct fsm_capture_info *ci; - struct capture_set_path_env env; - size_t seen_words; - int res = 0; - - assert(fsm != NULL); - assert(start < fsm->statecount); - assert(end < fsm->statecount); - - ci = fsm->capture_info; + struct fsm_capture_info *ci = fsm->capture_info; assert(ci != NULL); + struct idmap *m = ci->end_capture_map; + assert(m != NULL); - /* captures should no longer be stored as paths -- instead, set - * the info on the states _here_, and convert it as necessary. */ - -#if LOG_CAPTURE > 0 - fprintf(stderr, "fsm_capture_set_path: capture %u: <%u, %u>\n", - capture_id, start, end); -#endif - - if (capture_id > FSM_CAPTURE_MAX) { - return 0; /* ID out of range */ - } - - if (!init_capture_action_htab(fsm, ci)) { - return 0; - } - - /* This will create a trail and do a depth-first search from the - * start state, marking every unique path to the end state. */ - env.fsm = fsm; - env.capture_id = capture_id; - env.start = start; - env.end = end; - - env.trail_ceil = 0; - env.trail = NULL; - env.seen = NULL; + #if EXPENSIVE_CHECKS + assert(fsm_isend(fsm, end_state)); + #endif - env.trail = f_malloc(fsm->opt->alloc, - DEF_TRAIL_CEIL * sizeof(env.trail[0])); - if (env.trail == NULL) { - goto cleanup; - } - env.trail_ceil = DEF_TRAIL_CEIL; - - seen_words = fsm->statecount/64 + 1; - env.seen = f_malloc(fsm->opt->alloc, - seen_words * sizeof(env.seen[0])); + return idmap_set(m, end_state, capture_id); +} - if (!mark_capture_path(&env)) { - goto cleanup; - } +void +fsm_capture_iter_active_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_active_for_end_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter_for_state(fsm->capture_info->end_capture_map, state, + idmap_cb, opaque); +} - if (capture_id >= ci->max_capture_id) { - ci->max_capture_id = capture_id; - } +void +fsm_capture_iter_active_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_active_for_end_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter(fsm->capture_info->end_capture_map, + idmap_cb, opaque); +} - res = 1; - /* fall through */ +void +fsm_capture_iter_program_ids_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter_for_state(fsm->capture_info->end_capvm_program_map, state, + idmap_cb, opaque); +} -cleanup: - f_free(fsm->opt->alloc, env.trail); - f_free(fsm->opt->alloc, env.seen); - return res; +void +fsm_capture_iter_program_ids_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter(fsm->capture_info->end_capvm_program_map, + idmap_cb, opaque); } static int -init_capture_action_htab(struct fsm *fsm, struct fsm_capture_info *ci) +dump_active_for_ends_cb(fsm_state_t state_id, unsigned value, void *opaque) { - size_t count, i; - assert(fsm != NULL); - assert(ci != NULL); + FILE *f = opaque; + fprintf(f, " -- state %d: value %u\n", state_id, value); + return 1; +} - if (ci->bucket_count > 0) { - assert(ci->buckets != NULL); - return 1; /* done */ - } +void +fsm_capture_dump_active_for_ends(FILE *f, const struct fsm *fsm) +{ + fprintf(f, "%s:\n", __func__); + idmap_iter(fsm->capture_info->end_capture_map, dump_active_for_ends_cb, f); +} - assert(ci->buckets == NULL); - assert(ci->buckets_used == 0); +void +fsm_capture_dump_program_end_mapping(FILE *f, const struct fsm *fsm) +{ + fprintf(f, "%s:\n", __func__); + idmap_iter(fsm->capture_info->end_capvm_program_map, dump_active_for_ends_cb, f); +} - count = DEF_CAPTURE_ACTION_BUCKET_COUNT; - ci->buckets = f_malloc(fsm->opt->alloc, - count * sizeof(ci->buckets[0])); - if (ci->buckets == NULL) { - return 0; - } +/* Dump capture metadata about an FSM. */ +void +fsm_capture_dump(FILE *f, const char *tag, const struct fsm *fsm) +{ + struct fsm_capture_info *ci; - /* Init buckets to CAPTURE_NO_STATE -> empty. */ - for (i = 0; i < count; i++) { - ci->buckets[i].state = CAPTURE_NO_STATE; + assert(fsm != NULL); + ci = fsm->capture_info; + if (ci == NULL) { + fprintf(f, "==== %s -- no captures\n", tag); + return; } - ci->bucket_count = count; - return 1; + fsm_endid_dump(f, fsm); + fsm_capture_dump_active_for_ends(f, fsm); + fsm_capture_dump_programs(f, fsm); + fsm_capture_dump_program_end_mapping(f, fsm); } +struct carry_active_captures_env { + fsm_state_t dst; + struct idmap *dst_m; + int ok; +}; + static int -mark_capture_path(struct capture_set_path_env *env) +copy_active_captures_cb(fsm_state_t state_id, unsigned value, void *opaque) { - const size_t seen_words = env->fsm->statecount/64 + 1; - -#if LOG_CAPTURE > 0 - fprintf(stderr, "mark_capture_path: path [id %u, %u - %u]\n", - env->capture_id, env->start, env->end); -#endif + (void)state_id; - if (env->start == env->end) { - struct fsm_capture_action action; - action.type = CAPTURE_ACTION_COMMIT_ZERO_STEP; - action.id = env->capture_id; - action.to = CAPTURE_NO_STATE; - if (!add_capture_action(env->fsm, env->fsm->capture_info, - env->start, &action)) { - return 0; - } - return 1; - } - - memset(env->seen, 0x00, - seen_words * sizeof(env->seen[0])); - - /* initialize to starting node */ - env->trail_i = 1; - env->trail[0].state = env->start; - env->trail[0].step = TRAIL_STEP_START; - env->trail[0].has_self_edge = 0; - - while (env->trail_i > 0) { - const enum trail_step step = env->trail[env->trail_i - 1].step; -#if LOG_CAPTURE > 0 - fprintf(stderr, "mark_capture_path: trail %u/%u, cur %u, step %d\n", - env->trail_i, env->trail_ceil, - env->trail[env->trail_i - 1].state, - step); -#endif - - switch (step) { - case TRAIL_STEP_START: - if (!step_trail_start(env)) { - return 0; - } - break; - case TRAIL_STEP_ITER_EDGES: - if (!step_trail_iter_edges(env)) { - return 0; - } - break; - case TRAIL_STEP_ITER_EPSILONS: - if (!step_trail_iter_epsilons(env)) { - return 0; - } - break; - case TRAIL_STEP_DONE: - if (!step_trail_done(env)) { - return 0; - } - break; - default: - assert(!"match fail"); - } + struct carry_active_captures_env *env = opaque; + if (!idmap_set(env->dst_m, env->dst, value)) { + env->ok = false; + return 0; } - return 1; } static int -cmp_action(const struct fsm_capture_action *a, - const struct fsm_capture_action *b) { - /* could use memcmp here, provided padding is always zeroed. */ - return a->id < b->id ? -1 - : a->id > b->id ? 1 - : a->type < b->type ? -1 - : a->type > b->type ? 1 - : a->to < b->to ? -1 - : a->to > b->to ? 1 - : 0; -} - -int -fsm_capture_add_action(struct fsm *fsm, - fsm_state_t state, enum capture_action_type type, - unsigned id, fsm_state_t to) +copy_program_associations_cb(fsm_state_t state_id, unsigned value, void *opaque) { - struct fsm_capture_action action; - assert(fsm->capture_info != NULL); + (void)state_id; - action.type = type; - action.id = id; - action.to = to; - return add_capture_action(fsm, fsm->capture_info, - state, &action); + struct carry_active_captures_env *env = opaque; + if (!idmap_set(env->dst_m, env->dst, value)) { + env->ok = false; + return 0; + } + return 1; } -static int -add_capture_action(struct fsm *fsm, struct fsm_capture_info *ci, - fsm_state_t state, const struct fsm_capture_action *action) +int +fsm_capture_copy_active_for_ends(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state) { - uint64_t h; - size_t b_i, mask; - - assert(state < fsm->statecount); - assert(action->to == CAPTURE_NO_STATE || action->to < fsm->statecount); - -#if LOG_CAPTURE > 0 - fprintf(stderr, "add_capture_action: state %u, type %s, ID %u, TO %d\n", - state, fsm_capture_action_type_name[action->type], - action->id, action->to); -#endif - - if (ci->bucket_count == 0) { - if (!init_capture_action_htab(fsm, ci)) { - return 0; - } - } else if (ci->buckets_used >= ci->bucket_count/2) { /* grow */ - if (!grow_capture_action_buckets(fsm->opt->alloc, ci)) { - return 0; + struct state_iter it; + fsm_state_t s; + + assert(src_fsm != NULL); + assert(src_fsm->capture_info != NULL); + assert(src_fsm->capture_info->end_capture_map != NULL); + assert(dst_fsm != NULL); + assert(dst_fsm->capture_info != NULL); + assert(dst_fsm->capture_info->end_capture_map != NULL); + struct idmap *src_m = src_fsm->capture_info->end_capture_map; + struct idmap *dst_m = dst_fsm->capture_info->end_capture_map; + + struct carry_active_captures_env env = { + .dst_m = dst_m, + .dst = dst_state, + .ok = true, + }; + + state_set_reset(states, &it); + while (state_set_next(&it, &s)) { + if (!fsm_isend(src_fsm, s)) { + continue; } - } - h = hash_id(state); - mask = ci->bucket_count - 1; - - for (b_i = 0; b_i < ci->bucket_count; b_i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[(h + b_i) & mask]; - if (b->state == CAPTURE_NO_STATE) { /* empty */ - b->state = state; - memcpy(&b->action, action, sizeof(*action)); - ci->buckets_used++; - fsm->states[state].has_capture_actions = 1; - if (action->id > ci->max_capture_id) { - ci->max_capture_id = action->id; - } - return 1; - } else if (b->state == state && - 0 == cmp_action(action, &b->action)) { - /* already present, ignore duplicate */ - assert(fsm->states[state].has_capture_actions); - assert(ci->max_capture_id >= action->id); - return 1; - } else { - continue; /* skip past collision */ + idmap_iter_for_state(src_m, s, copy_active_captures_cb, &env); + if (!env.ok) { + goto cleanup; } } - assert(!"unreachable"); - return 0; +cleanup: + return env.ok; } -static int -grow_capture_action_buckets(const struct fsm_alloc *alloc, - struct fsm_capture_info *ci) +int +fsm_capture_copy_program_end_state_associations(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state) { - const size_t ncount = 2 * ci->bucket_count; - struct fsm_capture_action_bucket *nbuckets; - size_t nused = 0; - size_t i; + struct state_iter it; + fsm_state_t s; + + assert(src_fsm != NULL); + assert(src_fsm->capture_info != NULL); + assert(src_fsm->capture_info->end_capvm_program_map != NULL); + assert(dst_fsm != NULL); + assert(dst_fsm->capture_info != NULL); + assert(dst_fsm->capture_info->end_capvm_program_map != NULL); + struct idmap *src_m = src_fsm->capture_info->end_capvm_program_map; + struct idmap *dst_m = dst_fsm->capture_info->end_capvm_program_map; + + struct carry_active_captures_env env = { + .dst_m = dst_m, + .dst = dst_state, + .ok = true, + }; + + state_set_reset(states, &it); + while (state_set_next(&it, &s)) { + if (!fsm_isend(src_fsm, s)) { + continue; + } - assert(ncount != 0); - nbuckets = f_malloc(alloc, ncount * sizeof(nbuckets[0])); - if (nbuckets == NULL) { - return 0; - } + LOG(5 - LOG_CAPTURE_COMBINING_ANALYSIS, + "%s: dst_state %d, state_set_next => %d\n", + __func__, dst_state, s); - for (i = 0; i < ncount; i++) { - nbuckets[i].state = CAPTURE_NO_STATE; + idmap_iter_for_state(src_m, s, copy_program_associations_cb, &env); + if (!env.ok) { + goto cleanup; + } } - for (i = 0; i < ci->bucket_count; i++) { - const struct fsm_capture_action_bucket *src_b = &ci->buckets[i]; - uint64_t h; - const size_t mask = ncount - 1; - size_t b_i; +cleanup: + return env.ok; +} - if (src_b->state == CAPTURE_NO_STATE) { - continue; +int +fsm_capture_copy_programs(const struct fsm *src_fsm, + struct fsm *dst_fsm) +{ + const struct fsm_alloc *alloc = src_fsm->opt->alloc; + assert(alloc == dst_fsm->opt->alloc); + const struct fsm_capture_info *src_ci = src_fsm->capture_info; + + for (uint32_t p_i = 0; p_i < src_ci->programs.used; p_i++) { + const struct capvm_program *p = src_ci->programs.set[p_i]; + struct capvm_program *cp = capvm_program_copy(alloc, p); + if (cp == NULL) { + return 0; } - h = hash_id(src_b->state); - for (b_i = 0; b_i < ncount; b_i++) { - struct fsm_capture_action_bucket *dst_b; - dst_b = &nbuckets[(h + b_i) & mask]; - if (dst_b->state == CAPTURE_NO_STATE) { - memcpy(dst_b, src_b, sizeof(*src_b)); - nused++; - break; - } else { - continue; - } + /* unused: because this is an in-order copy, it's assumed + * the programs will retain their order. */ + uint32_t prog_id; + if (!fsm_capture_add_program(dst_fsm, cp, &prog_id)) { + return 0; } } - - assert(nused == ci->buckets_used); - f_free(alloc, ci->buckets); - ci->buckets = nbuckets; - ci->bucket_count = ncount; return 1; } -static int -grow_trail(struct capture_set_path_env *env) +size_t +fsm_capture_program_count(const struct fsm *fsm) { - struct trail_cell *ntrail; - unsigned nceil; - assert(env != NULL); - - nceil = 2 * env->trail_ceil; - assert(nceil > env->trail_ceil); + return fsm->capture_info->programs.used; +} - ntrail = f_realloc(env->fsm->opt->alloc, env->trail, - nceil * sizeof(env->trail[0])); - if (ntrail == NULL) { - return 0; - } +struct check_program_mappings_env { + const struct fsm *fsm; +}; - env->trail = ntrail; - env->trail_ceil = nceil; +static int +check_program_mappings_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + const uint32_t prog_id = (uint32_t)value; + struct check_program_mappings_env *env = opaque; + assert(state_id < env->fsm->statecount); + assert(prog_id < env->fsm->capture_info->programs.used); return 1; } -static int -step_trail_start(struct capture_set_path_env *env) -{ - struct trail_cell *tc = &env->trail[env->trail_i - 1]; - const fsm_state_t cur = tc->state; - size_t i; - struct edge_set *edge_set = NULL; - - /* check if node is endpoint, if so mark trail, - * then pop trail and continue */ - if (cur == env->end) { - struct fsm_capture_action action; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- GOT END at %u\n", cur); -#endif - action.id = env->capture_id; +void +fsm_capture_integrity_check(const struct fsm *fsm) +{ + if (!EXPENSIVE_CHECKS) { return; } - for (i = 0; i < env->trail_i; i++) { - fsm_state_t state = env->trail[i].state; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- %lu: %d\n", - i, state); -#endif + /* check that all program mappings are in range */ + struct check_program_mappings_env env = { + .fsm = fsm, + }; + idmap_iter(fsm->capture_info->end_capvm_program_map, check_program_mappings_cb, &env); +} - /* Special case: if this is marked as having - * a self-edge on the path, then also add an - * extend for that. */ - if (env->trail[i].has_self_edge) { - struct fsm_capture_action self_action; - self_action.type = CAPTURE_ACTION_EXTEND; - self_action.id = env->capture_id; - self_action.to = state; - - if (!add_capture_action(env->fsm, - env->fsm->capture_info, - state, &self_action)) { - return 0; - } - } - - - if (i == 0) { - action.type = CAPTURE_ACTION_START; - } else { - action.type = (i < env->trail_i - 1 - ? CAPTURE_ACTION_EXTEND - : CAPTURE_ACTION_COMMIT); - } - - if (i < env->trail_i - 1) { - action.to = env->trail[i + 1].state; - } else { - action.to = CAPTURE_NO_STATE; - } - - if (!add_capture_action(env->fsm, - env->fsm->capture_info, - state, &action)) { - return 0; - } - } +struct capture_idmap_compact_env { + int ok; + struct idmap *dst; + const fsm_state_t *mapping; + size_t orig_statecount; +}; - tc->step = TRAIL_STEP_DONE; - return 1; - } +static int +copy_with_mapping_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + fsm_state_t dst_id; + struct capture_idmap_compact_env *env = opaque; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- resetting edge iterator\n"); -#endif - edge_set = env->fsm->states[cur].edges; + assert(state_id < env->orig_statecount); + dst_id = env->mapping[state_id]; - MARK_SEEN(env, cur); -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- marking %u as seen\n", cur); -#endif + if (dst_id == FSM_STATE_REMAP_NO_STATE) { + return 1; /* discard */ + } + + if (!idmap_set(env->dst, dst_id, value)) { + env->ok = 0; + return 0; + } - edge_set_reset(edge_set, &tc->iter); - tc->step = TRAIL_STEP_ITER_EDGES; return 1; } -static int -step_trail_iter_edges(struct capture_set_path_env *env) +int +fsm_capture_id_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount) { - struct trail_cell *tc = &env->trail[env->trail_i - 1]; - struct trail_cell *next_tc = NULL; - - struct fsm_edge e; + struct capture_idmap_compact_env env; + struct idmap *old_idmap = fsm->capture_info->end_capture_map; + struct idmap *new_idmap = idmap_new(fsm->opt->alloc); - if (!edge_set_next(&tc->iter, &e)) { -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- ITER_EDGE_NEXT: DONE %u\n", tc->state); -#endif - tc->step = TRAIL_STEP_ITER_EPSILONS; - return 1; + if (new_idmap == NULL) { + return 0; } -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- ITER_EDGE_NEXT: %u -- NEXT %u\n", - tc->state, e.state); -#endif + env.ok = 1; + env.dst = new_idmap; + env.mapping = mapping; + env.orig_statecount = orig_statecount; - if (tc->state == e.state) { -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- special case, self-edge\n"); -#endif - /* Mark this state as having a self-edge, then continue - * the iterator. An EXTEND action will be added for the - * self-edge later, if necessary. */ - tc->has_self_edge = 1; - return 1; - } else if (CHECK_SEEN(env, e.state)) { -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- seen, skipping\n"); -#endif - return 1; /* continue */ - } - - if (env->trail_i == env->trail_ceil) { - if (!grow_trail(env)) { - return 0; - } + idmap_iter(old_idmap, copy_with_mapping_cb, &env); + if (!env.ok) { + idmap_free(new_idmap); + return 0; } -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- marking %u as seen\n", e.state); -#endif - MARK_SEEN(env, e.state); + idmap_free(old_idmap); + fsm->capture_info->end_capture_map = new_idmap; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- not seen (%u), exploring\n", e.state); -#endif - env->trail_i++; - next_tc = &env->trail[env->trail_i - 1]; - next_tc->state = e.state; - next_tc->step = TRAIL_STEP_START; - next_tc->has_self_edge = 0; return 1; } -static int -step_trail_iter_epsilons(struct capture_set_path_env *env) +int +fsm_capture_program_association_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount) { - struct trail_cell *tc = &env->trail[env->trail_i - 1]; + struct capture_idmap_compact_env env; + struct idmap *old_idmap = fsm->capture_info->end_capvm_program_map; + struct idmap *new_idmap = idmap_new(fsm->opt->alloc); - /* skipping this for now */ - -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- ITER_EPSILONS: %u\n", tc->state); -#endif - - tc->step = TRAIL_STEP_DONE; - return 1; -} + if (new_idmap == NULL) { + return 0; + } -static int -step_trail_done(struct capture_set_path_env *env) -{ - struct trail_cell *tc; + env.ok = 1; + env.dst = new_idmap; + env.mapping = mapping; + env.orig_statecount = orig_statecount; - /* 0-step paths already handled outside loop */ - assert(env->trail_i > 0); + idmap_iter(old_idmap, copy_with_mapping_cb, &env); + if (!env.ok) { + idmap_free(new_idmap); + return 0; + } - tc = &env->trail[env->trail_i - 1]; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- DONE: %u\n", tc->state); -#endif - CLEAR_SEEN(env, tc->state); + idmap_free(old_idmap); + fsm->capture_info->end_capvm_program_map = new_idmap; - env->trail_i--; return 1; } void -fsm_capture_rebase_capture_id(struct fsm *fsm, unsigned base) +fsm_capture_update_max_capture_id(struct fsm_capture_info *ci, + unsigned capture_id) { - size_t i; - struct fsm_capture_info *ci = fsm->capture_info; assert(ci != NULL); - - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { - continue; - } - - b->action.id += base; - if (b->action.id > ci->max_capture_id) { - ci->max_capture_id = b->action.id; - } + if (capture_id >= ci->max_capture_id) { + ci->max_capture_id = capture_id; } } -void -fsm_capture_rebase_capture_action_states(struct fsm *fsm, fsm_state_t base) +int +fsm_capture_add_program(struct fsm *fsm, + struct capvm_program *program, uint32_t *prog_id) { - size_t i; + assert(program != NULL); + assert(prog_id != NULL); + struct fsm_capture_info *ci = fsm->capture_info; - assert(ci != NULL); - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { - continue; + if (ci->programs.used == ci->programs.ceil) { + const size_t nceil = (ci->programs.ceil == 0 + ? DEF_PROGRAMS_CEIL + : 2*ci->programs.ceil); + assert(nceil > ci->programs.ceil); + struct capvm_program **nset = f_realloc(fsm->opt->alloc, + ci->programs.set, nceil * sizeof(nset[0])); + if (nset == NULL) { + return 0; } - b->state += base; - if (b->action.to != CAPTURE_NO_STATE) { - b->action.to += base; - } + ci->programs.ceil = nceil; + ci->programs.set = nset; } -} + assert(ci->programs.used < ci->programs.ceil); -struct fsm_capture * -fsm_capture_alloc(const struct fsm *fsm) -{ - (void)fsm; - assert(!"todo"); - return NULL; -} - -void -fsm_capture_update_captures(const struct fsm *fsm, - fsm_state_t cur_state, fsm_state_t next_state, size_t offset, - struct fsm_capture *captures) -{ - const struct fsm_capture_info *ci; - uint64_t h; - size_t b_i, mask; - - assert(cur_state < fsm->statecount); - assert(fsm->states[cur_state].has_capture_actions); - - ci = fsm->capture_info; - assert(ci != NULL); - - h = hash_id(cur_state); - mask = ci->bucket_count - 1; - -#if LOG_CAPTURE > 0 - fprintf(stderr, "-- updating captures at state %u, to %d, offset %lu\n", - cur_state, next_state, offset); -#endif - - for (b_i = 0; b_i < ci->bucket_count; b_i++) { - const size_t b_id = (h + b_i) & mask; - struct fsm_capture_action_bucket *b = &ci->buckets[b_id]; - unsigned capture_id; - -#if LOG_CAPTURE > 3 - fprintf(stderr, " -- update_captures: bucket %lu, state %d\n", b_id, b->state); -#endif - - - if (b->state == CAPTURE_NO_STATE) { -#if LOG_CAPTURE > 3 - fprintf(stderr, " -- no more actions for this state\n"); -#endif - break; /* no more for this state */ - } else if (b->state != cur_state) { - continue; /* skip collision */ - } - - assert(b->state == cur_state); - capture_id = b->action.id; - - switch (b->action.type) { - case CAPTURE_ACTION_START: -#if LOG_CAPTURE > 0 - fprintf(stderr, "START [%u, %u]\n", - b->action.id, b->action.to); -#endif - if (next_state == b->action.to && captures[capture_id].pos[0] == FSM_CAPTURE_NO_POS) { - captures[capture_id].pos[0] = offset; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[0] to %lu\n", b->action.id, offset); -#endif - } else { - /* filtered, ignore */ - } - break; - case CAPTURE_ACTION_EXTEND: -#if LOG_CAPTURE > 0 - fprintf(stderr, "EXTEND [%u, %u]\n", - b->action.id, b->action.to); -#endif - if (captures[capture_id].pos[0] != FSM_CAPTURE_NO_POS - && (0 == (captures[capture_id].pos[1] & COMMITTED_CAPTURE_FLAG))) { - if (next_state == b->action.to) { - captures[capture_id].pos[1] = offset; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[1] to %lu\n", b->action.id, offset); -#endif - } else { - /* filtered, ignore */ - } - } - break; - case CAPTURE_ACTION_COMMIT_ZERO_STEP: -#if LOG_CAPTURE > 0 - fprintf(stderr, "COMMIT_ZERO_STEP [%u]\n", - b->action.id); -#endif - - if (captures[capture_id].pos[0] == FSM_CAPTURE_NO_POS) { - captures[capture_id].pos[0] = offset; - captures[capture_id].pos[1] = offset | COMMITTED_CAPTURE_FLAG; - } else { /* extend */ - captures[capture_id].pos[1] = offset | COMMITTED_CAPTURE_FLAG; - } - -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[0] and [1] to %lu (with COMMIT flag)\n", b->action.id, offset); -#endif - break; - case CAPTURE_ACTION_COMMIT: -#if LOG_CAPTURE > 0 - fprintf(stderr, "COMMIT [%u]\n", - b->action.id); -#endif - captures[capture_id].pos[1] = offset | COMMITTED_CAPTURE_FLAG; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[1] to %lu (with COMMIT flag)\n", b->action.id, offset); -#endif - break; - default: - assert(!"matchfail"); - } + const unsigned max_prog_capture_id = fsm_capvm_program_get_max_capture_id(program); + if (max_prog_capture_id > ci->max_capture_id) { + fsm_capture_update_max_capture_id(ci, max_prog_capture_id); } -} -void -fsm_capture_finalize_captures(const struct fsm *fsm, - size_t capture_count, struct fsm_capture *captures) -{ - size_t i; - - /* If either pos[] is FSM_CAPTURE_NO_POS or the - * COMMITTED_CAPTURE_FLAG isn't set on pos[1], then the capture - * wasn't finalized; clear it. Otherwise, clear that bit so the - * pos[1] offset is meaningful. */ - - /* FIXME: this should also take the end state(s) associated - * with a capture into account, when that information is available; - * otherwise there will be false positives for zero-width captures - * where the paths have a common prefix. */ - (void)fsm; - - for (i = 0; i < capture_count; i++) { -#if LOG_CAPTURE > 1 - fprintf(stderr, "finalize[%lu]: pos[0]: %ld, pos[1]: %ld\n", - i, captures[i].pos[0], captures[i].pos[1]); -#endif + *prog_id = ci->programs.used; + ci->programs.set[ci->programs.used] = program; + ci->programs.used++; + return 1; +} - if (captures[i].pos[0] == FSM_CAPTURE_NO_POS - || captures[i].pos[1] == FSM_CAPTURE_NO_POS - || (0 == (captures[i].pos[1] & COMMITTED_CAPTURE_FLAG))) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; -#if LOG_CAPTURE > 1 - fprintf(stderr, "finalize: discard %lu\n", i); -#endif - } else if (captures[i].pos[1] & COMMITTED_CAPTURE_FLAG) { - captures[i].pos[1] &=~ COMMITTED_CAPTURE_FLAG; - } +const struct capvm_program * +fsm_capture_get_program_by_id(const struct fsm *fsm, uint32_t prog_id) +{ + struct fsm_capture_info *ci = fsm->capture_info; + if (prog_id >= ci->programs.used) { + return NULL; } + return ci->programs.set[prog_id]; } -void -fsm_capture_action_iter(const struct fsm *fsm, - fsm_capture_action_iter_cb *cb, void *opaque) +int +fsm_capture_associate_program_with_end_state(struct fsm *fsm, + uint32_t prog_id, fsm_state_t end_state) { - size_t i; struct fsm_capture_info *ci = fsm->capture_info; - assert(ci != NULL); - - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { - continue; - } + assert(end_state < fsm->statecount); + assert(prog_id < ci->programs.used); - if (!cb(b->state, b->action.type, - b->action.id, b->action.to, opaque)) { - break; - } + if (!idmap_set(ci->end_capvm_program_map, end_state, prog_id)) { + return 0; } + return 1; } -const char *fsm_capture_action_type_name[] = { - "START", "EXTEND", - "COMMIT_ZERO_STEP", "COMMIT" +struct capture_resolve_env { + const struct fsm_capture_info *ci; + const unsigned char *input; + const size_t length; + + int res; + struct fsm_capture *captures; + size_t captures_len; }; static int -dump_iter_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +exec_capvm_program_cb(fsm_state_t state_id, unsigned prog_id, void *opaque) { - FILE *f = opaque; - fprintf(f, " - state %u, %s [capture_id: %u, to: %d]\n", - state, fsm_capture_action_type_name[type], capture_id, to); + struct capture_resolve_env *env = opaque; + (void)state_id; + + /* TODO: idmap_iter could take a halt return value */ + if (env->res != 1) { return 0; } + + assert(prog_id < env->ci->programs.used); + struct capvm_program *p = env->ci->programs.set[prog_id]; + + LOG(5 - LOG_EVAL, "%s: evaluating prog_id %u for state %d\n", + __func__, prog_id, state_id); + +#define EXEC_COUNT 1 /* can be increased for benchmarking */ + + for (size_t i = 0; i < EXEC_COUNT; i++) { + const enum fsm_capvm_program_exec_res exec_res = + fsm_capvm_program_exec(p, + (const uint8_t *)env->input, env->length, + env->captures, env->captures_len); + if (exec_res != FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN) { + env->res = 0; + return 0; + } + } return 1; } -/* Dump capture metadata about an FSM. */ -void -fsm_capture_dump(FILE *f, const char *tag, const struct fsm *fsm) +int +fsm_capture_resolve_during_exec(const struct fsm *fsm, + fsm_state_t end_state, const unsigned char *input, size_t input_offset, + struct fsm_capture *captures, size_t captures_len) { - struct fsm_capture_info *ci; - assert(fsm != NULL); - ci = fsm->capture_info; - if (ci == NULL || ci->bucket_count == 0) { - fprintf(f, "==== %s -- no captures\n", tag); - return; - } - - fprintf(f, "==== %s -- capture action hash table (%u buckets)\n", - tag, ci->bucket_count); - fsm_capture_action_iter(fsm, dump_iter_cb, f); + assert(input != NULL); + assert(captures != NULL); + + const struct fsm_capture_info *ci = fsm->capture_info; + + struct capture_resolve_env capture_env = { + .res = 1, + .ci = ci, + .input = input, + .length = input_offset, + .captures = captures, + .captures_len = captures_len, + }; + + LOG(5 - LOG_EVAL, "%s: ended on state %d\n", + __func__, end_state); + idmap_iter_for_state(ci->end_capvm_program_map, + end_state, exec_capvm_program_cb, &capture_env); + + return capture_env.res; } diff --git a/src/libfsm/capture.h b/src/libfsm/capture.h index 4c0ba4722..16588060c 100644 --- a/src/libfsm/capture.h +++ b/src/libfsm/capture.h @@ -2,28 +2,26 @@ #define LIBFSM_CAPTURE_H #include +#include #include #include +#include -#define NEXT_STATE_END ((fsm_state_t)-1) - +/* Internal state IDs that are out of range for valid state IDs. + * + * CAPTURE_NO_STATE is used to represent the absence of a state, such as + * when remapping a state to a dead state (removing it) or empty + * hash table buckets. + * + * NEXT_STATE_END is used as a destination for capture actions that + * trigger when ending on a state. */ #define CAPTURE_NO_STATE ((fsm_state_t)-1) /* Capture interface -- functions internal to libfsm. * The public interface should not depend on any of these details. */ -enum capture_action_type { - /* Start an active capture if transitioning to TO. */ - CAPTURE_ACTION_START, - /* Continue an active capture if transitioning to TO, - * otherwise deactivate it. */ - CAPTURE_ACTION_EXTEND, - /* Write a zero-step capture (i.e., the start and - * end state are the same). */ - CAPTURE_ACTION_COMMIT_ZERO_STEP, - /* Write an active capture's endpoints. */ - CAPTURE_ACTION_COMMIT -}; +struct fsm_capture_info; +struct capvm_program; int fsm_capture_init(struct fsm *fsm); @@ -31,41 +29,110 @@ fsm_capture_init(struct fsm *fsm); void fsm_capture_free(struct fsm *fsm); +void +fsm_capture_dump_active_for_ends(FILE *f, const struct fsm *fsm); + +void +fsm_capture_dump_program_end_mapping(FILE *f, const struct fsm *fsm); + /* Does the FSM have captures? */ int fsm_capture_has_captures(const struct fsm *fsm); -/* Update captures, called when exiting or ending on a state. - * If ending on a state, use NEXT_STATE_END for next_state. */ void -fsm_capture_update_captures(const struct fsm *fsm, - fsm_state_t cur_state, fsm_state_t next_state, size_t offset, - struct fsm_capture *captures); +fsm_capture_dump(FILE *f, const char *tag, const struct fsm *fsm); + +void +fsm_capture_dump_programs(FILE *f, const struct fsm *fsm); +/* If EXPENSIVE_CHECKS is non-zero, assert that all capture metadata on + * an FSM is internally consistent. */ void -fsm_capture_finalize_captures(const struct fsm *fsm, - size_t capture_count, struct fsm_capture *captures); +fsm_capture_integrity_check(const struct fsm *fsm); + +int +fsm_capture_id_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount); -/* Add a capture action. This is used to update capture actions - * in the destination FSM when combining/transforming other FSMs. */ int -fsm_capture_add_action(struct fsm *fsm, - fsm_state_t state, enum capture_action_type type, - unsigned id, fsm_state_t to); - -/* Callback for iterating over capture actions. - * Return 1 to continue, return 0 to halt. - * If TO is not meaningful for a particular type, it will be - * set to NEXT_STATE_END. */ +fsm_capture_program_association_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount); + +/* Iterator callback for capture IDs that are active for a particular + * end state. Returns whether iteration should continue. */ typedef int -fsm_capture_action_iter_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, +fsm_capture_iter_active_for_end_cb(fsm_state_t state, unsigned capture_id, void *opaque); void -fsm_capture_action_iter(const struct fsm *fsm, - fsm_capture_action_iter_cb *cb, void *opaque); +fsm_capture_iter_active_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_active_for_end_cb *cb, void *opaque); + +void +fsm_capture_iter_active_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_active_for_end_cb *cb, void *opaque); -extern const char *fsm_capture_action_type_name[]; +/* Iterator callback for program IDs that are active for a particular + * end state. Returns whether iteration should continue. */ +typedef int +fsm_capture_iter_program_ids_for_end_state_cb(fsm_state_t state, unsigned prog_id, + void *opaque); + +void +fsm_capture_iter_program_ids_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque); +void +fsm_capture_iter_program_ids_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque); + +/* TODO: combine/rename */ +int +fsm_capture_copy_active_for_ends(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state); +int +fsm_capture_copy_program_end_state_associations(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state); + +int +fsm_capture_copy_programs(const struct fsm *src_fsm, + struct fsm *dst_fsm); + +size_t +fsm_capture_program_count(const struct fsm *fsm); + +void +fsm_capture_update_max_capture_id(struct fsm_capture_info *ci, + unsigned capture_id); + +int +fsm_capture_add_program(struct fsm *fsm, + struct capvm_program *program, uint32_t *prog_id); + +const struct capvm_program * +fsm_capture_get_program_by_id(const struct fsm *fsm, uint32_t prog_id); + +int +fsm_capture_associate_program_with_end_state(struct fsm *fsm, + uint32_t prog_id, fsm_state_t end_state); + +/* Resolve captures. + * + * FIXME: With the current implementation, if enough memory + * was passed in then it couldn't fail, but it may be worth + * changing the interface so that it doesn't assume there was + * already a successful match in order to support one-pass + * matching & capture resolution attempts from a stream. + * + * TODO: This should pass in a size for captures[]. + * TODO: An alternate interface that allows passing in + * preallocated buffers for working memory. + * + * TODO: describe return value. */ +int +fsm_capture_resolve_during_exec(const struct fsm *fsm, + fsm_state_t end_state, const unsigned char *input, size_t input_offset, + struct fsm_capture *captures, size_t captures_len); #endif diff --git a/src/libfsm/capture_internal.h b/src/libfsm/capture_internal.h deleted file mode 100644 index 70418b988..000000000 --- a/src/libfsm/capture_internal.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef CAPTURE_INTERNAL_H -#define CAPTURE_INTERNAL_H - -#include -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "capture.h" - -/* Bucket count for capture action hash table. - * Must be a power of 2. */ - -#define DEF_CAPTURE_ACTION_BUCKET_COUNT 32 -#define DEF_TRAIL_CEIL 8 - -#define LOG_CAPTURE 0 - -/* Most significant bit of a size_t. */ -#define COMMITTED_CAPTURE_FLAG ((SIZE_MAX) ^ (SIZE_MAX >> 1)) - -struct fsm_capture_info { - unsigned max_capture_id; - - /* Add-only hash table. */ - unsigned bucket_count; - unsigned buckets_used; /* grow if >= 1/2 used */ - - /* Hash buckets. If state is CAPTURE_NO_STATE, - * the bucket is empty. */ - struct fsm_capture_action_bucket { - fsm_state_t state; /* key */ - struct fsm_capture_action { - enum capture_action_type type; - unsigned id; - /* only used by START and EXTEND */ - fsm_state_t to; - } action; - } *buckets; -}; - -enum trail_step { - TRAIL_STEP_START, - TRAIL_STEP_ITER_EDGES, - TRAIL_STEP_ITER_EPSILONS, - TRAIL_STEP_DONE -}; - -/* env->seen is used as a bit set for tracking which states have already - * been processed. These macros set/check/clear the bits. */ -#define SEEN_BITOP(ENV, STATE, OP) ENV->seen[STATE/64] OP ((uint64_t)1 << (STATE&63)) -#define MARK_SEEN(ENV, STATE) SEEN_BITOP(ENV, STATE, |=) -#define CHECK_SEEN(ENV, STATE) SEEN_BITOP(ENV, STATE, &) -#define CLEAR_SEEN(ENV, STATE) SEEN_BITOP(ENV, STATE, &=~) - -struct capture_set_path_env { - struct fsm *fsm; - unsigned capture_id; - fsm_state_t start; - fsm_state_t end; - - unsigned trail_i; - unsigned trail_ceil; - struct trail_cell { - fsm_state_t state; - enum trail_step step; - char has_self_edge; - struct edge_iter iter; - } *trail; - - /* bitset for which states have already been seen. */ - uint64_t *seen; -}; - -static int -init_capture_action_htab(struct fsm *fsm, struct fsm_capture_info *ci); - -static int -mark_capture_path(struct capture_set_path_env *env); - -static int -add_capture_action(struct fsm *fsm, struct fsm_capture_info *ci, - fsm_state_t state, const struct fsm_capture_action *action); - -static int -grow_capture_action_buckets(const struct fsm_alloc *alloc, - struct fsm_capture_info *ci); - -static int -grow_trail(struct capture_set_path_env *env); - -static int -step_trail_start(struct capture_set_path_env *env); -static int -step_trail_iter_edges(struct capture_set_path_env *env); -static int -step_trail_iter_epsilons(struct capture_set_path_env *env); -static int -step_trail_done(struct capture_set_path_env *env); - -static int -cmp_action(const struct fsm_capture_action *a, - const struct fsm_capture_action *b); - -#endif diff --git a/src/libfsm/capture_log.h b/src/libfsm/capture_log.h new file mode 100644 index 000000000..c850460bb --- /dev/null +++ b/src/libfsm/capture_log.h @@ -0,0 +1,27 @@ +/* + * Copyright 2020 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_LOG_H +#define CAPTURE_LOG_H + +/* Log levels */ +#define LOG_CAPTURE 0 +#define LOG_CAPTURE_COMBINING_ANALYSIS 0 +#define LOG_EVAL 0 +#define LOG_APPEND_ACTION 0 +#define LOG_PRINT_FSM 0 +#define LOG_MARK_PATH 0 + +#include + +#define LOG(LEVEL, ...) \ + do { \ + if ((LEVEL) <= LOG_CAPTURE) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while(0) + +#endif diff --git a/src/libfsm/capture_vm.c b/src/libfsm/capture_vm.c new file mode 100644 index 000000000..aacae7f9c --- /dev/null +++ b/src/libfsm/capture_vm.c @@ -0,0 +1,193 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +/* Virtual machine for resolving captures while executing regular + * expressions from a subset of PCRE. This is based on the approach + * described in Russ Cox's "Regular Expression Matching: the Virtual + * Machine Approach" (https://swtch.com/~rsc/regexp/regexp2.html), but + * has a couple major modifications, mainly to keep memory usage low and + * predictable, and to be more consistent (arguably, bug-compatible...) + * with PCRE's behavior for libfsm's supported subset of PCRE. + * + * Instead of giving each green thread its own copy of the capture + * buffers, which uses a prohibitive amount of memory when matching DFAs + * that combine several regexes with several captures each, operate in + * two passes. + * + * In the first pass, each thread keeps track of its execution path, + * appending a bit for each branch: 1 for the greedy option, 0 for the + * non-greedy. Since there can be at most one live thread per program + * instruction, and all of them are either on the current or next input + * character, there's a bounded window for diverging paths during execution. + * After a certain distance back all paths either have a common prefix + * or consist entirely of 0 bits (for continually looping at an unanchored + * start). The path bits are stored in chunks in a backwards linked list, + * so nodes for common path prefixes can be shared by multiple threads, + * and the prefix of all 0 bits is instead stored as a counter. This + * keeps memory usage substantially lower. This search runs threads in + * parallel, breadth-first, halting any threads that duplicate work of + * a greedier search path (since PCRE's results match the greediest). + * + * In the second pass, replay the execution path for just the single + * greediest thread, which represents the "best" match, and write + * capture offsets into buffers passed in by the caller. + * + * Most of the other differences have to do with matching PCRE's quirky + * behaviors, particularly interactions between newlines and start/end + * anchors. + * */ + +#include "capture_vm.h" +#include "capture_vm_program.h" + +#include + +#include +#include +#include + +void +fsm_capvm_program_free(const struct fsm_alloc *alloc, + struct capvm_program *program) +{ + if (program == NULL) { return; } + f_free(alloc, program->ops); + f_free(alloc, program->char_classes.sets); + f_free(alloc, program); +} + +struct capvm_program * +capvm_program_copy(const struct fsm_alloc *alloc, + const struct capvm_program *src) +{ + assert(src != NULL); + struct capvm_program *p = NULL; + struct capvm_opcode *ops = NULL; + struct capvm_char_class *sets = NULL; + + p = f_calloc(alloc, 1, sizeof(*p)); + if (p == NULL) { goto cleanup; } + + /* This allocates exactly as many instructions and char_classes + * as necessary, rather than a power-of-2 buffer, because + * they are only added during compilation in libre. */ + + ops = f_calloc(alloc, src->used, sizeof(ops[0])); + if (ops == NULL) { goto cleanup; } + + sets = f_calloc(alloc, src->char_classes.count, + sizeof(src->char_classes.sets[0])); + if (sets == NULL) { goto cleanup; } + + memcpy(ops, src->ops, src->used * sizeof(src->ops[0])); + + assert(src->char_classes.sets != NULL || src->char_classes.count == 0); + if (src->char_classes.count > 0) { + memcpy(sets, src->char_classes.sets, + src->char_classes.count * sizeof(src->char_classes.sets[0])); + } + + struct capvm_program np = { + .capture_count = src->capture_count, + .capture_base = src->capture_base, + + .used = src->used, + .ceil = src->used, + .ops = ops, + + .char_classes = { + .count = src->char_classes.count, + .ceil = src->char_classes.count, + .sets = sets, + }, + }; + memcpy(p, &np, sizeof(np)); + return p; + +cleanup: + f_free(alloc, p); + f_free(alloc, ops); + f_free(alloc, sets); + return NULL; +} + +void +capvm_program_rebase(struct capvm_program *program, unsigned capture_offset) +{ + assert(program->capture_base + capture_offset > program->capture_base); + program->capture_base += capture_offset; +} + +void +fsm_capvm_program_dump(FILE *f, + const struct capvm_program *p) +{ + for (size_t i = 0; i < p->used; i++) { + const struct capvm_opcode *op = &p->ops[i]; + switch (op->t) { + case CAPVM_OP_CHAR: + fprintf(f, "%zu: char 0x%02x (%c)\n", + i, op->u.chr, isprint(op->u.chr) ? op->u.chr : '.'); + break; + case CAPVM_OP_CHARCLASS: + { + const uint32_t id = op->u.charclass_id; + assert(id < p->char_classes.count); + const struct capvm_char_class *cc = &p->char_classes.sets[id]; + fprintf(f, "%zu: charclass %u -> [", i, id); + for (size_t i = 0; i < 4; i++) { + fprintf(f, "%016lx", cc->octets[i]); + } + fprintf(f, "]\n"); + break; + } + case CAPVM_OP_MATCH: + fprintf(f, "%zu: match\n", i); + break; + case CAPVM_OP_JMP: + fprintf(f, "%zu: jmp %u\n", i, op->u.jmp); + break; + case CAPVM_OP_JMP_ONCE: + fprintf(f, "%zu: jmp_once %u\n", i, op->u.jmp_once); + break; + case CAPVM_OP_SPLIT: + fprintf(f, "%zu: split cont %u new %u\n", i, op->u.split.cont, op->u.split.new); + break; + case CAPVM_OP_SAVE: + fprintf(f, "%zu: save %u (cap %u, %s)\n", + i, op->u.save, + op->u.save / 2, (op->u.save & (uint32_t)0x01) ? "end" : "start"); + break; + case CAPVM_OP_ANCHOR: + fprintf(f, "%zu: anchor %s\n", i, + op->u.anchor == CAPVM_ANCHOR_START ? "start" : "end"); + break; + default: + assert(!"matchfail"); + } + } + for (size_t i = 0; i < p->char_classes.count; i++) { + const uint64_t *octets = p->char_classes.sets[i].octets; + fprintf(f, "char_class %zu: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", + i, octets[0], octets[1], octets[2], octets[3]); + } +} + +unsigned +fsm_capvm_program_get_capture_count(const struct capvm_program *program) +{ + assert(program != NULL); + return program->capture_count; +} + +unsigned +fsm_capvm_program_get_max_capture_id(const struct capvm_program *program) +{ + assert(program != NULL); + return (program->capture_count == 0 + ? 0 + : program->capture_base + program->capture_count - 1); +} diff --git a/src/libfsm/capture_vm.h b/src/libfsm/capture_vm.h new file mode 100644 index 000000000..74e958eb8 --- /dev/null +++ b/src/libfsm/capture_vm.h @@ -0,0 +1,69 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_VM_H +#define CAPTURE_VM_H + +#include +#include + +#include +#include + +/* Interface the virtual machine used to resolve captures. + * These interfaces are exposed to libre but should not be + * used directly. */ + +/* Opaque struct, details in capture_vm_program.h. */ +struct capvm_program; + +void +fsm_capvm_program_free(const struct fsm_alloc *alloc, + struct capvm_program *program); + +struct capvm_program * +capvm_program_copy(const struct fsm_alloc *alloc, + const struct capvm_program *program); + +/* Add an offset to the capture ID base for a program. + * Used when FSMs are merged, one of the source FSMs' capture IDs + * will be shifted to appear after the others. */ +void +capvm_program_rebase(struct capvm_program *program, unsigned capture_offset); + +void +fsm_capvm_program_dump(FILE *f, + const struct capvm_program *program); + +enum fsm_capvm_program_exec_res { + FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN, + FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND, + + FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC = -1, + /* FIXME: what other ways can this fail? */ +}; + +/* Execute a capture program with the given input and populate + * the capture buffer. + * + * This asserts that the capture buffer is at least as large as + * necessary. This is an internal interface, and the buffer size + * should have already been checked by the caller. */ +enum fsm_capvm_program_exec_res +fsm_capvm_program_exec(const struct capvm_program *program, + const uint8_t *input, size_t length, + struct fsm_capture *capture_buf, size_t capture_buf_length); + +/* Get the capture count from the program. */ +unsigned +fsm_capvm_program_get_capture_count(const struct capvm_program *program); + +/* Get the max capture ID from the program. + * If there are no captures (which is pointless) it will return 0. */ +unsigned +fsm_capvm_program_get_max_capture_id(const struct capvm_program *program); + +#endif diff --git a/src/libfsm/capture_vm_exec.c b/src/libfsm/capture_vm_exec.c new file mode 100644 index 000000000..5e33f159c --- /dev/null +++ b/src/libfsm/capture_vm_exec.c @@ -0,0 +1,2063 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "capture_vm.h" +#include "capture_vm_program.h" +#include "capture_vm_log.h" + +#include +#include +#include +#include + +#include + +/* for EXPENSIVE_CHECKS and TRACK_TIMES */ +#include "internal.h" + +#if EXPENSIVE_CHECKS +#include +#endif + +/* Special handling for a path node that has a long prefix of all 0 + * bits, as is common when the regex is unanchored at the start. */ +#define USE_COLLAPSED_ZERO_PREFIX 1 + +/* Special out-of-range NONE values. */ +#define NO_POS ((uint32_t)-1) +#define NO_ID ((uint32_t)-1) +#define COLLAPSED_ZERO_PREFIX_ID ((uint32_t)-2) +#define NO_POS_SIZE_T ((size_t)-1) + +/* Max number of bits each path link can store. + * This value cannot be changed without reworking the data structures. */ +#define PATH_LINK_BITS 32 + +/* This enables extra debugging/testing output */ +#ifndef TESTING_OPTIONS +#define TESTING_OPTIONS 0 +#endif + +/* Write the solution to stdout (used for testing). */ +#define LOG_SOLUTION_TO_STDOUT (0 || TESTING_OPTIONS) + +/* Enable extra fields for debugging/performance tuning, most notably + * a 'uniq_id' field that helps to see the various execution paths. */ +#define CAPVM_STATS (0 || TESTING_OPTIONS) +#define CAPVM_PATH_STATS (0 && CAPVM_STATS) + +/* This may no longer be necessary after further work on path handling. */ +#define ALLOW_TABLE_RESIZING 1 +#define ALLOW_PATH_TABLE_RESIZING (1 || ALLOW_TABLE_RESIZING) +#define ALLOW_THREAD_TABLE_RESIZING (0 || ALLOW_TABLE_RESIZING) + +/* Set to non-zero to trap runaway path table growth */ +#define PATH_TABLE_CEIL_LIMIT 0 + +/* Specialized logging that can be scraped to reconstruct non-interleaved + * execution paths per thread. */ +#define LOG_EXECUTION 0 +#define LOG_EXECUTION_FILE stderr +#if LOG_EXECUTION + +#if CAPVM_STATS == 0 +#error CAPVM_STATS must be 1 for uniq_id +#endif + +/* Various execution log messages, in an easily scraped format */ +#define LOG_EXEC_OP(UNIQ_ID, INPUT_POS, OP_ID, OP_NAME) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC OP %u %u %u %s\n", \ + UNIQ_ID, INPUT_POS, OP_ID, OP_NAME) + +#define LOG_EXEC_CHAR(UNIQ_ID, CHAR) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC CHAR %u %c 0x%02x\n", UNIQ_ID, isprint(CHAR) ? CHAR : '.', CHAR) + +#define LOG_EXEC_HALT(UNIQ_ID) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC HALT %u\n", UNIQ_ID) + +#define LOG_EXEC_PATH_FIND_SOLUTION(UNIQ_ID, BIT) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC PATH_FIND_SOLUTION %u %u\n", UNIQ_ID, BIT) + +#define LOG_EXEC_PATH_SAVE_CAPTURES(UNIQ_ID, BIT) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC PATH_SAVE_CAPTURES %u %u\n", UNIQ_ID, BIT) + +#define LOG_EXEC_SPLIT(PARENT_UNIQ_ID, CHILD_UNIQ_ID) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC SPLIT %u %u\n", PARENT_UNIQ_ID, CHILD_UNIQ_ID) +#else +#define LOG_EXEC_OP(UNIQ_ID, INPUT_POS, OP_ID, OP_NAME) /* no-op */ +#define LOG_EXEC_CHAR(UNIQ_ID, CHAR) /* no-op */ +#define LOG_EXEC_HALT(UNIQ_ID) /* no-op */ +#define LOG_EXEC_PATH_FIND_SOLUTION(UNIQ_ID, BIT) /* no-op */ +#define LOG_EXEC_PATH_SAVE_CAPTURES(UNIQ_ID, BIT) /* no-op */ +#define LOG_EXEC_SPLIT(PARENT_UNIQ_ID, CHILD_UNIQ_ID) /* no-op */ +#endif + +/* Bitset backed by an array of 32-bit words */ +#define GET_BIT32(BITARRAY, BIT) (BITARRAY[BIT/32] & ((uint32_t)1 << (BIT & 31))) +#define SET_BIT32(BITARRAY, BIT) (BITARRAY[BIT/32] |= ((uint32_t)1 << (BIT & 31))) + +static const char * +op_name[] = { + [CAPVM_OP_CHAR] = "CHAR", + [CAPVM_OP_CHARCLASS] = "CHARCLASS", + [CAPVM_OP_MATCH] = "MATCH", + [CAPVM_OP_JMP] = "JMP", + [CAPVM_OP_JMP_ONCE] = "JMP_ONCE", + [CAPVM_OP_SPLIT] = "SPLIT", + [CAPVM_OP_SAVE] = "SAVE", + [CAPVM_OP_ANCHOR] = "ANCHOR", +}; + +enum pair_id { PAIR_ID_CURRENT = 0, PAIR_ID_NEXT = 1 }; + +struct capvm { + const struct capvm_program *p; + const uint8_t *input; + const uint32_t input_len; + struct fsm_capture *capture_buf; + const size_t capture_buf_length; + size_t step_limit; + +#if CAPVM_STATS + uint32_t uniq_id_counter; +#endif + + /* Two stacks, used to track which execution instruction should + * be advanced next. The current stack is + * run_stacks[PAIR_ID_CURRENT], run_stacks[PAIR_ID_NEXT] is the + * stack for the next input position, and when the current stack + * is completed the next stack is copied over (and reversed). + * Same with run_stacks_h, the height for each stack, and the + * other fields with [2] below. */ + uint32_t *run_stacks[2]; + uint32_t run_stacks_h[2]; + + /* Similarly, two columns of bits and two arrays of path_info + * node IDs and uniq_ids for the execution at a particular + * opcode. */ + uint32_t *evaluated[2]; + uint32_t *path_info_heads[2]; +#if CAPVM_STATS + uint32_t *uniq_ids[2]; +#endif + + struct capvm_thread_stats { + uint32_t live; + uint32_t max_live; + } threads; + + /* Pool of nodes for linked lists of path segments. */ + struct capvm_path_info_pool { + uint32_t ceil; + uint32_t live; + uint32_t max_live; + uint32_t freelist_head; + struct capvm_path_info { + union { + struct capvm_path_freelist_link { + uint16_t refcount; /* == 0 */ + uint32_t freelist; + } freelist_node; + struct capvm_path_info_link { + uint16_t refcount; /* > 0, sticky at UINT16_MAX? */ + uint8_t used; /* .bits used, <= PATH_LINK_BITS */ + uint32_t bits; + uint32_t offset; + /* Linked list to earlier path nodes, with common + * nodes shared until paths diverge. + * + * This can be either a valid path node ID, NO_ID + * for end of list, or COLLAPSED_ZERO_PREFIX_ID + * to indicate that the node is preceded by + * (offset) zero bits. */ + uint32_t backlink; +#if CAPVM_PATH_STATS + uint32_t bits_added_per_input_character; +#endif + } path; + } u; + } *pool; + } paths; + + struct capvm_solution_info { + uint32_t best_path_id; +#if CAPVM_STATS + uint32_t best_path_uniq_id; +#endif + uint32_t zeros_evaluated_up_to; + } solution; + + struct { + size_t steps; +#if CAPVM_STATS + uint32_t matches; + uint32_t path_prefixes_shared; + uint32_t collapsed_zero_prefixes; +#endif +#if CAPVM_PATH_STATS + uint32_t max_bits_added_per_input_character; + uint32_t max_path_length_memory; +#endif + } stats; + + enum fsm_capvm_program_exec_res res; +}; + +/* Type identifier macros */ +#define IS_THREAD_FREELIST(T) (T->u.thread.path_info_head == NO_ID) +#define IS_PATH_FREELIST(P) (P->u.path.refcount == 0) +#define IS_PATH_NODE(P) (P->u.path.refcount > 0 && P->u.path.used <= PATH_LINK_BITS) + +static void +release_path_info_link(struct capvm *vm, uint32_t *pi_id); + +static void +dump_path_table(FILE *f, const struct capvm *vm); + +static void +set_max_threads_live(struct capvm *vm, uint32_t new_max_live) +{ + vm->threads.max_live = new_max_live; + if (LOG_CAPVM >= 6) { + LOG(0, "==== new vm->threads.max_live: %u\n", vm->threads.max_live); + dump_path_table(stderr, vm); + } +} + + +/*********************** + * path_info functions * + ***********************/ + +static void +set_max_paths_live(struct capvm *vm) +{ + vm->paths.max_live = vm->paths.live; + if (LOG_CAPVM >= 6) { + LOG(0, "==== new vm->paths.max_live: %u\n", vm->paths.max_live); + dump_path_table(stderr, vm); + } +} + +static uint32_t +get_path_node_refcount(const struct capvm *vm, uint32_t p_id) +{ + assert(p_id < vm->paths.ceil); + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { + return pi->u.freelist_node.refcount; + } else { + assert(IS_PATH_NODE(pi)); + return pi->u.path.refcount; + } +} + +static void +inc_path_node_refcount(struct capvm *vm, uint32_t p_id) +{ + /* TODO: sticky refcount handling is not currently implemented */ + if (p_id == COLLAPSED_ZERO_PREFIX_ID) { return; } + assert(p_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(IS_PATH_NODE(pi)); + LOG(5, "%s: p_id %u: refcnt %u -> %u\n", + __func__, p_id, pi->u.path.refcount, pi->u.path.refcount + 1); + pi->u.path.refcount++; +} + +static uint32_t +get_path_node_offset(const struct capvm *vm, uint32_t p_id) +{ + assert(p_id < vm->paths.ceil); + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(IS_PATH_NODE(pi)); + return pi->u.path.offset; +} + +static uint32_t +get_path_node_backlink(const struct capvm *vm, uint32_t p_id) +{ + assert(p_id < vm->paths.ceil); + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { + return pi->u.freelist_node.freelist; + } else { + assert(IS_PATH_NODE(pi)); + return pi->u.path.backlink; + } +} + +static void +set_path_node_backlink(struct capvm *vm, uint32_t p_id, uint32_t backlink) +{ + assert(p_id < vm->paths.ceil); + assert(backlink < vm->paths.ceil || (backlink == NO_ID || backlink == COLLAPSED_ZERO_PREFIX_ID)); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(IS_PATH_NODE(pi)); + pi->u.path.backlink = backlink; +} + +static void +dump_path_table(FILE *f, const struct capvm *vm) +{ + fprintf(f, "=== path table, %u/%u live\n", + vm->paths.live, vm->paths.ceil); + for (uint32_t i = 0; i < vm->paths.ceil; i++) { + struct capvm_path_info *pi = &vm->paths.pool[i]; + if (IS_PATH_FREELIST(pi)) { + if (LOG_CAPVM >= 5) { + fprintf(f, "paths[%u]: freelist -> %d\n", + i, (int)pi->u.freelist_node.freelist); + } + } else { + assert(IS_PATH_NODE(pi)); + fprintf(f, "paths[%u]: refcount %u, used %u, bits 0x%08x, offset %u, backlink %d%s\n", + i, pi->u.path.refcount, pi->u.path.used, pi->u.path.bits, + pi->u.path.offset, (int)pi->u.path.backlink, + pi->u.path.backlink == COLLAPSED_ZERO_PREFIX_ID + ? " (collapsed zero prefix)" + : pi->u.path.backlink == NO_ID + ? " (none)" + : " (link)"); + } + } +} + +static void +check_path_table(const struct capvm *vm) +{ +#if EXPENSIVE_CHECKS + uint32_t *refcounts = calloc(vm->paths.ceil, sizeof(refcounts[0])); + assert(refcounts); + + if (LOG_CAPVM >= 4) { + dump_path_table(stderr, vm); + } + + LOG(4, "%s: stack heights %u, %u\n", __func__, + vm->run_stacks_h[PAIR_ID_CURRENT], vm->run_stacks_h[PAIR_ID_NEXT]); + + for (uint32_t pair_id = 0; pair_id < 2; pair_id++) { + for (uint32_t h = 0; h < vm->run_stacks_h[pair_id]; h++) { + const uint32_t op_id = vm->run_stacks[pair_id][h]; + if (op_id == NO_ID) { continue; } +#if CAPVM_STATS + const uint32_t uniq_id = vm->uniq_ids[pair_id][op_id]; +#else + const uint32_t uniq_id = 0; +#endif + + LOG(4, "%s: run_stacks[%u][%u/%u]: op_id %u (uniq_id %u) -> path_info_head %u\n", + __func__, pair_id, h, vm->run_stacks_h[pair_id], op_id, + uniq_id, vm->path_info_heads[pair_id][op_id]); + if (op_id == NO_ID) { continue; } + const uint32_t p_id = vm->path_info_heads[pair_id][op_id]; + if (p_id != NO_ID) { + refcounts[p_id]++; + } + } + } + + for (uint32_t p_id = 0; p_id < vm->paths.ceil; p_id++) { + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { + continue; + } + const uint32_t backlink = get_path_node_backlink(vm, p_id); + if (backlink != NO_ID && backlink != COLLAPSED_ZERO_PREFIX_ID) { + assert(backlink < vm->paths.ceil); + refcounts[backlink]++; + } + } + + if (vm->solution.best_path_id != NO_ID) { + assert(vm->solution.best_path_id < vm->paths.ceil); + refcounts[vm->solution.best_path_id]++; + } + + for (uint32_t p_id = 0; p_id < vm->paths.ceil; p_id++) { + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { continue; } + bool ok; + const uint32_t refcount = get_path_node_refcount(vm, p_id); + ok = refcounts[p_id] == refcount; + + if (!ok) { + dump_path_table(stderr, vm); + + fprintf(stderr, "BAD REFCOUNT: pi[%u], expected %u, got %u\n", + p_id, refcounts[p_id], refcount); + assert(ok); + } + } + + free(refcounts); + LOG(6, "%s: passed\n", __func__); +#else + (void)vm; +#endif +} + +static bool +reserve_path_info_link(struct capvm *vm, uint32_t *pi_id) +{ + if (vm->paths.live == vm->paths.ceil) { +#if ALLOW_PATH_TABLE_RESIZING + if (LOG_CAPVM >= 4) { + fprintf(stderr, "\n"); + dump_path_table(stderr, vm); + check_path_table(vm); + fprintf(stderr, "\n"); + } + + const uint32_t nceil = 2*vm->paths.ceil; + LOG(1, "%s: growing path table %u -> %u\n", + __func__, vm->paths.ceil, nceil); + + /* This can legitimitely be reached with very long inputs, but + * if PATH_TABLE_CEIL_LIMIT is non-zero and this is hit then + * it's most likely a sign of an infinite loop. */ + if (PATH_TABLE_CEIL_LIMIT != 0 && nceil > PATH_TABLE_CEIL_LIMIT) { + assert(!"reached PATH_TABLE_CEIL_LIMIT"); + } + + assert(nceil > vm->paths.ceil); + struct capvm_path_info *npool = realloc(vm->paths.pool, + nceil * sizeof(npool[0])); + if (npool == NULL) { + return false; + } + + for (size_t i = vm->paths.ceil; i < nceil; i++) { + npool[i].u.freelist_node.refcount = 0; + npool[i].u.freelist_node.freelist = i + 1; + } + npool[nceil - 1].u.freelist_node.refcount = 0; + npool[nceil - 1].u.freelist_node.freelist = NO_POS; + vm->paths.freelist_head = vm->paths.ceil; + vm->paths.ceil = nceil; + vm->paths.pool = npool; +#else + assert(!"shouldn't need to grow path pool"); +#endif + } + + assert(vm->paths.live < vm->paths.ceil); + assert(vm->paths.freelist_head != NO_POS); + + *pi_id = vm->paths.freelist_head; + LOG(3, "%s: returning %u\n", __func__, *pi_id); + return true; +} + +/* Release a reference to a path_info_link. Consume the argument. + * If the reference count reaches 0, repool the node and release + * its backlink. */ +static void +release_path_info_link(struct capvm *vm, uint32_t *pi_id) +{ +#define LOG_RELEASE_PI 0 + size_t count = 0; + assert(pi_id != NULL); + uint32_t cur_id = *pi_id; + LOG(4 - LOG_RELEASE_PI, "%s: pi_id %u\n", __func__, cur_id); + *pi_id = NO_ID; + + while (cur_id != NO_ID) { + struct capvm_path_info *pi = &vm->paths.pool[cur_id]; + uint32_t refcount = get_path_node_refcount(vm, cur_id); + LOG(4 - LOG_RELEASE_PI, "-- checking path_info[%u]: refcount %u\n", + cur_id, refcount); + assert(refcount > 0); + LOG(4 - LOG_RELEASE_PI, "release: pi[%u] refcount %u -> %u\n", + cur_id, refcount, refcount - 1); + + const uint32_t backlink = get_path_node_backlink(vm, cur_id); + assert(IS_PATH_NODE(pi)); + pi->u.path.refcount--; + refcount = pi->u.path.refcount; + + if (refcount > 0) { + break; + } + + count++; + LOG(3 - LOG_RELEASE_PI, "-- repooling path_info %u, now %u live\n", + cur_id, vm->paths.live - 1); + LOG(3 - LOG_RELEASE_PI, "-- backlink: %d\n", backlink); + + pi->u.freelist_node.freelist = vm->paths.freelist_head; + vm->paths.freelist_head = cur_id; + assert(vm->paths.live > 0); + vm->paths.live--; + + cur_id = backlink; + if (cur_id == COLLAPSED_ZERO_PREFIX_ID) { + break; + } + } +} + +static void +print_path(FILE *f, const struct capvm *vm, uint32_t p_id) +{ + if (p_id == NO_ID) { + fprintf(f, "/0"); + return; + } + + /* reverse links to the root node */ + uint32_t zero_prefix = 0; + uint32_t next = NO_ID; + uint32_t first = NO_ID; + uint32_t prev; + + while (p_id != NO_ID) { + assert(p_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(!IS_PATH_FREELIST(pi)); + + uint32_t bl; + assert(IS_PATH_NODE(pi)); + bl = pi->u.path.backlink; + pi->u.path.backlink = next; + + if (bl == NO_ID) { + prev = bl; + first = p_id; + break; + } else if (bl == COLLAPSED_ZERO_PREFIX_ID) { + prev = bl; + first = p_id; + zero_prefix = pi->u.path.offset; + break; + } + + next = p_id; + p_id = bl; + } + + if (zero_prefix > 0) { + fprintf(f, "0/%u", zero_prefix); + } + + /* iterate forward, printing and restoring link order */ + p_id = first; + while (p_id != NO_ID) { + assert(p_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(!IS_PATH_FREELIST(pi)); + + uint32_t fl; /* now a forward link */ + assert(IS_PATH_NODE(pi)); + fl = pi->u.path.backlink; + pi->u.path.backlink = prev; + prev = p_id; + fprintf(f, "%s%08x/%u", prev == NO_ID ? "" : " ", + pi->u.path.bits, pi->u.path.used); + + p_id = fl; + } +} + +#if EXPENSIVE_CHECKS +SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() +#endif +static int +cmp_paths(struct capvm *vm, uint32_t p_a, uint32_t p_b) +{ +#if EXPENSIVE_CHECKS + /* When EXPENSIVE_CHECKS is set, walk the chains + * before and after and compare incremental hashing of node IDs, + * to ensure the chains are restored properly. */ + uint64_t hash_a_before = 0; + uint64_t hash_b_before = 0; +#endif + +#define LOG_CMP_PATHS 0 + LOG(3 - LOG_CMP_PATHS, "%s: p_a %d, p_b %d\n", __func__, p_a, p_b); + + if (p_a == NO_ID) { + return p_b == NO_ID ? 0 : -1; + } else if (p_b == NO_ID) { + return 1; + } + + assert(p_a != p_b); + + if (LOG_CAPVM >= 5) { + fprintf(stderr, "A: "); + print_path(stderr, vm, p_a); + fprintf(stderr, "\n"); + + fprintf(stderr, "B: "); + print_path(stderr, vm, p_b); + fprintf(stderr, "\n"); + } + + /* walk both paths backward until they reach a beginning + * or the common prefix node, reversing links along the + * way, then compare forward and restore link order. */ + uint32_t link_a = p_a; + uint32_t link_b = p_b; + + uint32_t fwd_a = NO_ID; + uint32_t fwd_b = NO_ID; + + /* Walk both paths backward, individually until reaching a + * common offset, then back until reaching a common prefix + * (including the start). While iterating backward, replace + * the .backlink field with a forward link, which will be + * reverted when iterating forward and comparing from the + * common prefix. */ + bool common_prefix_found = false; + uint32_t first_a = NO_ID; + uint32_t first_b = NO_ID; + uint32_t common_prefix_link; /* can be NO_ID */ + +#if EXPENSIVE_CHECKS + uint32_t hash_step = 0; /* added so ordering matters */ + while (link_a != NO_ID) { + assert(link_a < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_a); + hash_a_before += hash_id(link_a + hash_step); + link_a = prev; + hash_step++; + } + hash_step = 0; + while (link_b != NO_ID) { + assert(link_b < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_b); + hash_b_before += hash_id(link_b + hash_step); + link_b = prev; + hash_step++; + } + + link_a = p_a; + link_b = p_b; +#endif + + while (!common_prefix_found) { + assert(link_a != NO_ID); + assert(link_b != NO_ID); + assert(link_a < vm->paths.ceil); + assert(link_b < vm->paths.ceil); + + const uint32_t prev_a = get_path_node_backlink(vm, link_a); + const uint32_t prev_b = get_path_node_backlink(vm, link_b); + const uint32_t offset_a = get_path_node_offset(vm, link_a); + const uint32_t offset_b = get_path_node_offset(vm, link_b); + const uint32_t backlink_a = get_path_node_backlink(vm, link_a); + const uint32_t backlink_b = get_path_node_backlink(vm, link_b); + + LOG(3 - LOG_CMP_PATHS, + "%s: backward loop: link_a %d (offset %u, prev %d), link_b %d (offset %u, prev %d)\n", + __func__, link_a, offset_a, prev_a, link_b, offset_b, prev_b); + + assert((offset_a & 31) == 0); /* multiple of 32 */ + assert((offset_b & 31) == 0); /* multiple of 32 */ + if (offset_a > offset_b) { + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: a longer than b\n", __func__); + set_path_node_backlink(vm, link_a, fwd_a); + fwd_a = link_a; + link_a = prev_a; + } else if (offset_b > offset_a) { + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: b longer than a\n", __func__); + set_path_node_backlink(vm, link_b, fwd_b); + fwd_b = link_b; + link_b = prev_b; + } else { + assert(offset_b == offset_a); + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: comparing backlinks: a: %d, b: %d\n", + __func__, backlink_a, backlink_b); + assert(fwd_a != link_a); + set_path_node_backlink(vm, link_a, fwd_a); + assert(fwd_b != link_b); + set_path_node_backlink(vm, link_b, fwd_b); + + if (prev_a == prev_b) { + /* if == NO_ID, empty prefix */ + common_prefix_found = true; + common_prefix_link = prev_a; + first_a = link_a; + first_b = link_b; + + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: common_prefix_found: %d\n", + __func__, common_prefix_link); + } else { + fwd_a = link_a; + fwd_b = link_b; + + link_a = prev_a; + link_b = prev_b; + } + } + } + + assert(first_a != NO_ID); + assert(first_b != NO_ID); + link_a = first_a; + link_b = first_b; + + bool cmp_done = false; + int res; + bool done_restoring_link_order = false; + uint32_t prev_a = common_prefix_link; + uint32_t prev_b = common_prefix_link; + while (!done_restoring_link_order) { + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, link_a %d, link_b %d, cmp_done %d\n", + __func__, link_a, link_b, cmp_done); + if (!cmp_done) { + if (link_a == NO_ID) { /* b is longer */ + cmp_done = true; + if (link_b == NO_ID) { + res = 0; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, equal length, res %d\n", __func__, res); + } else { + res = -1; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, b is longer, res %d\n", __func__, res); + } + } else if (link_b == NO_ID) { /* a is longer */ + cmp_done = true; + res = 1; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, a is longer, res %d\n", __func__, res); + } else { + assert(link_a < vm->paths.ceil); + assert(link_b < vm->paths.ceil); + struct capvm_path_info *pi_a = &vm->paths.pool[link_a]; + struct capvm_path_info *pi_b = &vm->paths.pool[link_b]; + + const uint32_t offset_a = get_path_node_offset(vm, link_a); + const uint32_t offset_b = get_path_node_offset(vm, link_b); + + assert(offset_a == offset_b); + + if (pi_a->u.path.bits != pi_b->u.path.bits) { + res = pi_a->u.path.bits < pi_b->u.path.bits ? -1 : 1; + cmp_done = true; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, different bits (0x%08x, 0x%08x) => res %d\n", + __func__, pi_a->u.path.bits, pi_b->u.path.bits, res); + } + } + } + + /* Check if both have reached the original head node. */ + if (link_a == NO_ID && link_b == NO_ID) { + done_restoring_link_order = true; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop: reached end of both paths, prev_a %d (p_a %d), prev_b %d (p_b %d)\n", + __func__, prev_a, p_a, prev_b, p_b); + assert(prev_a == p_a); + assert(prev_b == p_b); + } + + /* Whether or not comparison has finished, iterate forward, + * restoring forward links. */ + if (link_a != NO_ID) { + assert(link_a < vm->paths.ceil); + const uint32_t fwd_a = get_path_node_backlink(vm, link_a); + LOG(3 - LOG_CMP_PATHS, "%s: fwd loop: link_a %d, fwd_a %d\n", + __func__, link_a, fwd_a); + assert(fwd_a != link_a); + + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, restoring a's backlink: pi[%u].backlink <- %d\n", + __func__, link_a, prev_a); + set_path_node_backlink(vm, link_a, prev_a); + prev_a = link_a; + link_a = fwd_a; + } + + if (link_b != NO_ID) { + assert(link_b < vm->paths.ceil); + const uint32_t fwd_b = get_path_node_backlink(vm, link_b); + LOG(3 - LOG_CMP_PATHS, "%s: fwd loop: link_b %d, fwd_b %d\n", + __func__, link_b, fwd_b); + assert(fwd_b != link_b); + + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, restoring b's backlink: pi[%u].backlink <- %d\n", + __func__, link_b, prev_b); + set_path_node_backlink(vm, link_b, prev_b); + prev_b = link_b; + link_b = fwd_b; + } + } + + LOG(3 - LOG_CMP_PATHS, "%s: res %d\n", __func__, res); + +#if EXPENSIVE_CHECKS + uint64_t hash_a_after = 0; + uint64_t hash_b_after = 0; + hash_step = 0; + link_a = p_a; + while (link_a != NO_ID) { + assert(link_a < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_a); + hash_a_after += hash_id(link_a + hash_step); + link_a = prev; + hash_step++; + } + link_b = p_b; + hash_step = 0; + while (link_b != NO_ID) { + assert(link_b < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_b); + hash_b_after += hash_id(link_b + hash_step); + link_b = prev; + hash_step++; + } + + assert(hash_a_after == hash_a_before); + assert(hash_b_after == hash_b_before); +#endif + + return res; +#undef LOG_CMP_PATHS +} + +static void +handle_possible_matching_path(struct capvm *vm, uint32_t path_info_head, uint32_t uniq_id); + +static bool +copy_path_info(struct capvm *vm, uint32_t path_info_head, + uint32_t *new_path_info_head); + +static bool +extend_path_info(struct capvm *vm, uint32_t path_info_head, bool greedy, uint32_t uniq_id, + uint32_t *new_path_info_head); + +/* Push the next execution step onto the stack, if it hasn't already + * been run by a greedier path. Calling this hands off ownership of the + * path_info_head, so it is released if execution will not be resumed + * later. */ +static void +schedule_possible_next_step(struct capvm *vm, enum pair_id pair_id, + uint32_t input_pos, uint32_t op_id, + uint32_t path_info_head, uint32_t uniq_id) +{ + assert(path_info_head != NO_ID); + uint32_t *stack = vm->run_stacks[pair_id]; + uint32_t *stack_h = &vm->run_stacks_h[pair_id]; + assert((*stack_h) < vm->p->used); + + /* If that instruction has already been evaluated, skip the + * redundant execution by a less greedy path. */ + const uint32_t *evaluated = vm->evaluated[pair_id]; + const bool already_evaluated = GET_BIT32(evaluated, op_id) != 0; + LOG(3, "%s: pair_id %u, input_pos %u, op_id %u, path_info_head %u, uniq_id %u, already_evaluated %d, stack_h %u\n", + __func__, pair_id, input_pos, op_id, path_info_head, uniq_id, already_evaluated, *stack_h); + + if (already_evaluated) { + LOG_EXEC_HALT(uniq_id); + release_path_info_link(vm, &path_info_head); + assert(vm->threads.live > 0); + vm->threads.live--; + } else { + + /* If the work being scheduled by the current greediest + * thread pre-empts work scheduled by a less greedy + * thread, release that thread's path link and clear its + * op ID on the run stack. + * + * TODO: Ideally, avoid the linear scan here, but the + * total stack height is bounded by the generated program size + * and should be fairly small in practice. Wait to change this + * untill there are benchmarks in place showing it's necessary. + * + * An extra two bits per opcode (one for each stack) could + * be used to track whether the stack already contains + * op_id, so the linear scan could be avoided except when + * actually necessary. */ + uint32_t cur_pih = vm->path_info_heads[pair_id][op_id]; + if (cur_pih != NO_ID) { + release_path_info_link(vm, &cur_pih); + vm->path_info_heads[pair_id][op_id] = NO_ID; + const size_t h = *stack_h; + for (size_t i = 0; i < h; i++) { + if (stack[i] == op_id) { + stack[i] = NO_ID; + vm->threads.live--; + } + } + } + stack[(*stack_h)++] = op_id; + vm->path_info_heads[pair_id][op_id] = path_info_head; +#if CAPVM_STATS + vm->uniq_ids[pair_id][op_id] = uniq_id; +#endif + + if (*stack_h > vm->threads.max_live) { + vm->threads.max_live = *stack_h; + if (LOG_CAPVM >= 6) { + LOG(0, "==== new vm->threads.max_live: %u\n", vm->threads.max_live); + dump_path_table(stderr, vm); + } + } + } +} + +static void +eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, + uint32_t path_info_head, uint32_t uniq_id, uint32_t op_id) +{ + LOG(5, "%s: input_pos %u, input_len %u, op_id %u, threads_live %u\n", + __func__, input_pos, vm->input_len, op_id, vm->threads.live); + + if (vm->stats.steps == vm->step_limit) { + LOG(1, "%s: halting, steps == step_limit %zu\n", + __func__, vm->step_limit); + return; + } + + assert(op_id < vm->p->used); + + const struct capvm_opcode *op = &vm->p->ops[op_id]; + + LOG(2, "%s: op_id[%u]: input_pos %u, path_info_head %u, uniq_id %u, op %s\n", + __func__, op_id, input_pos, path_info_head, uniq_id, op_name[op->t]); + LOG_EXEC_OP(uniq_id, input_pos, op_id, op_name[op->t]); + + vm->stats.steps++; + if (vm->stats.steps == vm->step_limit) { + /* TODO: Set some sort of STEP_LIMIT_REACHED error. */ + return; + } + + switch (op->t) { + case CAPVM_OP_CHAR: + if (input_pos == vm->input_len) { + goto halt_thread; /* past end of input */ + } + + LOG(3, "OP_CHAR: input_pos %u, exp char '%c', got '%c'\n", + input_pos, op->u.chr, vm->input[input_pos]); + + if (vm->input[input_pos] != op->u.chr) { + goto halt_thread; /* character mismatch */ + } + LOG_EXEC_CHAR(uniq_id, vm->input[input_pos]); + + schedule_possible_next_step(vm, PAIR_ID_NEXT, input_pos + 1, op_id + 1, + path_info_head, uniq_id); + break; + + case CAPVM_OP_CHARCLASS: + { + if (input_pos == vm->input_len) { + goto halt_thread; /* past end of input */ + } + + const uint8_t c = vm->input[input_pos]; + const uint32_t cc_id = op->u.charclass_id; + assert(cc_id < vm->p->char_classes.count); + const struct capvm_char_class *class = &vm->p->char_classes.sets[cc_id]; + + if (!(class->octets[c/64] & ((uint64_t)1 << (c&63)))) { + goto halt_thread; /* character not in class */ + } + LOG_EXEC_CHAR(uniq_id, vm->input[input_pos]); + + schedule_possible_next_step(vm, PAIR_ID_NEXT, input_pos + 1, op_id + 1, + path_info_head, uniq_id); + break; + } + + case CAPVM_OP_MATCH: + if (input_pos == vm->input_len) { + handle_possible_matching_path(vm, path_info_head, uniq_id); + } else if (vm->input_len > 0 && input_pos == vm->input_len - 1 + && vm->input[input_pos] == '\n') { + LOG(3, "OP_MATCH: special case for trailing newline\n"); + handle_possible_matching_path(vm, path_info_head, uniq_id); + } + goto halt_thread; + + case CAPVM_OP_JMP: + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op->u.jmp, + path_info_head, uniq_id); + break; + + case CAPVM_OP_JMP_ONCE: + { + /* If the destination for this jump has already been visited + * without advancing input, then skip the jump. This is necessary + * for edge cases like the first branch in `^(^|.$)*`, which would + * otherwise have a backward jump to before the first case, due to + * the repetition, and would effectively be treated as an infinite + * loop and ignored, leading to incorrect match bounds for "x". + * + * Replaying the capture path does not track what has been evaluated, + * so this needs to record the branch in the path. This will make + * repetition more expensive in some cases, but compilation could + * emit a JMP when it's safe to do so. */ + const bool greedy = GET_BIT32(vm->evaluated[PAIR_ID_CURRENT], op->u.jmp_once); + if (greedy) { + /* non-greedy branch -- fall through */ + uint32_t new_path_info_head = NO_ID; + if (!extend_path_info(vm, path_info_head, 0, uniq_id, &new_path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + new_path_info_head, uniq_id); + } else { + /* greedy branch -- loop back and potentially match more */ + uint32_t new_path_info_head = NO_ID; + if (!extend_path_info(vm, path_info_head, 1, uniq_id, &new_path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op->u.jmp_once, + new_path_info_head, uniq_id); + } + break; + } + + case CAPVM_OP_SPLIT: + { + const uint32_t dst_cont = op->u.split.cont; + const uint32_t dst_new = op->u.split.new; + + /* destinations must be in range and not self-referential */ + assert(dst_cont < vm->p->used); + assert(dst_new < vm->p->used); + assert(dst_cont != op_id); + assert(dst_new != op_id); + + uint32_t new_path_info_head; + if (!copy_path_info(vm, path_info_head, &new_path_info_head)) { + goto alloc_error; + } + + /* cont is the greedy branch */ + if (!extend_path_info(vm, path_info_head, 1, uniq_id, &path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + + /* new is the non-greedy branch */ + if (!extend_path_info(vm, new_path_info_head, 0, uniq_id, &new_path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + +#if CAPVM_STATS + const uint32_t new_uniq_id = ++vm->uniq_id_counter; +#else + const uint32_t new_uniq_id = 0; +#endif + + vm->threads.live++; + if (vm->threads.live > vm->threads.max_live) { + set_max_threads_live(vm, vm->threads.live); + } + + /* Push the split.new destination, and then the + * split.cont destination on top of it, so that the + * greedier .cont branch will be fully evaluated + * first. */ + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_new, + new_path_info_head, new_uniq_id); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_cont, + path_info_head, uniq_id); + LOG_EXEC_SPLIT(uniq_id, new_uniq_id); + + break; + } + + case CAPVM_OP_SAVE: + /* no-op, during this stage */ + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + + case CAPVM_OP_ANCHOR: + if (op->u.anchor == CAPVM_ANCHOR_START) { + LOG(3, "%s: ^ anchor\n", __func__); + /* ignore a trailing newline, because PCRE does, + * even after a $ anchor. */ + if (input_pos == 0 + && vm->input_len == 1 + && vm->input[0] == '\n') { + /* allowed */ + LOG(3, "%s: special case: ^ ignoring trailing newline\n", __func__); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + } else if (input_pos == 1 + && vm->input_len == 1 + && vm->input[0] == '\n') { + /* allowed */ + } else if (input_pos != 0) { goto halt_thread; } + } else { + assert(op->u.anchor == CAPVM_ANCHOR_END); + LOG(3, "%s: $ anchor: input_len %u, input_pos %u\n", + __func__, vm->input_len, input_pos); + + /* ignore a trailing newline, because PCRE does */ + if (vm->input_len > 0 && input_pos == vm->input_len - 1) { + if (vm->input[input_pos] != '\n') { + goto halt_thread; + } + LOG(3, "%s: special case: $ allowing trailing newline\n", __func__); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + } else if (input_pos != vm->input_len) { + goto halt_thread; + } + } + + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + + default: + assert(!"unreachable"); + return; + } + + if (EXPENSIVE_CHECKS) { /* postcondition */ + check_path_table(vm); + } + + /* FIXME: Check the cleanup logic here. */ + return; + +halt_thread: + /* do not push further execution on the run stack */ + LOG_EXEC_HALT(uniq_id); + + release_path_info_link(vm, &path_info_head); + assert(vm->threads.live > 0); + vm->threads.live--; + return; + +alloc_error: + release_path_info_link(vm, &path_info_head); + vm->res = FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC; +} + +static void +handle_possible_matching_path(struct capvm *vm, uint32_t pi_id, uint32_t uniq_id) +{ + LOG(3, "\n%s: HIT, pi_id %u, uniq_id %u\n", __func__, pi_id, uniq_id); + + if (LOG_CAPVM >= 3) { + LOG(3, "--- current_live: %u, max_live: %u\n", + vm->threads.live, vm->threads.max_live); + dump_path_table(stderr, vm); + LOG(3, "=====\n"); + } + +#if CAPVM_STATS + vm->stats.matches++; +#endif + + assert(pi_id < vm->paths.ceil); + + if (vm->solution.best_path_id == NO_ID) { + struct capvm_path_info *pi = &vm->paths.pool[pi_id]; + assert(!IS_PATH_FREELIST(pi)); + if (LOG_CAPVM >= 5) { + const uint32_t refcount = get_path_node_refcount(vm, pi_id); + LOG(5, "MATCH: pi_id %u refcount %u -> %u\n", + pi_id, refcount, refcount + 1); + } + inc_path_node_refcount(vm, pi_id); + vm->solution.best_path_id = pi_id; +#if CAPVM_STATS + vm->solution.best_path_uniq_id = uniq_id; +#endif + LOG(3, "MATCH: saved current best solution path (pi_id %u)\n", pi_id); + } else { + /* Compare path info and only keep the path associated + * with the greediest match so far. */ + const int res = cmp_paths(vm, pi_id, vm->solution.best_path_id); + if (res > 0) { + /* replace current best solution */ + struct capvm_path_info *pi = &vm->paths.pool[pi_id]; + assert(!IS_PATH_FREELIST(pi)); + if (LOG_CAPVM >= 5) { + const uint32_t refcount = get_path_node_refcount(vm, pi_id); + LOG(5, "MATCH: pi_id %u refcount %u -> %u\n", + pi_id, refcount, refcount + 1); + } + inc_path_node_refcount(vm, pi_id); + + LOG(3, "MATCH: replacing current best solution path %u with %u\n", + vm->solution.best_path_id, pi_id); + + release_path_info_link(vm, &vm->solution.best_path_id); + vm->solution.best_path_id = pi_id; +#if CAPVM_STATS + vm->solution.best_path_uniq_id = uniq_id; +#endif + } else { + /* keep the current best solution */ + LOG(3, "MATCH: ignoring new solution path %u, keeping %u\n", + pi_id, vm->solution.best_path_id); + } + } +} + +static bool +eval_vm(struct capvm *vm) +{ + uint32_t i_i; + + /* init the path_info_heads tables to NO_ID, except for cell 0 + * in next, which contains the starting point. */ + for (size_t op_i = 0; op_i < vm->p->used; op_i++) { + vm->path_info_heads[PAIR_ID_CURRENT][op_i] = NO_ID; +#if CAPVM_STATS + vm->uniq_ids[PAIR_ID_CURRENT][op_i] = NO_ID; +#endif + } + for (size_t op_i = 1; op_i < vm->p->used; op_i++) { + vm->path_info_heads[PAIR_ID_NEXT][op_i] = NO_ID; +#if CAPVM_STATS + vm->uniq_ids[PAIR_ID_NEXT][op_i] = NO_ID; +#endif + } + + for (i_i = 0; i_i <= vm->input_len; i_i++) { + if (vm->threads.live == 0 + || vm->stats.steps == vm->step_limit) { + LOG(3, "%s: breaking, live %u, steps %zu/%zd\n", + __func__, vm->threads.live, vm->stats.steps, vm->step_limit); + break; + } + LOG(3, "\n###### i_i %u\n", i_i); + + LOG(4, "-- clearing evaluated\n"); + const size_t evaluated_bit_words = vm->p->used/32 + 1; + for (size_t i = 0; i < evaluated_bit_words; i++) { + vm->evaluated[PAIR_ID_CURRENT][i] = 0; + vm->evaluated[PAIR_ID_NEXT][i] = 0; + } + + uint32_t *stack_h = &vm->run_stacks_h[PAIR_ID_CURRENT]; + uint32_t *run_stack = vm->run_stacks[PAIR_ID_CURRENT]; + + /* Copy everything from the next run stack to the + * current. Copy in reverse, so items that were pushed + * earlier by greedier paths end up on the top of the + * stack and evalated first, preserving greedy + * ordering. */ + { + const uint32_t next_stack_h = vm->run_stacks_h[PAIR_ID_NEXT]; + const uint32_t *next_stack = vm->run_stacks[PAIR_ID_NEXT]; + uint32_t *next_path_info_heads = vm->path_info_heads[PAIR_ID_NEXT]; + uint32_t *cur_path_info_heads = vm->path_info_heads[PAIR_ID_CURRENT]; + + uint32_t discarded = 0; + for (size_t i = 0; i < next_stack_h; i++) { + const uint32_t op_id = next_stack[i]; + if (op_id == NO_ID) { + assert(!"unreachable"); + discarded++; + continue; + } + + cur_path_info_heads[op_id] = next_path_info_heads[op_id]; + LOG(3, "%s: run_stack[%zd] <- %u, path_info_head %u\n", + __func__, i, op_id, cur_path_info_heads[op_id]); + assert(next_path_info_heads[op_id] < vm->paths.ceil); + next_path_info_heads[op_id] = NO_ID; /* move reference */ +#if CAPVM_STATS + vm->uniq_ids[PAIR_ID_CURRENT][op_id] = + vm->uniq_ids[PAIR_ID_NEXT][op_id]; +#endif + run_stack[next_stack_h - i - 1 - discarded] = op_id; + } + *stack_h = next_stack_h - discarded; + vm->run_stacks_h[PAIR_ID_NEXT] = 0; + +#if CAPVM_PATH_STATS + /* reset counters */ + for (size_t i = 0; i < vm->paths.ceil; i++) { + struct capvm_path_info *pi = &vm->paths.pool[i]; + if (IS_PATH_NODE(pi)) { + pi->u.path.bits_added_per_input_character = 0; + } + } +#endif + } + + uint32_t *path_info_heads = vm->path_info_heads[PAIR_ID_CURRENT]; + while (vm->run_stacks_h[PAIR_ID_CURRENT] > 0) { + /* Do this here, before popping, so that the reference + * on the stack can be counted properly. */ + if (EXPENSIVE_CHECKS) { + check_path_table(vm); + } + + const uint32_t h = --(*stack_h); + assert(h < vm->p->used); + const uint32_t op_id = run_stack[h]; + LOG(4, "%s: popped op_id %d off stack\n", __func__, op_id); + if (op_id == NO_ID) { + LOG(4, "%s: ignoring halted pending execution\n", __func__); + continue; + } + assert(op_id < vm->p->used); + + if (GET_BIT32(vm->evaluated[PAIR_ID_CURRENT], op_id)) { + LOG(2, "%s: evaluated[current] already set for op_id %u (popped off stack), skipping\n", + __func__, op_id); + assert(!"unreachable"); + continue; + } + + LOG(4, "%s: setting evaluated[current] for op_id %u (popped off stack)\n", __func__, op_id); + SET_BIT32(vm->evaluated[PAIR_ID_CURRENT], op_id); + + const uint32_t path_info_head = path_info_heads[op_id]; + LOG(4, "%s: op_id %d's path_info_head: %d\n", __func__, op_id, path_info_head); + path_info_heads[op_id] = NO_ID; + + +#if CAPVM_STATS + const uint32_t uniq_id = vm->uniq_ids[PAIR_ID_CURRENT][op_id]; + assert(uniq_id != NO_ID); +#else + const uint32_t uniq_id = 0; +#endif + eval_vm_advance_greediest(vm, i_i, path_info_head, uniq_id, op_id); + } + + +#if CAPVM_PATH_STATS + uint32_t max_path_bits_added = 0; + for (size_t i = 0; i < vm->paths.ceil; i++) { + const struct capvm_path_info *pi = &vm->paths.pool[i]; + if (IS_PATH_NODE(pi)) { + if (pi->u.path.bits_added_per_input_character > max_path_bits_added) { + max_path_bits_added = pi->u.path.bits_added_per_input_character; + } + } + } + LOG(2, "%s: input_i %u: max_path_bits_added: %u\n", + __func__, i_i, max_path_bits_added); + if (max_path_bits_added > vm->stats.max_bits_added_per_input_character) { + vm->stats.max_bits_added_per_input_character = max_path_bits_added; + } + + if (CAPVM_PATH_STATS > 1) { + dump_path_table(stderr, vm); + } +#endif + } + + return vm->solution.best_path_id != NO_ID; +} + +static bool +copy_path_info(struct capvm *vm, uint32_t path_info_head, + uint32_t *new_path_info_head) +{ + if (!reserve_path_info_link(vm, new_path_info_head)) { + return false; + } + + assert(path_info_head != NO_ID); + assert(path_info_head < vm->paths.ceil); + assert(*new_path_info_head < vm->paths.ceil); + assert(*new_path_info_head != path_info_head); + + /* Since this is the path head, it can never be a collapsed + * zero prefix node. */ + const struct capvm_path_info *pi = &vm->paths.pool[path_info_head]; + assert(IS_PATH_NODE(pi)); + + struct capvm_path_info *npi = &vm->paths.pool[*new_path_info_head]; + assert(IS_PATH_FREELIST(npi)); + + /* unlink from freelist */ + vm->paths.freelist_head = npi->u.freelist_node.freelist; + vm->paths.live++; + if (vm->paths.live > vm->paths.max_live) { + set_max_paths_live(vm); + } + + *npi = (struct capvm_path_info){ + .u.path = { + .refcount = 1, + .used = pi->u.path.used, + .bits = pi->u.path.bits, + .offset = pi->u.path.offset, + .backlink = pi->u.path.backlink, + } + }; + if (pi->u.path.backlink != NO_ID) { + inc_path_node_refcount(vm, pi->u.path.backlink); + } + return true; +} + +#if CAPVM_PATH_STATS +static void +update_max_path_length_memory(struct capvm *vm, const struct capvm_path_info *pi) +{ + const uint32_t len = pi->u.path.used + + (pi->u.path.backlink == COLLAPSED_ZERO_PREFIX_ID + ? 0 /* not actually stored, so don't count it */ + : pi->u.path.offset); + + if (len > vm->stats.max_path_length_memory) { + vm->stats.max_path_length_memory = len; + } +} +#endif + +static bool +extend_path_info(struct capvm *vm, uint32_t pi_id, bool greedy, uint32_t uniq_id, + uint32_t *new_path_info_head) +{ + assert(pi_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[pi_id]; + assert(IS_PATH_NODE(pi)); + + (void)uniq_id; + LOG_EXEC_PATH_FIND_SOLUTION(uniq_id, greedy); + +#define LOG_EPI 0 + LOG(5 - LOG_EPI, "%s: pi_id %u, greedy %d\n", + __func__, pi_id, greedy); + + + if (pi->u.path.used == PATH_LINK_BITS) { /* full */ + uint32_t npi_id; + if (!reserve_path_info_link(vm, &npi_id)) { + assert(!"alloc fail"); + } + pi = &vm->paths.pool[pi_id]; /* refresh stale pointer */ + LOG(5 - LOG_EPI, "%s: new head at %u (%u is full)\n", __func__, npi_id, pi_id); + assert(npi_id < vm->paths.ceil); + struct capvm_path_info *npi = &vm->paths.pool[npi_id]; + vm->paths.freelist_head = npi->u.freelist_node.freelist; + vm->paths.live++; + if (vm->paths.live > vm->paths.max_live) { + set_max_paths_live(vm); + } + + LOG(5 - LOG_EPI, "%s: npi_id %u refcount 1 (new link)\n", + __func__, npi_id); + npi->u.path.refcount = 1; + npi->u.path.offset = pi->u.path.offset + pi->u.path.used; + npi->u.path.bits = (greedy ? ((uint32_t)1 << 31) : 0); + LOG(5 - LOG_EPI, "%s: bits after: 0x%08x\n", __func__, npi->u.path.bits); + npi->u.path.used = 1; + +#if CAPVM_PATH_STATS + npi->u.path.bits_added_per_input_character = pi->u.path.bits_added_per_input_character + 1; +#endif + + /* If the path node is full of zero bits and it's either at the start, + * or its backlink is a COLLAPSED_ZERO_PREFIX_ID, then extend the + * backlink to a collapsed run of zeroes. The node's offset field + * indicates the prefix length. Long prefixes of zero bits tend to + * occur with an unanchored start loop. */ + if (pi->u.path.bits == (uint32_t)0 && USE_COLLAPSED_ZERO_PREFIX + && (pi->u.path.offset == 0 || pi->u.path.backlink == COLLAPSED_ZERO_PREFIX_ID)) { + release_path_info_link(vm, &pi_id); + pi_id = COLLAPSED_ZERO_PREFIX_ID; + +#if CAPVM_STATS + vm->stats.collapsed_zero_prefixes++; +#endif + } else { + /* Check if there's an existing full path node with + * exactly the same bits. If so, link backward to that + * and free the old full one, rather than saving it as + * a duplicate. */ + const uint32_t old_path_bits = pi->u.path.bits; + const uint32_t old_path_offset = pi->u.path.offset; + const uint32_t old_path_backlink = pi->u.path.backlink; + + for (uint32_t epi_id = 0; epi_id < vm->paths.ceil; epi_id++) { + if (epi_id == pi_id) { continue; } + struct capvm_path_info *epi = &vm->paths.pool[epi_id]; + if (IS_PATH_FREELIST(epi)) { + continue; + } + + assert(IS_PATH_NODE(epi)); + if (epi->u.path.used == PATH_LINK_BITS + && epi->u.path.bits == old_path_bits + && epi->u.path.offset == old_path_offset + && epi->u.path.backlink == old_path_backlink) { + + if (LOG_CAPVM >= 4 || 1) { + const uint32_t refcount = get_path_node_refcount(vm, epi_id); + LOG(4 - LOG_EPI, "%s: pi[%u] refcount %u -> %u (reusing identical path backlink %u instead of %u)\n", + __func__, epi_id, refcount, refcount + 1, + epi_id, pi_id); + } + inc_path_node_refcount(vm, epi_id); + release_path_info_link(vm, &pi_id); + pi_id = epi_id; +#if CAPVM_STATS + vm->stats.path_prefixes_shared++; +#endif + break; + } + } + } + + assert(IS_PATH_NODE(npi)); + npi->u.path.backlink = pi_id; + /* transfer pi_id's reference to npi_id */ + *new_path_info_head = npi_id; + +#if CAPVM_PATH_STATS + update_max_path_length_memory(vm, npi); +#endif + + return true; + } else { + assert(IS_PATH_NODE(pi)); + assert(pi->u.path.used < PATH_LINK_BITS); + + LOG(5 - LOG_EPI, "%s: appending to head node %u, %u -> %u used\n", + __func__, pi_id, pi->u.path.used, pi->u.path.used + 1); + assert(pi->u.path.used < PATH_LINK_BITS); + if (greedy) { + LOG(5 - LOG_EPI, "%s: bits before: 0x%08x (greedy: %d)\n", + __func__, pi->u.path.bits, greedy); + pi->u.path.bits |= (uint32_t)1 << (31 - pi->u.path.used); + LOG(5 - LOG_EPI, "%s: bits after: 0x%08x\n", + __func__, pi->u.path.bits); + } + pi->u.path.used++; +#if CAPVM_PATH_STATS + pi->u.path.bits_added_per_input_character++; +#endif + +#if CAPVM_PATH_STATS + update_max_path_length_memory(vm, pi); +#endif + + *new_path_info_head = pi_id; + return true; + } +#undef LOG_EPI +} + +static void +populate_solution(struct capvm *vm) +{ + if (LOG_CAPVM >= 3) { + fsm_capvm_program_dump(stderr, vm->p); + LOG(0, "%s: best_path_id %d, tables:\n", __func__, vm->solution.best_path_id); + dump_path_table(stderr, vm); + check_path_table(vm); + fprintf(stderr, "SOLUTION_PATH: "); + print_path(stderr, vm, vm->solution.best_path_id); + fprintf(stderr, "\n"); + } + +#if CAPVM_PATH_STATS + LOG(1, "%s: prog_size %u, max_path_length_memory %u (bits)\n", + __func__, vm->p->used, vm->stats.max_path_length_memory); + const uint32_t uniq_id = vm->solution.best_path_uniq_id; +#else + const uint32_t uniq_id = NO_ID; +#endif + (void)uniq_id; + + /* Interpret the program again, but rather than using the input to + * drive execution, use the saved path for the primary solution. */ + + /* Walk the solution path, reversing the edges temporarily so it + * can be executed start to finish, and truncate any bits appended + * after branches on the path. */ + assert(vm->solution.best_path_id != NO_ID); + assert(vm->solution.best_path_id < vm->paths.ceil); + + uint32_t path_link = vm->solution.best_path_id; + uint32_t next_link = NO_ID; + uint32_t next_offset = NO_POS; + uint32_t first_link = NO_ID; + + size_t split_count = 0; + uint32_t zero_prefix_length = 0; + + if (LOG_CAPVM >= 3) { + const struct capvm_path_info *pi = &vm->paths.pool[path_link]; + assert(!IS_PATH_FREELIST(pi)); + LOG(3, "%s: best_path %d, path_length %u\n", + __func__, vm->solution.best_path_id, pi->u.path.offset + pi->u.path.used); + if (LOG_CAPVM > 4) { + dump_path_table(stderr, vm); + } + } + + uint32_t prev; + do { + struct capvm_path_info *pi = &vm->paths.pool[path_link]; + assert(!IS_PATH_FREELIST(pi)); + const uint32_t prev_link = get_path_node_backlink(vm, path_link); + + if (LOG_CAPVM >= 3) { + if (IS_PATH_NODE(pi)) { + LOG(3, "%s (moving back), node %u: refcount %u, used %u, offset %u, backlink %d, bits '", + __func__, path_link, pi->u.path.refcount, pi->u.path.used, + pi->u.path.offset, pi->u.path.backlink); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", bit ? '1' : '0'); + } + LOG(3, "'\n"); + } + } + + split_count += pi->u.path.used; + + if (next_link != NO_ID) { + LOG(3, "-- setting backlink to %d\n", next_link); + set_path_node_backlink(vm, path_link, next_link); /* point fwd */ + } else { + LOG(3, "-- setting backlink to %d\n", NO_ID); + set_path_node_backlink(vm, path_link, NO_ID); /* now EOL */ + } + + if (prev_link == NO_ID) { + first_link = path_link; + prev = prev_link; + } else if (prev_link == COLLAPSED_ZERO_PREFIX_ID) { + first_link = path_link; + split_count += pi->u.path.offset; + zero_prefix_length = pi->u.path.offset; + prev = prev_link; + } + + next_offset = get_path_node_offset(vm, path_link); + next_link = path_link; + assert(path_link != prev_link); + path_link = prev_link; + } while (path_link != NO_ID && path_link != COLLAPSED_ZERO_PREFIX_ID); + + /* iter forward */ + uint32_t cur = first_link; + if (LOG_CAPVM >= 3) do { + struct capvm_path_info *pi = &vm->paths.pool[cur]; + + assert(IS_PATH_NODE(pi)); + LOG(3, "%s (moving fwd): node %u: refcount %u, used %u, offset %u, fwdlink %d, bits '", + __func__, cur, get_path_node_refcount(vm, cur), + pi->u.path.used, + get_path_node_offset(vm, cur), + get_path_node_backlink(vm, cur)); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", bit ? '1' : '0'); + } + LOG(3, "'\n"); + + const uint32_t next_cur = get_path_node_backlink(vm, cur); + assert(cur != next_cur); + cur = next_cur; /* fwd link */ + } while (cur != NO_ID); + + /* evaluate program with forward path */ + LOG(3, "%s: split_count %zu\n", __func__, split_count); + size_t split_i = 0; + uint32_t prog_i = 0; + uint32_t input_i = 0; + size_t capture_lookup_steps = 0; + bool done = false; + + /* This flag tracks whether an explicit newline was matched at + * the end of input. Normally a trailing newline is implicitly + * ignored in the bounds for captures, but when the regex + * matches a newline at the end, it must still be included. An + * example case where this matters is `^[^x]$` for "\n", because + * the character class matches the newline this should capture + * as (0,1). */ + bool explicitly_matched_nl_at_end = false; + + cur = first_link; + while (split_i < split_count || !done) { + assert(prog_i < vm->p->used); + const uint32_t cur_prog_i = prog_i; + const struct capvm_opcode *op = &vm->p->ops[cur_prog_i]; + LOG(3, "%s: i_i %u, p_i %u, s_i %zu/%zu, op %s\n", + __func__, input_i, cur_prog_i, split_i, split_count, op_name[op->t]); + + prog_i++; + capture_lookup_steps++; + switch (op->t) { + case CAPVM_OP_CHAR: + assert(input_i < vm->input_len); + LOG(3, "OP_CHAR: input_i %u, exp char '%c', got '%c'\n", + input_i, op->u.chr, vm->input[input_i]); + assert(vm->input[input_i] == op->u.chr); + if (vm->input_len > 0 + && input_i == vm->input_len - 1 + && vm->input[input_i] == '\n') { + explicitly_matched_nl_at_end = true; + } + input_i++; + break; + case CAPVM_OP_CHARCLASS: + assert(input_i < vm->input_len); + if (vm->input_len > 0 + && input_i == vm->input_len - 1 + && vm->input[input_i] == '\n') { + explicitly_matched_nl_at_end = true; + } + input_i++; + break; + case CAPVM_OP_MATCH: + LOG(2, "split_i %zu, split_count %zu\n", split_i, split_count); + assert(split_i == split_count); + done = true; + break; + case CAPVM_OP_JMP: + prog_i = op->u.jmp; + break; + case CAPVM_OP_JMP_ONCE: + { + /* look at next bit of path and jmp or fall through */ + const uint32_t offset = get_path_node_offset(vm, cur); + const struct capvm_path_info *pi = &vm->paths.pool[cur]; + + assert(IS_PATH_NODE(pi)); + bool next_bit; + LOG(3, "%s: OP_JMP_ONCE: split_i %zu, zpl %u, offset %u, pi->u.path.used %u\n", + __func__, split_i, zero_prefix_length, offset, pi->u.path.used); + if (split_i < zero_prefix_length) { + next_bit = 0; + } else { + assert(split_i >= offset && + split_i <= offset + pi->u.path.used); + const uint32_t shift = 31 - (split_i & 31); + assert(shift < PATH_LINK_BITS); + next_bit = (pi->u.path.bits & ((uint32_t)1 << shift)) != 0; + } + LOG(3, "jmp_once: next_bit %d\n", next_bit); + LOG_EXEC_PATH_SAVE_CAPTURES(uniq_id, next_bit); + if (next_bit) { /* greedy edge */ + prog_i = op->u.jmp_once; + } else { /* non-greedy edge */ + /* fall through */ + } + split_i++; + if (split_i >= offset && + split_i - offset == pi->u.path.used && split_i < split_count) { + const uint32_t backlink = get_path_node_backlink(vm, cur); + assert(backlink != NO_ID); + cur = backlink; + } + LOG(3, "%s: prog_i now %u, split_i %zu/%zu\n", + __func__, prog_i, split_i, split_count); + assert(split_i <= split_count); + break; + } + case CAPVM_OP_SPLIT: + { + /* look at next bit of path and act accordingly */ + const uint32_t offset = get_path_node_offset(vm, cur); + const struct capvm_path_info *pi = &vm->paths.pool[cur]; + + const uint32_t dst_cont = op->u.split.cont; + const uint32_t dst_new = op->u.split.new; + + assert(IS_PATH_NODE(pi)); + bool next_bit; + LOG(3, "%s: OP_SPLIT_CONT: split_i %zu, zpl %u, offset %u, pi->u.path.used %u\n", + __func__, split_i, zero_prefix_length, offset, pi->u.path.used); + if (split_i < zero_prefix_length) { + next_bit = 0; + } else { + assert(split_i >= offset && + split_i <= offset + pi->u.path.used); + const uint32_t shift = 31 - (split_i & 31); + assert(shift < PATH_LINK_BITS); + next_bit = (pi->u.path.bits & ((uint32_t)1 << shift)) != 0; + } + LOG(3, "split: next_bit %d\n", next_bit); + LOG_EXEC_PATH_SAVE_CAPTURES(uniq_id, next_bit); + if (next_bit) { /* greedy edge */ + prog_i = dst_cont; + } else { /* non-greedy edge */ + prog_i = dst_new; + } + split_i++; + if (split_i >= offset && + split_i - offset == pi->u.path.used && split_i < split_count) { + const uint32_t backlink = get_path_node_backlink(vm, cur); + assert(backlink != NO_ID); + cur = backlink; + } + LOG(3, "%s: prog_i now %u, split_i %zu/%zu\n", + __func__, prog_i, split_i, split_count); + assert(split_i <= split_count); + + break; + } + case CAPVM_OP_SAVE: + { + const unsigned capture_id = op->u.save/2; + const bool is_end = (op->u.save & 1) == 1; + + LOG(5, "%s: input_i %u, save %d -> capture %d pos %d, cur value %zd, prev char 0x%02x\n", + __func__, + input_i, op->u.save, + capture_id, is_end, + vm->capture_buf[op->u.save/2].pos[op->u.save & 1], + input_i > 0 ? vm->input[input_i - 1] : 0xff); + + /* Special case to ignore a trailing + * newline when capturing, unless the + * newline was explicitly matched as the + * last character of input. */ + if (input_i > 0 + && !explicitly_matched_nl_at_end + && input_i == vm->input_len + && vm->input[input_i - 1] == '\n') { + LOG(3, "%s: updating capture[%u].pos[1] to ignore trailing '\\n' at %u\n", + __func__, capture_id, input_i); + vm->capture_buf[capture_id].pos[is_end] = input_i - 1; + } else { + /* Save current position to appropriate capture buffer endpoint */ + vm->capture_buf[op->u.save/2].pos[op->u.save & 1] = input_i; + LOG(3, "%s: saved capture[%d].pos[%d] <- %u\n", + __func__, op->u.save/2, op->u.save&1, input_i); + } + break; + } + case CAPVM_OP_ANCHOR: + if (op->u.anchor == CAPVM_ANCHOR_START) { + assert(input_i == 0 + || (input_i == 1 + && vm->input_len == 1 + && vm->input[0] == '\n')); + } else { + assert(op->u.anchor == CAPVM_ANCHOR_END); + LOG(3, "%s: $ anchor: input_len %u, input_i %u\n", + __func__, vm->input_len, input_i); + + if (vm->input_len > 0 && input_i == vm->input_len - 1) { + /* special hack to not include trailing newline + * in match group zero */ + if (vm->p->capture_count > 0) { + vm->capture_buf[0].pos[1] = input_i; + } + + assert(vm->input[input_i] == '\n'); + input_i++; + } else { + assert(input_i == vm->input_len); + } + } + break; + + default: + assert(!"match fail"); + } + } + + /* write solution into caller's buffers and print */ + if (LOG_SOLUTION_TO_STDOUT) { + /* fprintf(stderr, "capture_count %u\n", vm->p->capture_count); */ + printf("HIT:"); + for (unsigned i = 0; i < vm->p->capture_count; i++) { + printf(" %zd %zd", + vm->capture_buf[i].pos[0], vm->capture_buf[i].pos[1]); + } + printf("\n"); + } + + /* restore original link order */ + cur = first_link; + do { + struct capvm_path_info *pi = &vm->paths.pool[cur]; + assert(!IS_PATH_FREELIST(pi)); + const uint32_t backlink = get_path_node_backlink(vm, cur); + + LOG(3, "%s (moving fwd again): node %u: refcount %u, used %u, offset %u, fwdlink %d, bits '", + __func__, cur, get_path_node_refcount(vm, cur), + pi->u.path.used, + get_path_node_offset(vm, cur), + backlink); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", (pi->u.path.bits & bit) ? '1' : '0'); + } + LOG(3, "'\n"); + + LOG(3, "-- setting node %u's backlink to %d\n", cur, prev); + const uint32_t next = backlink; + set_path_node_backlink(vm, cur, prev); + + prev = cur; + cur = next; /* fwd link */ + } while (cur != NO_ID); +} + +/* TODO: It should be possible to avoid dynamic allocation here + * by calculating the max space needed upfront and passing in a + * uint32_t or uint64_t-aligned array for working space. */ + +enum fsm_capvm_program_exec_res +fsm_capvm_program_exec(const struct capvm_program *program, + const uint8_t *input, size_t length, + struct fsm_capture *capture_buf, size_t capture_buf_length) +{ + assert(program != NULL); + assert(input != NULL || length == 0); + assert(capture_buf != NULL); + + const size_t thread_max = program->used; + + /* FIXME: The path node table can grow beyond this, but in + * practice will usually stay fairly small. The worst case + * should be decidable based on the compiled program and input + * length. */ +#if ALLOW_PATH_TABLE_RESIZING + const size_t path_info_max = thread_max; +#else + const size_t path_info_max = 3 * thread_max; +#endif + + struct capvm_path_info *path_info_pool = malloc(path_info_max + * sizeof(path_info_pool[0])); + if (path_info_pool == NULL) { + return FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC; + } + assert(path_info_pool != NULL); + + /* link path_info freelist */ + for (size_t i = 1; i < path_info_max - 1; i++) { + struct capvm_path_info *pi = &path_info_pool[i]; + pi->u.freelist_node.refcount = 0; + pi->u.freelist_node.freelist = i + 1; + } + struct capvm_path_info *piZ = &path_info_pool[path_info_max - 1]; + piZ->u.freelist_node.refcount = 0; + piZ->u.freelist_node.freelist = NO_ID; + + /* init an empty path descriptor for initial execution */ + struct capvm_path_info *pi0 = &path_info_pool[0]; + pi0->u.path.refcount = 1; + pi0->u.path.used = 0; + pi0->u.path.bits = 0; + pi0->u.path.offset = 0; + pi0->u.path.backlink = NO_ID; + + uint32_t stack_a[thread_max]; + uint32_t stack_b[thread_max]; + + const size_t evaluated_bit_words = program->used/32 + 1; + uint32_t evaluated_a[evaluated_bit_words]; + uint32_t evaluated_b[evaluated_bit_words]; + uint32_t path_info_head_a[thread_max]; + uint32_t path_info_head_b[thread_max]; +#if CAPVM_STATS + uint32_t uniq_ids_a[thread_max]; + uint32_t uniq_ids_b[thread_max]; +#endif + + assert(capture_buf_length >= program->capture_base + program->capture_count); + + struct fsm_capture *offset_capture_buf = &capture_buf[program->capture_base]; + + struct capvm vm = { + .res = FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND, + .p = program, + .input = input, + .input_len = length, + .capture_buf = offset_capture_buf, + .capture_buf_length = capture_buf_length, + .step_limit = SIZE_MAX, +#if CAPVM_STATS + .uniq_id_counter = 0, +#endif + + .run_stacks = { stack_a, stack_b }, + .evaluated = { evaluated_a, evaluated_b }, + .path_info_heads = { path_info_head_a, path_info_head_b }, +#if CAPVM_STATS + .uniq_ids = { uniq_ids_a, uniq_ids_b }, +#endif + + .paths = { + .ceil = path_info_max, + .live = 1, + .max_live = 1, + .freelist_head = 1, + .pool = path_info_pool, + }, + .solution = { + .best_path_id = NO_ID, + }, + }; + + /* enqueue execution at first opcode */ + vm.run_stacks[PAIR_ID_NEXT][0] = 0; + vm.run_stacks_h[PAIR_ID_NEXT] = 1; + vm.threads.live = 1; + vm.threads.max_live = 1; + vm.path_info_heads[PAIR_ID_NEXT][0] = 0; + +#if CAPVM_STATS + vm.uniq_ids[PAIR_ID_NEXT][0] = 0; +#endif + + INIT_TIMERS(); + TIME(&pre); + if (eval_vm(&vm)) { + assert(vm.threads.live == 0); + assert(vm.paths.live > 0); + + populate_solution(&vm); + release_path_info_link(&vm, &vm.solution.best_path_id); + vm.res = FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN; + + /* TODO: This assert will not work if refcounts are + * sticky at the max value, but if the number of paths + * and threads is bounded then it shouldn't be possible + * to overflow the refcount anyway. If sticky refcounts + * are used then reaching one should probably set a + * flag, which would skip this assertion. */ + assert(vm.paths.live == 0); + } else { + assert(vm.res == FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND); + } + + TIME(&post); + DIFF_MSEC(__func__, pre, post, NULL); + +#if CAPVM_STATS + LOG(2, "%s: %zu steps, max_threads %u, max_paths %u, matches %u, path_prefixes_shared %u, collapsed_zero_prefixes %u\n", + __func__, vm.stats.steps, vm.threads.max_live, vm.paths.max_live, vm.stats.matches, + vm.stats.path_prefixes_shared, vm.stats.collapsed_zero_prefixes); +#if CAPVM_PATH_STATS + LOG(2, "%s: prog_size %u, max_path_length_memory %u (bits), input length %zu, max_paths * %zu bytes/path => %zu bytes\n", + __func__, vm.p->used, vm.stats.max_path_length_memory, length, + sizeof(vm.paths.pool[0]), + vm.paths.max_live * sizeof(vm.paths.pool[0])); +#endif +#endif + + free(vm.paths.pool); + return vm.res; +} diff --git a/src/libfsm/capture_vm_log.h b/src/libfsm/capture_vm_log.h new file mode 100644 index 000000000..8ff51d8b4 --- /dev/null +++ b/src/libfsm/capture_vm_log.h @@ -0,0 +1,21 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_VM_LOG_H +#define CAPTURE_VM_LOG_H + +#include + +#define LOG_CAPVM (1+0) +#define LOG(LEVEL, ...) \ + do { \ + if ((LEVEL) <= LOG_CAPVM) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while(0) + + +#endif diff --git a/src/libfsm/capture_vm_program.h b/src/libfsm/capture_vm_program.h new file mode 100644 index 000000000..15e585761 --- /dev/null +++ b/src/libfsm/capture_vm_program.h @@ -0,0 +1,74 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_VM_PROGRAM_H +#define CAPTURE_VM_PROGRAM_H + +#include + +struct capvm_program { + uint32_t capture_count; + uint32_t capture_base; + + uint32_t used; + uint32_t ceil; + struct capvm_opcode { + enum capvm_opcode_type { + /* Next character of input == .u.chr */ + CAPVM_OP_CHAR, + /* Next character of input is in char class */ + CAPVM_OP_CHARCLASS, + /* Input has matched */ + CAPVM_OP_MATCH, + /* Unconditional jump */ + CAPVM_OP_JMP, + /* If destination has already been evaulated + * since advancing the input position, fall + * through to next instruction, otherwise jmp. */ + CAPVM_OP_JMP_ONCE, + /* Split execution to two paths, where .cont + * offset is greedier than .new's offset. */ + CAPVM_OP_SPLIT, + /* Save current input position as capture bound */ + CAPVM_OP_SAVE, + /* Check if current input position is at start/end + * of input, after accounting for PCRE's special + * cases for a trailing newline. */ + CAPVM_OP_ANCHOR, + } t; + union { + uint8_t chr; + uint32_t charclass_id; + uint32_t jmp; /* absolute */ + uint32_t jmp_once; /* absolute */ + struct { + uint32_t cont; /* greedy branch */ + uint32_t new; /* non-greedy branch */ + } split; + /* (save >> 1): capture ID, + * (save & 0x01): save pos to start (0b0) or end (0b1). */ + uint32_t save; + enum capvm_anchor_type { + CAPVM_ANCHOR_START, + CAPVM_ANCHOR_END, + } anchor; + } u; + } *ops; + + /* Most compiled programs only use a few distinct character + * classes (if any), and the data is much larger than the + * other instructions, so they are stored in a separate + * table and referred to by op->u.charclass_id. */ + struct capvm_char_classes { + uint32_t count; + uint32_t ceil; + struct capvm_char_class { + uint64_t octets[4]; /* 256-bitset */ + } *sets; + } char_classes; +}; + +#endif diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index bec16bb0f..8b7b606c8 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -21,10 +21,7 @@ #define LOG_CLONE_ENDIDS 0 static int -copy_capture_actions(struct fsm *dst, const struct fsm *src); - -static int -copy_end_ids(struct fsm *dst, const struct fsm *src); +copy_end_metadata(struct fsm *dst, const struct fsm *src); struct fsm * fsm_clone(const struct fsm *fsm) @@ -70,12 +67,7 @@ fsm_clone(const struct fsm *fsm) } { - if (!copy_capture_actions(new, fsm)) { - fsm_free(new); - return NULL; - } - - if (!copy_end_ids(new, fsm)) { + if (!copy_end_metadata(new, fsm)) { fsm_free(new); return NULL; } @@ -84,75 +76,81 @@ fsm_clone(const struct fsm *fsm) return new; } -struct copy_capture_actions_env { +struct copy_end_ids_env { + char tag; struct fsm *dst; int ok; }; static int -copy_capture_actions_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +copy_end_ids_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque) { - struct copy_capture_actions_env *env = opaque; - assert(env->dst); + struct copy_end_ids_env *env = opaque; + enum fsm_endid_set_res sres; + assert(env->tag == 'c'); + (void)fsm; + (void)nth; - if (!fsm_capture_add_action(env->dst, - state, type, capture_id, to)) { +#if LOG_CLONE_ENDIDS + fprintf(stderr, "clone[%d] <- %d\n", state, id); +#endif + + sres = fsm_endid_set(env->dst, state, id); + if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { env->ok = 0; + return 0; } - return env->ok; + return 1; } static int -copy_capture_actions(struct fsm *dst, const struct fsm *src) +copy_active_capture_ids_cb(fsm_state_t state, unsigned capture_id, void *opaque) { - struct copy_capture_actions_env env = { NULL, 1 }; - env.dst = dst; + struct copy_end_ids_env *env = opaque; - fsm_capture_action_iter(src, - copy_capture_actions_cb, &env); - return env.ok; + if (!fsm_capture_set_active_for_end(env->dst, + capture_id, + state)) { + env->ok = 0; + return 0; + } + return 1; } -struct copy_end_ids_env { - char tag; - struct fsm *dst; - const struct fsm *src; - int ok; -}; - static int -copy_end_ids_cb(fsm_state_t state, const fsm_end_id_t id, void *opaque) +associate_capture_programs_cb(fsm_state_t state, unsigned prog_id, void *opaque) { struct copy_end_ids_env *env = opaque; - enum fsm_endid_set_res sres; - assert(env->tag == 'c'); -#if LOG_CLONE_ENDIDS - fprintf(stderr, "clone[%d] <- %d\n", state, id); -#endif - - sres = fsm_endid_set(env->dst, state, id); - if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { + if (!fsm_capture_associate_program_with_end_state(env->dst, + prog_id, state)) { env->ok = 0; return 0; } - return 1; } static int -copy_end_ids(struct fsm *dst, const struct fsm *src) +copy_end_metadata(struct fsm *dst, const struct fsm *src) { struct copy_end_ids_env env; env.tag = 'c'; /* for clone */ env.dst = dst; - env.src = src; env.ok = 1; fsm_endid_iter(src, copy_end_ids_cb, &env); + fsm_capture_iter_active_for_all_end_states(src, + copy_active_capture_ids_cb, &env); + + if (!fsm_capture_copy_programs(src, dst)) { + return 0; + } + + fsm_capture_iter_program_ids_for_all_end_states(src, + associate_capture_programs_cb, &env); + return env.ok; } diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index 390b2a0c5..b34e69910 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "capture.h" @@ -26,27 +27,15 @@ #define LOG_MAPPING 0 #define LOG_CONSOLIDATE_CAPTURES 0 -#define LOG_CONSOLIDATE_ENDIDS 0 +#define LOG_CONSOLIDATE_END_METADATA 0 struct mapping_closure { size_t count; const fsm_state_t *mapping; }; -struct consolidate_copy_capture_actions_env { - char tag; - struct fsm *dst; - size_t mapping_count; - const fsm_state_t *mapping; - int ok; -}; - -static int -consolidate_copy_capture_actions(struct fsm *dst, const struct fsm *src, - const fsm_state_t *mapping, size_t mapping_count); - static int -consolidate_end_ids(struct fsm *dst, const struct fsm *src, +consolidate_end_metadata(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count); static fsm_state_t @@ -68,6 +57,9 @@ fsm_consolidate(const struct fsm *src, size_t max_used = 0; assert(src != NULL); + if (mapping_count == 0) { + return fsm_clone(src); + } assert(src->opt != NULL); dst = fsm_new(src->opt); @@ -76,12 +68,14 @@ fsm_consolidate(const struct fsm *src, } for (src_i = 0; src_i < mapping_count; src_i++) { + const fsm_state_t dst_i = mapping[src_i]; #if LOG_MAPPING fprintf(stderr, "consolidate_mapping[%u]: %u\n", src_i, mapping[src_i]); #endif - if (mapping[src_i] >= max_used) { - max_used = mapping[src_i]; + if (dst_i > max_used) { + assert(dst_i != FSM_STATE_REMAP_NO_STATE); + max_used = dst_i; } } @@ -96,8 +90,8 @@ fsm_consolidate(const struct fsm *src, goto cleanup; } -#define DST_SEEN(I) (seen[I/64] & ((uint64_t)1 << (I&63))) -#define SET_DST_SEEN(I) (seen[I/64] |= ((uint64_t)1 << (I&63))) +#define DST_SEEN(I) u64bitset_get(seen, I) +#define SET_DST_SEEN(I) u64bitset_set(seen, I) /* map N states to M states, where N >= M. * if it's the first time state[M] is seen, @@ -110,6 +104,9 @@ fsm_consolidate(const struct fsm *src, for (src_i = 0; src_i < mapping_count; src_i++) { const fsm_state_t dst_i = mapping[src_i]; + /* fsm_consolidate does not currently support discarding states. */ + assert(dst_i != FSM_STATE_REMAP_NO_STATE); + if (!DST_SEEN(dst_i)) { SET_DST_SEEN(dst_i); @@ -134,11 +131,11 @@ fsm_consolidate(const struct fsm *src, } } - if (!consolidate_copy_capture_actions(dst, src, mapping, mapping_count)) { + if (!fsm_capture_copy_programs(src, dst)) { goto cleanup; } - if (!consolidate_end_ids(dst, src, mapping, mapping_count)) { + if (!consolidate_end_metadata(dst, src, mapping, mapping_count)) { goto cleanup; } @@ -161,31 +158,42 @@ fsm_consolidate(const struct fsm *src, return NULL; } +struct consolidate_end_ids_env { + char tag; + struct fsm *dst; + const struct fsm *src; + const fsm_state_t *mapping; + size_t mapping_count; + int ok; +}; + static int -consolidate_copy_capture_actions_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +consolidate_end_ids_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque) { - struct consolidate_copy_capture_actions_env *env = opaque; - fsm_state_t s, t; - + struct consolidate_end_ids_env *env = opaque; + enum fsm_endid_set_res sres; + fsm_state_t s; assert(env->tag == 'C'); -#if LOG_CONSOLIDATE_CAPTURES - fprintf(stderr, "consolidate_copy_capture_actions_cb: state %u, type %s, ID %u, TO %d\n", - state, - fsm_capture_action_type_name[type], - capture_id, to); + (void)fsm; + (void)nth; + +#if LOG_CONSOLIDATE_END_METADATA > 1 + fprintf(stderr, "consolidate_end_ids_cb: state %u, ID[%zu]: %d\n", + state, nth, id); + fprintf(stderr, " -- mapping_count %zu\n", + env->mapping_count); #endif - assert(state < env->mapping_count); - assert(to == CAPTURE_NO_STATE || to < env->mapping_count); s = env->mapping[state]; - t = to == CAPTURE_NO_STATE - ? CAPTURE_NO_STATE : env->mapping[to]; - if (!fsm_capture_add_action(env->dst, - s, type, capture_id, t)) { +#if LOG_CONSOLIDATE_END_METADATA > 1 + fprintf(stderr, "consolidate[%d] <- %d\n", s, id); +#endif + + sres = fsm_endid_set(env->dst, s, id); + if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { env->ok = 0; return 0; } @@ -194,76 +202,54 @@ consolidate_copy_capture_actions_cb(fsm_state_t state, } static int -consolidate_copy_capture_actions(struct fsm *dst, const struct fsm *src, - const fsm_state_t *mapping, size_t mapping_count) +consolidate_active_captures_cb(fsm_state_t state, unsigned capture_id, + void *opaque) { - size_t i; + struct consolidate_end_ids_env *env = opaque; + fsm_state_t dst_s; + assert(env->tag == 'C'); - struct consolidate_copy_capture_actions_env env; - env.tag = 'C'; - env.dst = dst; - env.mapping_count = mapping_count; - env.mapping = mapping; - env.ok = 1; + assert(state < env->mapping_count); + dst_s = env->mapping[state]; -#if LOG_MAPPING - for (i = 0; i < mapping_count; i++) { - fprintf(stderr, "mapping[%lu]: %u\n", i, mapping[i]); - } -#else - (void)i; +#if LOG_CONSOLIDATE_END_METADATA + fprintf(stderr, "consolidate_active_captures_cb: state %d -> dst_s %d, capture_id %u\n", + state, dst_s, capture_id); #endif - fsm_capture_action_iter(src, - consolidate_copy_capture_actions_cb, &env); - return env.ok; + if (!fsm_capture_set_active_for_end(env->dst, capture_id, dst_s)) { + env->ok = 0; + return 0; + } + return 1; } -struct consolidate_end_ids_env { - char tag; - struct fsm *dst; - const struct fsm *src; - const fsm_state_t *mapping; - size_t mapping_count; - int ok; -}; - static int -consolidate_end_ids_cb(fsm_state_t state, const fsm_end_id_t id, +consolidate_capture_programs_cb(fsm_state_t state, unsigned program_id, void *opaque) { struct consolidate_end_ids_env *env = opaque; - enum fsm_endid_set_res sres; - fsm_state_t s; + fsm_state_t dst_s; assert(env->tag == 'C'); -#if LOG_CONSOLIDATE_ENDIDS > 1 - fprintf(stderr, "consolidate_end_ids_cb: state %u, count %lu, ID %d:", - state, id); - for (i = 0; i < count; i++) { - fprintf(stderr, " %u", ids[i]); - } - fprintf(stderr, "\n"); -#endif - assert(state < env->mapping_count); - s = env->mapping[state]; + dst_s = env->mapping[state]; -#if LOG_CONSOLIDATE_ENDIDS > 1 - fprintf(stderr, "consolidate[%d] <- %d\n", s, id); +#if LOG_CONSOLIDATE_END_METADATA + fprintf(stderr, "%s: state %d -> dst_s %d, capture_id %u\n", + __func__, state, dst_s, program_id); #endif - sres = fsm_endid_set(env->dst, s, id); - if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { + if (!fsm_capture_associate_program_with_end_state(env->dst, + (uint32_t)program_id, dst_s)) { env->ok = 0; return 0; } - return 1; } static int -consolidate_end_ids(struct fsm *dst, const struct fsm *src, +consolidate_end_metadata(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count) { struct consolidate_end_ids_env env; @@ -276,9 +262,28 @@ consolidate_end_ids(struct fsm *dst, const struct fsm *src, fsm_endid_iter(src, consolidate_end_ids_cb, &env); -#if LOG_CONSOLIDATE_ENDIDS > 1 + if (env.ok) { + fsm_state_t s; + const size_t src_state_count = fsm_countstates(src); + for (s = 0; s < src_state_count; s++) { + fsm_capture_iter_active_for_end_state(src, s, + consolidate_active_captures_cb, &env); + if (!env.ok) { + break; + } + + fsm_capture_iter_program_ids_for_end_state(src, s, + consolidate_capture_programs_cb, &env); + if (!env.ok) { + break; + } + } + } + +#if LOG_CONSOLIDATE_END_METADATA > 1 fprintf(stderr, "==== fsm_consolidate -- endid_info after:\n"); fsm_endid_dump(stderr, dst); + fsm_capture_dump_active_for_ends(stderr, dst); #endif return env.ok; diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 56e135afd..817fd335b 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -6,6 +6,8 @@ #include "determinise_internal.h" +#define LOG_DETERMINISATION_COUNTERS 0 + static void dump_labels(FILE *f, const uint64_t labels[4]) { @@ -29,6 +31,8 @@ fsm_determinise(struct fsm *nfa) size_t dfacount = 0; struct analyze_closures_env ac_env = { 0 }; + INIT_TIMERS(); + INIT_TIMERS_NAMED(overall); assert(nfa != NULL); map.alloc = nfa->opt->alloc; @@ -39,9 +43,12 @@ fsm_determinise(struct fsm *nfa) * faster where we can start with an epsilon-free NFA in the first place. */ if (fsm_has(nfa, fsm_hasepsilons)) { + TIME(&pre); if (!fsm_remove_epsilons(nfa)) { return 0; } + TIME(&post); + DIFF_MSEC("det_remove_eps", pre, post, NULL); } #if LOG_DETERMINISE_CAPTURES || LOG_INPUT @@ -49,6 +56,7 @@ fsm_determinise(struct fsm *nfa) fsm_print_fsm(stderr, nfa); fsm_capture_dump(stderr, "#### post_remove_epsilons", nfa); #endif + TIME(&overall_pre); issp = interned_state_set_pool_alloc(nfa->opt->alloc); if (issp == NULL) { @@ -104,6 +112,17 @@ fsm_determinise(struct fsm *nfa) ac_env.fsm = nfa; ac_env.issp = issp; +#if LOG_DETERMINISATION_STATS + fprintf(stderr, "%s: determinising FSM with %d states\n", __func__, fsm_countstates(nfa)); +#endif + + INIT_TIMERS_NAMED(iss); + size_t iss_accum = 0; + size_t iss_calls = 0; + size_t stack_pushes = 0; + size_t inner_steps = 0; + + TIME(&pre); do { size_t o_i; @@ -114,18 +133,25 @@ fsm_determinise(struct fsm *nfa) assert(curr != NULL); + TIME(&iss_pre); if (!analyze_closures__pairwise_grouping(&ac_env, curr->iss)) { goto cleanup; } + TIME(&iss_post); + DIFF_MSEC("det_iss", iss_pre, iss_post, &iss_accum); + (void)iss_accum; + iss_calls++; if (!edge_set_advise_growth(&curr->edges, nfa->opt->alloc, ac_env.output_count)) { goto cleanup; } + /* each output is an outgoing (label set) -> interned_state_set pair */ for (o_i = 0; o_i < ac_env.output_count; o_i++) { struct mapping *m; struct ac_output *output = &ac_env.outputs[o_i]; interned_state_set_id iss = output->iss; + inner_steps++; #if LOG_DETERMINISE_CLOSURES fprintf(stderr, "fsm_determinise: output %zu/%zu: cur (dfa %zu) label [", @@ -157,6 +183,7 @@ fsm_determinise(struct fsm *nfa) if (!stack_push(stack, m)) { goto cleanup; } + stack_pushes++; } #if LOG_SYMBOL_CLOSURE @@ -174,6 +201,13 @@ fsm_determinise(struct fsm *nfa) /* All elements in sclosures[] are interned, so they will be freed later. */ } while ((curr = stack_pop(stack))); + TIME(&post); + DIFF_MSEC("det_stack_loop", pre, post, NULL); + + if (LOG_DETERMINISATION_COUNTERS) { + fprintf(stderr, "%s: iss_accum total %zu (%zu calls, %g usec avg.), %zu stack pushes, %zu iterations, %zu inner_steps\n", + __func__, iss_accum, iss_calls, iss_accum / (1.0 * iss_calls), stack_pushes, iss_calls, inner_steps); + } { struct map_iter it; @@ -185,6 +219,13 @@ fsm_determinise(struct fsm *nfa) goto cleanup; } + TIME(&pre); + if (!fsm_capture_copy_programs(nfa, dfa)) { + goto cleanup; + } + TIME(&post); + DIFF_MSEC("det_copy_captures", pre, post, NULL); + #if DUMP_MAPPING { fprintf(stderr, "#### fsm_determinise: mapping\n"); @@ -192,10 +233,10 @@ fsm_determinise(struct fsm *nfa) /* build reverse mappings table: for every NFA state X, if X is part * of the new DFA state Y, then add Y to a list for X */ for (m = map_first(&map, &it); m != NULL; m = map_next(&it)) { - struct state_iter si; interned_state_set_id iss_id = m->iss; + struct state_iter si; fsm_state_t state; - struct state_set *ss = interned_state_set_get_state_set(ac_env.issp, iss_id); + struct state_set *ss = interned_state_set_get_state_set(issp, iss_id); fprintf(stderr, "%zu:", m->dfastate); for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { @@ -238,24 +279,41 @@ fsm_determinise(struct fsm *nfa) fsm_setend(dfa, m->dfastate, 1); /* - * Carry through end IDs, if present. This isn't anything to do - * with the DFA conversion; it's meaningful only to the caller. + * Copy over metadata associated with end + * states, if present. This isn't anything to do + * with the DFA conversion; it's meaningful only + * to the caller. * * The closure may contain non-end states, but at least one state is * known to have been an end state. */ - if (!fsm_endid_carry(nfa, ss, dfa, m->dfastate)) { + if (!remap_end_metadata(nfa, ss, dfa, m->dfastate)) { goto cleanup; } } + TIME(&post); + DIFF_MSEC("det_map_loop", pre, post, NULL); - if (!remap_capture_actions(&map, issp, dfa, nfa)) { - goto cleanup; - } + fsm_capture_integrity_check(dfa); fsm_move(nfa, dfa); } +#if LOG_DETERMINISE_CAPTURES + fprintf(stderr, "# post_determinise\n"); + fsm_print_fsm(stderr, nfa); + fsm_capture_dump(stderr, "#### post_determinise", nfa); +#endif + + TIME(&overall_post); + DIFF_MSEC("det_overall", overall_pre, overall_post, NULL); + +#if LOG_DETERMINISATION_STATS + fprintf(stderr, "%s: created DFA with %d states\n", __func__, fsm_countstates(nfa)); + fprintf(stderr, "%s: analyze_closures_env.analyze_usec: %zu\n", + __func__, ac_env.analyze_usec); +#endif + #if EXPENSIVE_CHECKS assert(fsm_all(nfa, fsm_isdfa)); #endif @@ -311,85 +369,6 @@ fsm_determinise(struct fsm *nfa) return res; } -/* Add DFA_state to the list for NFA_state. */ -static int -add_reverse_mapping(const struct fsm_alloc *alloc, - struct reverse_mapping *reverse_mappings, - fsm_state_t dfastate, fsm_state_t nfa_state) -{ - struct reverse_mapping *rm = &reverse_mappings[nfa_state]; - if (rm->count == rm->ceil) { - const unsigned nceil = (rm->ceil ? 2*rm->ceil : 2); - fsm_state_t *nlist = f_realloc(alloc, - rm->list, nceil * sizeof(rm->list)); - if (nlist == NULL) { - return 0; - } - rm->list = nlist; - rm->ceil = nceil; - } - - rm->list[rm->count] = dfastate; - rm->count++; - return 1; -} - -static int -det_copy_capture_actions_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) -{ - struct reverse_mapping *rm_s; - size_t s_i, t_i; - struct det_copy_capture_actions_env *env = opaque; - assert(env->tag == 'D'); - -#if LOG_DETERMINISE_CAPTURES - fprintf(stderr, "det_copy_capture_actions_cb: state %u, type %s, ID %u, TO %d\n", - state, fsm_capture_action_type_name[type], - capture_id, to); -#endif - - rm_s = &env->reverse_mappings[state]; - - for (s_i = 0; s_i < rm_s->count; s_i++) { - const fsm_state_t s = rm_s->list[s_i]; - - if (to == CAPTURE_NO_STATE) { - if (!fsm_capture_add_action(env->dst, - s, type, capture_id, CAPTURE_NO_STATE)) { - env->ok = 0; - return 0; - } - } else { - struct reverse_mapping *rm_t = &env->reverse_mappings[to]; - for (t_i = 0; t_i < rm_t->count; t_i++) { - const fsm_state_t t = rm_t->list[t_i]; - - if (!fsm_capture_add_action(env->dst, - s, type, capture_id, t)) { - env->ok = 0; - return 0; - } - } - } - } - - return 1; -} - -static int -det_copy_capture_actions(struct reverse_mapping *reverse_mappings, - struct fsm *dst, struct fsm *src) -{ - struct det_copy_capture_actions_env env = { 'D', NULL, NULL, 1 }; - env.dst = dst; - env.reverse_mappings = reverse_mappings; - - fsm_capture_action_iter(src, det_copy_capture_actions_cb, &env); - return env.ok; -} - SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t hash_iss(interned_state_set_id iss) @@ -636,83 +615,6 @@ stack_pop(struct mappingstack *stack) return item; } -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa) -{ - struct map_iter it; - struct state_iter si; - struct mapping *m; - struct reverse_mapping *reverse_mappings; - fsm_state_t state; - const size_t capture_count = fsm_countcaptures(src_nfa); - size_t i, j; - int res = 0; - - if (capture_count == 0) { - return 1; - } - - /* This is not 1 to 1 -- if state X is now represented by multiple - * states Y in the DFA, and state X has action(s) when transitioning - * to state Z, this needs to be added on every Y, for every state - * representing Z in the DFA. - * - * We could probably filter this somehow, at the very least by - * checking reachability from every X, but the actual path - * handling later will also check reachability. */ - reverse_mappings = f_calloc(dst_dfa->opt->alloc, src_nfa->statecount, sizeof(reverse_mappings[0])); - if (reverse_mappings == NULL) { - return 0; - } - - /* build reverse mappings table: for every NFA state X, if X is part - * of the new DFA state Y, then add Y to a list for X */ - for (m = map_first(map, &it); m != NULL; m = map_next(&it)) { - struct state_set *ss; - interned_state_set_id iss_id = m->iss; - assert(m->dfastate < dst_dfa->statecount); - ss = interned_state_set_get_state_set(issp, iss_id); - - for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { - if (!add_reverse_mapping(dst_dfa->opt->alloc, - reverse_mappings, - m->dfastate, state)) { - goto cleanup; - } - } - } - -#if LOG_DETERMINISE_CAPTURES - fprintf(stderr, "#### reverse mapping for %zu states\n", src_nfa->statecount); - for (i = 0; i < src_nfa->statecount; i++) { - struct reverse_mapping *rm = &reverse_mappings[i]; - fprintf(stderr, "%lu:", i); - for (j = 0; j < rm->count; j++) { - fprintf(stderr, " %u", rm->list[j]); - } - fprintf(stderr, "\n"); - } -#else - (void)j; -#endif - - if (!det_copy_capture_actions(reverse_mappings, dst_dfa, src_nfa)) { - goto cleanup; - } - - res = 1; -cleanup: - for (i = 0; i < src_nfa->statecount; i++) { - if (reverse_mappings[i].list != NULL) { - f_free(dst_dfa->opt->alloc, reverse_mappings[i].list); - } - } - f_free(dst_dfa->opt->alloc, reverse_mappings); - - return res; -} - static int group_labels_overlap(const struct ac_group *a, const struct ac_group *b) { @@ -730,6 +632,25 @@ group_labels_overlap(const struct ac_group *a, const struct ac_group *b) return 0; } +static int +remap_end_metadata(const struct fsm *src_fsm, const struct state_set *src_set, + struct fsm *dst_fsm, fsm_state_t dst_state) +{ + if (!fsm_endid_carry(src_fsm, src_set, dst_fsm, dst_state)) { + return 0; + } + + if (!fsm_capture_copy_active_for_ends(src_fsm, src_set, dst_fsm, dst_state)) { + return 0; + } + + if (!fsm_capture_copy_program_end_state_associations(src_fsm, src_set, dst_fsm, dst_state)) { + return 0; + } + + return 1; +} + static void intersect_with(uint64_t *a, const uint64_t *b) { diff --git a/src/libfsm/determinise_internal.h b/src/libfsm/determinise_internal.h index 8fe35fcd8..856f8baf8 100644 --- a/src/libfsm/determinise_internal.h +++ b/src/libfsm/determinise_internal.h @@ -75,19 +75,6 @@ struct map_iter { size_t i; }; -struct reverse_mapping { - unsigned count; - unsigned ceil; - fsm_state_t *list; -}; - -struct det_copy_capture_actions_env { - char tag; - struct fsm *dst; - struct reverse_mapping *reverse_mappings; - int ok; -}; - #define MAPPINGSTACK_DEF_CEIL 16 struct mappingstack { const struct fsm_alloc *alloc; @@ -289,6 +276,10 @@ analyze_closures__grow_dst(struct analyze_closures_env *env); static int analyze_closures__grow_outputs(struct analyze_closures_env *env); +static int +remap_end_metadata(const struct fsm *src_fsm, const struct state_set *src_set, + struct fsm *dst_fsm, fsm_state_t dst_state); + static int map_add(struct map *map, fsm_state_t dfastate, interned_state_set_id iss, struct mapping **new_mapping); @@ -306,22 +297,9 @@ map_first(struct map *map, struct map_iter *iter); static struct mapping * map_next(struct map_iter *iter); -static int -add_reverse_mapping(const struct fsm_alloc *alloc, - struct reverse_mapping *reverse_mappings, - fsm_state_t dfastate, fsm_state_t nfa_state); - -static int -det_copy_capture_actions(struct reverse_mapping *reverse_mappings, - struct fsm *dst, struct fsm *src); - static int grow_map(struct map *map); -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa); - static struct mappingstack * stack_init(const struct fsm_alloc *alloc); diff --git a/src/libfsm/endids.c b/src/libfsm/endids.c index 94cf8f259..1684facc8 100644 --- a/src/libfsm/endids.c +++ b/src/libfsm/endids.c @@ -4,7 +4,44 @@ * See LICENCE for the full copyright terms. */ -#include "endids_internal.h" +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "endids.h" + +#define BUCKET_NO_STATE ((fsm_state_t)-1) +#define DEF_BUCKET_COUNT 4 +#define DEF_BUCKET_ID_COUNT 16 + +struct endid_info { + /* Add-only hash table, with a state ID and an associated + * non-empty ordered array of unique end IDs. The state is the + * key. Grows when the buckets are more than half full. */ + unsigned bucket_count; + unsigned buckets_used; + + struct endid_info_bucket { + fsm_state_t state; + struct end_info_ids { + unsigned count; + unsigned ceil; + fsm_end_id_t ids[1]; + } *ids; + } *buckets; +}; #define LOG_ENDIDS 0 @@ -48,6 +85,14 @@ fsm_setendid(struct fsm *fsm, fsm_end_id_t id) return 1; } +int +fsm_setendid_state(struct fsm *fsm, fsm_state_t s, fsm_end_id_t id) +{ + enum fsm_endid_set_res sres; + sres = fsm_endid_set(fsm, s, id); + return sres != FSM_ENDID_SET_ERROR_ALLOC_FAIL; +} + enum fsm_getendids_res fsm_getendids(const struct fsm *fsm, fsm_state_t end_state, size_t id_buf_count, fsm_end_id_t *id_buf, @@ -420,6 +465,13 @@ fsm_endid_get(const struct fsm *fsm, fsm_state_t end_state, return FSM_GETENDIDS_NOT_FOUND; } +void +fsm_iterendids(const struct fsm *fsm, fsm_state_t state, + fsm_iterendids_cb *cb, void *opaque) +{ + fsm_endid_iter_state(fsm, state, cb, opaque); +} + struct carry_env { char tag; struct fsm *dst; @@ -428,13 +480,16 @@ struct carry_env { }; static int -carry_iter_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +carry_iter_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, fsm_end_id_t id, void *opaque) { enum fsm_endid_set_res sres; struct carry_env *env = opaque; assert(env->tag == 'C'); + (void)fsm; (void)state; + (void)nth; sres = fsm_endid_set(env->dst, env->dst_state, id); if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { @@ -489,6 +544,72 @@ fsm_endid_carry(const struct fsm *src_fsm, const struct state_set *src_set, return 1; } +/* Make a new hash table, copying over converted entries and/or discarding. */ +int +fsm_endid_compact(struct fsm *fsm, + const fsm_state_t *mapping, size_t mapping_count) +{ + struct endid_info *info = fsm->endid_info; + const size_t ocount = info->bucket_count; + const size_t ncount = ocount; /* does not need to grow */ + struct endid_info_bucket *obuckets = info->buckets; + struct endid_info_bucket *nbuckets = f_malloc(fsm->opt->alloc, + ncount * sizeof(nbuckets[0])); + const size_t nmask = ncount - 1; + size_t ob_i, nb_i; + size_t moved = 0; + +#if LOG_ENDIDS > 3 + fprintf(stderr, "fsm_endid_compact: rehashing mapped entries\n"); +#endif + + if (nbuckets == NULL) { + return 0; + } + + for (nb_i = 0; nb_i < ncount; nb_i++) { /* clear table */ + nbuckets[nb_i].state = BUCKET_NO_STATE; + } + + for (ob_i = 0; ob_i < ocount; ob_i++) { + const struct endid_info_bucket *ob = &obuckets[ob_i]; + uint64_t hash; + fsm_state_t nstate; + + if (ob->state == BUCKET_NO_STATE) { + continue; + } + + assert(ob->state < mapping_count); + nstate = mapping[ob->state]; + + if (nstate == FSM_STATE_REMAP_NO_STATE) { + info->buckets_used--; /* discarded */ + continue; + } + + hash = hash_id(nstate); + for (nb_i = 0; nb_i < ncount; nb_i++) { + struct endid_info_bucket *nb = &nbuckets[(hash + nb_i) & nmask]; + if (nb->state == BUCKET_NO_STATE) { + nb->state = nstate; + nb->ids = ob->ids; + moved++; + break; + } else { + continue; /* collision */ + } + } + } + + assert(moved == info->buckets_used); + + f_free(fsm->opt->alloc, info->buckets); + info->bucket_count = ncount; + info->buckets = nbuckets; + return 1; +} + void fsm_endid_iter(const struct fsm *fsm, fsm_endid_iter_cb *cb, void *opaque) @@ -517,7 +638,7 @@ fsm_endid_iter(const struct fsm *fsm, count = b->ids->count; for (id_i = 0; id_i < count; id_i++) { - if (!cb(b->state, b->ids->ids[id_i], opaque)) { + if (!cb(fsm, b->state, id_i, b->ids->ids[id_i], opaque)) { break; } @@ -580,7 +701,7 @@ fsm_endid_iter_state(const struct fsm *fsm, fsm_state_t state, fprintf(stderr, "fsm_endid_iter_state[%d], ids[%ld] -> %d\n", b->state, id_i, b->ids->ids[id_i]); #endif - if (!cb(b->state, b->ids->ids[id_i], opaque)) { + if (!cb(fsm, b->state, id_i, b->ids->ids[id_i], opaque)) { return; } id_i++; @@ -602,10 +723,13 @@ struct dump_env { }; static int -dump_cb(fsm_state_t state, const fsm_end_id_t id, void *opaque) +dump_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque) { struct dump_env *env = opaque; - fprintf(env->f, "state[%u]: %u\n", state, id); + fprintf(env->f, "endids: state[%u]: %u\n", state, id); + (void)fsm; + (void)nth; return 1; } diff --git a/src/libfsm/endids.h b/src/libfsm/endids.h index 5389bd8b0..0982017ae 100644 --- a/src/libfsm/endids.h +++ b/src/libfsm/endids.h @@ -32,10 +32,15 @@ int fsm_endid_carry(const struct fsm *src_fsm, const struct state_set *src_set, struct fsm *dst_fsm, fsm_state_t dst_state); +int +fsm_endid_compact(struct fsm *fsm, + const fsm_state_t *mapping, size_t mapping_count); + /* Callback when iterating over the endids. * Return 0 to halt, or non-zero to continue. */ typedef int -fsm_endid_iter_cb(fsm_state_t state, const fsm_end_id_t id, void *opaque); +fsm_endid_iter_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque); void fsm_endid_iter(const struct fsm *fsm, diff --git a/src/libfsm/endids_internal.h b/src/libfsm/endids_internal.h deleted file mode 100644 index 27450af3b..000000000 --- a/src/libfsm/endids_internal.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef ENDIDS_INTERNAL_H -#define ENDIDS_INTERNAL_H - -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "endids.h" - -#define BUCKET_NO_STATE ((fsm_state_t)-1) -#define DEF_BUCKET_COUNT 4 -#define DEF_BUCKET_ID_COUNT 16 - -struct endid_info { - /* Add-only hash table, with a state ID and an associated - * non-empty ordered array of unique end IDs. The state is the - * key. Grows when the buckets are more than half full. */ - unsigned bucket_count; - unsigned buckets_used; - - struct endid_info_bucket { - fsm_state_t state; - struct end_info_ids { - unsigned count; - unsigned ceil; - fsm_end_id_t ids[1]; - } *ids; - } *buckets; -}; - -#endif diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 06da7739e..e63ae50ce 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -21,50 +22,77 @@ #include "endids.h" #define DUMP_EPSILON_CLOSURES 0 -#define DEF_PENDING_CAPTURE_ACTIONS_CEIL 2 #define LOG_RM_EPSILONS_CAPTURES 0 -#define DEF_CARRY_ENDIDS_COUNT 2 +#define LOG_COPYING 0 +#define LOG_RESULT 0 -struct remap_env { - char tag; - const struct fsm_alloc *alloc; - struct state_set **rmap; - int ok; - - size_t count; - size_t ceil; - struct remap_action { - fsm_state_t state; - enum capture_action_type type; - unsigned capture_id; - fsm_state_t to; - } *actions; -}; +/* #define DEF_CARRY_ENDIDS_COUNT 2 */ +/* #define DEF_CARRY_CAPTUREIDS_COUNT 2 */ -static int -remap_capture_actions(struct fsm *nfa, struct state_set **eclosures); +#if LOG_RESULT +#include +#endif -static int -remap_capture_action_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque); +#define DEF_END_METADATA_ENDIDS_CEIL 4 +#define DEF_END_METADATA_CAPTUREIDS_CEIL 4 +#define DEF_END_METADATA_PROGRAMIDS_CEIL 4 +struct carry_end_metadata_env { + struct fsm *fsm; + const struct fsm_alloc *alloc; + + struct { + size_t ceil; + fsm_end_id_t *ids; + } end; + struct { + int ok; + size_t count; + size_t ceil; + unsigned *ids; + } capture; + struct { + int ok; + size_t count; + size_t ceil; + uint32_t *ids; + } program; +}; static int -carry_endids(struct fsm *fsm, struct state_set *states, - fsm_state_t s); +carry_end_metadata(struct carry_end_metadata_env *env, + fsm_state_t end_state, fsm_state_t dst_state); int fsm_remove_epsilons(struct fsm *nfa) { +#if LOG_RESULT + fprintf(stderr, "==== before\n"); + fsm_print_fsm(stderr, nfa); + fsm_capture_dump(stderr, "#### before_remove_epsilons", nfa); + fprintf(stderr, "====\n"); +#endif + const size_t state_count = fsm_countstates(nfa); int res = 0; struct state_set **eclosures = NULL; - fsm_state_t s; + fsm_state_t s, start_id; + const struct fsm_alloc *alloc = nfa->opt->alloc; INIT_TIMERS(); + struct carry_end_metadata_env em_env = { 0 }; + em_env.fsm = nfa; + em_env.alloc = alloc; + assert(nfa != NULL); + if (!fsm_getstart(nfa, &start_id)) { + goto cleanup; + } + + /* TODO: This could successfully exit early if none of the + * states have epsilon edges. */ + TIME(&pre); eclosures = fsm_epsilon_closure(nfa); TIME(&post); @@ -107,20 +135,14 @@ fsm_remove_epsilons(struct fsm *nfa) * end states. * * Similarly, any end state metadata on states - * in its epsilon-closure is copied to it. - * - * Capture actions are copied in a later pass. */ + * in its epsilon-closure is copied to it. */ if (fsm_isend(nfa, es_id)) { #if LOG_COPYING fprintf(stderr, "remove_epsilons: setting end on %d (due to %d)\n", s, es_id); #endif fsm_setend(nfa, s, 1); - /* - * Carry through end IDs, if present. This isn't anything to do - * with the NFA conversion; it's meaningful only to the caller. - */ - if (!carry_endids(nfa, eclosures[s], s)) { + if (!carry_end_metadata(&em_env, es_id, s)) { goto cleanup; } } @@ -150,14 +172,7 @@ fsm_remove_epsilons(struct fsm *nfa) state->epsilons = NULL; } -#if LOG_RESULT - fprintf(stderr, "=== %s: about to update capture actions\n", __func__); - fsm_print_fsm(stderr, nfa); -#endif - - if (!remap_capture_actions(nfa, eclosures)) { - goto cleanup; - } + fsm_capture_integrity_check(nfa); #if LOG_RESULT fsm_print_fsm(stderr, nfa); @@ -169,253 +184,136 @@ fsm_remove_epsilons(struct fsm *nfa) if (eclosures != NULL) { fsm_closure_free(eclosures, state_count); } - - return res; -} - -static int -remap_capture_actions(struct fsm *nfa, struct state_set **eclosures) -{ - int res = 0; - fsm_state_t s, i; - struct state_set **rmap; - struct state_iter si; - fsm_state_t si_s; - struct remap_env env = { 'R', NULL, NULL, 1, 0, 0, NULL }; - env.alloc = nfa->opt->alloc; - - /* build a reverse mapping */ - rmap = f_calloc(nfa->opt->alloc, nfa->statecount, sizeof(rmap[0])); - if (rmap == NULL) { - goto cleanup; - } - - for (s = 0; s < nfa->statecount; s++) { - if (eclosures[s] == NULL) { continue; } - for (state_set_reset(eclosures[s], &si); state_set_next(&si, &si_s); ) { - if (si_s == s) { - continue; /* ignore identical states */ - } -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, "remap_capture_actions: %u <- %u\n", - s, si_s); -#endif - if (!state_set_add(&rmap[si_s], nfa->opt->alloc, s)) { - goto cleanup; - } - } + if (em_env.end.ids != NULL) { + f_free(alloc, em_env.end.ids); } - env.rmap = rmap; - - /* Iterate over the current set of actions with the reverse - * mapping (containing only states which will be skipped, - * collecting info about every new capture action that will need - * to be added. - * - * It can't be added during the iteration, because that would - * modify the hash table as it's being iterated over. */ - fsm_capture_action_iter(nfa, remap_capture_action_cb, &env); - - /* Now that we're done iterating, add those actions. */ - for (i = 0; i < env.count; i++) { - const struct remap_action *a = &env.actions[i]; - if (!fsm_capture_add_action(nfa, a->state, a->type, - a->capture_id, a->to)) { - goto cleanup; - } + if (em_env.program.ids != NULL) { + f_free(alloc, em_env.program.ids); } - - res = 1; - -cleanup: - if (env.actions != NULL) { - f_free(nfa->opt->alloc, env.actions); + if (em_env.capture.ids != NULL) { + f_free(alloc, em_env.capture.ids); } - if (rmap != NULL) { - for (i = 0; i < nfa->statecount; i++) { - state_set_free(rmap[i]); - } - f_free(nfa->opt->alloc, rmap); - } return res; - } static int -add_pending_capture_action(struct remap_env *env, - fsm_state_t state, enum capture_action_type type, - unsigned capture_id, fsm_state_t to) +collect_captureid_cb(fsm_state_t state, unsigned id, void *opaque) { - struct remap_action *a; - if (env->count == env->ceil) { - struct remap_action *nactions; - const size_t nceil = (env->ceil == 0 - ? DEF_PENDING_CAPTURE_ACTIONS_CEIL : 2*env->ceil); - assert(nceil > 0); - nactions = f_realloc(env->alloc, - env->actions, - nceil * sizeof(nactions[0])); - if (nactions == NULL) { - return 0; - } - - env->ceil = nceil; - env->actions = nactions; - } + struct carry_end_metadata_env *env = opaque; + (void)state; - a = &env->actions[env->count]; -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, "add_pending_capture_action: state %d, type %s, capture_id %u, to %d\n", - state, fsm_capture_action_type_name[type], capture_id, to); -#endif + if (env->capture.count == env->capture.ceil) { + const size_t nceil = (env->capture.ceil == 0) + ? DEF_END_METADATA_CAPTUREIDS_CEIL + : 2 * env->capture.ceil; + unsigned *nids; + assert(nceil > env->capture.ceil); + nids = f_realloc(env->alloc, env->capture.ids, + nceil * sizeof(env->capture.ids[0])); + if (nids == NULL) { + env->capture.ok = 0; + return 0; + } + env->capture.ceil = nceil; + env->capture.ids = nids; + } - a->state = state; - a->type = type; - a->capture_id = capture_id; - a->to = to; - env->count++; + env->capture.ids[env->capture.count] = id; + env->capture.count++; return 1; } static int -remap_capture_action_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +collect_progid_cb(fsm_state_t state, unsigned id, void *opaque) { - struct state_iter si; - fsm_state_t si_s; - struct remap_env *env = opaque; - assert(env->tag == 'R'); - -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, "remap_capture_action_cb: state %d, type %s, capture_id %u, to %d\n", - state, fsm_capture_action_type_name[type], capture_id, to); -#endif - - for (state_set_reset(env->rmap[state], &si); state_set_next(&si, &si_s); ) { - struct state_iter si_to; - fsm_state_t si_tos; - -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, " -- rcac: state %d -> %d\n", state, si_s); -#endif - - if (!add_pending_capture_action(env, si_s, type, capture_id, to)) { - goto fail; - } - - if (to == CAPTURE_NO_STATE) { - continue; - } - - for (state_set_reset(env->rmap[to], &si_to); state_set_next(&si, &si_tos); ) { -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, " -- rcac: to %d -> %d\n", to, si_tos); -#endif - - if (!add_pending_capture_action(env, si_tos, type, capture_id, to)) { - goto fail; - } - - } - } + struct carry_end_metadata_env *env = opaque; + uint32_t prog_id = (uint32_t)id; + (void)state; + + if (env->program.count == env->program.ceil) { + const size_t nceil = (env->program.ceil == 0) + ? DEF_END_METADATA_PROGRAMIDS_CEIL + : 2 * env->program.ceil; + unsigned *nids; + assert(nceil > env->program.ceil); + nids = f_realloc(env->alloc, env->program.ids, + nceil * sizeof(env->program.ids[0])); + if (nids == NULL) { + env->program.ok = 0; + return 0; + } + env->program.ceil = nceil; + env->program.ids = nids; + } + env->program.ids[env->program.count] = prog_id; + env->program.count++; return 1; - -fail: - env->ok = 0; - return 0; } -struct collect_env { - char tag; - const struct fsm_alloc *alloc; - size_t count; - size_t ceil; - fsm_end_id_t *ids; - int ok; -}; - +/* Because we're modifying the FSM in place, we can't iterate and add + * new entries -- it could lead to the underlying hash table resizing. + * Instead, collect, then add in a second pass. */ static int -collect_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +carry_end_metadata(struct carry_end_metadata_env *env, + fsm_state_t end_state, fsm_state_t dst_state) { - struct collect_env *env = opaque; - assert(env->tag == 'E'); - - (void)state; - - if (env->count == env->ceil) { - const size_t nceil = 2 * env->ceil; - fsm_end_id_t *nids; - assert(nceil > env->ceil); - nids = f_realloc(env->alloc, env->ids, - nceil * sizeof(*env->ids)); - if (nids == NULL) { - env->ok = 0; - return 0; + size_t i; + const size_t id_count = fsm_getendidcount(env->fsm, end_state); + if (id_count > 0) { /* copy end IDs */ + enum fsm_getendids_res id_res; + size_t written; + if (id_count > env->end.ceil) { /* grow buffer */ + const size_t nceil = (env->end.ceil == 0) + ? DEF_END_METADATA_ENDIDS_CEIL + : 2*env->end.ceil; + assert(nceil > 0); + env->end.ids = f_realloc(env->alloc, + env->end.ids, nceil * sizeof(env->end.ids[0])); + if (env->end.ids == NULL) { + return 0; + } } - env->ceil = nceil; - env->ids = nids; - } - env->ids[env->count] = id; - env->count++; + id_res = fsm_getendids(env->fsm, end_state, + id_count, env->end.ids, &written); + assert(id_res != FSM_GETENDIDS_ERROR_INSUFFICIENT_SPACE); - return 1; -} - -/* fsm_remove_epsilons can't use fsm_endid_carry directly, because the src - * and dst FSMs are the same -- that would lead to adding entries to a - * hash table, possibly causing it to resize, while iterating over it. - * - * Instead, collect entries that need to be added (if not already - * present), and then add them in a second pass. */ -static int -carry_endids(struct fsm *fsm, struct state_set *states, - fsm_state_t dst_state) -{ - struct state_iter it; - fsm_state_t s; - size_t i; + for (i = 0; i < id_count; i++) { +#if LOG_COPYING + fprintf(stderr, "carry_end_metadata: setting end ID %u on %d (due to %d)\n", + env->end.ids[i], dst_state, end_state); +#endif + if (!fsm_setendid_state(env->fsm, dst_state, env->end.ids[i])) { + return 0; + } + } + } - struct collect_env env; - env.tag = 'E'; /* for fsm_remove_epsilons */ - env.alloc = fsm->opt->alloc; - env.count = 0; - env.ceil = DEF_CARRY_ENDIDS_COUNT; - env.ids = f_malloc(fsm->opt->alloc, - env.ceil * sizeof(*env.ids)); - if (env.ids == NULL) { + env->capture.ok = 1; + env->capture.count = 0; + fsm_capture_iter_active_for_end_state(env->fsm, end_state, + collect_captureid_cb, env); + if (!env->capture.ok) { return 0; } - env.ok = 1; - - /* collect from states */ - for (state_set_reset(states, &it); state_set_next(&it, &s); ) { - if (!fsm_isend(fsm, s)) { - continue; - } - - fsm_endid_iter_state(fsm, s, collect_cb, &env); - if (!env.ok) { - goto cleanup; + for (i = 0; i < env->capture.count; i++) { + if (!fsm_capture_set_active_for_end(env->fsm, + env->capture.ids[i], dst_state)) { + return 0; } } - /* add them */ - for (i = 0; i < env.count; i++) { - enum fsm_endid_set_res sres; - sres = fsm_endid_set(fsm, dst_state, env.ids[i]); - if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { - env.ok = 0; - goto cleanup; + env->program.count = 0; + fsm_capture_iter_program_ids_for_end_state(env->fsm, end_state, + collect_progid_cb, env); + for (i = 0; i < env->program.count; i++) { + if (!fsm_capture_associate_program_with_end_state(env->fsm, + env->program.ids[i], dst_state)) { + return 0; } } -cleanup: - f_free(fsm->opt->alloc, env.ids); - - return env.ok; + return 1; } - diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index 9f7b21802..47d27a50e 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,6 @@ static int transition(const struct fsm *fsm, fsm_state_t state, int c, - size_t offset, struct fsm_capture *captures, fsm_state_t *next) { assert(state < fsm->statecount); @@ -35,31 +35,72 @@ transition(const struct fsm *fsm, fsm_state_t state, int c, return 0; } - if (captures != NULL && fsm_capture_has_capture_actions(fsm, state)) { - fsm_capture_update_captures(fsm, state, *next, - offset, captures); - } - return 1; } int fsm_exec(const struct fsm *fsm, - int (*fsm_getc)(void *opaque), void *opaque, - fsm_state_t *end, struct fsm_capture *captures) + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end) { fsm_state_t state; int c; size_t offset = 0; - unsigned i; - size_t capture_count; assert(fsm != NULL); assert(fsm_getc != NULL); assert(end != NULL); - capture_count = fsm_countcaptures(fsm); + /* TODO: check prerequisites; that it has literal edges, DFA, etc */ + + /* TODO: pass struct of callbacks to call during each event; transitions etc */ + + if (!fsm_all(fsm, fsm_isdfa)) { + errno = EINVAL; + return -1; + } + + if (!fsm_getstart(fsm, &state)) { + errno = EINVAL; + return -1; + } + +#if LOG_EXEC + fprintf(stderr, "fsm_exec: starting at %d\n", state); +#endif + + while (c = fsm_getc(opaque), c != EOF) { + if (!transition(fsm, state, c, &state)) { +#if LOG_EXEC + fprintf(stderr, "fsm_exec: edge not found\n"); +#endif + return 0; + } +#if LOG_EXEC + fprintf(stderr, "fsm_exec: @ %zu, input '%c', new state %u\n", + offset, c, state); +#endif + offset++; + } + + if (!fsm_isend(fsm, state)) { + return 0; + } + + *end = state; + return 1; +} + +int +fsm_exec_with_captures(const struct fsm *fsm, const unsigned char *input, + size_t input_length, fsm_state_t *end, + struct fsm_capture *captures, size_t capture_buf_length) +{ + fsm_state_t state; + size_t offset = 0; + + assert(fsm != NULL); + assert(end != NULL); /* TODO: check prerequisites; that it has literal edges, DFA, etc */ /* TODO: pass struct of callbacks to call during each event; transitions etc */ @@ -74,17 +115,26 @@ fsm_exec(const struct fsm *fsm, return -1; } - for (i = 0; i < capture_count; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; + if (captures != NULL) { + const size_t capture_ceil = fsm_capture_ceiling(fsm); + if (capture_buf_length < capture_ceil) { + errno = EINVAL; + return -1; + } + + for (size_t i = 0; i < capture_ceil; i++) { + captures[i].pos[0] = FSM_CAPTURE_NO_POS; + captures[i].pos[1] = FSM_CAPTURE_NO_POS; + } } #if LOG_EXEC fprintf(stderr, "fsm_exec: starting at %d\n", state); #endif - while (c = fsm_getc(opaque), c != EOF) { - if (!transition(fsm, state, c, offset, captures, &state)) { + while (offset < input_length) { + const unsigned char c = input[offset]; + if (!transition(fsm, state, c, &state)) { #if LOG_EXEC fprintf(stderr, "fsm_exec: edge not found\n"); #endif @@ -102,15 +152,15 @@ fsm_exec(const struct fsm *fsm, return 0; } - /* Check for capture actions on end state */ - if (captures != NULL && fsm_capture_has_capture_actions(fsm, state)) { - fsm_capture_update_captures(fsm, state, NEXT_STATE_END, - offset, captures); + /* Resolve captures associated with the end state. */ + if (captures != NULL) { + if (!fsm_capture_resolve_during_exec(fsm, state, + input, offset, captures, capture_buf_length)) { + assert(errno != 0); + return -1; + } } - fsm_capture_finalize_captures(fsm, capture_count, captures); - *end = state; return 1; } - diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index 1dde1fc96..272f2cc88 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -124,10 +124,6 @@ struct fsm_edge { struct fsm_state { unsigned int end:1; - /* If 0, then this state has no need for checking - * the fsm->capture_info struct. */ - unsigned int has_capture_actions:1; - /* meaningful within one particular transformation only */ unsigned int visited:1; diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index f9f6bf003..bac177ff2 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -106,6 +106,7 @@ fsm_shortest fsm_example fsm_exec +fsm_exec_with_captures # exec fsm_fgetc @@ -127,11 +128,19 @@ fsm_vm_match_buffer fsm_vm_match_file # -fsm_countcaptures -fsm_capture_has_capture_actions +fsm_capture_ceiling fsm_capture_set_path fsm_capture_rebase_capture_id -fsm_capture_alloc +fsm_capture_alloc_capture_buffer +fsm_capture_free_capture_buffer fsm_capture_dump +fsm_capture_set_active_for_end +fsm_capture_add_program +fsm_capture_associate_program_with_end_state + +# capture_vm* +fsm_capvm_program_free +fsm_capvm_program_dump +fsm_capvm_program_exec fsm_minimise_test_oracle diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index b0df7ea71..ef8b13271 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -17,25 +17,27 @@ #include #include #include +#include #include "capture.h" +#include "capture_vm.h" #include "internal.h" #include "endids.h" #define LOG_MERGE_ENDIDS 0 - -struct copy_capture_env { - char tag; - struct fsm *dst; - int ok; -}; +#define LOG_COPY_CAPTURE_PROGRAMS 0 static int -copy_capture_actions(struct fsm *dst, struct fsm *src); +copy_end_metadata(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src); static int copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); +static int +copy_active_capture_ids(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src); + static struct fsm * merge(struct fsm *dst, struct fsm *src, fsm_state_t *base_dst, fsm_state_t *base_src, @@ -72,17 +74,12 @@ merge(struct fsm *dst, struct fsm *src, *base_dst = 0; *base_src = dst->statecount; *capture_base_dst = 0; - *capture_base_src = fsm_countcaptures(dst); + *capture_base_src = fsm_capture_ceiling(dst); for (i = 0; i < src->statecount; i++) { state_set_rebase(&src->states[i].epsilons, *base_src); edge_set_rebase(&src->states[i].edges, *base_src); } - - /* FIXME: instead of rebasing these here, they could - * also be updated in copy_capture_actions below. */ - fsm_capture_rebase_capture_id(src, *capture_base_src); - fsm_capture_rebase_capture_action_states(src, *base_src); } memcpy(dst->states + dst->statecount, src->states, @@ -90,22 +87,10 @@ merge(struct fsm *dst, struct fsm *src, dst->statecount += src->statecount; dst->endcount += src->endcount; - /* We need to explicitly copy over the capture actions and end - * ID info here because they're stored on the FSMs as a whole, - * rather than individual states; `memcpy`ing the states alone - * won't transfer them. - * - * They're stored separately because they are likely to only - * be on a small portion of the states, and adding two extra - * NULL pointers to `struct fsm_state` increases memory usage - * significantly. */ - - if (!copy_capture_actions(dst, src)) { - /* non-recoverable -- destructive operation */ - return NULL; - } - - if (!copy_end_ids(dst, src, *base_src)) { + /* We need to explicitly copy over end metadata here. They're + * stored separately because they are likely to only be on a + * small portion of the states. */ + if (!copy_end_metadata(dst, src, *base_src, *capture_base_src)) { /* non-recoverable -- destructive operation */ return NULL; } @@ -123,16 +108,91 @@ merge(struct fsm *dst, struct fsm *src, return dst; } +struct copy_capture_programs_env { + const struct fsm_alloc *alloc; + const struct fsm *src; + struct fsm *dst; + int ok; + fsm_state_t state_base_src; + unsigned capture_base_src; + +#define DEF_MAPPING_CEIL 1 + size_t mapping_used; + size_t mapping_ceil; + /* TODO: could cache last_map to check first if this becomes expensive */ + struct prog_mapping { + unsigned src_prog_id; + unsigned dst_prog_id; + } *mappings; +}; + static int -copy_capture_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, +copy_capture_programs_cb(fsm_state_t src_state, unsigned src_prog_id, void *opaque) { - struct copy_capture_env *env = opaque; - assert(env->tag == 'C'); + struct copy_capture_programs_env *env = opaque; + + const fsm_state_t dst_state = src_state + env->state_base_src; + assert(dst_state < fsm_countstates(env->dst)); + +#if LOG_COPY_CAPTURE_PROGRAMS + fprintf(stderr, "%s: src %p, dst %p, src_prog_id %u, src_state %d, dst_state %d, capture_base_src %u\n", + __func__, (void *)env->src, (void *)env->dst, + src_prog_id, src_state, dst_state, env->capture_base_src); +#endif + int found = 0; + uint32_t dst_prog_id; + + for (size_t i = 0; i < env->mapping_used; i++) { + const struct prog_mapping *m = &env->mappings[i]; + if (m->src_prog_id == src_prog_id) { + dst_prog_id = m->dst_prog_id; + found = 1; + } + } + + if (!found) { + if (env->mapping_used == env->mapping_ceil) { /* grow */ + const size_t nceil = 2*env->mapping_ceil; + struct prog_mapping *nmappings = f_realloc(env->alloc, + env->mappings, nceil * sizeof(nmappings[0])); + if (nmappings == NULL) { + env->ok = 0; + return 0; + } + + env->mapping_ceil = nceil; + env->mappings = nmappings; + } + + const struct capvm_program *p = fsm_capture_get_program_by_id(env->src, + src_prog_id); + assert(p != NULL); + + struct capvm_program *cp = capvm_program_copy(env->alloc, p); + if (cp == NULL) { + env->ok = 0; + return 0; + } + capvm_program_rebase(cp, env->capture_base_src); + + /* add program, if not present */ + if (!fsm_capture_add_program(env->dst, + cp, &dst_prog_id)) { + f_free(env->alloc, cp); + env->ok = 0; + return 0; + } - if (!fsm_capture_add_action(env->dst, state, type, - capture_id, to)) { + struct prog_mapping *m = &env->mappings[env->mapping_used]; + m->src_prog_id = src_prog_id; + m->dst_prog_id = dst_prog_id; + env->mapping_used++; + } + + /* associate with end states */ + if (!fsm_capture_associate_program_with_end_state(env->dst, + dst_prog_id, dst_state)) { env->ok = 0; return 0; } @@ -141,18 +201,55 @@ copy_capture_cb(fsm_state_t state, } static int -copy_capture_actions(struct fsm *dst, struct fsm *src) +copy_capture_programs(struct fsm *dst, const struct fsm *src, + fsm_state_t state_base_src, unsigned capture_base_src) { - struct copy_capture_env env; - env.tag = 'C'; - env.dst = dst; - env.ok = 1; + const struct fsm_alloc *alloc = src->opt->alloc; + struct prog_mapping *mappings = f_malloc(alloc, + DEF_MAPPING_CEIL * sizeof(mappings[0])); + if (mappings == NULL) { + return 0; + } - fsm_capture_action_iter(src, copy_capture_cb, &env); + struct copy_capture_programs_env env = { + .alloc = alloc, + .src = src, + .dst = dst, + .ok = 1, + .state_base_src = state_base_src, + .capture_base_src = capture_base_src, + .mapping_ceil = DEF_MAPPING_CEIL, + .mappings = mappings, + }; + fsm_capture_iter_program_ids_for_all_end_states(src, + copy_capture_programs_cb, &env); + + f_free(alloc, env.mappings); return env.ok; } +static int +copy_end_metadata(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src) +{ + /* TODO: inline */ + + if (!copy_end_ids(dst, src, base_src)) { + return 0; + } + + if (!copy_active_capture_ids(dst, src, base_src, capture_base_src)) { + return 0; + } + + if (!copy_capture_programs(dst, src, base_src, capture_base_src)) { + return 0; + } + + return 1; +} + struct copy_end_ids_env { char tag; struct fsm *dst; @@ -162,11 +259,14 @@ struct copy_end_ids_env { }; static int -copy_end_ids_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +copy_end_ids_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque) { enum fsm_endid_set_res sres; struct copy_end_ids_env *env = opaque; assert(env->tag == 'M'); + (void)fsm; + (void)nth; #if LOG_MERGE_ENDIDS > 1 fprintf(stderr, "merge[%d] <- %d\n", @@ -188,7 +288,6 @@ copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) struct copy_end_ids_env env; env.tag = 'M'; /* for Merge */ env.dst = dst; - env.src = src; env.base_src = base_src; env.ok = 1; @@ -196,6 +295,45 @@ copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) return env.ok; } +struct copy_active_capture_ids_env { + char tag; + struct fsm *dst; + fsm_state_t base_src; + unsigned capture_base_src; + int ok; +}; + +static int +copy_active_capture_ids_cb(fsm_state_t state, unsigned capture_id, void *opaque) +{ + struct copy_active_capture_ids_env *env = opaque; + assert(env->tag == 'A'); + + if (!fsm_capture_set_active_for_end(env->dst, + capture_id + env->capture_base_src, + state + env->base_src)) { + env->ok = 0; + return 0; + } + return 1; +} + +static int +copy_active_capture_ids(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src) +{ + struct copy_active_capture_ids_env env; + env.tag = 'A'; + env.dst = dst; + env.base_src = base_src; + env.capture_base_src = capture_base_src; + env.ok = 1; + + fsm_capture_iter_active_for_all_end_states(src, + copy_active_capture_ids_cb, &env); + return env.ok; +} + struct fsm * fsm_mergeab(struct fsm *a, struct fsm *b, fsm_state_t *base_b) diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index 7d95b78ad..6863395e2 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,54 @@ #include "minimise_internal.h" #include "minimise_test_oracle.h" +static int +label_sets_match(const uint64_t a[256/64], const uint64_t b[256/64]); + +static int +split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm); + +#if EXPENSIVE_CHECKS +#include + +static void +check_done_ec_offset(const struct min_env *env); + +static int +all_end_states_are_currently_together(const struct min_env *env); +#endif + +#define DEF_CAPTURE_ID_CEIL 4 +struct end_metadata { + struct end_metadata_end { + unsigned count; + fsm_end_id_t *ids; + } end; + + struct end_metadata_capture { + unsigned count; + unsigned ceil; + unsigned *ids; + } capture; + + struct end_metadata_program { + unsigned count; + unsigned ceil; + unsigned *ids; + } program; +}; + +static int +collect_end_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_end *e); + +static int +collect_capture_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_capture *c); + +static int +collect_capture_program_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_program *p); + int fsm_minimise(struct fsm *fsm) { @@ -51,19 +100,36 @@ fsm_minimise(struct fsm *fsm) assert(fsm != NULL); assert(fsm_all(fsm, fsm_isdfa)); +#if LOG_INIT > 1 + fprintf(stderr, "=== BEFORE TRIM, %d states\n", fsm_countstates(fsm)); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "#### pre_minimise", fsm); + fprintf(stderr, "=== BEFORE TRIM\n"); +#endif + /* The algorithm used below won't remove states without a path * to an end state, because it cannot prove they're * unnecessary, so they must be trimmed away first. */ + TIME(&pre); if (fsm_trim(fsm, FSM_TRIM_START_AND_END_REACHABLE, &shortest_end_distance) < 0) { return 0; } + TIME(&post); + DIFF_MSEC("trim", pre, post, NULL); if (fsm->statecount == 0) { r = 1; goto cleanup; } +#if LOG_INIT > 1 + fprintf(stderr, "=== AFTER TRIM, %d states\n", fsm_countstates(fsm)); + fprintf(stderr, "# pre_minimise\n"); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "#### pre_minimise", fsm); +#endif + TIME(&pre); collect_labels(fsm, labels, &label_count); TIME(&post); @@ -109,6 +175,8 @@ fsm_minimise(struct fsm *fsm) goto cleanup; } + fsm_capture_integrity_check(dst); + #if EXPENSIVE_CHECKS if (!fsm_capture_has_captures(fsm)) { struct fsm *oracle = fsm_minimise_test_oracle(fsm); @@ -249,6 +317,12 @@ build_minimised_mapping(const struct fsm *fsm, goto cleanup; } + /* This only needs to be run once, but must run before the main + * fixpoint loop below, because it potentially refines ECs. */ + if (!split_ecs_by_end_metadata(&env, fsm)) { + goto cleanup; + } + #if LOG_INIT for (i = 0; i < env.ec_count; i++) { fprintf(stderr, "# --ec[%lu]: %d\n", i, env.ecs[i]); @@ -325,7 +399,7 @@ build_minimised_mapping(const struct fsm *fsm, } } -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS check_done_ec_offset(&env); #endif } @@ -361,6 +435,12 @@ build_minimised_mapping(const struct fsm *fsm, } #endif +#if EXPENSIVE_CHECKS + for (i = 0; i < fsm->statecount; i++) { + assert(mapping[i] < fsm->statecount); + } +#endif + #if LOG_STEPS fprintf(stderr, "# done in %lu iteration(s), %lu step(s), %ld -> %ld states, label_count %lu\n", env.iter, env.steps, fsm->statecount, @@ -399,7 +479,7 @@ dump_ecs(FILE *f, const struct min_env *env) #endif } -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS static void check_descending_EC_counts(const struct min_env *env) { @@ -607,7 +687,7 @@ populate_initial_ecs(struct min_env *env, const struct fsm *fsm, /* The dead state is not a member of any EC. */ env->state_ecs[env->dead_state] = NO_ID; -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS check_descending_EC_counts(env); #endif @@ -642,7 +722,456 @@ populate_initial_ecs(struct min_env *env, const struct fsm *fsm, #endif } -#if EXPENSIVE_INTEGRITY_CHECKS +static int +accum_endids(const struct fsm *fsm, fsm_state_t end_state, + size_t nth, fsm_end_id_t id, void *opaque) +{ + fsm_end_id_t *buf = opaque; + (void)fsm; + (void)end_state; + buf[nth] = id; + +#if LOG_ECS + fprintf(stderr, " %d", id); +#endif + + return 1; +} + +SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() +static void +incremental_hash_of_ids(uint64_t *accum, fsm_end_id_t id) +{ + (*accum) += hash_id(id); +} + +static int +same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) +{ + if (a->end.count != b->end.count) { + return 0; + } + + if (a->capture.count != b->capture.count) { + return 0; + } + + /* compare -- these must be sorted */ + + for (size_t i = 0; i < a->end.count; i++) { + if (a->end.ids[i] != b->end.ids[i]) { + return 0; + } + } + + for (size_t i = 0; i < a->capture.count; i++) { + if (a->capture.ids[i] != b->capture.ids[i]) { + return 0; + } + } + + return 1; +} + +static int +split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) +{ + int res = 0; + + struct end_metadata *end_md; + fsm_state_t *htab = NULL; + + const size_t state_count = fsm_countstates(fsm); + +#if EXPENSIVE_CHECKS + /* Invariant: For each EC, either all or none of the states + * are end states. We only partition the set(s) of end states + * here. */ + assert(all_end_states_are_currently_together(env)); +#endif + + /* Use the hash table to assign to new groups. */ + + end_md = f_calloc(fsm->opt->alloc, + state_count, sizeof(end_md[0])); + if (end_md == NULL) { + goto cleanup; + } + + size_t bucket_count = 1; + while (bucket_count < state_count) { + bucket_count *= 2; /* power of 2 ceiling */ + } + const size_t mask = bucket_count - 1; + + htab = f_malloc(fsm->opt->alloc, + bucket_count * sizeof(htab[0])); + if (htab == NULL) { + goto cleanup; + } + + /* First pass: collect end state metadata */ + for (size_t ec_i = 0; ec_i < env->ec_count; ec_i++) { + fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); +#if LOG_ECS + fprintf(stderr, "## EC %zu\n", ec_i); +#endif + while (s != NO_ID) { + struct end_metadata *e = &end_md[s]; + if (!fsm_isend(fsm, s)) { + break; /* this EC has non-end states, skip */ + } + + if (!collect_end_ids(fsm, s, &e->end)) { + goto cleanup; + } + + if (!collect_capture_ids(fsm, s, &e->capture)) { + goto cleanup; + } + + if (!collect_capture_program_ids(fsm, s, &e->program)) { + goto cleanup; + } + + s = env->jump[s]; + } + } + +#if LOG_ECS + fprintf(stderr, "==== BEFORE PARTITIONING BY END METADATA\n"); + dump_ecs(stderr, env); + fprintf(stderr, "====\n"); +#endif + + /* Second pass: partition ECs into groups with identical end IDs. + * for each group with different end IDs, unlink them. */ + const size_t max_ec = env->ec_count; + for (size_t ec_i = 0; ec_i < max_ec; ec_i++) { + fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); + fsm_state_t prev = NO_ID; + + for (size_t i = 0; i < bucket_count; i++) { + htab[i] = NO_ID; /* reset hash table */ + } + + while (s != NO_ID) { + const struct end_metadata *s_md = &end_md[s]; + + uint64_t hash = 0; + const fsm_state_t next = env->jump[s]; + + for (size_t eid_i = 0; eid_i < s_md->end.count; eid_i++) { + incremental_hash_of_ids(&hash, s_md->end.ids[eid_i]); + } + for (size_t pid_i = 0; pid_i < s_md->program.count; pid_i++) { + incremental_hash_of_ids(&hash, s_md->program.ids[pid_i]); + } + + for (size_t b_i = 0; b_i < bucket_count; b_i++) { + fsm_state_t *b = &htab[(b_i + hash) & mask]; + const fsm_state_t other = *b; + const struct end_metadata *other_md = &end_md[other]; + + if (other == NO_ID) { /* empty hash bucket */ + *b = s; + if (prev == NO_ID) { + /* keep the first state, along with other states + * with matching end IDs, in this EC. no-op. */ +#if LOG_ECS + fprintf(stderr, " -- keeping state s %d in EC %u\n", + s, env->state_ecs[s]); +#endif + prev = s; + } else { /* not first (prev is set), so it landed somewhere else */ + /* unlink and assign new EC */ +#if LOG_ECS + fprintf(stderr, " -- moving state s %d from EC %u to EC %u\n", + s, env->state_ecs[s], env->ec_count); +#endif + env->jump[prev] = env->jump[s]; /* unlink */ + env->ecs[env->ec_count] = s; /* head of new EC */ + env->state_ecs[s] = env->ec_count; + env->jump[s] = NO_ID; + env->ec_count++; + } + break; + } else if (same_end_metadata(s_md, other_md)) { + if (env->state_ecs[other] == ec_i) { + /* keep in the current EC -- no-op */ +#if LOG_ECS + fprintf(stderr, " -- keeping state s %d in EC %u\n", + s, env->state_ecs[s]); +#endif + prev = s; + } else { + /* unlink and link to other state's EC */ +#if LOG_ECS + fprintf(stderr, " -- appending s %d to EC %u, after state %d, before %d\n", + s, env->state_ecs[other], other, env->jump[other]); +#endif + assert(prev != NO_ID); + env->jump[prev] = env->jump[s]; /* unlink */ + env->state_ecs[s] = env->state_ecs[other]; + env->jump[s] = env->jump[other]; + env->jump[other] = s; /* link after other */ + } + break; + } else { + continue; /* collision */ + } + } + + s = next; + } + + /* If this EC only has one entry and it's before the + * done_ec_offset, then set that here so that invariants + * will be restored while sweeping forward after this loop. */ + + if (env->jump[MASK_EC_HEAD(env->ecs[ec_i])] == NO_ID && ec_i < env->done_ec_offset) { + env->done_ec_offset = ec_i; /* will be readjusted later */ + } + +#if LOG_ECS + fprintf(stderr, "==== AFTER PARTITIONING BY END METADATA -- EC %zu\n", ec_i); + dump_ecs(stderr, env); + fprintf(stderr, "==== (done_ec_offset: %d)\n", env->done_ec_offset); +#endif + } + +#if LOG_ECS + fprintf(stderr, "==== AFTER PARTITIONING BY END IDs\n"); + dump_ecs(stderr, env); + fprintf(stderr, "==== (done_ec_offset: %d)\n", env->done_ec_offset); +#endif + + /* Sweep forward and swap ECs as necessary so all single-entry + * ECs are at the end -- they're done. */ + size_t ec_i = env->done_ec_offset; + + while (ec_i < env->ec_count) { + const fsm_state_t head = MASK_EC_HEAD(env->ecs[ec_i]); + if (env->jump[head] == NO_ID) { + /* offset stays where it is */ +#if LOG_ECS + fprintf(stderr, "ec_i: %zu / %u -- branch a\n", ec_i, env->ec_count); +#endif + env->ecs[ec_i] = SET_SMALL_EC_FLAG(head); + } else { + /* this EC has more than one state, but is after + * the done_ec_offset, so swap it with an EC at + * the boundary. */ + const fsm_state_t n_ec_i = env->done_ec_offset; +#if LOG_ECS + fprintf(stderr, "ec_i: %zu / %u -- branch b -- swap %ld and %d\n", + ec_i, env->ec_count, ec_i, n_ec_i); +#endif + + /* swap ec[n_ec_i] and ec[ec_i] */ + const fsm_state_t tmp = env->ecs[ec_i]; + env->ecs[ec_i] = env->ecs[n_ec_i]; + env->ecs[n_ec_i] = tmp; + /* note: this may set the SMALL_EC_FLAG. */ + update_ec_links(env, ec_i); + update_ec_links(env, n_ec_i); + env->done_ec_offset++; + } + ec_i++; + } + +#if LOG_ECS + fprintf(stderr, "==== (done_ec_offset is now: %d, ec_count %u)\n", env->done_ec_offset, env->ec_count); + dump_ecs(stderr, env); +#endif + + /* check that all ECs are before/after done_ec_offset */ + for (size_t ec_i = 0; ec_i < env->ec_count; ec_i++) { + const fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); +#if LOG_ECS + fprintf(stderr, " -- ec_i %zu: s %d\n", ec_i, s); +#endif + if (ec_i < env->done_ec_offset) { + assert(env->jump[s] != NO_ID); + } else { + assert(env->jump[s] == NO_ID); + } + } + + res = 1; + +cleanup: + if (htab != NULL) { + f_free(fsm->opt->alloc, htab); + } + if (end_md != NULL) { + size_t i; + for (i = 0; i < state_count; i++) { + struct end_metadata *e = &end_md[i]; + if (e->end.ids != NULL) { + f_free(fsm->opt->alloc, e->end.ids); + } + if (e->capture.ids != NULL) { + f_free(fsm->opt->alloc, e->capture.ids); + } + if (e->program.ids != NULL) { + f_free(fsm->opt->alloc, e->program.ids); + } + } + f_free(fsm->opt->alloc, end_md); + } + + return res; +} + +static int +cmp_end_ids(const void *pa, const void *pb) +{ + const fsm_end_id_t a = *(fsm_end_id_t *)pa; + const fsm_end_id_t b = *(fsm_end_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static int +cmp_unsigned(const void *pa, const void *pb) +{ + const unsigned a = *(unsigned *)pa; + const unsigned b = *(unsigned *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static int +collect_end_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_end *e) +{ + e->count = fsm_getendidcount(fsm, s); + + if (e->count > 0) { + e->ids = f_malloc(fsm->opt->alloc, + e->count * sizeof(e->ids[0])); + if (e->ids == NULL) { + return 0; + } + +#if LOG_ECS + fprintf(stderr, "%d:", s); +#endif + fsm_iterendids(fsm, s, + accum_endids, (void *)e->ids); +#if LOG_ECS + fprintf(stderr, "\n"); +#endif + /* sort, to make comparison easier later */ + qsort(e->ids, e->count, + sizeof(e->ids[0]), cmp_end_ids); + } + return 1; +} + +struct collect_capture_env { + int ok; + const struct fsm_alloc *alloc; + struct end_metadata_capture *c; + struct end_metadata_program *p; +}; + +static int +collect_capture_cb(fsm_state_t state, unsigned capture_id, + void *opaque) +{ + struct collect_capture_env *env = opaque; + struct end_metadata_capture *c = env->c; + (void)state; + if (c->count == c->ceil) { + const size_t nceil = (c->count == 0) + ? DEF_CAPTURE_ID_CEIL + : 2*c->ceil; + unsigned *nids = f_realloc(env->alloc, c->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + env->ok = 0; + return 0; + } + c->ids = nids; + c->ceil = nceil; + } + + c->ids[c->count] = capture_id; + c->count++; + return 1; +} + +static int +collect_capture_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_capture *c) +{ + struct collect_capture_env env = { + .ok = 1, + .alloc = fsm->opt->alloc, + .c = c, + }; + fsm_capture_iter_active_for_end_state(fsm, s, + collect_capture_cb, &env); + + if (env.ok) { + if (c->ids == NULL) { + assert(c->count == 0); + } else { + qsort(c->ids, c->count, sizeof(c->ids[0]), cmp_unsigned); + } + } + + return env.ok; +} + +static int +collect_capture_program_ids_cb(fsm_state_t state, unsigned prog_id, + void *opaque) +{ + struct collect_capture_env *env = opaque; + struct end_metadata_program *p = env->p; + (void)state; + if (p->count == p->ceil) { + const size_t nceil = (p->count == 0) + ? DEF_CAPTURE_ID_CEIL + : 2*p->ceil; + unsigned *nids = f_realloc(env->alloc, p->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + env->ok = 0; + return 0; + } + p->ids = nids; + p->ceil = nceil; + } + + p->ids[p->count] = prog_id; + p->count++; + return 1; +} + +static int +collect_capture_program_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_program *p) +{ + struct collect_capture_env env = { + .ok = 1, + .alloc = fsm->opt->alloc, + .p = p, + }; + fsm_capture_iter_program_ids_for_end_state(fsm, s, + collect_capture_program_ids_cb, &env); + + if (env.ok) { + if (p->ids == NULL) { + assert(p->count == 0); + } else { + qsort(p->ids, p->count, sizeof(p->ids[0]), cmp_unsigned); + } + } + + return env.ok; +} + +#if EXPENSIVE_CHECKS static void check_done_ec_offset(const struct min_env *env) { @@ -657,13 +1186,34 @@ check_done_ec_offset(const struct min_env *env) * worth the added complexity to avoid checking ECs 0 and 1. */ for (i = 0; i < env->ec_count; i++) { const fsm_state_t head = MASK_EC_HEAD(env->ecs[i]); - if (i >= done_ec_offset) { + if (i >= env->done_ec_offset) { assert(head == NO_ID || env->jump[head] == NO_ID); } else if (i >= 2) { assert(env->jump[head] != NO_ID); } } } + +static int +all_end_states_are_currently_together(const struct min_env *env) +{ + /* For each EC, either all or none of the states in it + * are end states. */ + for (size_t i = 0; i < env->ec_count; i++) { + const fsm_state_t head = MASK_EC_HEAD(env->ecs[i]); + const int ec_first_is_end = fsm_isend(env->fsm, head); + + fsm_state_t s = env->jump[head]; + while (s != NO_ID) { + if (fsm_isend(env->fsm, s) != ec_first_is_end) { + return 0; + } + s = env->jump[s]; + } + } + + return 1; +} #endif static int @@ -813,7 +1363,7 @@ try_partition(struct min_env *env, unsigned char label, const unsigned dead_state_ec = env->state_ecs[env->dead_state]; const struct fsm_state *states = env->fsm->states; -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS /* Count states here, to compare against the partitioned * EC' counts later. */ size_t state_count = 0, psrc_count, pdst_count; @@ -853,7 +1403,7 @@ try_partition(struct min_env *env, unsigned char label, first_ec = dead_state_ec; } #if LOG_PARTITIONS > 1 - fprintf(stderr, "# --- try_partition: label '%c' -> EC %d\n", label, first_ec); + fprintf(stderr, "# --- try_partition: label '%c' -> first_ec %d\n", label, first_ec); #endif partition_counts[0] = 1; @@ -893,7 +1443,7 @@ try_partition(struct min_env *env, unsigned char label, partition_counts[0]++; prev = cur; cur = env->jump[cur]; - } else { /* unlink, split */ + } else if (to_ec != first_ec) { /* definitely different destination EC: unlink, split */ fsm_state_t next; #if LOG_PARTITIONS > 1 fprintf(stderr, "# try_partition: unlinking -- label '%c', src %u, dst %u, first_ec %d, cur %u -> to_ec %d\n", label, ec_src, ec_dst, first_ec, cur, to_ec); @@ -908,10 +1458,21 @@ try_partition(struct min_env *env, unsigned char label, env->ecs[ec_dst] = cur; cur = next; partition_counts[1]++; + } else { + /* Restrict the ones that will be marked as checked + * to the common subset before continuing, so that any + * other labels will still be checked in a later pass. */ + for (size_t i = 0; i < 4; i++) { + checked_labels[i] &= cur_label_set[i]; + } + + partition_counts[0]++; + prev = cur; + cur = env->jump[cur]; } } -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS /* Count how many states were split into each EC * and check that the sum matches the original count. */ psrc_count = 0; diff --git a/src/libfsm/print/Makefile b/src/libfsm/print/Makefile index c2911318b..9c6f42bbf 100644 --- a/src/libfsm/print/Makefile +++ b/src/libfsm/print/Makefile @@ -20,6 +20,9 @@ SRC += src/libfsm/print/vmasm.c .for src in ${SRC:Msrc/libfsm/print/*.c} CFLAGS.${src} += -I src # XXX: for internal.h DFLAGS.${src} += -I src # XXX: for internal.h + +CFLAGS.${src} += -std=c99 +DFLAGS.${src} += -std=c99 .endfor .for src in ${SRC:Msrc/libfsm/print/*.c} diff --git a/src/libfsm/state.c b/src/libfsm/state.c index acf2bff25..c4425077a 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -17,6 +17,8 @@ #include #include "internal.h" +#include "capture.h" +#include "endids.h" int fsm_addstate(struct fsm *fsm, fsm_state_t *state) @@ -33,17 +35,12 @@ fsm_addstate(struct fsm *fsm, fsm_state_t *state) const size_t factor = 2; /* a guess */ const size_t n = fsm->statealloc * factor; struct fsm_state *tmp; - size_t i; tmp = f_realloc(fsm->opt->alloc, fsm->states, n * sizeof *fsm->states); if (tmp == NULL) { return 0; } - for (i = fsm->statealloc; i < n; i++) { - tmp[i].has_capture_actions = 0; - } - fsm->statealloc = n; fsm->states = tmp; } @@ -253,6 +250,18 @@ fsm_compact_states(struct fsm *fsm, } } + if (!fsm_endid_compact(fsm, mapping, orig_statecount)) { + goto error; + } + + if (!fsm_capture_id_compact(fsm, mapping, orig_statecount)) { + goto error; + } + + if (!fsm_capture_program_association_compact(fsm, mapping, orig_statecount)) { + goto error; + } + assert(dst == kept); assert(kept == fsm->statecount); @@ -278,4 +287,9 @@ fsm_compact_states(struct fsm *fsm, *removed = removed_count; } return 1; + +error: + f_free(fsm->opt->alloc, mapping); + + return 0; } diff --git a/src/libfsm/trim.c b/src/libfsm/trim.c index 6a9a25f09..36bf9145d 100644 --- a/src/libfsm/trim.c +++ b/src/libfsm/trim.c @@ -10,13 +10,16 @@ #include #include +#include #include #include +#include #include #include #include "internal.h" +#include "capture.h" #define DEF_EDGES_CEIL 8 #define DEF_ENDS_CEIL 8 @@ -42,17 +45,18 @@ save_edge(const struct fsm_alloc *alloc, size_t *count, size_t *ceil, struct edge **edges, fsm_state_t from, fsm_state_t to); -static int -cmp_edges_by_to(const void *pa, const void *pb) +static fsm_state_t +get_max_to(const struct edge *edges, size_t edge_count) { - const struct edge *a = (const struct edge *)pa; - const struct edge *b = (const struct edge *)pb; - - return a->to < b->to ? -1 - : a->to > b->to ? 1 - : a->from < b->from ? -1 - : a->from > b->from ? 1 - : 0; + size_t i; + fsm_state_t res = edges[0].to; + for (i = 1; i < edge_count; i++) { + const fsm_state_t to = edges[i].to; + if (to > res) { + res = to; + } + } + return res; } static int @@ -61,7 +65,7 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, { /* Use a queue to walk breath-first over all states reachable * from the start state. Note all end states. Collect all the - * edges, then sort them by the note they lead to, to convert it + * edges, then sort them by the node they lead to, to convert it * to a reverse edge index. Then, enqueue all the end states, * and again use the queue to walk the graph breadth-first, but * this time iterating bottom-up from the end states, and mark @@ -81,10 +85,13 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, fsm_state_t max_end; const size_t state_count = fsm->statecount; + fsm_state_t max_to; + unsigned *pv = NULL; size_t *offsets = NULL; + INIT_TIMERS(); - if (!fsm_getstart(fsm, &start)) { + if (!fsm_getstart(fsm, &start) || state_count == 0) { return 1; /* nothing is reachable */ } @@ -225,7 +232,33 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, } /* Sort edges by state they lead to, inverting the index. */ - qsort(edges, edge_count, sizeof(edges[0]), cmp_edges_by_to); + max_to = edge_count == 0 ? 0 : get_max_to(edges, edge_count); +#if LOG_TRIM + fprintf(stderr, " -- edge count %zu, got max_to %u\n", edge_count, max_to); +#endif + TIME(&pre); + pv = permutation_vector_with_size_and_offset(fsm->opt->alloc, + edge_count, max_to, edges, sizeof(edges[0]), offsetof(struct edge, to)); + TIME(&post); + DIFF_MSEC("trim_pv_so", pre, post, NULL); + + if (EXPENSIVE_CHECKS) { + size_t i; + int ok = 1; +#if LOG_TRIM + fprintf(stderr, "\n#i\tedge\tpv\tsorted, max_to %u\n", max_to); +#endif + for (i = 0; i < edge_count; i++) { +#if LOG_TRIM + fprintf(stderr, "%zu\t%u\t%u\t%u\n", + i, edges[i].to, pv[i], edges[pv[i]].to); +#endif + if (i > 0 && edges[pv[i]].to < edges[pv[i - 1]].to) { + ok = 0; + } + } + assert(ok); + } max_end = 0; @@ -282,7 +315,6 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, * offsets[i - 1], to represent zero entries. */ { size_t i; - const fsm_state_t max_to = edges[edge_count - 1].to; const size_t offset_count = fsm_countstates(fsm); offsets = f_calloc(fsm->opt->alloc, @@ -292,7 +324,7 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, } for (i = 0; i < edge_count; i++) { - const fsm_state_t to = edges[i].to; + const fsm_state_t to = edges[pv[i]].to; offsets[to] = i + 1; } @@ -312,8 +344,8 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, if (LOG_TRIM > 1) { size_t i; for (i = 0; i < edge_count; i++) { - fprintf(stderr, "mark_states: edges[%zu]: %d -> %d\n", - i, edges[i].from, edges[i].to); + fprintf(stderr, "mark_states: edges[pv[%zu]]: %d -> %d\n", + i, edges[pv[i]].from, edges[pv[i]].to); } } @@ -331,13 +363,13 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, } for (e_i = base; e_i < limit; e_i++) { - const fsm_state_t from = edges[e_i].from; + const fsm_state_t from = edges[pv[e_i]].from; const unsigned end_distance = (sed == NULL ? 0 : sed[s_id]); assert(from < state_count); if (LOG_TRIM > 0) { - fprintf(stderr, "mark_states: edges[%zu]: from: %d, visited? %d\n", + fprintf(stderr, "mark_states: edges[pv[%zu]]: from: %d, visited? %d\n", e_i, from, fsm->states[from].visited); } @@ -370,6 +402,7 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, if (ends != NULL) { f_free(fsm->opt->alloc, ends); } if (offsets != NULL) { f_free(fsm->opt->alloc, offsets); } if (q != NULL) { queue_free(q); } + if (pv != NULL) { f_free(fsm->opt->alloc, pv); } return res; } @@ -457,7 +490,7 @@ integrity_check(const char *descr, const struct fsm *fsm) struct edge_iter edge_iter; struct fsm_edge e; -#ifdef NDEBUG +#if defined(NDEBUG) || !EXPENSIVE_CHECKS return; #endif @@ -484,8 +517,14 @@ integrity_check(const char *descr, const struct fsm *fsm) } } + fsm_capture_integrity_check(fsm); + if (LOG_TRIM > 1) { fprintf(stderr, "integrity check: %s...PASS\n", descr); + if (LOG_TRIM > 2) { + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "post_trim", fsm); + } } } @@ -504,6 +543,8 @@ fsm_trim(struct fsm *fsm, enum fsm_trim_mode mode, return 1; } + integrity_check("pre", fsm); + if (shortest_end_distance != NULL && mode == FSM_TRIM_START_AND_END_REACHABLE) { size_t s_i; diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 455dc8359..a99b49be2 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -32,6 +32,7 @@ fsm_union(struct fsm *a, struct fsm *b, if (combine_info == NULL) { combine_info = &combine_info_internal; } + memset(combine_info, 0x00, sizeof(*combine_info)); assert(a != NULL); assert(b != NULL); @@ -101,8 +102,16 @@ fsm_union_array(size_t fsm_count, fsms[0] = NULL; memset(bases, 0x00, fsm_count * sizeof(bases[0])); +#if EXPENSIVE_CHECKS + size_t capture_count = fsm_capture_count(res); +#endif + for (i = 1; i < fsm_count; i++) { struct fsm_combine_info ci; +#if EXPENSIVE_CHECKS + capture_count += fsm_capture_count(fsms[i]); +#endif + struct fsm *combined = fsm_union(res, fsms[i], &ci); fsms[i] = NULL; if (combined == NULL) { @@ -138,5 +147,8 @@ fsm_union_array(size_t fsm_count, } #endif +#if EXPENSIVE_CHECKS + assert(fsm_capture_count(res) == capture_count); +#endif return res; } diff --git a/src/libre/Makefile b/src/libre/Makefile index a88a92418..508b77a76 100644 --- a/src/libre/Makefile +++ b/src/libre/Makefile @@ -10,13 +10,17 @@ SRC += src/libre/ast_new_from_fsm.c SRC += src/libre/ast_rewrite.c SRC += src/libre/ac.c SRC += src/libre/re_strings.c +SRC += src/libre/re_capvm_compile.c # generated SRC += src/libre/class_name.c -.for src in ${SRC:Msrc/libre/ast_compile.c} +.for src in ${SRC:Msrc/libre/ast*.c} ${SRC:Msrc/libre/re*.c} CFLAGS.${src} += -I src # XXX: for internal.h DFLAGS.${src} += -I src # XXX: for internal.h + +CFLAGS.${src} += -std=c99 +DFLAGS.${src} += -std=c99 .endfor LIB += libre diff --git a/src/libre/ast.c b/src/libre/ast.c index 2e1d21817..6bd6063f5 100644 --- a/src/libre/ast.c +++ b/src/libre/ast.c @@ -494,6 +494,7 @@ ast_expr_cmp(const struct ast_expr *a, const struct ast_expr *b) case AST_EXPR_GROUP: if (a->u.group.id < b->u.group.id) { return -1; } if (a->u.group.id > b->u.group.id) { return +1; } + /* .repeated flag is ignored here */ return ast_expr_cmp(a->u.group.e, b->u.group.e); @@ -753,6 +754,7 @@ ast_make_expr_group(struct ast_expr_pool **poolp, enum re_flags re_flags, struct res->re_flags = re_flags; res->u.group.e = e; res->u.group.id = id; + res->u.group.repeated = 0; /* may be set during analysis */ return res; } @@ -770,6 +772,7 @@ ast_make_expr_anchor(struct ast_expr_pool **poolp, enum re_flags re_flags, enum res->type = AST_EXPR_ANCHOR; res->re_flags = re_flags; res->u.anchor.type = type; + res->u.anchor.is_end_nl = 0; /* may be set later */ return res; } diff --git a/src/libre/ast.h b/src/libre/ast.h index 233744847..20cc3cdb7 100644 --- a/src/libre/ast.h +++ b/src/libre/ast.h @@ -7,6 +7,11 @@ #ifndef RE_AST_H #define RE_AST_H +#include +#include +#include +#include + /* * This is a duplicate of struct lx_pos, but since we're linking to * code with several distinct lexers, there isn't a clear lexer.h @@ -62,7 +67,9 @@ enum ast_anchor_type { * followed by nullable nodes. * * - AST_FLAG_UNSATISFIABLE - * The node caused the regex to become unsatisfiable. + * The node is unsatisfiable (can never match anything). + * This can cause AST subtrees to be pruned, or for the + * entire regex to become unsatisfiable. * * - AST_FLAG_NULLABLE * The node is not always evaluated, such as nodes that @@ -159,6 +166,8 @@ struct ast_expr { size_t count; /* used */ size_t alloc; /* allocated */ struct ast_expr **n; + int contains_empty_groups; + int nullable_alt_inside_plus_repeat; } alt; struct { @@ -172,12 +181,15 @@ struct ast_expr { struct ast_expr_repeat { struct ast_expr *e; unsigned min; - unsigned max; + unsigned max; /* can be AST_COUNT_UNBOUNDED */ + int contains_empty_groups; + int contains_nullable_alt; } repeat; struct { struct ast_expr *e; unsigned id; + int repeated; /* set during analysis */ } group; struct { @@ -235,9 +247,12 @@ ast_pool_free(struct ast_expr_pool *pool); struct ast_expr_pool * ast_expr_pool_save(void); +#define AST_NO_MAX_CAPTURE_ID ((long)-1) + struct ast { struct ast_expr_pool *pool; struct ast_expr *expr; + long max_capture_id; int has_unanchored_start; int has_unanchored_end; }; diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index 945eeb814..70e65cad4 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -19,7 +19,7 @@ #define LOG_ANALYSIS 0 #define LOG_FIRST_ANALYSIS (0 + LOG_ANALYSIS) -#define LOG_REPEATED_GROUPS (0 + LOG_ANALYSIS) +#define LOG_REPETITION_CASES (0 + LOG_ANALYSIS) #define LOG_FORKING (0 + LOG_ANALYSIS) #define LOG_ANCHORING (0 + LOG_ANALYSIS) #define LOG_CONCAT_FLAGS (0 + LOG_ANALYSIS) @@ -35,6 +35,11 @@ /* Mask for end-anchor flags */ #define END_ANCHOR_FLAG_MASK (AST_FLAG_ANCHORED_END | AST_FLAG_END_NL) +struct capture_env { + long max_capture_id; + int use_captures; +}; + static int is_nullable(const struct ast_expr *n) { @@ -224,7 +229,7 @@ analysis_iter(struct ast_expr *n) } case AST_EXPR_ANCHOR: - /* anchor flags will be handled on the second pass */ + /* anchor flags will be handled on the next pass */ break; case AST_EXPR_SUBTRACT: @@ -252,23 +257,281 @@ analysis_iter(struct ast_expr *n) return AST_ANALYSIS_OK; } -/* Analysis for unanchored starts/ends uses three values, because some - * details decide the whole subtree is/isn't, others should defer to - * analysis elsewhere it the tree, but an overall result of undecided - * still decides yes. */ -enum unanchored_analysis_res { - UA_NO = 'N', - UA_YES = 'Y', - UA_UNDECIDED = 'U', +static int +is_only_anchors(struct ast_expr *expr) +{ + if (can_consume_input(expr)) { return 0; } + + switch (expr->type) { + case AST_EXPR_ANCHOR: + return 1; + + case AST_EXPR_CONCAT: + if (expr->u.concat.count == 0) { return 0; } + for (size_t i = 0; i < expr->u.concat.count; i++) { + if (!is_only_anchors(expr->u.concat.n[i]) + && can_consume_input(expr->u.concat.n[i])) { + return 0; + } + } + return 1; + + case AST_EXPR_ALT: + assert(expr->u.alt.count > 0); + for (size_t i = 0; i < expr->u.alt.count; i++) { + /* earlier matches will shadow later ones */ + if (is_only_anchors(expr->u.alt.n[i])) { + return 1; + } + } + return 0; + + case AST_EXPR_REPEAT: + if (expr->u.repeat.min == 0 && expr->u.repeat.max == 0) { + return 0; + } + return is_only_anchors(expr->u.repeat.e); + + case AST_EXPR_GROUP: + return is_only_anchors(expr->u.group.e); + + default: + break; + } + + return 0; +} + +static enum ast_analysis_res +analysis_iter_repetition(struct ast_expr *n, struct ast_expr *outermost_repeat_parent, + int shadowed_by_previous_alt_case, struct ast_expr *repeat_plus_ancestor) +{ + enum ast_analysis_res res = AST_ANALYSIS_OK; + + LOG(3 - LOG_REPETITION_CASES, "%s: node %p, type %s, shadowed_by_previous_alt_case %d\n", + __func__, (void *)n, ast_node_type_name(n->type), shadowed_by_previous_alt_case); + + if (shadowed_by_previous_alt_case) { + assert(outermost_repeat_parent == NULL + || outermost_repeat_parent->type == AST_EXPR_ALT); + } + + switch (n->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_TOMBSTONE: + case AST_EXPR_ANCHOR: + case AST_EXPR_LITERAL: + case AST_EXPR_CODEPOINT: + case AST_EXPR_RANGE: + break; + + case AST_EXPR_CONCAT: { + /* If this CONCAT array of nodes always consumes input, then + * it cannot be repeated empty, so it cannot produce the + * special case that needs outermost_repeat_parent for + * AST_EXPR_REPEAT's case below. + * + * An example input that needs this is 'x(()x)*' for "xx", + * because the 'x' prevents the outermost group + * from repeating and matching empty again after consuming + * a run of "x"s. */ + if (always_consumes_input(n)) { + outermost_repeat_parent = NULL; + } + + for (size_t i = 0; i < n->u.concat.count; i++) { + res = analysis_iter_repetition(n->u.concat.n[i], outermost_repeat_parent, + shadowed_by_previous_alt_case, repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + } + break; + } + + case AST_EXPR_ALT: { + /* See AST_EXPR_GROUP below for why this matters. */ + int new_shadowed_by_previous_alt_case = shadowed_by_previous_alt_case; + + /* FIXME: check nesting of this construct */ + + LOG(3 - LOG_REPETITION_CASES, + "%s: ALT node %p, repeat_plus_ancestor %p\n", + __func__, (void *)n, (void *)repeat_plus_ancestor); + + for (size_t i = 0; i < n->u.alt.count; i++) { + /* If this is an ALT inside of a repeated subtree that contains + * a capture, this will need special handling. */ + if (outermost_repeat_parent != NULL) { + LOG(3 - LOG_REPETITION_CASES, + "%s: setting outermost_repeat_parent to %p for alt branch %zu, repeat_plus_ancestor %p\n", + __func__, (void *)n, i, (void *)repeat_plus_ancestor); + outermost_repeat_parent = n; + } + + if (is_nullable(n->u.alt.n[i]) || is_only_anchors(n->u.alt.n[i])) { + LOG(3 - LOG_REPETITION_CASES, + "%s: setting new_shadowed_by_previous_alt_case for alt branch %zu, repeat_plus_ancestor %p\n", + __func__, i, (void *)repeat_plus_ancestor); + new_shadowed_by_previous_alt_case = 1; + if (repeat_plus_ancestor != NULL) { + n->u.alt.nullable_alt_inside_plus_repeat = 1; + assert(repeat_plus_ancestor->type == AST_EXPR_REPEAT); + assert(repeat_plus_ancestor->u.repeat.min == 1); + assert(repeat_plus_ancestor->u.repeat.max == AST_COUNT_UNBOUNDED); + repeat_plus_ancestor->u.repeat.contains_nullable_alt = 1; + + /* Repetition of an alt subtree which has a capture group child that + * only contains only* anchors is not handled properly yet. This + * isn't actually _useful_, it's just something that comes up + * in fuzzing, so reject it as an unsupported PCRE construct. + * + * An example input that triggers this is '^(($)|)+$' . */ + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + } + + res = analysis_iter_repetition(n->u.alt.n[i], + outermost_repeat_parent, + new_shadowed_by_previous_alt_case, + repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + } + break; + } + + case AST_EXPR_REPEAT: + { + struct ast_expr *child = n->u.repeat.e; + + LOG(3 - LOG_REPETITION_CASES, "%s: REPEAT node %p, min %u max %u nullable? %d, !cci %d\n", + __func__, (void *)n, n->u.repeat.min, n->u.repeat.max, + is_nullable(child), can_consume_input(child)); + + if (n->u.repeat.min == 1 && + n->u.repeat.max == AST_COUNT_UNBOUNDED) { + LOG(3 - LOG_REPETITION_CASES, "%s: setting repeat_plus_ancestor to %p\n", + __func__, (void *)n); + repeat_plus_ancestor = n; + } else { + repeat_plus_ancestor = NULL; + } + + /* Special cases for a repeated group that contains possibly empty captures, + * in order to correctly reflect their repeating one more time and capture + * at the end (but without an infinite loop). + * + * For example, '^((x?))*$' will always end up with capture groups 1 and 2 + * at the end of the input for any number of "x"s, since the outermost ()* + * can always repeat once more time, consuming nothing, and clobber the + * existing captures. We mark repeated groups so that the compiled capture + * program can move saving the captures after the repetition, instead + * behaving like `^((?:x?)*(())$`. + * + * However, if the repeated subtree always consumes input, such as with + * '^(()a)+b$', then clear any passed in outermost_repeat_parent, because + * having to consume input will prevent that extra repetition of the + * empty captures. */ + if (always_consumes_input(n)) { + res = analysis_iter_repetition(child, NULL, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + } else if (outermost_repeat_parent == NULL && n->u.repeat.max > 1) { + LOG(3 - LOG_REPETITION_CASES, "%s: recursing with outermost_repeat_parent set to %p\n", + __func__, (void *)n); + res = analysis_iter_repetition(child, n, 0, + repeat_plus_ancestor); + } else { + LOG(3 - LOG_REPETITION_CASES, "%s: recursing with outermost_repeat_parent %p\n", + __func__, (void *)outermost_repeat_parent); + res = analysis_iter_repetition(child, outermost_repeat_parent, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + } + if (res != AST_ANALYSIS_OK) { return res; } + break; + } + + case AST_EXPR_GROUP: + LOG(3 - LOG_REPETITION_CASES, + "%s: GROUP %p, repeat_plus_ancestor %p\n", + __func__, (void *)n, (void *)repeat_plus_ancestor); + + + if (outermost_repeat_parent != NULL && (is_nullable(n) || !can_consume_input(n))) { + int should_mark_repeated = 1; + /* If the outermost_repeat_parent is an ALT node and a previous ALT subtree + * matching the empty string is shadowing this group, then do not mark it + * as repeated, because that can lead to incorrect handling in somewhat + * contrived regexes like '^(?:|(|x))*$'. */ + if (outermost_repeat_parent->type == AST_EXPR_ALT && shadowed_by_previous_alt_case) { + LOG(3 - LOG_REPETITION_CASES, + "%s: hit group shadowed_by_previous_alt_case, skipping\n", __func__); + should_mark_repeated = 0; + } + + if (n->flags & (AST_FLAG_ANCHORED_START | AST_FLAG_ANCHORED_END)) { + LOG(3 - LOG_REPETITION_CASES, + "%s: hit repeating anchor, skipping\n", __func__); + should_mark_repeated = 0; + } + + if (should_mark_repeated) { + LOG(3 - LOG_REPETITION_CASES, "%s: setting group %u to repeated\n", + __func__, n->u.group.id); + n->u.group.repeated = 1; + assert(outermost_repeat_parent->type == AST_EXPR_REPEAT || + outermost_repeat_parent->type == AST_EXPR_ALT); + LOG(3 - LOG_REPETITION_CASES, "%s: setting contains_empty_groups on outermost_repeat_parent %p\n", + __func__, (void *)outermost_repeat_parent); + if (outermost_repeat_parent->type == AST_EXPR_REPEAT) { + outermost_repeat_parent->u.repeat.contains_empty_groups = 1; + } else if (outermost_repeat_parent->type == AST_EXPR_ALT) { + outermost_repeat_parent->u.alt.contains_empty_groups = 1; + } else { + assert(!"type mismatch"); + } + } + } + + if (repeat_plus_ancestor != NULL && (is_nullable(n) || !can_consume_input(n))) { + assert(repeat_plus_ancestor->type == AST_EXPR_REPEAT + && repeat_plus_ancestor->u.repeat.min == 1 + && repeat_plus_ancestor->u.repeat.max == AST_COUNT_UNBOUNDED); + LOG(3 - LOG_REPETITION_CASES, + "%s: not yet implemented, skipping\n", __func__); + /* return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; */ + } + + res = analysis_iter_repetition(n->u.group.e, outermost_repeat_parent, + shadowed_by_previous_alt_case, repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + break; + + case AST_EXPR_SUBTRACT: + res = analysis_iter_repetition(n->u.subtract.a, outermost_repeat_parent, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + res = analysis_iter_repetition(n->u.subtract.b, outermost_repeat_parent, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + break; + + default: + assert(!"unreached"); + } + return res; +} + +enum anchoring_analysis_res { + ANCHOR_ANALYSIS_ANCHORED = 'a', + ANCHOR_ANALYSIS_UNANCHORED = 'u', + ANCHOR_ANALYSIS_UNDECIDED = '_', }; -static enum unanchored_analysis_res +static enum anchoring_analysis_res analysis_iter_unanchored_start(const struct ast_expr *n) { if (n->flags & AST_FLAG_ANCHORED_START) { LOG(4 - LOG_UNANCHORED_FLAGS, "%s: n (%p)->flags & AST_FLAG_ANCHORED_START -> N\n", __func__, (void *)n); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; } LOG(4 - LOG_UNANCHORED_FLAGS, "%s: node %p, type %s\n", @@ -278,7 +541,7 @@ analysis_iter_unanchored_start(const struct ast_expr *n) case AST_EXPR_EMPTY: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; case AST_EXPR_TOMBSTONE: break; @@ -287,12 +550,12 @@ analysis_iter_unanchored_start(const struct ast_expr *n) switch (n->u.anchor.type) { case AST_ANCHOR_START: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ^ anchor -> N\n", __func__); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; case AST_ANCHOR_END: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: $ anchor -> U\n", __func__); /* will be handled by other cases */ - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; default: assert(!"unreached"); @@ -308,7 +571,7 @@ analysis_iter_unanchored_start(const struct ast_expr *n) case AST_EXPR_RANGE: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> Y\n", __func__, ast_node_type_name(n->type)); - return UA_YES; + return ANCHOR_ANALYSIS_UNANCHORED; case AST_EXPR_CONCAT: { size_t i; @@ -316,8 +579,8 @@ analysis_iter_unanchored_start(const struct ast_expr *n) /* has unanchored start if first non-nullable child does */ for (i = 0; i < n->u.concat.count; i++) { const struct ast_expr *child = n->u.concat.n[i]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_start(child); - if (child_res != UA_UNDECIDED) { + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_start(child); + if (child_res != ANCHOR_ANALYSIS_UNDECIDED) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), child_res); return child_res; @@ -335,19 +598,19 @@ analysis_iter_unanchored_start(const struct ast_expr *n) size_t i; /* if all children agree, return that result, otherwise undecided */ - const enum unanchored_analysis_res first_child_res = analysis_iter_unanchored_start(n->u.alt.n[0]); + const enum anchoring_analysis_res first_child_res = analysis_iter_unanchored_start(n->u.alt.n[0]); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child 0 -- %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), first_child_res); for (i = 1; i < n->u.alt.count; i++) { const struct ast_expr *child = n->u.alt.n[i]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_start(child); + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_start(child); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child %zd -- %s -> %c (child res)\n", __func__, i, ast_node_type_name(n->type), child_res); if (child_res != first_child_res) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child result) contracts first, returning U\n", __func__, ast_node_type_name(n->type), child_res); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } } @@ -360,7 +623,7 @@ analysis_iter_unanchored_start(const struct ast_expr *n) if (n->u.repeat.min == 0) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U (repeat.min == 0)\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } return analysis_iter_unanchored_start(n->u.repeat.e); @@ -374,16 +637,16 @@ analysis_iter_unanchored_start(const struct ast_expr *n) assert(!"unreached"); } - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } -static enum unanchored_analysis_res +static enum anchoring_analysis_res analysis_iter_unanchored_end(const struct ast_expr *n) { if (n->flags & AST_FLAG_ANCHORED_END) { LOG(4 - LOG_UNANCHORED_FLAGS, "%s: node (%p)->flags & AST_FLAG_ANCHORED_END -> N\n", __func__, (void *)n); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; } LOG(4 - LOG_UNANCHORED_FLAGS, "%s: node %p, type %s\n", @@ -393,7 +656,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) case AST_EXPR_EMPTY: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; case AST_EXPR_TOMBSTONE: break; @@ -403,12 +666,12 @@ analysis_iter_unanchored_end(const struct ast_expr *n) case AST_ANCHOR_START: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ^ %s -> U\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; case AST_ANCHOR_END: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: $ %s -> N\n", __func__, ast_node_type_name(n->type)); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; default: assert(!"unreached"); @@ -424,7 +687,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) case AST_EXPR_RANGE: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> Y\n", __func__, ast_node_type_name(n->type)); - return UA_YES; + return ANCHOR_ANALYSIS_UNANCHORED; case AST_EXPR_CONCAT: { size_t i; @@ -432,8 +695,8 @@ analysis_iter_unanchored_end(const struct ast_expr *n) /* has unanchored end if last non-nullable child does */ for (i = n->u.concat.count; i > 0; i--) { const struct ast_expr *child = n->u.concat.n[i - 1]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_end(child); - if (child_res != UA_UNDECIDED) { + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_end(child); + if (child_res != ANCHOR_ANALYSIS_UNDECIDED) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), child_res); return child_res; @@ -451,19 +714,19 @@ analysis_iter_unanchored_end(const struct ast_expr *n) size_t i; /* if all children agree, return that result, otherwise undecided */ - const enum unanchored_analysis_res first_child_res = analysis_iter_unanchored_end(n->u.alt.n[0]); + const enum anchoring_analysis_res first_child_res = analysis_iter_unanchored_end(n->u.alt.n[0]); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child 0 -- %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), first_child_res); for (i = 1; i < n->u.alt.count; i++) { const struct ast_expr *child = n->u.alt.n[i]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_end(child); + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_end(child); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child %zd -- %s -> %c (child res)\n", __func__, i, ast_node_type_name(n->type), child_res); if (child_res != first_child_res) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child result) contracts first, returning U\n", __func__, ast_node_type_name(n->type), child_res); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } } @@ -476,7 +739,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) if (n->u.repeat.min == 0) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U (repeat.min == 0)\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } return analysis_iter_unanchored_end(n->u.repeat.e); @@ -490,7 +753,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) assert(!"unreached"); } - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } static void @@ -548,9 +811,68 @@ struct anchoring_env { /* Corresponding flag for end anchors while sweeping backward. */ int followed_by_consuming; + /* Special case for detecting '$[^a]', which matches "\n" with + * a capture group 0 of (0,1) in PCRE. */ + int followed_by_consuming_newline; + + /* Flag for tracking whether we're in a part of the subtree that + * is always before a start anchor. This influences satisfiability + * and edge cases like '()*^'. */ int before_start_anchor; + + /* Flag used to detect and reject the awkward case in '$[^a]', + * where (according to PCRE) the character class after the '$' + * should match the literal newline, but nothing else, and only + * once. Because $ is actually a zero-width assertion that + * execution is either at the end of input or a trailing + * newline, it has the rather surprising result that '$[^a]' + * will not match "x" but *will* match "x\n" (because it has a $ + * before a trailing newline, and because the newline matches + * the non-skippable [^a]). We just return an unsupported + * error for this case. */ + enum newline_after_end_anchor_state { + NAEAS_NONE, + NAEAS_WOULD_MATCH_ONCE, + } newline_after_end_anchor_state; + + int after_end_anchor; }; +/* Does the subtree match a literal '\n'? */ +static int +matches_newline(const struct ast_expr *n) +{ + switch (n->type) { + case AST_EXPR_LITERAL: + return n->u.literal.c == '\n'; + + case AST_EXPR_SUBTRACT: + return matches_newline(n->u.subtract.a) + && !matches_newline(n->u.subtract.b); + + case AST_EXPR_RANGE: + { + const struct ast_endpoint *f = &n->u.range.from; + const struct ast_endpoint *t = &n->u.range.to; + if (f->type != AST_ENDPOINT_LITERAL + || t->type != AST_ENDPOINT_LITERAL) { + /* not implemented */ + LOG(1 - LOG_ANCHORING, "%s: not implemented\n", __func__); + return 0; + } + + const int res = f->u.literal.c <= '\n' && t->u.literal.c >= '\n'; + LOG(1 - LOG_ANCHORING, "%s: RANGE res %d\n", __func__, res); + return res; + } + + default: + break; + } + + return 0; +} + /* Tree walker that analyzes the AST, marks which nodes and subtrees are * anchored at the start and/or end, and determines which subtrees are * unsatisfiable due to start anchoring. @@ -585,7 +907,7 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) case AST_ANCHOR_START: /* * If it's not possible to get here without consuming - * any input and there's a start anchor, the regex is + * any input and there's a start anchor, the subtree is * inherently unsatisfiable. */ set_flags(n, AST_FLAG_ANCHORED_START); @@ -615,6 +937,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) set_flags(n, AST_FLAG_ANCHORED_END); if (n->u.anchor.is_end_nl && !(env->re_flags & RE_ANCHORED)) { set_flags(n, AST_FLAG_END_NL); + if (env->newline_after_end_anchor_state == NAEAS_NONE) { + env->newline_after_end_anchor_state = NAEAS_WOULD_MATCH_ONCE; + } } break; @@ -627,6 +952,15 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) * These are the types that actually consume input. */ case AST_EXPR_LITERAL: + if (n->u.literal.c == '\n' && + (env->newline_after_end_anchor_state == NAEAS_WOULD_MATCH_ONCE)) { + LOG(3 - LOG_ANCHORING, + "%s: LITERAL: rejecting non-optional newline match after $ as unsupported\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + break; case AST_EXPR_CODEPOINT: case AST_EXPR_RANGE: break; /* handled outside switch/case */ @@ -658,6 +992,13 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (res != AST_ANALYSIS_OK && res != AST_ANALYSIS_UNSATISFIABLE) { /* unsat is handled below */ + if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + + /* FIXME: check this */ + assert(child->flags & AST_FLAG_UNSATISFIABLE); + + set_flags(n, AST_FLAG_UNSATISFIABLE); + } return res; } @@ -688,6 +1029,7 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) env->past_always_consuming = 1; } + env->newline_after_end_anchor_state = child_env.newline_after_end_anchor_state; } /* flow ANCHORED_START and ANCHORED_END flags upward */ @@ -701,14 +1043,6 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) "%s: setting ANCHORED_START due to child %zu (%p)'s analysis\n", __func__, i, (void *)child); set_flags(n, AST_FLAG_ANCHORED_START); - - if (after_always_consumes) { - LOG(3 - LOG_ANCHORING, - "%s: setting %p and child %p UNSATISFIABLE due to ^-anchored child that always consumes input\n", - __func__, (void *)n, (void *)child); - set_flags(n, AST_FLAG_UNSATISFIABLE); - set_flags(child, AST_FLAG_UNSATISFIABLE); - } } if (always_consumes_input(child)) { @@ -771,16 +1105,21 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) for (i = 0; i < n->u.concat.count; i++) { struct ast_expr *child = n->u.concat.n[i]; if (after_end_anchor) { - if (always_consumes_input(child)) { + if (child->type == AST_EXPR_REPEAT + && (child->flags & AST_FLAG_UNSATISFIABLE) + && child->u.repeat.min == 0) { LOG(3 - LOG_ANCHORING, - "%s: after_end_anchor & ALWAYS_CONSUMES on child %p -> UNSATISFIABLE\n", - __func__, (void *)child); - set_flags(child, AST_FLAG_UNSATISFIABLE); + "%s: setting unsatisfiable {0,*} repeat after $ anchor to {0,0}\n", + __func__); + child->u.repeat.max = 0; } if (child->type == AST_EXPR_REPEAT - && (child->flags & AST_FLAG_UNSATISFIABLE) + && (child->flags & AST_FLAG_END_NL) && child->u.repeat.min == 0) { + LOG(3 - LOG_ANCHORING, + "%s: setting {0,*} repeat subtree matching \n after $ anchor to {0,0}\n", + __func__); child->u.repeat.max = 0; } } else if (!after_end_anchor @@ -831,6 +1170,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } else if (res == AST_ANALYSIS_OK) { all_set_past_always_consuming &= child_env.past_always_consuming; any_sat = 1; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE + || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + assert(child->flags & AST_FLAG_UNSATISFIABLE); + continue; } else { return res; } @@ -868,7 +1211,7 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (all_end_anchored) { LOG(3 - LOG_ANCHORING, "%s: ALT: all_end_anchored -> ANCHORED_END\n", __func__); - /* FIXME: AST_FLAG_END_NL: need to determine how this interacts + /* TODO: AST_FLAG_END_NL: need to determine how this interacts * with alt nodes. `^(?:(a)\z|(a)$)` */ set_flags(n, AST_FLAG_ANCHORED_END); } @@ -910,15 +1253,26 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) return res; } - if (n->u.repeat.e->flags & AST_FLAG_ANCHORED_END && n->u.repeat.min > 0) { - /* FIXME: if repeating something that is always - * anchored at the end, repeat.max could be - * capped at 1, but I have not yet found any - * inputs where that change is necessary to - * produce a correct result. */ - LOG(3 - LOG_ANCHORING, - "%s: REPEAT: repeating ANCHORED_END subtree >0 times -> ANCHORED_END\n", __func__); - set_flags(n, n->u.repeat.e->flags & END_ANCHOR_FLAG_MASK); + /* If the child subtree is anchored at the start and/or end, then this + * node can be repeated at most once. */ + const int child_is_anchored = n->u.repeat.e->flags & (AST_FLAG_ANCHORED_START | AST_FLAG_ANCHORED_END); + if (child_is_anchored) { + if (n->u.repeat.min >= 1) { + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: copying anchor flags from child subtree since we cannot repeat it 0 times\n", __func__); + set_flags(n, n->u.repeat.e->flags & END_ANCHOR_FLAG_MASK); + } + + if (n->u.repeat.min > 1) { + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: anchored, so clamping min repeat to at most once\n", __func__); + n->u.repeat.min = 1; + } + if (n->u.repeat.max > 1) { + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: anchored, so clamping max repeat to at most once\n", __func__); + n->u.repeat.max = 1; + } } break; @@ -928,27 +1282,22 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) /* This flows anchoring flags upward even when the node * is unsatisfiable, because that info can impact how * the node's unsatisfiability is handled. */ -#define PROPAGATE_CHILD_FLAGS(TAG, N, CHILD) \ - do { \ - struct ast_expr *child = CHILD; \ - if (child->flags & AST_FLAG_ANCHORED_START) { \ - set_flags(N, AST_FLAG_ANCHORED_START); \ - } \ - if (child->flags & AST_FLAG_ANCHORED_END) { \ - set_flags(N, AST_FLAG_ANCHORED_END); \ - } \ - if (res == AST_ANALYSIS_UNSATISFIABLE) { \ - LOG(3 - LOG_ANCHORING, \ - "%s: %s: setting UNSATISFIABLE due to unsatisfiable child\n", \ - __func__, TAG); \ - set_flags(N, AST_FLAG_UNSATISFIABLE); \ - } \ - if (res != AST_ANALYSIS_OK) { \ - return res; \ - } \ - } while(0) - - PROPAGATE_CHILD_FLAGS("GROUP", n, n->u.group.e); + struct ast_expr *child = n->u.group.e; + if (child->flags & AST_FLAG_ANCHORED_START) { + set_flags(n, AST_FLAG_ANCHORED_START); + } + if (child->flags & AST_FLAG_ANCHORED_END) { + set_flags(n, AST_FLAG_ANCHORED_END); + } + if (res == AST_ANALYSIS_UNSATISFIABLE) { + LOG(3 - LOG_ANCHORING, + "%s: GROUP: setting UNSATISFIABLE due to unsatisfiable childn", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + } + if (res != AST_ANALYSIS_OK) { + return res; + } break; case AST_EXPR_SUBTRACT: @@ -959,6 +1308,15 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (n->u.subtract.a->flags & AST_FLAG_ANCHORED_END) { set_flags(n, n->u.subtract.a->flags & END_ANCHOR_FLAG_MASK); } + + if (env->newline_after_end_anchor_state == NAEAS_WOULD_MATCH_ONCE) { + LOG(3 - LOG_ANCHORING, + "%s: SUBTRACT: rejecting non-optional newline match after $ as unsupported\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + if (res != AST_ANALYSIS_OK) { if (res == AST_ANALYSIS_UNSATISFIABLE) { set_flags(n, AST_FLAG_UNSATISFIABLE); @@ -1002,9 +1360,9 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) enum ast_analysis_res res; /* - * Second pass, in reverse, specifically checking for end-anchored - * subtrees that are unsatisfiable because they are followed by - * nodes that always consume input. + * Second anchoring pass, in reverse, specifically checking for + * end-anchored subtrees that are unsatisfiable because they are + * followed by nodes that always consume input. * * Also check for subtrees that always consume input appearing * before a start anchor and mark them as unsatisfiable. @@ -1016,6 +1374,14 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) switch (n->type) { case AST_EXPR_EMPTY: + if (env->before_start_anchor) { + /* Needed for cases like '()*^' matching "". */ + LOG(3 - LOG_ANCHORING, "%s: skipping EMPTY before ^\n", __func__); + + set_flags(n, AST_FLAG_ANCHORED_START); + } + break; + case AST_EXPR_TOMBSTONE: break; @@ -1029,10 +1395,19 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) break; case AST_ANCHOR_END: - /* should already be set during forward pass */ - assert(n->flags & AST_FLAG_ANCHORED_END); + /* Normally this will have been set during the forward pass, + * but if it's inside an unsatisfiable CONCAT node whose earlier + * children casued it to be rejected, forward analysis will not + * have reached it. */ + set_flags(n, AST_FLAG_ANCHORED_END); - if (env->followed_by_consuming) { + if (env->followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: RANGE: rejecting possible newline match after $ as unsupported\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } else if (env->followed_by_consuming) { LOG(3 - LOG_ANCHORING, "%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n", __func__); @@ -1098,6 +1473,7 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) set_flags(n, AST_FLAG_UNSATISFIABLE); } } else if (res != AST_ANALYSIS_OK) { + set_flags(n, AST_FLAG_UNSATISFIABLE); return res; } @@ -1120,6 +1496,13 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) __func__, (void *)child); env->before_start_anchor = 1; } + + if (!env->followed_by_consuming_newline && child_env.followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: setting followed_by_consuming_newline due to child %p's analysis\n", + __func__, (void *)child); + env->followed_by_consuming_newline = 1; + } } break; @@ -1128,6 +1511,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) case AST_EXPR_ALT: { int any_sat = 0; int all_set_followed_by_consuming = 1; + int all_set_followed_by_consuming_newline = 1; + int any_set_followed_by_consuming_newline = 0; int all_set_before_start_anchor = 1; assert(n->u.alt.count > 0); @@ -1153,7 +1538,15 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) } else if (res == AST_ANALYSIS_OK) { all_set_followed_by_consuming &= child_env.followed_by_consuming; all_set_before_start_anchor &= child_env.before_start_anchor; + + any_set_followed_by_consuming_newline |= child_env.followed_by_consuming_newline; + all_set_followed_by_consuming_newline &= child_env.followed_by_consuming_newline; + any_sat = 1; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE + || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + assert(child->flags & AST_FLAG_UNSATISFIABLE); + continue; } else { return res; } @@ -1166,6 +1559,13 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) env->followed_by_consuming = 1; } + if (!env->followed_by_consuming_newline && any_set_followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: ALT: any_set_followed_by_consuming_newline -> setting env->followed_by_consuming_newline for feature PCRE rejection\n", + __func__); + env->followed_by_consuming_newline = 1; + } + if (!env->before_start_anchor && all_set_before_start_anchor) { LOG(3 - LOG_ANCHORING, "%s: ALT: all_set_before_start_anchor -> setting env->before_start_anchor\n", @@ -1190,6 +1590,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) if (n->u.repeat.min == 0) { LOG(3 - LOG_ANCHORING, "%s: REPEAT: UNSATISFIABLE but can be repeated 0 times, ignoring\n", __func__); + /* Set the REPEAT node to repeat 0 times (to + * prune it) rather than making it as unsatisfiable. */ n->u.repeat.max = 0; /* skip */ break; } else if (n->u.repeat.min > 0) { @@ -1254,6 +1656,10 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) return AST_ANALYSIS_UNSATISFIABLE; } + if (n->flags & AST_FLAG_CAN_CONSUME && matches_newline(n)) { + env->followed_by_consuming_newline = 1; + } + return AST_ANALYSIS_OK; } @@ -1318,7 +1724,7 @@ assign_firsts(struct ast_expr *n) * subexpression is compiled, the links to the global self-loop * are created, which the REPEAT node then copies. * - * It probably makes sense to not go further + * FIXME: needs tests. It probably makes sense to not go further * here because the top layer of the repeated section * should only link to the global start once. */ @@ -1401,7 +1807,7 @@ assign_lasts(struct ast_expr *n) * subexpression is compiled, the links to the global self-loop * are created, which the REPEAT node then copies. * - * It probably makes sense to not go further + * FIXME: needs tests. It probably makes sense to not go further * here because the top layer of the repeated section * should only link to the global start once. */ @@ -1422,6 +1828,106 @@ assign_lasts(struct ast_expr *n) } } +static enum ast_analysis_res +analysis_iter_captures(struct capture_env *env, struct ast_expr *n) +{ + enum ast_analysis_res res; + + switch (n->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_TOMBSTONE: + case AST_EXPR_ANCHOR: + break; + + case AST_EXPR_LITERAL: + case AST_EXPR_CODEPOINT: + case AST_EXPR_RANGE: + break; + + case AST_EXPR_CONCAT: { + size_t i; + + for (i = 0; i < n->u.concat.count; i++) { + res = analysis_iter_captures(env, n->u.concat.n[i]); + if (res != AST_ANALYSIS_OK) { + return res; + } + } + + break; + } + + case AST_EXPR_ALT: { + size_t i; + + for (i = 0; i < n->u.alt.count; i++) { + res = analysis_iter_captures(env, n->u.alt.n[i]); + if (res != AST_ANALYSIS_OK) { + return res; + } + } + + break; + } + + case AST_EXPR_REPEAT: { + res = analysis_iter_captures(env, n->u.repeat.e); + if (res != AST_ANALYSIS_OK) { + return res; + } + +/* Set this to 1 when running the fuzzer, so that it ignores + * uninteresting failures from regexes like '.{1000000}' that use + * repetition to hit memory limits. + * + * This should be set by the build system when building for fuzzing. */ +#ifndef FUZZING_LIMITS +#define FUZZING_LIMITS 0 +#endif + +#if FUZZING_LIMITS + if ((n->u.repeat.max != AST_COUNT_UNBOUNDED && n->u.repeat.max >= 10) + || (n->u.repeat.min >= 10)) { + fprintf(stderr, "%s: rejecting regex with {count} >= 10 (%u)\n", + __func__, n->u.repeat.max); + return AST_ANALYSIS_ERROR_MEMORY; + } +#endif + + break; + } + + case AST_EXPR_GROUP: + if (env->max_capture_id == AST_NO_MAX_CAPTURE_ID + || n->u.group.id > env->max_capture_id) { + env->max_capture_id = n->u.group.id; + } + + res = analysis_iter_captures(env, n->u.group.e); + if (res != AST_ANALYSIS_OK) { + return res; + } + break; + + case AST_EXPR_SUBTRACT: + res = analysis_iter_captures(env, n->u.subtract.a); + if (res != AST_ANALYSIS_OK) { + return res; + } + + res = analysis_iter_captures(env, n->u.subtract.b); + if (res != AST_ANALYSIS_OK) { + return res; + } + break; + + default: + assert(!"unreached"); + } + + return AST_ANALYSIS_OK; +} + enum ast_analysis_res ast_analysis(struct ast *ast, enum re_flags flags) { @@ -1434,14 +1940,26 @@ ast_analysis(struct ast *ast, enum re_flags flags) assert(ast->expr != NULL); /* - * First pass -- track nullability, clean up some artifacts from - * parsing. + * First pass -- track nullability, which subtrees can/always consume + * input, and clean up some artifacts from parsing. */ res = analysis_iter(ast->expr); if (res != AST_ANALYSIS_OK) { return res; } + /* Next pass, check for capture IDs. */ + { + struct capture_env env = { .max_capture_id = AST_NO_MAX_CAPTURE_ID }; + env.use_captures = !(flags & RE_NOCAPTURE); + + res = analysis_iter_captures(&env, ast->expr); + if (res != AST_ANALYSIS_OK) { + return res; + } + ast->max_capture_id = env.max_capture_id; + } + /* * Next pass: set anchoring, now that nullability info from * the first pass is in place and some other things have been @@ -1449,8 +1967,10 @@ ast_analysis(struct ast *ast, enum re_flags flags) * start anchors. */ { - /* first anchoring analysis pass, sweeping forward */ - struct anchoring_env env = { .re_flags = flags }; + struct anchoring_env env = { + .re_flags = flags, + .newline_after_end_anchor_state = NAEAS_NONE, + }; res = analysis_iter_anchoring(&env, ast->expr); if (res != AST_ANALYSIS_OK) { return res; } @@ -1458,20 +1978,30 @@ ast_analysis(struct ast *ast, enum re_flags flags) res = analysis_iter_reverse_anchoring(&env, ast->expr); if (res != AST_ANALYSIS_OK) { return res; } - /* - * Next passes, mark all nodes in a first and/or last - * position. This is informed by the anchoring flags, so - * that needs to happen first. - */ - assign_firsts(ast->expr); - assign_lasts(ast->expr); - - ast->has_unanchored_start = (analysis_iter_unanchored_start(ast->expr) != UA_NO); - ast->has_unanchored_end = (analysis_iter_unanchored_end(ast->expr) != UA_NO); - LOG(2 - LOG_UNANCHORED_FLAGS, - "%s: has_unanchored_start %d, has_unanchored_end %d\n", - __func__, ast->has_unanchored_start, ast->has_unanchored_end); } + /* Next pass, mark some cases that need special handling + * due to repetition. For example, with cases like + * ^((x?))*$ the inner capture will always need to repeat + * one more time to match () after any 'x's. + * + * This needs to happen after the anchoring passes. */ + res = analysis_iter_repetition(ast->expr, NULL, 0, NULL); + if (res != AST_ANALYSIS_OK) { return res; } + + /* + * Next passes, mark all nodes in a first and/or last + * position. This is informed by the anchoring flags, so + * that needs to happen first. + */ + assign_firsts(ast->expr); + assign_lasts(ast->expr); + + ast->has_unanchored_start = (analysis_iter_unanchored_start(ast->expr) != ANCHOR_ANALYSIS_ANCHORED); + ast->has_unanchored_end = (analysis_iter_unanchored_end(ast->expr) != ANCHOR_ANALYSIS_ANCHORED); + LOG(2 - LOG_UNANCHORED_FLAGS, + "%s: has_unanchored_start %d, has_unanchored_end %d\n", + __func__, ast->has_unanchored_start, ast->has_unanchored_end); + return res; } diff --git a/src/libre/ast_analysis.h b/src/libre/ast_analysis.h index f9ec8ebbb..5390cce57 100644 --- a/src/libre/ast_analysis.h +++ b/src/libre/ast_analysis.h @@ -30,7 +30,9 @@ enum ast_analysis_res { AST_ANALYSIS_UNSATISFIABLE, AST_ANALYSIS_ERROR_NULL = -1, - AST_ANALYSIS_ERROR_MEMORY = -2 + AST_ANALYSIS_ERROR_MEMORY = -2, + AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE = -3, + AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE = -4 }; enum ast_analysis_res diff --git a/src/libre/ast_compile.c b/src/libre/ast_compile.c index e0f321037..e1188c4dc 100644 --- a/src/libre/ast_compile.c +++ b/src/libre/ast_compile.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -15,18 +16,25 @@ #include #include +#include #include #include #include +#include +#include + #include "class.h" #include "ast.h" #include "ast_compile.h" +#include "re_capvm_compile.h" +#include "libfsm/capture.h" #include "libfsm/internal.h" /* XXX */ #define LOG_LINKAGE 0 +#define LOG_TRAMPOLINE 0 #if LOG_LINKAGE #include "print.h" @@ -44,12 +52,12 @@ enum link_side { * Use the passed in start/end states (x and y) * * - LINK_GLOBAL - * Link to the global start/end state (env->start or env->end), + * Link to the global start/end state (env->start_inner or env->end_inner), * because this node has a ^ or $ anchor * * - LINK_GLOBAL_SELF_LOOP * Link to the unanchored self loop adjacent to the start/end - * state (env->start_any_loop or env->end_any_loop), because + * states (env->start_any_inner or env->end_any_inner), because * this node is in a FIRST or LAST position, but unanchored. */ enum link_types { @@ -58,7 +66,12 @@ enum link_types { LINK_GLOBAL_SELF_LOOP, }; +/* Call stack for AST -> NFA conversion. */ +#define DEF_COMP_STACK_CEIL 4 +#define NO_MAX_CAPTURE_IDS ((unsigned)-1) + struct comp_env { + const struct fsm_alloc *alloc; struct fsm *fsm; enum re_flags re_flags; struct re_err *err; @@ -72,21 +85,90 @@ struct comp_env { * Also, some states in a first/last context need to link * directly to the overall start/end states, either in * place of or along with the adjacent states. + * + * The inner start and end states are considered inside of + * match group 0, outer are not. */ - fsm_state_t start; - fsm_state_t end; + fsm_state_t start_outer; + fsm_state_t start_inner; fsm_state_t start_any_loop; + fsm_state_t start_any_inner; + int have_start_any_loop; + + /* States leading to the end, with and without an unanchored + * `.*` loop that consumes any trailing characters. */ + fsm_state_t end_inner; + fsm_state_t end_outer; + fsm_state_t end_nl_inner; fsm_state_t end_any_loop; - fsm_state_t end_nl; - int has_start_any_loop; + fsm_state_t end_any_inner; int has_end_any_loop; - int has_end_nl; + int has_end_nl_inner; + + /* bitset for active capture IDs */ + uint64_t *active_capture_ids; + long max_capture_id; /* upper bound */ + + /* Evaluation stack */ + struct comp_stack { + size_t ceil; + size_t depth; /* 0 -> empty */ + + struct comp_stack_frame { + struct ast_expr *n; + fsm_state_t x; + fsm_state_t y; + unsigned step; + + union { + struct { + fsm_state_t link; + } concat; + struct { + unsigned count; + } alt; + struct { + struct fsm_subgraph subgraph; + fsm_state_t na; + fsm_state_t nz; + } repeat; + } u; + } *frames; + } stack; }; static int -comp_iter(struct comp_env *env, - fsm_state_t x, fsm_state_t y, - struct ast_expr *n, const struct ast_expr *parent); +comp_iter(struct comp_env *env, fsm_state_t x, const struct ast *ast); + +static int +eval_stack_frame(struct comp_env *env); + +static int +eval_EMPTY(struct comp_env *env); +static int +eval_CONCAT(struct comp_env *env); +static int +eval_ALT(struct comp_env *env); +static int +eval_LITERAL(struct comp_env *env); +static int +eval_CODEPOINT(struct comp_env *env); +static int +eval_REPEAT(struct comp_env *env); +static int +eval_GROUP(struct comp_env *env); +static int +eval_ANCHOR(struct comp_env *env); +static int +eval_SUBTRACT(struct comp_env *env); +static int +eval_RANGE(struct comp_env *env); +static int +eval_TOMBSTONE(struct comp_env *env); + +static int +compile_capvm_program_for_stack_end_states(struct comp_env *env, + const struct ast *ast, uint32_t *prog_id); static int utf8(uint32_t cp, char c[]) @@ -192,6 +274,9 @@ expr_compile(struct ast_expr *e, enum re_flags flags, struct ast ast; ast.expr = e; + ast.max_capture_id = 0; + ast.has_unanchored_start = 0; + ast.has_unanchored_end = 0; return ast_compile(&ast, flags, opt, err); } @@ -207,11 +292,11 @@ addedge_literal(struct comp_env *env, enum re_flags re_flags, assert(to < env->fsm->statecount); if (re_flags & RE_ICASE) { - if (!fsm_addedge_literal(fsm, from, to, tolower((unsigned char) c))) { + if (!fsm_addedge_literal(fsm, from, to, (char)tolower((unsigned char) c))) { return 0; } - - if (!fsm_addedge_literal(fsm, from, to, toupper((unsigned char) c))) { + + if (!fsm_addedge_literal(fsm, from, to, (char)toupper((unsigned char) c))) { return 0; } } else { @@ -219,38 +304,50 @@ addedge_literal(struct comp_env *env, enum re_flags re_flags, return 0; } } - + return 1; } static int intern_start_any_loop(struct comp_env *env) { - fsm_state_t loop; + fsm_state_t loop, inner; assert(env != NULL); - if (env->has_start_any_loop) { + if (env->have_start_any_loop) { return 1; } assert(~env->re_flags & RE_ANCHORED); - assert(env->start < env->fsm->statecount); + assert(env->start_outer < env->fsm->statecount); if (!fsm_addstate(env->fsm, &loop)) { return 0; } + if (!fsm_addstate(env->fsm, &inner)) { + return 0; + } + +#if LOG_LINKAGE + fprintf(stderr, "%s: start_any: loop %d, inner: %d\n", __func__, loop, inner); +#endif + if (!fsm_addedge_any(env->fsm, loop, loop)) { return 0; } - if (!fsm_addedge_epsilon(env->fsm, env->start, loop)) { + if (!fsm_addedge_epsilon(env->fsm, env->start_outer, loop)) { + return 0; + } + if (!fsm_addedge_epsilon(env->fsm, loop, inner)) { return 0; } env->start_any_loop = loop; - env->has_start_any_loop = 1; + env->start_any_inner = inner; + env->have_start_any_loop = 1; return 1; } @@ -258,7 +355,7 @@ intern_start_any_loop(struct comp_env *env) static int intern_end_any_loop(struct comp_env *env) { - fsm_state_t loop; + fsm_state_t loop, inner; assert(env != NULL); @@ -267,21 +364,32 @@ intern_end_any_loop(struct comp_env *env) } assert(~env->re_flags & RE_ANCHORED); - assert(env->end < env->fsm->statecount); + assert(env->end_outer < env->fsm->statecount); if (!fsm_addstate(env->fsm, &loop)) { return 0; } + if (!fsm_addstate(env->fsm, &inner)) { + return 0; + } + +#if LOG_LINKAGE + fprintf(stderr, "%s: end_any: %d, inner: %d\n", __func__, loop, inner); +#endif if (!fsm_addedge_any(env->fsm, loop, loop)) { return 0; } - if (!fsm_addedge_epsilon(env->fsm, loop, env->end)) { + if (!fsm_addedge_epsilon(env->fsm, inner, loop)) { + return 0; + } + if (!fsm_addedge_epsilon(env->fsm, loop, env->end_outer)) { return 0; } env->end_any_loop = loop; + env->end_any_inner = inner; env->has_end_any_loop = 1; return 1; @@ -290,37 +398,39 @@ intern_end_any_loop(struct comp_env *env) static int intern_end_nl(struct comp_env *env) { - /* PCRE's end anchor $ matches a single optional newline. + /* PCRE's end anchor $ matches a single optional newline, + * which should exist outside of match group 0. * - * Intern states for a `\n?` that links to the global end. */ + * Intern states for a `\n?` that links to */ assert(env != NULL); - if (env->has_end_nl) { + if (env->has_end_nl_inner) { return 1; } assert(~env->re_flags & RE_ANCHORED); assert(env->re_flags & RE_END_NL); - assert(env->end < env->fsm->statecount); + assert(~env->re_flags & RE_END_NL_DISABLE); + assert(env->end_outer < env->fsm->statecount); - fsm_state_t end_nl; - if (!fsm_addstate(env->fsm, &end_nl)) { + fsm_state_t inner; + if (!fsm_addstate(env->fsm, &inner)) { return 0; } #if LOG_LINKAGE - fprintf(stderr, "%s: end_nl: %d\n", __func__, end_nl); + fprintf(stderr, "%s: end_nl_inner: %d\n", __func__, inner); #endif - if (!fsm_addedge_epsilon(env->fsm, end_nl, env->end)) { + if (!fsm_addedge_epsilon(env->fsm, inner, env->end_outer)) { return 0; } - if (!fsm_addedge_literal(env->fsm, end_nl, env->end, (char)'\n')) { + if (!fsm_addedge_literal(env->fsm, inner, env->end_outer, (char)'\n')) { return 0; } - env->end_nl = end_nl; - env->has_end_nl = 1; + env->end_nl_inner = inner; + env->has_end_nl_inner = 1; return 1; } @@ -337,8 +447,8 @@ can_have_backward_epsilon_edge(const struct ast_expr *e) return 0; case AST_EXPR_SUBTRACT: - /* XXX: not sure */ - return 1; + /* Single character class */ + return 0; case AST_EXPR_REPEAT: /* 0 and 1 don't have backward epsilon edges */ @@ -396,21 +506,32 @@ can_skip_concat_state_and_epsilon(const struct ast_expr *l, return 0; } +static const struct ast_expr * +get_parent_node_from_stack(const struct comp_stack *stack) +{ + if (stack->depth < 2) { return NULL; } + return stack->frames[stack->depth - 2].n; +} + static enum link_types -decide_linking(struct comp_env *env, - struct ast_expr *n, const struct ast_expr *parent, enum link_side side) +decide_linking(struct comp_env *env, fsm_state_t x, fsm_state_t y, + struct ast_expr *n, enum link_side side) { assert(n != NULL); assert(env != NULL); + (void)x; + (void)y; + + struct comp_stack *stack = &env->stack; + /* If the regex is implicitly anchored and the dialect does * not support anchoring, linking is always top-down. */ if ((env->re_flags & RE_ANCHORED)) { return LINK_TOP_DOWN; } - /* parent can be NULL, if we're at the root node, but it must - * never be the same node. */ + const struct ast_expr *parent = get_parent_node_from_stack(stack); assert(parent != n); /* Note: any asymmetry here should be due to special cases @@ -478,183 +599,71 @@ print_linkage(enum link_types t) #define EPSILON(FROM, TO) \ assert((FROM) != (TO)); \ if (!fsm_addedge_epsilon(env->fsm, (FROM), (TO))) { return 0; } - + #define ANY(FROM, TO) \ if (!fsm_addedge_any(env->fsm, (FROM), (TO))) { return 0; } #define LITERAL(FROM, TO, C) \ if (!addedge_literal(env, n->re_flags, (FROM), (TO), ((char)C))) { return 0; } -#define RECURSE(FROM, TO, NODE, PARENT) \ - if (!comp_iter(env, (FROM), (TO), (NODE), (PARENT))) { return 0; } - -static int -comp_iter_repeated(struct comp_env *env, - fsm_state_t x, fsm_state_t y, - struct ast_expr *n) -{ - fsm_state_t a, b; - fsm_state_t na, nz; - unsigned i; - - assert(n->type == AST_EXPR_REPEAT); - const unsigned min = n->u.repeat.min; - const unsigned max = n->u.repeat.max; - struct ast_expr *e = n->u.repeat.e; - - assert(min <= max); - - if (min == 0 && max == 0) { /* {0,0} */ - EPSILON(x, y); - } else if (min == 0 && max == 1) { /* '?' */ - RECURSE(x, y, e, n); - EPSILON(x, y); - } else if (min == 1 && max == 1) { /* {1,1} */ - RECURSE(x, y, e, n); - } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* '*' */ - NEWSTATE(na); - NEWSTATE(nz); - EPSILON(x,na); - EPSILON(nz,y); - - EPSILON(na, nz); - RECURSE(na, nz, e, n); - EPSILON(nz, na); - } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* '+' */ - NEWSTATE(na); - NEWSTATE(nz); - EPSILON(x,na); - EPSILON(nz,y); - - RECURSE(na, nz, e, n); - EPSILON(nz, na); - } else { - /* - * Make new beginning/end states for the repeated section, - * build its NFA, and link to its head. - */ - - struct fsm_subgraph subgraph; - fsm_state_t tail; - - fsm_subgraph_start(env->fsm, &subgraph); - - NEWSTATE(na); - NEWSTATE(nz); - RECURSE(na, nz, e, n); - EPSILON(x, na); /* link head to repeated NFA head */ - - b = nz; /* set the initial tail */ - - /* can be skipped */ - if (min == 0) { - EPSILON(na, nz); - } - fsm_subgraph_stop(env->fsm, &subgraph); - tail = nz; - - if (max != AST_COUNT_UNBOUNDED) { - for (i = 1; i < max; i++) { - /* copies the original subgraph; need to set b to the - * original tail - */ - b = tail; +#define RETURN(ENV) comp_stack_pop(ENV) - if (!fsm_subgraph_duplicate(env->fsm, &subgraph, &b, &a)) { - return 0; - } +#define RECURSE(ENV, FROM, TO, NODE) \ + if (!comp_stack_push(ENV, (FROM), (TO), (NODE))) { return 0; } - EPSILON(nz, a); - - /* To the optional part of the repeated count */ - if (i >= min) { - EPSILON(nz, b); - } - - na = a; /* advance head for next duplication */ - nz = b; /* advance tail for concenation */ - } - } else { - for (i = 1; i < min; i++) { - /* copies the original subgraph; need to set b to the - * original tail - */ - b = tail; - - if (!fsm_subgraph_duplicate(env->fsm, &subgraph, &b, &a)) { - return 0; - } - - EPSILON(nz, a); - - na = a; /* advance head for next duplication */ - nz = b; /* advance tail for concenation */ - } - - /* back link to allow for infinite repetition */ - EPSILON(nz,na); - } - - /* tail to last repeated NFA tail */ - EPSILON(nz, y); - } - - return 1; -} +#define TAILCALL(ENV, FROM, TO, NODE) \ + comp_stack_tailcall(ENV, (FROM), (TO), (NODE)); static int -comp_iter(struct comp_env *env, - fsm_state_t x, fsm_state_t y, - struct ast_expr *n, const struct ast_expr *parent) +set_linking(struct comp_env *env, struct ast_expr *n, + enum link_types link_start, enum link_types link_end, + fsm_state_t *px, fsm_state_t *py) { - enum link_types link_start, link_end; - - if (n == NULL) { - return 1; - } - - link_start = decide_linking(env, n, parent, LINK_START); - link_end = decide_linking(env, n, parent, LINK_END); + fsm_state_t x = *px; + fsm_state_t y = *py; #if LOG_LINKAGE - fprintf(stderr, "%s: decide_linking %p: start ", __func__, (void *) n); + fprintf(stderr, "%s: decide_linking %p [%s]: start ", + __func__, (void *) n, ast_node_type_name(n->type)); print_linkage(link_start); fprintf(stderr, ", end "); print_linkage(link_end); fprintf(stderr, ", x %d, y %d\n", x, y); #else (void) print_linkage; + (void)n; #endif switch (link_start) { case LINK_TOP_DOWN: break; case LINK_GLOBAL: - x = env->start; + x = env->start_inner; break; case LINK_GLOBAL_SELF_LOOP: if (!intern_start_any_loop(env)) { return 0; } - assert(env->has_start_any_loop); + assert(env->have_start_any_loop); - x = env->start_any_loop; + x = env->start_any_inner; break; default: - assert(!"unreachable"); + assert(!"match fail"); /* these should be mutually exclusive now */ } switch (link_end) { case LINK_TOP_DOWN: break; case LINK_GLOBAL: - if (env->re_flags & RE_END_NL && (n->flags & AST_FLAG_END_NL)) { + if (env->re_flags & RE_END_NL && !(env->re_flags & RE_END_NL_DISABLE) + && (n->flags & AST_FLAG_END_NL)) { if (!intern_end_nl(env)) { return 0; } - y = env->end_nl; + y = env->end_nl_inner; } else { - y = env->end; + y = env->end_inner; } break; case LINK_GLOBAL_SELF_LOOP: @@ -663,243 +672,784 @@ comp_iter(struct comp_env *env, } assert(env->has_end_any_loop); - y = env->end_any_loop; + y = env->end_any_inner; break; default: - assert(!"unreachable"); + assert(!"match fail"); /* these should be mutually exclusive now */ } #if LOG_LINKAGE fprintf(stderr, " ---> x: %d, y: %d\n", x, y); #endif + *px = x; + *py = y; + return 1; +} - switch (n->type) { - case AST_EXPR_EMPTY: - /* skip these, when possible */ - EPSILON(x, y); - break; +static void +comp_stack_pop(struct comp_env *env) +{ + assert(env->stack.depth > 0); + env->stack.depth--; +} - case AST_EXPR_CONCAT: - { - fsm_state_t curr_x; - size_t i; +static int +comp_stack_push(struct comp_env *env, fsm_state_t x, fsm_state_t y, struct ast_expr *n) +{ + struct comp_stack *stack = &env->stack; + assert(n != NULL); - const size_t count = n->u.concat.count; + if (stack->depth == stack->ceil) { + const size_t nceil = 2*stack->ceil; + struct comp_stack_frame *nframes = f_realloc(env->alloc, + stack->frames, nceil * sizeof(stack->frames[0])); +#if LOG_LINKAGE || LOG_TRAMPOLINE + fprintf(stderr, "comp_stack_push: reallocating comp_stack, %zu -> %zu frames\n", + stack->ceil, nceil); +#endif + if (nframes == NULL) { + return 0; + } + stack->ceil = nceil; + stack->frames = nframes; + } - curr_x = x; + assert(stack->depth < stack->ceil); - assert(count >= 1); + struct comp_stack_frame *sf = &stack->frames[stack->depth]; + memset(sf, 0x00, sizeof(*sf)); + sf->n = n; + sf->x = x; + sf->y = y; - if (!fsm_addstate_bulk(env->fsm, count - 1)) { - return 0; - } + stack->depth++; + return 1; +} - for (i = 0; i < count; i++) { - struct ast_expr *curr = n->u.concat.n[i]; +static void +comp_stack_tailcall(struct comp_env *env, + fsm_state_t x, fsm_state_t y, struct ast_expr *n) +{ + struct comp_stack *stack = &env->stack; - /* If a subtree is unsatisfiable but also nullable, ignore it. */ - const enum ast_flags nullable_and_unsat = AST_FLAG_NULLABLE - | AST_FLAG_UNSATISFIABLE; - if ((curr->flags & nullable_and_unsat) == nullable_and_unsat) { - /* if necessary, link the end */ - if (i == count - 1) { - EPSILON(curr_x, y); - } - continue; - } + assert(stack->depth > 0); - struct ast_expr *next = i == count - 1 - ? NULL - : n->u.concat.n[i + 1]; + /* Replace current stack frame. */ + struct comp_stack_frame *sf = &stack->frames[stack->depth - 1]; + memset(sf, 0x00, sizeof(*sf)); + sf->n = n; + sf->x = x; + sf->y = y; +} - fsm_state_t z; - if (i + 1 < count) { - if (!fsm_addstate(env->fsm, &z)) { - return 0; - } -#if LOG_LINKAGE - fprintf(stderr, "%s: added state z %d\n", __func__, z); -#endif - } else { - z = y; /* connect to right parent to close off subtree */ - } +#define JUST_ONE_PROG 1 - /* - * If nullable, add an extra state & epsilion as a one-way gate - */ - if (!can_skip_concat_state_and_epsilon(curr, next)) { - fsm_state_t diode; +static int +comp_iter(struct comp_env *env, + fsm_state_t x, const struct ast *ast) +{ + int res = 1; + assert(ast != NULL); + assert(ast->expr != NULL); - NEWSTATE(diode); - EPSILON(curr_x, diode); - curr_x = diode; - } + struct comp_stack_frame *frames = NULL; + uint64_t *active_capture_ids = NULL; + const bool use_captures = (env->re_flags & RE_NOCAPTURE) == 0; - RECURSE(curr_x, z, curr, n); + frames = f_calloc(env->alloc, + DEF_COMP_STACK_CEIL, sizeof(env->stack.frames[0])); + if (frames == NULL) { + goto alloc_fail; + } - curr_x = z; + { + const size_t capture_id_words = (env->max_capture_id == AST_NO_MAX_CAPTURE_ID) + ? 0 + : ((env->max_capture_id)/64 + 1); + active_capture_ids = f_calloc(env->alloc, capture_id_words, + sizeof(active_capture_ids[0])); + if (active_capture_ids == NULL) { + goto alloc_fail; } + } - break; + /* Add inner and outer end states. Like start_outer and start_inner, + * these represent the boundary between match group 0 (inner) and + * states outside it (the unanchored end loop). */ + if (!fsm_addstate(env->fsm, &env->end_inner)) { + goto alloc_fail; + } + if (!fsm_addstate(env->fsm, &env->end_outer)) { + goto alloc_fail; + } + if (!fsm_addedge_epsilon(env->fsm, env->end_inner, env->end_outer)) { + goto alloc_fail; } - case AST_EXPR_ALT: - { - size_t i; + fsm_setend(env->fsm, env->end_outer, 1); - const size_t count = n->u.alt.count; +#if LOG_LINKAGE + fprintf(stderr, "end: outer %d, inner %d\n", + env->end_outer, env->end_inner); +#endif - assert(count >= 1); +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: x %d, y %d\n", __func__, x, env->end_inner); +#endif - for (i = 0; i < count; i++) { - /* skip unsatisfiable ALT subtrees */ - if (n->u.alt.n[i]->flags & AST_FLAG_UNSATISFIABLE) { + env->stack.ceil = DEF_COMP_STACK_CEIL; + env->stack.depth = 1; + env->stack.frames = frames; + env->active_capture_ids = active_capture_ids; + + { /* set up the first stack frame */ + struct comp_stack_frame *sf = &env->stack.frames[0]; + sf->n = ast->expr; + sf->x = x; + sf->y = env->end_inner; + sf->step = 0; + } + +#if JUST_ONE_PROG + uint32_t prog_id; + if (use_captures) { + if (!compile_capvm_program_for_stack_end_states(env, ast, &prog_id)) { + goto alloc_fail; + } + } +#endif + + /* evaluate call stack until termination */ + while (res && env->stack.depth > 0) { + if (!eval_stack_frame(env)) { +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: res -> 0\n", __func__); +#endif + res = 0; + break; + } + } + + if (use_captures && res && env->max_capture_id != AST_NO_MAX_CAPTURE_ID) { + /* Set the active captures on the end state. */ + for (unsigned i = 0; i <= (unsigned)env->max_capture_id; i++) { + if (!u64bitset_get(env->active_capture_ids, i)) { continue; } + if (!fsm_capture_set_active_for_end(env->fsm, i, env->end_outer)) { + goto alloc_fail; + } + } - /* - * CONCAT handles adding extra states and - * epsilons when necessary, so there isn't much - * more to do here. - */ - RECURSE(x, y, n->u.alt.n[i], n); +#if !JUST_ONE_PROG + uint32_t prog_id; + if (!compile_capvm_program_for_stack_end_states(env, stack, ast, &prog_id)) { + goto alloc_fail; + } +#endif + +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: associated prog_id %u with state %d\n", + __func__, prog_id, stack->end_outer); +#endif + if (!fsm_capture_associate_program_with_end_state(env->fsm, + prog_id, env->end_outer)) { + goto alloc_fail; } - break; } - case AST_EXPR_LITERAL: - LITERAL(x, y, n->u.literal.c); - break; + f_free(env->alloc, env->stack.frames); + f_free(env->alloc, env->active_capture_ids); - case AST_EXPR_CODEPOINT: { - fsm_state_t a, b; - char c[4]; - int r, i; + return res; - r = utf8(n->u.codepoint.u, c); - if (!r) { - if (env->err != NULL) { - env->err->e = RE_EBADCP; - env->err->cp = n->u.codepoint.u; - } +alloc_fail: + /* TODO: set env->err to indicate alloc failure */ + if (frames != NULL) { + f_free(env->alloc, frames); + } + if (active_capture_ids != NULL) { + f_free(env->alloc, active_capture_ids); + } + return 0; +} + +static struct comp_stack_frame * +get_comp_stack_top(struct comp_env *env) +{ + struct comp_stack *stack = &env->stack; + assert(stack->depth > 0); + struct comp_stack_frame *sf = &stack->frames[stack->depth - 1]; + assert(sf->n != NULL); + return sf; +} + +static int +eval_stack_frame(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: depth %zu/%zu, type %s, step %u\n", __func__, + stack->depth, stack->ceil, ast_node_type_name(sf->n->type), sf->step); +#endif + + /* If this is the first time the trampoline has called this + * state, decide the linking. Some of the states below (such as + * AST_EXPR_CONCAT) can have multiple child nodes, so they will + * increment step and use it to resume where they left off as + * the trampoline returns execution to them. */ + enum link_types link_end; + if (sf->step == 0) { /* entering state */ + enum link_types link_start; + link_start = decide_linking(env, sf->x, sf->y, sf->n, LINK_START); + link_end = decide_linking(env, sf->x, sf->y, sf->n, LINK_END); + if (!set_linking(env, sf->n, link_start, link_end, &sf->x, &sf->y)) { return 0; } + } + +#if LOG_TRAMPOLINE > 1 + fprintf(stderr, "%s: x %d, y %d\n", __func__, sf->x, sf->y); +#endif + + switch (sf->n->type) { + case AST_EXPR_EMPTY: + return eval_EMPTY(env); + case AST_EXPR_CONCAT: + return eval_CONCAT(env); + case AST_EXPR_ALT: + return eval_ALT(env); + case AST_EXPR_LITERAL: + return eval_LITERAL(env); + case AST_EXPR_CODEPOINT: + return eval_CODEPOINT(env); + case AST_EXPR_REPEAT: + return eval_REPEAT(env); + case AST_EXPR_GROUP: + return eval_GROUP(env); + case AST_EXPR_ANCHOR: + return eval_ANCHOR(env); + case AST_EXPR_SUBTRACT: + return eval_SUBTRACT(env); + case AST_EXPR_RANGE: + return eval_RANGE(env); + case AST_EXPR_TOMBSTONE: + return eval_TOMBSTONE(env); + default: + assert(!"unreached"); + return 0; + } +} - a = x; +static int +eval_EMPTY(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); +#if LOG_LINKAGE + fprintf(stderr, "eval_EMPTY: step %u, x %d -> y %d\n", + sf->step, sf->x, sf->y); +#endif - for (i = 0; i < r; i++) { - if (i + 1 < r) { - NEWSTATE(b); - } else { - b = y; + EPSILON(sf->x, sf->y); + RETURN(env); + return 1; +} + +static int +eval_CONCAT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; + const size_t count = n->u.concat.count; + assert(count >= 1); + +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: eval_CONCAT: x %d, y %d, step %d\n", + sf->x, sf->y, sf->step); +#endif + + if (sf->step == 0) { + sf->u.concat.link = sf->x; + } + + while (sf->step < count) { + fsm_state_t curr_x = sf->u.concat.link; + struct ast_expr *curr = n->u.concat.n[sf->step]; + + /* If a subtree is unsatisfiable but also nullable, ignore it. */ + const enum ast_flags nullable_and_unsat = AST_FLAG_NULLABLE + | AST_FLAG_UNSATISFIABLE; + if ((curr->flags & nullable_and_unsat) == nullable_and_unsat) { + sf->step++; + + /* if necessary, link the end */ + if (sf->step == count) { + EPSILON(curr_x, sf->y); } + return 1; + } - LITERAL(a, b, c[i]); + struct ast_expr *next = sf->step == count - 1 + ? NULL + : n->u.concat.n[sf->step + 1]; - a = b; + fsm_state_t z; + if (sf->step + 1 < count) { + if (!fsm_addstate(env->fsm, &z)) { + return 0; + } +#if LOG_LINKAGE + fprintf(stderr, "%s: added state z %d\n", __func__, z); +#endif + } else { + z = sf->y; /* connect to right parent to close off subtree */ } - break; - } +#if LOG_LINKAGE + fprintf(stderr, "%s: curr_x %d, z %d\n", + __func__, curr_x, z); +#endif - case AST_EXPR_REPEAT: /* - * REPEAT breaks out into its own function, because - * there are several special cases + * If nullable, add an extra state & epsilon as a one-way gate */ - if (!comp_iter_repeated(env, x, y, n)) { - return 0; + if (!can_skip_concat_state_and_epsilon(curr, next)) { + fsm_state_t diode; + + NEWSTATE(diode); + EPSILON(curr_x, diode); + curr_x = diode; +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: added diode %d\n", diode); +#endif } - break; - case AST_EXPR_GROUP: - RECURSE(x, y, n->u.group.e, n); - break; +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: recurse CONCAT[%u/%zu]: link %d, z %d\n", + sf->step, count, sf->u.concat.link, z); +#endif + /* Set the right side link, which will become the + * left side link for the next step (if any). */ + sf->u.concat.link = z; + sf->step++; + RECURSE(env, curr_x, z, curr); + return 1; + } - case AST_EXPR_TOMBSTONE: - /* do not link -- intentionally pruned */ - break; + RETURN(env); + return 1; +} - case AST_EXPR_ANCHOR: - EPSILON(x, y); - break; +static int +eval_ALT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + const size_t count = sf->n->u.alt.count; + assert(count >= 1); - case AST_EXPR_SUBTRACT: { - struct fsm *a, *b; - struct fsm *q; - enum re_flags re_flags; +#if LOG_LINKAGE + fprintf(stderr, "eval_ALT: step %u\n", sf->step); +#endif - re_flags = n->re_flags; + if (sf->step < count) { + struct ast_expr *n; - /* wouldn't want to reverse twice! */ - re_flags &= ~(unsigned)RE_REVERSE; + /* + * CONCAT handles adding extra states and + * epsilons when necessary, so there isn't much + * more to do here. + */ +#if LOG_LINKAGE + fprintf(stderr, "eval_ALT: recurse ALT[%u/%zu]: x %d, y %d\n", + sf->step, count, sf->x, sf->y); +#endif - a = expr_compile(n->u.subtract.a, re_flags, - fsm_getoptions(env->fsm), env->err); - if (a == NULL) { - return 0; + n = sf->n->u.alt.n[sf->step]; + assert(n != NULL); + sf->step++; /* RECURSE can realloc the stack and make sf stale. */ + + if (!(n->flags & AST_FLAG_UNSATISFIABLE)) { + RECURSE(env, sf->x, sf->y, n); } + return 1; + } - b = expr_compile(n->u.subtract.b, re_flags, - fsm_getoptions(env->fsm), env->err); - if (b == NULL) { - fsm_free(a); - return 0; + RETURN(env); + return 1; +} + +static int +eval_LITERAL(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; +#if LOG_LINKAGE + fprintf(stderr, "%s: linking %d -> %d with literal '%c' (0x%02x)\n", + __func__, sf->x, sf->y, isprint(n->u.literal.c) ? n->u.literal.c : '.', + n->u.literal.c); +#endif + + LITERAL(sf->x, sf->y, n->u.literal.c); + + RETURN(env); + return 1; +} + +static int +eval_CODEPOINT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; + fsm_state_t a, b; + char c[4]; + int r, i; + + r = utf8(n->u.codepoint.u, c); + if (!r) { + if (env->err != NULL) { + env->err->e = RE_EBADCP; + env->err->cp = n->u.codepoint.u; } - q = fsm_subtract(a, b); - if (q == NULL) { - return 0; + return 0; + } + + a = sf->x; + + for (i = 0; i < r; i++) { + if (i + 1 < r) { + NEWSTATE(b); + } else { + b = sf->y; } + LITERAL(a, b, c[i]); + + a = b; + } + + RETURN(env); + return 1; +} + +static int +eval_REPEAT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + fsm_state_t a, b; + unsigned i, min, max; + + assert(sf->n->type == AST_EXPR_REPEAT); + struct ast_expr_repeat *n = &sf->n->u.repeat; + + min = n->min; + max = n->max; + + assert(min <= max); + + if (min == 0 && max == 0) { /* {0,0} */ + EPSILON(sf->x, sf->y); + RETURN(env); + return 1; + } else if (min == 0 && max == 1) { /* '?' */ + EPSILON(sf->x, sf->y); + TAILCALL(env, sf->x, sf->y, n->e); + return 1; + } else if (min == 1 && max == 1) { /* {1,1} */ + TAILCALL(env, sf->x, sf->y, n->e); + return 1; + } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* '*' */ + fsm_state_t na, nz; + NEWSTATE(na); + NEWSTATE(nz); + EPSILON(sf->x,na); + EPSILON(nz,sf->y); + + EPSILON(na, nz); + EPSILON(nz, na); + TAILCALL(env, na, nz, n->e); + return 1; + } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* '+' */ + fsm_state_t na, nz; + NEWSTATE(na); + NEWSTATE(nz); + EPSILON(sf->x, na); + EPSILON(nz, sf->y); + + EPSILON(nz, na); + TAILCALL(env, na, nz, n->e); + return 1; + } else if (sf->step == 0) { /* - * Subtraction produces quite a mess. We could trim or minimise here - * while q is self-contained, which might work out better than doing it - * in the larger FSM after merge. I'm not sure if it works out better - * overall or not. + * Make new beginning/end states for the repeated section, + * build its NFA, and link to its head. */ - if (fsm_empty(q)) { - EPSILON(x, y); - break; + fsm_subgraph_start(env->fsm, &sf->u.repeat.subgraph); + + sf->step++; /* resume after RECURSE */ + NEWSTATE(sf->u.repeat.na); + NEWSTATE(sf->u.repeat.nz); + RECURSE(env, sf->u.repeat.na, sf->u.repeat.nz, n->e); + return 1; + } else { + fsm_state_t tail; + assert(sf->step == 1); + EPSILON(sf->x, sf->u.repeat.na); /* link head to repeated NFA head */ + + b = sf->u.repeat.nz; /* set the initial tail */ + + /* can be skipped */ + if (min == 0) { + EPSILON(sf->u.repeat.na, sf->u.repeat.nz); } + fsm_subgraph_stop(env->fsm, &sf->u.repeat.subgraph); + tail = sf->u.repeat.nz; - if (!fsm_unionxy(env->fsm, q, x, y)) { - return 0; + if (max != AST_COUNT_UNBOUNDED) { + for (i = 1; i < max; i++) { + /* copies the original subgraph; need to set b to the + * original tail + */ + b = tail; + + if (!fsm_subgraph_duplicate(env->fsm, &sf->u.repeat.subgraph, &b, &a)) { + return 0; + } + + EPSILON(sf->u.repeat.nz, a); + + /* To the optional part of the repeated count */ + if (i >= min) { + EPSILON(sf->u.repeat.nz, b); + } + + sf->u.repeat.na = a; /* advance head for next duplication */ + sf->u.repeat.nz = b; /* advance tail for concenation */ + } + } else { + for (i = 1; i < min; i++) { + /* copies the original subgraph; need to set b to the + * original tail + */ + b = tail; + + if (!fsm_subgraph_duplicate(env->fsm, &sf->u.repeat.subgraph, &b, &a)) { + return 0; + } + + EPSILON(sf->u.repeat.nz, a); + + sf->u.repeat.na = a; /* advance head for next duplication */ + sf->u.repeat.nz = b; /* advance tail for concenation */ + } + + /* back link to allow for infinite repetition */ + EPSILON(sf->u.repeat.nz, sf->u.repeat.na); } - break; + /* tail to last repeated NFA tail */ + EPSILON(sf->u.repeat.nz, sf->y); + RETURN(env); + return 1; } +} + +static void +set_active_capture_ids(struct comp_env *env, unsigned id) +{ +#if LOG_LINKAGE + fprintf(stderr, "set_active_capture_ids: id %u\n", id); +#endif + assert(env->active_capture_ids != NULL); + u64bitset_set(env->active_capture_ids, id); +} - case AST_EXPR_RANGE: { - unsigned int i; +static int +eval_GROUP(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); - if (n->u.range.from.type != AST_ENDPOINT_LITERAL || n->u.range.to.type != AST_ENDPOINT_LITERAL) { - /* not yet supported */ - return 0; + if (env->re_flags & RE_NOCAPTURE) { + /* passthrough, disable captures */ + if (sf->step == 0) { + sf->step++; + RECURSE(env, sf->x, sf->y, sf->n->u.group.e); + } else { + RETURN(env); } + return 1; + } - assert(n->u.range.from.u.literal.c <= n->u.range.to.u.literal.c); + if (sf->step == 0) { + struct ast_expr *n = sf->n; + set_active_capture_ids(env, n->u.group.id); - if (n->u.range.from.u.literal.c == 0x00 && - n->u.range.to.u.literal.c == 0xff) - { - ANY(x, y); +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: recurse GROUP: x %d, y %d\n", + sf->x, sf->y); +#endif + sf->step++; + RECURSE(env, sf->x, sf->y, n->u.group.e); + return 1; + } else { + assert(sf->step == 1); + + RETURN(env); + return 1; + } +} + +static int +eval_ANCHOR(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); +#if 1 + +#if LOG_LINKAGE + fprintf(stderr, "%s: ignoring anchor node %p, epsilon %d -> %d\n", + __func__, (void *)sf->n, sf->x, sf->y); +#endif + EPSILON(sf->x, sf->y); +#else + switch (sf->n->u.anchor.type) { + case AST_ANCHOR_START: + if (!(sf->n->flags & AST_FLAG_FIRST)) { +#if LOG_LINKAGE + fprintf(stderr, "%s: ignoring START anchor in non-FIRST location\n", + __func__); +#endif + EPSILON(sf->x, sf->y); break; } - for (i = n->u.range.from.u.literal.c; i <= n->u.range.to.u.literal.c; i++) { - LITERAL(x, y, i); +#if LOG_LINKAGE + fprintf(stderr, "%s: START anchor %p epsilon-linking %d -> %d\n", + __func__, (void *)sf->n, env->start_inner, sf->y); +#endif + EPSILON(env->start_inner, sf->y); + break; + + case AST_ANCHOR_END: + if (!(sf->n->flags & AST_FLAG_LAST)) { +#if LOG_LINKAGE + fprintf(stderr, "%s: ignoring END anchor in non-LAST location\n", + __func__); +#endif + EPSILON(sf->x, sf->y); + break; } +#if LOG_LINKAGE + fprintf(stderr, "%s: END anchor %p epsilon-linking %d -> %d\n", + __func__, (void *)sf->n, sf->x, stack->end_inner); +#endif + EPSILON(sf->x, stack->end_inner); break; - } default: assert(!"unreached"); + return 0; } +#endif + RETURN(env); + return 1; +} + +static int +eval_SUBTRACT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + + struct fsm *a, *b; + struct fsm *q; + enum re_flags re_flags = sf->n->re_flags; + + /* wouldn't want to reverse twice! */ + re_flags &= ~(unsigned)RE_REVERSE; + + /* Don't compile capture resolution programs again for the + * subtrees, just ignore capture behavior. */ + re_flags |= RE_NOCAPTURE; + + a = expr_compile(sf->n->u.subtract.a, re_flags, + fsm_getoptions(env->fsm), env->err); + if (a == NULL) { + return 0; + } + + b = expr_compile(sf->n->u.subtract.b, re_flags, + fsm_getoptions(env->fsm), env->err); + if (b == NULL) { + fsm_free(a); + return 0; + } + + q = fsm_subtract(a, b); + if (q == NULL) { + return 0; + } + + /* + * Subtraction produces quite a mess. We could trim or minimise here + * while q is self-contained, which might work out better than doing it + * in the larger FSM after merge. I'm not sure if it works out better + * overall or not. + */ + + if (fsm_empty(q)) { + EPSILON(sf->x, sf->y); + RETURN(env); + return 1; + } + + if (!fsm_unionxy(env->fsm, q, sf->x, sf->y)) { + return 0; + } + + RETURN(env); + return 1; +} + +static int +eval_RANGE(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; + unsigned int i; + + if (n->u.range.from.type != AST_ENDPOINT_LITERAL || n->u.range.to.type != AST_ENDPOINT_LITERAL) { + /* not yet supported */ + return 0; + } + + assert(n->u.range.from.u.literal.c <= n->u.range.to.u.literal.c); + + if (n->u.range.from.u.literal.c == 0x00 && + n->u.range.to.u.literal.c == 0xff) + { + ANY(sf->x, sf->y); + RETURN(env); + return 1; + } + + for (i = n->u.range.from.u.literal.c; i <= n->u.range.to.u.literal.c; i++) { + LITERAL(sf->x, sf->y, i); + } + + RETURN(env); + return 1; +} + +static int +eval_TOMBSTONE(struct comp_env *env) +{ + /* do not link -- intentionally pruned */ + (void)env; + RETURN(env); return 1; } @@ -908,6 +1458,8 @@ comp_iter(struct comp_env *env, #undef NEWSTATE #undef LITERAL #undef RECURSE +#undef RETURN +#undef TAILCALL struct fsm * ast_compile(const struct ast *ast, @@ -915,40 +1467,64 @@ ast_compile(const struct ast *ast, const struct fsm_options *opt, struct re_err *err) { - fsm_state_t x, y; + /* Start states inside and outside of match group 0, + * which represents the entire matched input, but does not + * include the implied /.*?/ loop at the start or end when + * a regex is unanchored. */ + fsm_state_t start_outer, start_inner; struct fsm *fsm; assert(ast != NULL); +#if LOG_LINKAGE + ast_print_tree(stderr, opt, re_flags, ast); +#endif + fsm = fsm_new(opt); if (fsm == NULL) { return NULL; } - if (!fsm_addstate(fsm, &x)) { + /* TODO: move these to the call stack, for symmetry? + * Or possibly combine comp_env and stack. */ + if (!fsm_addstate(fsm, &start_outer)) { goto error; } - if (!fsm_addstate(fsm, &y)) { + if (!fsm_addstate(fsm, &start_inner)) { goto error; } - fsm_setstart(fsm, x); - fsm_setend(fsm, y, 1); + if (!fsm_addedge_epsilon(fsm, start_outer, start_inner)) { + goto error; + } + + fsm_setstart(fsm, start_outer); + +#if LOG_LINKAGE + fprintf(stderr, "start: outer %d, inner %d\n", + start_outer, start_inner); +#endif { struct comp_env env; memset(&env, 0x00, sizeof(env)); + env.alloc = fsm->opt->alloc; env.fsm = fsm; env.re_flags = re_flags; env.err = err; - env.start = x; - env.end = y; + env.start_inner = start_inner; + env.start_outer = start_outer; - if (!comp_iter(&env, x, y, ast->expr, NULL)) { + env.max_capture_id = ast->max_capture_id; + + if (!comp_iter(&env, start_inner, ast)) { + if (err != NULL && err->e == 0) { + err->e = RE_EBADGROUP; + } goto error; } } @@ -980,3 +1556,25 @@ ast_compile(const struct ast *ast, return NULL; } +static int +compile_capvm_program_for_stack_end_states(struct comp_env *env, + const struct ast *ast, uint32_t *prog_id) +{ + /* compile and save program in ^, associate its id w/ end state */ + enum re_capvm_compile_ast_res res; + struct capvm_program *prog; + res = re_capvm_compile_ast(env->alloc, + ast, env->re_flags, &prog); + if (res != RE_CAPVM_COMPILE_AST_OK) { + if (env->err != NULL && env->err->e == 0 && errno != 0) { + env->err->e = RE_EERRNO; + } + return 0; + } + + if (!fsm_capture_add_program(env->fsm, prog, prog_id)) { + return 0; + } + + return 1; +} diff --git a/src/libre/ast_rewrite.c b/src/libre/ast_rewrite.c index d05fc0a82..adb0690d2 100644 --- a/src/libre/ast_rewrite.c +++ b/src/libre/ast_rewrite.c @@ -146,7 +146,7 @@ compile_subexpr(struct ast_expr *e, enum re_flags flags) return 0; } - fsm = ast_compile(&ast, flags | RE_ANCHORED, NULL, NULL); + fsm = ast_compile(&ast, flags | RE_ANCHORED | RE_NOCAPTURE, NULL, NULL); if (fsm == NULL) { return 0; } diff --git a/src/libre/print/tree.c b/src/libre/print/tree.c index 5d2f78691..58e1d6050 100644 --- a/src/libre/print/tree.c +++ b/src/libre/print/tree.c @@ -147,7 +147,9 @@ pp_iter(FILE *f, const struct fsm_options *opt, size_t indent, enum re_flags re_ case AST_EXPR_ALT: { size_t i, count = n->u.alt.count; - fprintf(f, "ALT (%u):\n", (unsigned)count); + fprintf(f, "ALT (%u):%s\n", + (unsigned)count, + n->u.alt.contains_empty_groups ? " (contains_empty_groups)" : ""); for (i = 0; i < count; i++) { pp_iter(f, opt, indent + 1 * IND, re_flags, n->u.alt.n[i]); } @@ -155,7 +157,9 @@ pp_iter(FILE *f, const struct fsm_options *opt, size_t indent, enum re_flags re_ } case AST_EXPR_LITERAL: - fprintf(f, "LITERAL '%c'\n", n->u.literal.c); + fprintf(f, "LITERAL '"); + print_char_or_esc(f, n->u.literal.c); + fprintf(f, "'\n"); break; case AST_EXPR_CODEPOINT: @@ -167,18 +171,27 @@ pp_iter(FILE *f, const struct fsm_options *opt, size_t indent, enum re_flags re_ fprintf_count(f, n->u.repeat.min); fprintf(f, ","); fprintf_count(f, n->u.repeat.max); - fprintf(f, "}\n"); + fprintf(f, "}%s\n", n->u.repeat.contains_empty_groups ? " (contains_empty_groups)" : ""); pp_iter(f, opt, indent + 1 * IND, re_flags, n->u.repeat.e); break; case AST_EXPR_GROUP: - fprintf(f, "GROUP %p: %u\n", (void *) n, n->u.group.id); + fprintf(f, "GROUP: %u%s\n", n->u.group.id, + n->u.group.repeated ? " (repeated)" : ""); pp_iter(f, opt, indent + 1 * IND, re_flags, n->u.group.e); break; case AST_EXPR_ANCHOR: assert(n->u.anchor.type == AST_ANCHOR_START || n->u.anchor.type == AST_ANCHOR_END); - fprintf(f, "ANCHOR %s\n", n->u.anchor.type == AST_ANCHOR_START ? "^" : "$"); + if (n->u.anchor.type == AST_ANCHOR_START) { + fprintf(f, "ANCHOR ^\n"); + } else if (n->u.anchor.type == AST_ANCHOR_END) { + assert(n->u.anchor.type == AST_ANCHOR_START || n->u.anchor.type == AST_ANCHOR_END); + fprintf(f, "ANCHOR $%s\n", + n->u.anchor.is_end_nl ? " (with \\n)" : ""); + } else { + assert(!"unreachable"); + } break; case AST_EXPR_SUBTRACT: diff --git a/src/libre/re.c b/src/libre/re.c index ade526ea7..9701672a2 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -37,12 +37,12 @@ re_dialect(enum re_dialect dialect) size_t i; static const struct dialect a[] = { - { RE_LIKE, parse_re_like, 0, RE_SINGLE | RE_ANCHORED }, - { RE_LITERAL, parse_re_literal, 0, RE_SINGLE | RE_ANCHORED }, - { RE_GLOB, parse_re_glob, 0, RE_SINGLE | RE_ANCHORED }, + { RE_LIKE, parse_re_like, 0, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE }, + { RE_LITERAL, parse_re_literal, 0, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE }, + { RE_GLOB, parse_re_glob, 0, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE }, { RE_NATIVE, parse_re_native, 0, 0 }, { RE_PCRE, parse_re_pcre, 0, RE_END_NL }, - { RE_SQL, parse_re_sql, 1, RE_SINGLE | RE_ANCHORED } + { RE_SQL, parse_re_sql, 1, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE } }; for (i = 0; i < sizeof a / sizeof *a; i++) { @@ -125,7 +125,15 @@ re_parse(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, if (res < 0) { ast_free(ast); - if (err != NULL) { err->e = RE_EERRNO; } + if (err != NULL) { + if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + err->e = RE_EUNSUPPPCRE; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE) { + err->e = RE_EUNSUPCAPTUR; + } else if (err->e == RE_ESUCCESS) { + err->e = RE_EERRNO; + } + } return NULL; } diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c new file mode 100644 index 000000000..c67db2669 --- /dev/null +++ b/src/libre/re_capvm_compile.c @@ -0,0 +1,1689 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "re_capvm_compile.h" +#include "../libfsm/capture_vm.h" +#include "../libfsm/capture_vm_program.h" +#include "../libfsm/capture_vm_log.h" + +/* for EXPENSIVE_CHECKS */ +#include "../libfsm/internal.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +#include "ast.h" + +#define DEF_OPCODE_CEIL 8 +#define DEF_CHARCLASS_BUCKETS 8 +#define DEF_CHARCLASS_CEIL 4 +#define DEF_REPEATED_ALT_BACKPATCH_CEIL 1 +#define NO_BUCKET_ID ((uint32_t)-1) +#define NO_CAPTURE_ID ((uint32_t)-1) + +#define LOG_REPETITION_CASES 0 + +/* Placeholder markers for pending offset values (which would + * otherwise temporarily be uninitialized memory), chosen so + * they stand out visually in a debugger. */ +enum pending_offset { + PENDING_OFFSET_REPEAT_OPTIONAL_NEW = 11111111, + PENDING_OFFSET_REPEAT_OPTIONAL_CONT = 22222222, + PENDING_OFFSET_ALT_BACKPATCH_JMP = 33333333, + PENDING_OFFSET_ALT_BACKPATCH_NEW = 44444444, + PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS = 55555555, +}; + +struct capvm_compile_env { + const struct fsm_alloc *alloc; + enum re_flags re_flags; + struct capvm_program *program; + + uint32_t max_capture_seen; + + /* Hash table for interning character classes. + * Doubles and rehashes when half full. */ + struct charclass_htab { + uint32_t bucket_count; + uint32_t buckets_used; + uint32_t ids_used; + struct charclass_htab_bucket { + uint32_t id; /* or NO_BUCKET_ID for unused */ + struct capvm_char_class bitset; + } *buckets; + } charclass_htab; + +#define DEF_REPEATED_GROUPS_CEIL 8 + /* Linked list of nodes used at compile time to compile regexes + * such as '^(a((b*)*)*)$' as if they were '^(a(?:b*)(()))$'. + * Since the inner body of the repeated subexpression with the + * capture groups can be empty, it will always repeat after + * its body matches any input. We move the group captures to + * the end of the repeated subexpression to explicitly represent + * them always capturing afterward, because otherwise the + * infinite loop protection skips them. */ + struct repeated_group_info { + /* Ancestor node that should emit the SAVE opcodes; can + * be either a REPEAT or ALT. */ + const struct ast_expr *outermost_ancestor; + size_t ceil; + size_t count; + const struct ast_expr **groups; + /* linked list */ + struct repeated_group_info *prev; + } *repeated_groups; + + /* Linked list of nodes used for regexes like '^(($)|x)+$', + * which need to special-case the JMP instruction after the + * nullable '($)' case to replace it with a SPLIT to before + * and after the + repetition. */ + struct repeated_alt_backpatch_info { + const struct ast_expr *repeat; /* must be a + repeat */ + size_t ceil; + size_t used; + unsigned *opcode_offsets; + /* linked list */ + struct repeated_alt_backpatch_info *prev; + } *repeated_alt_backpatches; +}; + +static bool +ensure_program_capacity(const struct fsm_alloc *alloc, + struct capvm_program *p, uint32_t count) +{ +#define STRESS_GROWING (EXPENSIVE_CHECKS && 1) + + const uint32_t capacity = p->used + count; + + if (capacity > p->ceil) { +#if STRESS_GROWING + const uint32_t nceil = (p->ceil + 1 < capacity + ? capacity : p->ceil + 1); +#else + const uint32_t nceil = (p->ceil == 0 + ? DEF_OPCODE_CEIL + : 2*p->ceil); + /* This should always be enough for any capacity + * requested during compilation. */ + assert(nceil >= p->used + count); +#endif + LOG(3, "%s: growing %u -> %u (count %u)\n", + __func__, p->ceil, nceil, count); + struct capvm_opcode *nops = f_realloc(alloc, + p->ops, nceil * sizeof(p->ops[0])); + if (nops == NULL) { + return false; + } + +#if EXPENSIVE_CHECKS + for (size_t i = p->ceil; i < nceil; i++) { + /* out of range, will trigger asserts */ + nops[i].t = 'X'; + } +#endif + + p->ceil = nceil; + p->ops = nops; + } + return true; +} + +static void +check_program_for_invalid_labels(const struct capvm_program *p) +{ + for (uint32_t op_i = 0; op_i < p->used; op_i++) { + const struct capvm_opcode *op = &p->ops[op_i]; + switch (op->t) { + case CAPVM_OP_JMP: + assert(op->u.jmp != op_i); + break; + case CAPVM_OP_JMP_ONCE: + assert(op->u.jmp_once != op_i); + break; + case CAPVM_OP_SPLIT: + assert(op->u.split.cont < p->used); + assert(op->u.split.cont != op_i); + assert(op->u.split.new < p->used); + assert(op->u.split.new != op_i); + break; + + case CAPVM_OP_CHAR: + case CAPVM_OP_CHARCLASS: + case CAPVM_OP_MATCH: + case CAPVM_OP_SAVE: + case CAPVM_OP_ANCHOR: + break; + default: + assert(!"out of range"); + break; + } + } +} + +static uint32_t +get_program_offset(const struct capvm_program *p) +{ + assert(p->used < p->ceil); + +#if EXPENSIVE_CHECKS + struct capvm_opcode *op = &p->ops[p->used]; + op->t = 'X'; /* out of range */ +#endif + + return p->used; +} + +static uint32_t +reserve_program_opcode(struct capvm_program *p) +{ + assert(p->used < p->ceil); + const uint32_t res = p->used; + p->used++; + +#if EXPENSIVE_CHECKS + struct capvm_opcode *op = &p->ops[res]; + op->t = 'X'; /* out of range */ +#endif + + return res; +} + +static bool +grow_program_char_classes(const struct fsm_alloc *alloc, + struct capvm_program *p) +{ + const uint32_t nceil = (p->char_classes.ceil == 0 + ? DEF_CHARCLASS_CEIL + : 2*p->char_classes.ceil); + struct capvm_char_class *nsets = f_realloc(alloc, + p->char_classes.sets, nceil * sizeof(nsets[0])); + if (nsets == NULL) { + return false; + } + + p->char_classes.sets = nsets; + p->char_classes.ceil = nceil; + return true; +} + +static bool +intern_char_class(struct capvm_compile_env *env, + struct capvm_program *p, uint64_t chars[4], + uint32_t *id) +{ + LOG(5, "%s: used %u/%u\n", __func__, + env->charclass_htab.buckets_used, env->charclass_htab.bucket_count); + if (env->charclass_htab.buckets_used >= env->charclass_htab.bucket_count/2) { + const uint32_t ocount = env->charclass_htab.bucket_count; + const uint32_t ncount = (ocount == 0 + ? DEF_CHARCLASS_BUCKETS + : 2*env->charclass_htab.bucket_count); + LOG(3, "%s: growing from %u -> %u\n", __func__, ocount, ncount); + struct charclass_htab_bucket *nbuckets = + f_malloc(env->alloc, ncount * sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return false; + } + for (uint32_t n_i = 0; n_i < ncount; n_i++) { + nbuckets[n_i].id = NO_BUCKET_ID; + } + + const uint32_t nmask = ncount - 1; + assert((ncount & nmask) == 0); + + struct charclass_htab_bucket *obuckets = env->charclass_htab.buckets; + for (uint32_t o_i = 0; o_i < ocount; o_i++) { + if (obuckets[o_i].id == NO_BUCKET_ID) { + continue; + } + const uint64_t h = hash_fnv1a_64((const uint8_t *)obuckets[o_i].bitset.octets, + sizeof(obuckets[o_i].bitset)); + + for (uint32_t n_i = 0; n_i < ncount; n_i++) { + const uint64_t b = (h + n_i) & nmask; + if (nbuckets[b].id == NO_BUCKET_ID) { + memcpy(&nbuckets[b].bitset, + &obuckets[o_i].bitset, + sizeof(obuckets[o_i].bitset)); + nbuckets[b].id = obuckets[o_i].id; + break; + } + } + } + f_free(env->alloc, obuckets); + env->charclass_htab.bucket_count = ncount; + env->charclass_htab.buckets = nbuckets; + } + + assert(env->charclass_htab.buckets_used < env->charclass_htab.bucket_count/2); + + const uint32_t count = env->charclass_htab.bucket_count; + const uint32_t mask = count - 1; + struct charclass_htab_bucket *buckets = env->charclass_htab.buckets; + + const uint64_t h = hash_fnv1a_64((const uint8_t *)chars, + sizeof(buckets[0].bitset)); + for (uint32_t i = 0; i < count; i++) { + const uint64_t b = (h + i) & mask; + LOG(5, "%s: buckets[%lu].id == %d\n", + __func__, b, buckets[b].id); + if (buckets[b].id == NO_BUCKET_ID) { + memcpy(&buckets[b].bitset, chars, sizeof(buckets[b].bitset)); + if (p->char_classes.count == p->char_classes.ceil) { + if (!grow_program_char_classes(env->alloc, p)) { + return false; + } + } + + memcpy(&p->char_classes.sets[p->char_classes.count], + chars, sizeof(buckets[b].bitset)); + p->char_classes.count++; + buckets[b].id = env->charclass_htab.ids_used; + env->charclass_htab.ids_used++; + env->charclass_htab.buckets_used++; + *id = buckets[b].id; + + return true; + } else if (0 == memcmp(chars, &buckets[b].bitset, sizeof(buckets[b].bitset))) { + *id = buckets[b].id; + return true; /* already present, reuse */ + } else { + /* collision */ + } + } + + assert(!"unreachable"); + return false; +} + +static void +dump_endpoint(const struct ast_endpoint *e) +{ + switch (e->type) { + case AST_ENDPOINT_LITERAL: + fprintf(stderr, "endpoint[LITERAL]: 0x%02x '%c'\n", + e->u.literal.c, + isprint(e->u.literal.c) ? e->u.literal.c : '.'); + break; + case AST_ENDPOINT_CODEPOINT: + fprintf(stderr, "endpoint[CODEPOINT]: 0x%x\n", + e->u.codepoint.u); + break; + case AST_ENDPOINT_NAMED: + assert(!"todo?"); + break; + } +} + +static void +dump_pos(const struct ast_pos *p) +{ + fprintf(stderr, "pos: byte %u, line %u, col %u\n", + p->byte, p->line, p->col); +} + +static bool +active_node(const struct ast_expr *n) +{ + assert(n != NULL); + + switch (n->type) { + case AST_EXPR_TOMBSTONE: + return false; + default: + return true; + } +} + +static bool +subtree_represents_character_class(const struct ast_expr *expr, uint64_t cc[4]) +{ + for (size_t i = 0; i < 4; i++) { + cc[i] = 0; + } + + switch (expr->type) { + case AST_EXPR_EMPTY: + /* empty set */ + return false; + + case AST_EXPR_LITERAL: + u64bitset_set(cc, (uint8_t)expr->u.literal.c); + return true; + + case AST_EXPR_RANGE: + { + const struct ast_endpoint *f = &expr->u.range.from; + const struct ast_endpoint *t = &expr->u.range.to; + if (f->type != AST_ENDPOINT_LITERAL + || t->type != AST_ENDPOINT_LITERAL) { + return false; + } + for (uint64_t c = (uint64_t)f->u.literal.c; c <= (uint64_t)t->u.literal.c; c++) { + u64bitset_set(cc, (uint8_t)c); + } + return true; + } + + case AST_EXPR_ALT: + { + /* union character classes from children */ + assert(expr->u.alt.count > 0); + for (size_t c_i = 0; c_i < expr->u.alt.count; c_i++) { + uint64_t child_cc[4]; + const struct ast_expr *child = expr->u.alt.n[c_i]; + if (subtree_represents_character_class(child, child_cc)) { + for (size_t cc_i = 0; cc_i < 4; cc_i++) { + cc[cc_i] |= child_cc[cc_i]; + } + } else { + return false; + } + } + return true; + } + + case AST_EXPR_SUBTRACT: + { + /* Only support AST_EXPR_SUBTRACT nodes where .a is a + * RANGE:0x00-0xff and .b is either a LITERAL, RANGE, EMPTY, + * or an ALT that itself represents a character class, */ + + const struct ast_expr *sub_a = expr->u.subtract.a; + if (sub_a->type != AST_EXPR_RANGE) { + return false; + } + + const struct ast_endpoint *f = &sub_a->u.range.from; + const struct ast_endpoint *t = &sub_a->u.range.to; + if (f->type != AST_ENDPOINT_LITERAL || t->type != AST_ENDPOINT_LITERAL) { + return false; + } + + for (uint64_t i = 0; i < 256; i++) { + if (i >= (uint64_t)f->u.literal.c && i <= (uint64_t)f->u.literal.c) { + u64bitset_set(cc, i); + } + } + + for (size_t i = 0; i < 4; i++) { + cc[i] = ~(uint64_t)0; + } + + uint64_t neg_cc[4]; + if (subtree_represents_character_class(expr->u.subtract.b, neg_cc)) { + for (size_t cc_i = 0; cc_i < 4; cc_i++) { + cc[cc_i] &=~ neg_cc[cc_i]; + } + } else { + return false; + } + return true; + } + + default: + return false; + } +} + +static void +make_charclass_case_insensitive(uint64_t *cc) +{ + for (size_t i = 0; i < 256; i++) { + if (isalpha(i) && u64bitset_get(cc, i)) { + const char c = (char)i; + const uint64_t cl = (uint64_t)tolower(c); + const uint64_t cu = (uint64_t)toupper(c); + u64bitset_set(cc, cl); + u64bitset_set(cc, cu); + } + } +} + +static bool +can_safely_skip_JMP_ONCE(const struct ast_expr *expr) +{ + /* There are potentially cases where it's safe to skip the + * JMP_ONCE special case, which would save memory by not + * expanding the path an extra bit per iteration, but the + * criteria are subtle enough that it can probably wait. */ + (void)expr; + return false; +} + +static bool +push_repeated_group_info(struct capvm_compile_env *env, const struct ast_expr *expr) +{ + LOG(3 - LOG_REPETITION_CASES, + "%s: setting env->repeated_groups.outermost_ancestor <- %p\n", + __func__, (void *)expr); + + assert(expr != NULL); + assert(expr->type == AST_EXPR_REPEAT || expr->type == AST_EXPR_ALT); + + struct repeated_group_info *rgi = f_calloc(env->alloc, 1, sizeof(*rgi)); + if (rgi == NULL) { + return false; + } + rgi->outermost_ancestor = expr; + rgi->prev = env->repeated_groups; + env->repeated_groups = rgi; + LOG(3 - LOG_REPETITION_CASES, + "%s: push rgi, allocated %p, prev %p\n", + __func__, (void *)rgi, (void *)rgi->prev); + return true; +} + +static void +pop_repeated_group_info(struct capvm_compile_env *env, const struct ast_expr *expr) +{ + assert(expr != NULL); + assert(expr->type == AST_EXPR_REPEAT || expr->type == AST_EXPR_ALT); + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: pop rgi, expecting %p, got %p\n", + __func__, (void *)expr, (void *)rgi->outermost_ancestor); + assert(rgi->outermost_ancestor == expr); + struct repeated_group_info *prev = rgi->prev; + LOG(3 - LOG_REPETITION_CASES, + "%s: pop rgi, freeing %p, prev %p\n", + __func__, (void *)rgi, (void *)prev); + + env->repeated_groups = prev; + if (rgi->groups != NULL) { + f_free(env->alloc, rgi->groups); + } + f_free(env->alloc, rgi); +} + +static bool +push_repeated_alt_backpatch_info(struct capvm_compile_env *env, const struct ast_expr *expr) +{ + assert(expr + && expr->type == AST_EXPR_REPEAT + && expr->u.repeat.min == 1 + && expr->u.repeat.max == AST_COUNT_UNBOUNDED); + struct repeated_alt_backpatch_info *rabi = f_calloc(env->alloc, + 1, sizeof(*rabi)); + if (rabi == NULL) { + return false; + } + rabi->repeat = expr; + rabi->prev = env->repeated_alt_backpatches; + LOG(3 - LOG_REPETITION_CASES, + "%s: pushing node %p onto %p, prev link %p\n", + __func__, (void *)expr, (void *)rabi, (void *)rabi->prev); + env->repeated_alt_backpatches = rabi; + return true; +} + +static bool +append_repeated_alt_backpatch_offset(struct capvm_compile_env *env, unsigned offset) +{ + struct repeated_alt_backpatch_info *rabi = env->repeated_alt_backpatches; + assert(rabi != NULL); + if (rabi->used == rabi->ceil) { + const size_t nceil = (rabi->ceil == 0 + ? DEF_REPEATED_ALT_BACKPATCH_CEIL + : 2*rabi->ceil); + LOG(3 - LOG_REPETITION_CASES, + "%s: growing %zu -> %zu\n", __func__, rabi->ceil, nceil); + + unsigned *noffsets = f_realloc(env->alloc, + rabi->opcode_offsets, nceil * sizeof(noffsets[0])); + if (noffsets == NULL) { + return false; + } + rabi->ceil = nceil; + rabi->opcode_offsets = noffsets; + } + + LOG(3 - LOG_REPETITION_CASES, + "%s: pushing offset %u\n", __func__, offset); + rabi->opcode_offsets[rabi->used] = offset; + rabi->used++; + return true; +} + +static void +pop_repeated_alt_backpatch_info(struct capvm_compile_env *env, const struct ast_expr *expr) +{ + struct repeated_alt_backpatch_info *rabi = env->repeated_alt_backpatches; + assert(rabi != NULL); + assert(rabi->repeat == expr); + struct repeated_alt_backpatch_info *prev = rabi->prev; + LOG(3 - LOG_REPETITION_CASES, + "%s: popping %p, prev link %p\n", + __func__, (void *)rabi, (void *)prev); + f_free(env->alloc, rabi->opcode_offsets); + f_free(env->alloc, rabi); + env->repeated_alt_backpatches = prev; +} + +static void +backpatch_repeated_nullable_alt_split(struct capvm_compile_env *env, + const struct ast_expr *expr, struct capvm_program *p, unsigned split_new_dst); + +static bool +emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p); + +static bool +capvm_compile_iter_save_groups_in_skipped_subtree(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr); + +static bool +compile_kleene_star(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr); + +static bool +capvm_compile_iter(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr) +{ + LOG(4, "%s: expr %p, type %s, %u/%u used, re_flags 0x%02x\n", + __func__, (void *)expr, ast_node_type_name(expr->type), + p->used, p->ceil, expr->re_flags); + + switch (expr->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_TOMBSTONE: + break; + case AST_EXPR_CONCAT: + for (size_t i = 0; i < expr->u.concat.count; i++) { + /* append instructions from each consecutive node */ + const struct ast_expr *n = expr->u.concat.n[i]; + if (!capvm_compile_iter(env, p, n)) { return false; } + } + break; + case AST_EXPR_ALT: + { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + assert(expr->u.alt.count > 0); + + if (expr->u.alt.contains_empty_groups) { + if (!push_repeated_group_info(env, expr)) { + return false; + } + } + + /* If this ALT node represents a character class (such as a + * rewritten . character's ALT[0x00 - 0x09, 0x0b - 0xff] or + * a rewritten [abc-ef]'s ... , then produce the corresponding + * character class literal. The direct representation of the + * subtree would take several instructions and introduce + * unnecessary splits, increasing memory usage at runtime. */ + uint64_t cc[4]; + if (subtree_represents_character_class(expr, cc)) { + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op_cc = &p->ops[pos]; + op_cc->t = CAPVM_OP_CHARCLASS; + + if (expr->re_flags & RE_ICASE) { + make_charclass_case_insensitive(cc); + } + if (!intern_char_class(env, p, cc, &op_cc->u.charclass_id)) { + return false; + } + + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + break; + } + + uint32_t active_count = 0; + uint32_t last_active; + struct alt_flow_info { + bool is_active; + uint32_t backpatch; + }; + struct alt_flow_info *flow_info = f_calloc(env->alloc, + expr->u.alt.count, sizeof(flow_info[0])); + assert(flow_info != NULL); + + for (uint64_t i = 0; i < expr->u.alt.count; i++) { + const struct ast_expr *n = expr->u.alt.n[i]; + if (active_node(n)) { + last_active = i; + active_count++; + flow_info[i].is_active = true; + } + } + + /* If there are no children active this should terminate + * with an empty program. */ + LOG(3, "%s: active_count == %d\n", __func__, active_count); + if (active_count == 0) { + LOG(3, "%s: active_count == 0, skipping\n", __func__); + + for (uint64_t i = 0; i < expr->u.alt.count; i++) { + const struct ast_expr *n = expr->u.alt.n[i]; + capvm_compile_iter_save_groups_in_skipped_subtree(env, p, n); + if (n->flags & AST_FLAG_NULLABLE) { + break; + } + } + + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + + /* FIXME: may need distinct error case to not leak. + * There is currently no test reaching this, try + * using the fuzzer to trigger it. */ + return true; + } else if (active_count == 1) { + /* even if one of the later subtrees is active, an earlier + * subtree can still shadow it. */ + bool shadowed = false; + + for (uint64_t i = 0; i < expr->u.alt.count; i++) { + if (i != last_active) { /* evaluate for empty groups */ + const struct ast_expr *n = expr->u.alt.n[i]; + capvm_compile_iter_save_groups_in_skipped_subtree(env, p, n); + if (n->flags & AST_FLAG_NULLABLE) { + shadowed = true; + break; + } + } + } + + if (!shadowed) { + LOG(5, "narrowing to last_active %u\n", last_active); + assert(last_active < expr->u.alt.count); + const struct ast_expr *n = expr->u.alt.n[last_active]; + if (!capvm_compile_iter(env, p, n)) { + return false; + } + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + break; + } else { + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + return true; /* may need distinct error case to not leak */ + } + } + + LOG(3, "%s: compiling AST_EXPR_ALT with %u active nodes, last_active %u\n", + __func__, active_count, last_active); + + /* note: binarized split: for a|b|c, treat this like "a else (b else c)", + * leading to generated code like: + * + * // note: trying each case in order, earlier cases are more greedy + * - split_cont j1 + * - split_new j2 + * j1: + * - + * - jmp pos_after_all // or split pos_after_all, PLUS_BACKPATCH, see below + * j2: + * - split_cont j3 + * - split_new j4 + * j3: + * - + * - jmp pos_after_all + * j4: + * //// DO NOT EMIT split instructions here, treat like a final else + * - + * // fall through to pos_after_all + * pos_after_all: + * + * + * When an ALT case: + * - is nullable (can match the empty string) + * - is the first nullable case (shadowing cases after) + * - is in a subtree of a REPEAT{1,inf} (+) node whose entire subtree is nullable + * then that case's `jmp pos_after_all` should be replaced with + * `split pos_after_all pos_after_repeat_backjmp`, which will need a special + * label for batch-patching by the REPEAT later. This is necessary for cases + * like '^(?:($|x))+$', where the `jmp pos_after_all` would lead to code after + * the ALT that has already been executed at the current input position. + * */ + for (uint32_t c_i = 0; c_i < expr->u.alt.count; c_i++) { + const bool is_final_else_case = c_i == last_active; + LOG(3, "%s: %p c_i %u/%zu, is_final_else_case %d\n", + __func__, (void *)expr, c_i, expr->u.alt.count, is_final_else_case); + if (!flow_info[c_i].is_active) { continue; } + + if (is_final_else_case) { + /* Just add the case for the child node and + * then fall through to pos_after_all. */ + const struct ast_expr *n = expr->u.alt.n[c_i]; + LOG(3, "%s: %p recursing...\n", __func__, (void *)expr); + if (!capvm_compile_iter(env, p, n)) { + return false; + } + LOG(3, "%s: %p recursing...done (final-else-case)\n", __func__, (void *)expr); + + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: ALT %p: contains_empty_groups: %d, outermost_ancestor: %p == %p ? %d\n", + __func__, (void *)expr, + expr->u.alt.contains_empty_groups, + (void *)(rgi ? rgi->outermost_ancestor : NULL), + (void *)expr, + (rgi ? rgi->outermost_ancestor == expr : 0)); + if (expr->u.alt.contains_empty_groups) { + assert(rgi != NULL); + LOG(3 - LOG_REPETITION_CASES, + "%s: outermost_ancestor match, count %zu\n", __func__, rgi->count); + if (!emit_repeated_groups(env, p)) { + return false; + } + } + } else { + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + const uint32_t pos_split_before_case = reserve_program_opcode(p); + struct capvm_opcode *op_split_before = &p->ops[pos_split_before_case]; + op_split_before->t = CAPVM_OP_SPLIT; + + /* greedier branch: trying the next case, in order */ + op_split_before->u.split.cont = get_program_offset(p); + + /* less greedy branch: moving on to the next case. + * will backpatch .new to after this case's JMP later */ + op_split_before->u.split.new = PENDING_OFFSET_ALT_BACKPATCH_NEW; + + const struct ast_expr *n = expr->u.alt.n[c_i]; + LOG(3, "%s: %p recursing...\n", __func__, (void *)expr); + if (!capvm_compile_iter(env, p, n)) { + return false; + } + LOG(3, "%s: %p recursing...done (non-final)\n", __func__, (void *)expr); + + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: ALT %p: contains_empty_groups: %d, outermost_ancestor: %p == %p ? %d\n", + __func__, (void *)expr, expr->u.alt.contains_empty_groups, + (void *)(rgi ? rgi->outermost_ancestor : NULL), + (void *)expr, + (rgi ? rgi->outermost_ancestor == expr : 0)); + if (expr->u.alt.contains_empty_groups) { + assert(rgi != NULL); + LOG(3 - LOG_REPETITION_CASES, + "%s: outermost_ancestor match, count %zu\n", __func__, rgi->count); + if (!emit_repeated_groups(env, p)) { + return false; + } + } + + /* JMP or SPLIT, plus space after */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + /* Based on analysis, either emit a JMP or SPLIT. */ + if (n->u.alt.nullable_alt_inside_plus_repeat) { + const uint32_t pos_split_after = reserve_program_opcode(p); + flow_info[c_i].backpatch = pos_split_after; + struct capvm_opcode *op_split_after = &p->ops[pos_split_after]; + op_split_after->t = CAPVM_OP_SPLIT; + op_split_after->u.split.cont = PENDING_OFFSET_ALT_BACKPATCH_JMP; + op_split_after->u.split.new = PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS; + if (!append_repeated_alt_backpatch_offset(env, pos_split_after)) { + return false; + } + } else { + const uint32_t pos_jmp_after = reserve_program_opcode(p); + flow_info[c_i].backpatch = pos_jmp_after; + struct capvm_opcode *op_jmp = &p->ops[pos_jmp_after]; + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = PENDING_OFFSET_ALT_BACKPATCH_JMP; + } + + /* refresh pointer after possible realloc */ + op_split_before = &p->ops[pos_split_before_case]; + + /* and the original split jumps to after + * this case's JMP */ + op_split_before->u.split.new = get_program_offset(p); + } + } + + /* Ensure there's space for the next instruction, and then + * set every case's JMP suffix to it. */ + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_after_all = get_program_offset(p); + + for (size_t i = 0; i < expr->u.alt.count - 1; i++) { + const bool is_final_else_case = i == last_active; + assert(flow_info[i].backpatch < p->used); + if (is_final_else_case || !flow_info[i].is_active) { + continue; + } + + struct capvm_opcode *op_patch = &p->ops[flow_info[i].backpatch]; + if (op_patch->t == CAPVM_OP_JMP) { + assert(op_patch->u.jmp == PENDING_OFFSET_ALT_BACKPATCH_JMP); + op_patch->u.jmp = pos_after_all; + } else if (op_patch->t == CAPVM_OP_SPLIT) { + assert(op_patch->u.split.cont == PENDING_OFFSET_ALT_BACKPATCH_JMP); + op_patch->u.split.cont = pos_after_all; + /* This will be patched by an ancestor repeat node after returning. */ + assert(op_patch->u.split.cont == PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS); + } else { + assert(!"type mismatch"); + } + } + + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + break; + } + case AST_EXPR_LITERAL: + { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos]; + + if (expr->re_flags & RE_ICASE) { + uint64_t cc[4] = { 0 }; + u64bitset_set(cc, (uint64_t)expr->u.literal.c); + + op->t = CAPVM_OP_CHARCLASS; + make_charclass_case_insensitive(cc); + if (!intern_char_class(env, p, cc, &op->u.charclass_id)) { + return false; + } + } else { + op->t = CAPVM_OP_CHAR; + op->u.chr = (uint8_t)expr->u.literal.c; + } + break; + } + case AST_EXPR_CODEPOINT: + assert(!"not implemented, unreachable"); + break; + case AST_EXPR_REPEAT: + { + const unsigned min = expr->u.repeat.min; + const unsigned max = expr->u.repeat.max; + const struct ast_expr *e = expr->u.repeat.e; + + /* collect groups to emit */ + if (expr->u.repeat.contains_empty_groups) { + if (!push_repeated_group_info(env, expr)) { + return false; + } + } + + if (min == 1 && max == 1) { /* {1,1} */ + /* if repeating exactly once, just defer to subtree, + * but still do the repeated_group_info cleanup below */ + if (!capvm_compile_iter(env, p, e)) { + return false; + } + } else if (min == 0 && max == 1) { /* ? */ + /* split l1, l2 + * l1: + * l2: */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l1 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = pos_l1; + op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (!capvm_compile_iter(env, p, e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + op_split = &p->ops[pos_split]; /* refresh pointer */ + + const uint32_t after_expr = get_program_offset(p); + op_split->u.split.new = after_expr; + } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* * */ + if (!compile_kleene_star(env, p, expr)) { + return false; + } + } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* + */ + if (expr->u.repeat.contains_nullable_alt) { + if (!push_repeated_alt_backpatch_info(env, expr)) { + return false; + } + } + + /* l1: + * split l1, l2 + * l2: */ + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_l1 = get_program_offset(p); + + if (!capvm_compile_iter(env, p, e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + + /* Only emit the backwards jump for repetition branching + * if the subtree added any instructions. */ + if (get_program_offset(p) != pos_l1) { + if (!ensure_program_capacity(env->alloc, p, 3)) { + return false; + } + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l2 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = pos_l1; + op_split->u.split.new = pos_l2; + + /* Update any ALT nodes in the subtree whose SPLIT instructions + * are awaiting backpatching with pos_l2. */ + if (expr->u.repeat.contains_nullable_alt) { + backpatch_repeated_nullable_alt_split(env, expr, p, pos_l2); + } + } + + if (expr->u.repeat.contains_nullable_alt) { + pop_repeated_alt_backpatch_info(env, expr); + } + } else if (min == 0 && max == 0) { /* {0,0} */ + /* ignored, except any groups contained within that could match + * empty input still get emitted (unless unsatisfiable). */ + if (e->flags & AST_FLAG_UNSATISFIABLE) { + LOG(3, "%s: repeat{0,0} && UNSATISFIABILE -> skipping\n", __func__); + break; + } + + /* Unreachable group captures still need to be counted, otherwise + * subsequent ones would get shifted down. */ + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, e)) { return false; } + break; + } else { /* other bounded count */ + /* repeat the minimum number of times */ + for (size_t i = 0; i < min; i++) { + if (!capvm_compile_iter(env, p, e)) { return false; } + } + + if (max == AST_COUNT_UNBOUNDED) { + /* A repeat of {x,inf} should be treated like + * (?:subtree){x} (?:subtree)* . */ + if (!compile_kleene_star(env, p, expr)) { + return false; + } + } else { + /* then repeat up to the max as ? + * + * split_cont l1 + * split_new l2 + * l1: + * l2: */ + for (size_t i = min; i < max; i++) { + if (!ensure_program_capacity(env->alloc, p, 3)) { + return false; + } + + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l1 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = pos_l1; + op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (!capvm_compile_iter(env, p, e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + op_split = &p->ops[pos_split]; /* refresh pointer */ + + const uint32_t after_expr = get_program_offset(p); + op_split->u.split.new = after_expr; + } + } + } + + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: REPEAT %p: contains_empty_groups: %d, outermost_ancestor: %p == %p ? %d\n", + __func__, (void *)expr, expr->u.repeat.contains_empty_groups, + (void *)(rgi ? rgi->outermost_ancestor : NULL), + (void *)expr, + (rgi ? rgi->outermost_ancestor == expr : 0)); + if (expr->u.repeat.contains_empty_groups + && rgi != NULL + && rgi->outermost_ancestor == expr) { + LOG(3 - LOG_REPETITION_CASES, + "%s: outermost_ancestor match, count %zu\n", __func__, rgi->count); + if (!emit_repeated_groups(env, p)) { + return false; + } + pop_repeated_group_info(env, expr); + } + + break; + } + case AST_EXPR_GROUP: + { + const uint32_t id = expr->u.group.id; + const int is_repeated = expr->u.group.repeated; + + /* If the group is nullable and repeated, then move its save + * instructions to the end, since the final iteration matching + * nothing will always clobber any earlier saves. This is a + * workaround for cases that would otherwise incorrectly be + * halted by infinite loop prevention at runtime. */ + if (is_repeated && ((expr->flags & AST_FLAG_NULLABLE) + || !(expr->flags & AST_FLAG_CAN_CONSUME))) { + + struct repeated_group_info *rgi = env->repeated_groups; + + LOG(3 - LOG_REPETITION_CASES, + "%s: checking repeated group %u (capvm_compile_iter recurse), parent %p\n", + __func__, id, (void *)(rgi ? rgi->outermost_ancestor : NULL)); + if (!capvm_compile_iter(env, p, expr->u.group.e)) { return false; } + LOG(3 - LOG_REPETITION_CASES, + "%s: checking repeated group %u (capvm_compile_iter done), parent %p\n", + __func__, id, (void *)(rgi ? rgi->outermost_ancestor : NULL)); + + /* don't emit these here, parent repeat node will add them after. */ + if (rgi && rgi->outermost_ancestor != NULL) { + if (rgi->count == rgi->ceil) { + const size_t nceil = (rgi->ceil == 0 + ? DEF_REPEATED_GROUPS_CEIL + : 2*rgi->ceil); + const struct ast_expr **ngroups = f_realloc(env->alloc, + rgi->groups, + nceil * sizeof(ngroups[0])); + if (ngroups == NULL) { + return false; + } + rgi->groups = ngroups; + rgi->ceil = nceil; + } + + LOG(3 - LOG_REPETITION_CASES, + "%s: adding group %u (%p) to outermost_ancestor %p\n", + __func__, id, (void *)expr, + (void *)rgi->outermost_ancestor); + rgi->groups[rgi->count] = expr; + rgi->count++; + } + } else { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + if (!capvm_compile_iter(env, p, expr->u.group.e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*id + 1; + } + + if (id > env->max_capture_seen || env->max_capture_seen == NO_CAPTURE_ID) { + env->max_capture_seen = id; + } + + break; + } + + case AST_EXPR_ANCHOR: + { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos]; + op->t = CAPVM_OP_ANCHOR; + op->u.anchor = (expr->u.anchor.type == AST_ANCHOR_START + ? CAPVM_ANCHOR_START : CAPVM_ANCHOR_END); + break; + } + case AST_EXPR_SUBTRACT: + { + uint64_t cc[4]; + for (size_t i = 0; i < 4; i++) { + cc[i] = ~(uint64_t)0; + } + if (subtree_represents_character_class(expr, cc)) { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op_cc = &p->ops[pos]; + op_cc->t = CAPVM_OP_CHARCLASS; + + if (expr->re_flags & RE_ICASE) { + make_charclass_case_insensitive(cc); + } + + if (!intern_char_class(env, p, cc, &op_cc->u.charclass_id)) { + return false; + } + } else { + /* FIXME: should return UNSUPPORTED */ + assert(!"unreachable"); + return false; + } + break; + } + case AST_EXPR_RANGE: + { + uint64_t cc[4] = { 0 }; + if (!subtree_represents_character_class(expr, cc)) { + dump_endpoint(&expr->u.range.from); + dump_pos(&expr->u.range.start); + dump_endpoint(&expr->u.range.to); + dump_pos(&expr->u.range.end); + assert(!"unreachable"); + return false; + } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos]; + + op->t = CAPVM_OP_CHARCLASS; + if (expr->re_flags & RE_ICASE) { + make_charclass_case_insensitive(cc); + } + + if (!intern_char_class(env, p, cc, &op->u.charclass_id)) { + return false; + } + break; + } + default: + assert(!"matchfail"); + } + + return true; +} + +static bool +compile_kleene_star(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr) +{ + /* Note: min count may be > 0 because this is also + * used for unbounded repetition with a lower count, + * as in `a{3,}`, but in that case the {min} + * repetitions have already been handled by the caller. */ + assert(expr && expr->type == AST_EXPR_REPEAT && + expr->u.repeat.max == AST_COUNT_UNBOUNDED); + + /* l1: split l2, l3 + * l2: + * jmp_once l1 OR jmp l1 + * l3: */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + const uint32_t pos_l1 = reserve_program_opcode(p); + const uint32_t pos_l2 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_l1]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = PENDING_OFFSET_REPEAT_OPTIONAL_CONT; + op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (!capvm_compile_iter(env, p, expr->u.repeat.e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + /* It's more expensive to always emit JMP_ONCE because it + * extends the path each iteration, so we could detect when + * it would be safe to use a JMP instead. */ + if (can_safely_skip_JMP_ONCE(expr)) { + const uint32_t pos_jmp = reserve_program_opcode(p); + struct capvm_opcode *op_jmp = &p->ops[pos_jmp]; + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = pos_l1; + } else { + const uint32_t pos_jmp_once = reserve_program_opcode(p); + struct capvm_opcode *op_jmp_once = &p->ops[pos_jmp_once]; + op_jmp_once->t = CAPVM_OP_JMP_ONCE; + op_jmp_once->u.jmp_once = pos_l1; + } + + const uint32_t pos_l3 = get_program_offset(p); + op_split = &p->ops[pos_l1]; /* refresh pointer */ + op_split->u.split.cont = pos_l2; + op_split->u.split.new = pos_l3; + return true; +} + +static bool +emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p) +{ + struct repeated_group_info *rgi = env->repeated_groups; + for (size_t i = 0; i < rgi->count; i++) { + const struct ast_expr *group = rgi->groups[i]; + assert(group->u.group.repeated); + const unsigned id = group->u.group.id; + LOG(3 - LOG_REPETITION_CASES, + "%s: checking %zu/%zu: group_id %u\n", + __func__, i, rgi->count, id); + + if (group->flags & (AST_FLAG_ANCHORED_START | AST_FLAG_ANCHORED_END)) { + /* if the otherwise empty group contains any anchors, + * then emit a subtree like (^)? so that its capture + * is only set when the anchors would match. */ + if (!ensure_program_capacity(env->alloc, p, 6)) { + return false; + } + + /* split l1, l2 + * l1: + * + * l2: save (start) + * save (end) + * */ + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l1 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = pos_l1; + op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (group->flags & AST_FLAG_ANCHORED_START) { + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_ANCHOR; + op->u.anchor = CAPVM_ANCHOR_START; + } + + if (group->flags & AST_FLAG_ANCHORED_END) { + const uint32_t pos_end = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_end]; + op->t = CAPVM_OP_ANCHOR; + op->u.anchor = CAPVM_ANCHOR_END; + } + + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*group->u.group.id; + op->u.save = 2*id + 1; + + const uint32_t after_expr = get_program_offset(p); + op_split = &p->ops[pos_split]; /* refresh pointer */ + op_split->u.split.new = after_expr; + } else { + /* simple case, emit SAVE pair */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*group->u.group.id; + op->u.save = 2*id + 1; + } + } + + /* clear, because an ALT's subtrees can have distinct repeated groups */ + rgi->count = 0; + + return true; +} + +static void +backpatch_repeated_nullable_alt_split(struct capvm_compile_env *env, + const struct ast_expr *expr, struct capvm_program *p, unsigned split_new_dst) +{ + struct repeated_alt_backpatch_info *rabi = env->repeated_alt_backpatches; + assert(rabi != NULL && rabi->repeat == expr); + + for (size_t op_i = 0; op_i < rabi->used; op_i++) { + const unsigned offset = rabi->opcode_offsets[op_i]; + assert(offset < p->used); + LOG(3 - LOG_REPETITION_CASES, + "%s: backpatching SPLIT instruction %u's .new to %u\n", + __func__, offset, split_new_dst); + struct capvm_opcode *op = &p->ops[offset]; + assert(op->t == CAPVM_OP_SPLIT); + assert(op->u.split.new == PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS); + op->u.split.new = split_new_dst; + } + + rabi->used = 0; +} + +static bool +capvm_compile_iter_save_groups_in_skipped_subtree(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr) +{ + /* Follow the subtree as far as any expressions that could + * contain GROUPs. Emit any empty groups. This is necessary for + * regexes like /()*^/ and /(x|(x|))^/ whose subtrees are + * otherwise pruned but would still match the empty string + * before ^. */ + switch (expr->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_LITERAL: + case AST_EXPR_CODEPOINT: + case AST_EXPR_ANCHOR: + case AST_EXPR_SUBTRACT: + case AST_EXPR_RANGE: + case AST_EXPR_TOMBSTONE: + /* none of these can contain groups */ + break; + + case AST_EXPR_CONCAT: + if (expr->flags & AST_FLAG_UNSATISFIABLE) { + return true; /* skip */ + } + for (size_t i = 0; i < expr->u.concat.count; i++) { + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.concat.n[i])) { + return false; + } + } + break; + case AST_EXPR_ALT: + for (size_t i = 0; i < expr->u.alt.count; i++) { + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.alt.n[i])) { + return false; + } + } + break; + + case AST_EXPR_REPEAT: + return capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.repeat.e); + + case AST_EXPR_GROUP: + { + const uint32_t id = expr->u.group.id; + LOG(5, "%s: recording otherwise skipped group %u\n", __func__, id); + + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + if (id > env->max_capture_seen || env->max_capture_seen == NO_CAPTURE_ID) { + env->max_capture_seen = id; + } + + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*id + 1; + + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.group.e)) { + return false; + } + + break; + } + default: + assert(!"match fail"); + + } + return true; +} + +static enum re_capvm_compile_ast_res +capvm_compile(struct capvm_compile_env *env, + const struct ast *ast) +{ + struct capvm_program *p = f_calloc(env->alloc, 1, sizeof(*p)); + if (p == NULL) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + + LOG(3, "%s: has_unanchored: start? %d, end? %d\n", __func__, + ast->has_unanchored_start, + ast->has_unanchored_end); + + /* If the regex has an unanchored start, it gets a `.*` prefix, + * but with the labels swapped so that the unanchored start + * loop is NOT greedy. */ + if (ast->has_unanchored_start) { + if (!ensure_program_capacity(env->alloc, p, 4)) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + + /* l1: split l3, l2 + * l2: . + * jmp l1 + * l3: */ + const uint32_t l1 = get_program_offset(p); + const uint32_t split_pos = reserve_program_opcode(p); + struct capvm_opcode *op_split = &p->ops[split_pos]; + + const uint32_t l2 = get_program_offset(p); + const uint32_t op_cc_pos = reserve_program_opcode(p); + struct capvm_opcode *op_cc = &p->ops[op_cc_pos]; + + const uint32_t op_jmp_pos = reserve_program_opcode(p); + struct capvm_opcode *op_jmp = &p->ops[op_jmp_pos]; + + const uint32_t l3 = get_program_offset(p); + + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = l3; /* greedy */ + op_split->u.split.new = l2; /* non-greedy */ + + op_cc->t = CAPVM_OP_CHARCLASS; + uint64_t any[4]; + for (size_t i = 0; i < 4; i++) { + any[i] = ~(uint64_t)0; + } + if (!intern_char_class(env, p, any, &op_cc->u.charclass_id)) { + goto cleanup; + } + + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = l1; + } + + /* Compile the regex AST, assuming match group 0 is + * explicitly represented. */ + if (!capvm_compile_iter(env, p, ast->expr)) { + goto cleanup; + } + + /* Add the unanchored end loop, outside of match group 0 */ + if (ast->has_unanchored_end) { + if (!ensure_program_capacity(env->alloc, p, 4)) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + + /* l1: split l3, l2 + * l2: . + * jmp l1 + * l3: */ + const uint32_t l1 = reserve_program_opcode(p); + const uint32_t l2 = reserve_program_opcode(p); + const uint32_t l_jmp = reserve_program_opcode(p); + const uint32_t l3 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[l1]; + + struct capvm_opcode *op_any = &p->ops[l2]; + struct capvm_opcode *op_jmp = &p->ops[l_jmp]; + + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.cont = l3; /* greedy */ + op_split->u.split.new = l2; /* non-greedy */ + + op_any->t = CAPVM_OP_CHARCLASS; + uint64_t any[4]; + for (size_t i = 0; i < 4; i++) { + any[i] = ~(uint64_t)0; + } + if (!intern_char_class(env, p, any, &op_any->u.charclass_id)) { + goto cleanup; + } + + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = l1; + } + + /* add MATCH opcode at end */ + if (!ensure_program_capacity(env->alloc, p, 1)) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + const uint32_t pos_m = reserve_program_opcode(p); + struct capvm_opcode *op_m = &p->ops[pos_m]; + op_m->t = CAPVM_OP_MATCH; + + /* TODO: populate info about max threads, etc. in p, + * because it should be possible to calculate runtime + * memory limits at compile time. */ + env->program = p; + p->capture_count = (env->max_capture_seen == NO_CAPTURE_ID + ? 0 : env->max_capture_seen + 1); + + if (LOG_CAPVM > 2) { + LOG(0, "====\n"); + fsm_capvm_program_dump(stderr, p); + LOG(0, "====\n"); + } + + /* TODO: it may be worth exposing these static checks as + * something the caller can run at load-time */ + check_program_for_invalid_labels(p); + + return RE_CAPVM_COMPILE_AST_OK; + +cleanup: + fsm_capvm_program_free(env->alloc, p); + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; +} + +#define DUMP_AST 0 +#define DUMP_RESULT 0 /* should be 0 in production */ + +#if DUMP_AST || DUMP_RESULT +#include +#include "print.h" +static struct fsm_options opt = { .group_edges = 1 }; + +static unsigned +get_max_capture_id(const struct capvm_program *program) +{ + assert(program != NULL); + return (program->capture_count == 0 + ? 0 + : program->capture_base + program->capture_count - 1); +} + +#endif + +enum re_capvm_compile_ast_res +re_capvm_compile_ast(const struct fsm_alloc *alloc, + const struct ast *ast, + enum re_flags re_flags, + struct capvm_program **program) +{ +#if DUMP_AST + if (LOG_CAPVM > 2) { + ast_print_dot(stderr, &opt, re_flags, ast); + ast_print_tree(stderr, &opt, re_flags, ast); + } +#endif + + struct capvm_compile_env env = { + .alloc = alloc, + .re_flags = re_flags, + .max_capture_seen = NO_CAPTURE_ID, + }; + + enum re_capvm_compile_ast_res res; + res = capvm_compile(&env, ast); + + + struct repeated_group_info *rgi = env.repeated_groups; + while (rgi != NULL) { + struct repeated_group_info *prev = rgi->prev; + LOG(3 - LOG_REPETITION_CASES, + "%s: rgi cleanup, freeing %p, prev %p\n", + __func__, (void *)rgi, (void *)prev); + + if (rgi->groups != NULL) { + f_free(alloc, rgi->groups); + } + f_free(alloc, rgi); + rgi = prev; + } + + if (res == RE_CAPVM_COMPILE_AST_OK) { +#if DUMP_RESULT > 0 + if (DUMP_RESULT > 1 || getenv("DUMP")) { + ast_print_tree(stderr, &opt, re_flags, ast); + fsm_capvm_program_dump(stderr, env.program); + fprintf(stderr, "%s: max_capture_id %u\n", __func__, + get_max_capture_id(env.program)); + + } +#endif + + *program = env.program; + } + + free(env.charclass_htab.buckets); + + return res; +} diff --git a/src/libre/re_capvm_compile.h b/src/libre/re_capvm_compile.h new file mode 100644 index 000000000..b98ac9196 --- /dev/null +++ b/src/libre/re_capvm_compile.h @@ -0,0 +1,32 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef RE_CAPVM_COMPILE_H +#define RE_CAPVM_COMPILE_H + +/* The part of the capture VM interface that belongs in + * libre rather than libfsm, mostly related to compiling + * a libre AST into a capvm_program. */ + +#include + +#include "ast.h" +#include + +struct capvm_program; + +enum re_capvm_compile_ast_res { + RE_CAPVM_COMPILE_AST_OK, + RE_CAPVM_COMPILE_AST_ERROR_ALLOC = -1, +}; + +enum re_capvm_compile_ast_res +re_capvm_compile_ast(const struct fsm_alloc *alloc, + const struct ast *ast, + enum re_flags re_flags, + struct capvm_program **program); + +#endif diff --git a/src/libre/strerror.c b/src/libre/strerror.c index 009d61df2..d66e750a4 100644 --- a/src/libre/strerror.c +++ b/src/libre/strerror.c @@ -20,6 +20,8 @@ re_strerror(enum re_errno e) case RE_EERRNO: return strerror(errno); case RE_EBADDIALECT: return "Bad dialect"; case RE_EBADGROUP: return "Bad group"; + case RE_EUNSUPCAPTUR: return "Cannot support captures in this case"; + case RE_EUNSUPPPCRE: return "Unsupported PCRE edge case"; case RE_ENEGRANGE: return "Negative group range"; case RE_ENEGCOUNT: return "Negative count range"; diff --git a/src/lx/parser.act b/src/lx/parser.act index 78254e123..66d0591f0 100644 --- a/src/lx/parser.act +++ b/src/lx/parser.act @@ -388,7 +388,7 @@ assert(@a != NULL); assert(@a->p != NULL); - @r = re_comp(RE_NATIVE, act_agetc, @a, act_state->opt, @f, &err); + @r = re_comp(RE_NATIVE, act_agetc, @a, act_state->opt, @f | RE_NOCAPTURE, &err); if (@r == NULL) { assert(err.e != RE_EBADDIALECT); /* TODO: pass filename for .lx source */ diff --git a/src/re/main.c b/src/re/main.c index c83c4fc66..226267196 100644 --- a/src/re/main.c +++ b/src/re/main.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -48,6 +49,10 @@ * TODO: flags; -r for RE_REVERSE, etc */ +static int +exec_with_captures(struct fsm *fsm, + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end); + struct match { fsm_end_id_t i; const char *s; @@ -642,6 +647,7 @@ main(int argc, char *argv[]) int patterns; int ambig; int makevm; + int resolve_captures; size_t generate_bounds = 0; struct fsm_dfavm *vm; @@ -665,6 +671,7 @@ main(int argc, char *argv[]) patterns = 0; ambig = 0; makevm = 0; + resolve_captures = 0; print_fsm = NULL; print_ast = NULL; query = NULL; @@ -675,7 +682,7 @@ main(int argc, char *argv[]) { int c; - while (c = getopt(argc, argv, "h" "acCwXe:E:G:k:" "bi" "sq:r:l:F:" "upMmnftxyz"), c != -1) { + while (c = getopt(argc, argv, "h" "acCwXe:E:G:k:" "bi" "sq:r:l:F:" "upMmnRftxyz"), c != -1) { switch (c) { case 'a': opt.anonymous_states = 0; break; case 'c': opt.consolidate_edges = 0; break; @@ -714,6 +721,7 @@ main(int argc, char *argv[]) case 't': isliteral = 1; break; case 'z': patterns = 1; break; case 'M': makevm = 1; break; + case 'R': resolve_captures = 1; break; case 'G': generate_bounds = strtoul(optarg, NULL, 10); @@ -1235,8 +1243,10 @@ main(int argc, char *argv[]) if (vm != NULL) { e = fsm_vm_match_file(vm, f); + } else if (resolve_captures) { + assert(!"todo"); } else { - e = fsm_exec(fsm, fsm_fgetc, f, &state, NULL); + e = fsm_exec(fsm, fsm_fgetc, f, &state); } fclose(f); @@ -1247,8 +1257,10 @@ main(int argc, char *argv[]) if (vm != NULL) { e = fsm_vm_match_buffer(vm, s, strlen(s)); + } else if (resolve_captures) { + e = exec_with_captures(fsm, fsm_sgetc, &s, &state); } else { - e = fsm_exec(fsm, fsm_sgetc, &s, &state, NULL); + e = fsm_exec(fsm, fsm_sgetc, &s, &state); } } @@ -1281,3 +1293,56 @@ main(int argc, char *argv[]) return r; } } + +static int +exec_with_captures(struct fsm *fsm, + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end) +{ + int c; + size_t ceil = 16; + size_t used = 0; + unsigned char *buf = malloc(ceil); + size_t i; + size_t capture_ceil; + struct fsm_capture *captures; + int res; + + while (c = fsm_getc(opaque), c != EOF) { + if (used == ceil - 1) { + const size_t nceil = 2*ceil; + unsigned char *nbuf = realloc(buf, nceil); + if (nbuf == NULL) { + free(buf); + return -1; + } + ceil = nceil; + buf = nbuf; + } + buf[used] = c; + used++; + } + buf[used] = '\0'; + + capture_ceil = fsm_capture_ceiling(fsm); + + captures = malloc(capture_ceil * sizeof(captures[0])); + if (captures == NULL) { + free(buf); + return -1; + } + + res = fsm_exec_with_captures(fsm, buf, used, + end, captures, capture_ceil); + if (res == 1) { + for (i = 0; i < capture_ceil; i++) { + printf("-- %zu: %zd,%zd\n", + i, captures[i].pos[0], captures[i].pos[1]); + } + } else { + printf("-- no match\n"); + } + + free(buf); + free(captures); + return res; +} diff --git a/tests/capture/Makefile b/tests/capture/Makefile index 53d63ff2b..bdee42df4 100644 --- a/tests/capture/Makefile +++ b/tests/capture/Makefile @@ -7,18 +7,20 @@ TEST_OUTDIR.tests/capture = ${BUILD}/tests/capture .for n in ${TEST.tests/capture:T:R:C/^capture//} test:: ${TEST_OUTDIR.tests/capture}/res${n} SRC += ${TEST_SRCDIR.tests/capture}/capture${n}.c -CFLAGS.${TEST_SRCDIR.tests/capture}/capture${n}.c = -UNDEBUG +CFLAGS.${TEST_SRCDIR.tests/capture}/capture${n}.c = -UNDEBUG -std=c99 ${TEST_OUTDIR.tests/capture}/run${n}: ${TEST_OUTDIR.tests/capture}/capture${n}.o ${TEST_OUTDIR.tests/capture}/captest.o - ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/capture}/run${n} ${TEST_OUTDIR.tests/capture}/capture${n}.o ${TEST_OUTDIR.tests/capture}/captest.o ${BUILD}/lib/libfsm.a + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/capture}/run${n} ${TEST_OUTDIR.tests/capture}/capture${n}.o ${TEST_OUTDIR.tests/capture}/captest.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a ${TEST_OUTDIR.tests/capture}/res${n}: ${TEST_OUTDIR.tests/capture}/run${n} ( ${TEST_OUTDIR.tests/capture}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/capture}/res${n} -.for lib in ${LIB:Mlibfsm} +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} ${TEST_OUTDIR.tests/capture}/run${n}: ${BUILD}/lib/${lib:R}.a .endfor .endfor +CFLAGS.${TEST_SRCDIR.tests/capture}/captest.c += -std=c99 + ${TEST_OUTDIR.tests/capture}/captest.o: tests/capture/captest.c ${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/capture}/captest.o tests/capture/captest.c diff --git a/tests/capture/captest.c b/tests/capture/captest.c index dc66f81d4..bf453e3ad 100644 --- a/tests/capture/captest.c +++ b/tests/capture/captest.c @@ -1,19 +1,19 @@ #include "captest.h" -#include -#include -#include - -#if CAPTEST_RUN_SINGLE_LOG #include -#endif +#include + +#include + +/* for fsm_capvm_program_exec */ +#include "../../src/libfsm/capture_vm.h" -#define FAIL(MSG) \ - fprintf(stderr, "FAIL: %s:%d -- %s\n", \ - __FILE__, __LINE__, MSG); \ - exit(EXIT_FAILURE) +struct captest_input { + const char *string; + size_t pos; +}; -int +static int captest_getc(void *opaque) { struct captest_input *input = opaque; @@ -22,189 +22,483 @@ captest_getc(void *opaque) return res == 0 ? EOF : res; } -int -captest_run_single(const struct captest_single_fsm_test_info *info) -{ - size_t i; - struct captest_input input; - fsm_state_t end; - int exec_res; - struct fsm_capture got_captures[MAX_TEST_CAPTURES]; - struct fsm_capture exp_captures[MAX_TEST_CAPTURES]; - size_t capture_count = 0; - struct fsm *fsm = captest_fsm_of_string(info->string, 0); +static struct fsm_options options = { + .group_edges = 1, +}; - input.string = info->string; - input.pos = 0; +#define MAX_INPUT_WITH_NL_LENGTH 1000 +static char +input_with_nl[MAX_INPUT_WITH_NL_LENGTH]; - if (fsm == NULL) { - FAIL("fsm_of_string"); +enum captest_run_case_res +captest_run_case(const struct captest_case_single *testcase, + int verbosity, bool trailing_newline, FILE *prog_output) +{ + bool dump_captures = false; + enum captest_run_case_res res = CAPTEST_RUN_CASE_ERROR; + struct re_err err; + + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + assert(prog_output != NULL); + } else { + assert(prog_output == NULL); } - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - exp_captures[i].pos[0] = FSM_CAPTURE_NO_POS; - exp_captures[i].pos[1] = FSM_CAPTURE_NO_POS; + if (verbosity > 0) { + printf("/%s/ <- \"%s%s\": ", + testcase->regex, testcase->input, + trailing_newline ? "\\n" : ""); } - for (i = 0; i < MAX_SINGLE_FSM_TEST_PATHS; i++) { - const struct captest_single_fsm_test_path *path = - &info->paths[i]; - if (path->start == 0 && path->end == 0 && i > 0) { - break; /* end of list */ - } + /* build regex */ + const enum re_flags flags = 0; + struct captest_input comp_input = { + .string = testcase->regex, + }; - /* no zero-width captures */ - assert(path->end > path->start); + struct fsm *fsm = re_comp(RE_PCRE, + captest_getc, &comp_input, + &options, flags, &err); - if (!fsm_capture_set_path(fsm, i, - path->start, path->end)) { - fprintf(stderr, - "failed to set capture path %lu\n", i); - FAIL("fsm_capture_set_path"); + if (testcase->match == SHOULD_REJECT_AS_UNSUPPORTED) { + if (fsm != NULL) { + fsm_free(fsm); + return CAPTEST_RUN_CASE_FAIL; } + return CAPTEST_RUN_CASE_PASS; + } - exp_captures[i].pos[0] = path->start; - exp_captures[i].pos[1] = path->end; + assert(fsm != NULL); - capture_count = i + 1; + if (!fsm_determinise(fsm)) { + return CAPTEST_RUN_CASE_ERROR; } - { - const unsigned count = fsm_countcaptures(fsm); - const unsigned expected = capture_count; - if (count != expected) { - fprintf(stderr, "expected %u, got %u\n", - expected, count); - FAIL("countcaptures"); - } + if (!fsm_minimise(fsm)) { + return CAPTEST_RUN_CASE_ERROR; } -#if CAPTEST_RUN_SINGLE_LOG - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "fsm", fsm); -#endif + if (verbosity > 3) { + fsm_print_fsm(stdout, fsm); + } - exec_res = fsm_exec(fsm, captest_getc, &input, &end, got_captures); - if (exec_res != 1) { FAIL("exec_res"); } - if (end != strlen(info->string)) { FAIL("exec end pos"); } + if (trailing_newline) { + const size_t length = strlen(testcase->input); + assert(length + 1 < MAX_INPUT_WITH_NL_LENGTH); + memcpy(input_with_nl, testcase->input, + length); + input_with_nl[length] = '\n'; + input_with_nl[length + 1] = '\0'; + } - { - fsm_end_id_t id_buf[1] = { ~0 }; - enum fsm_getendids_res gres; - size_t written; - if (1 != fsm_getendidcount(fsm, end)) { - FAIL("did not have exactly one end ID"); + const char *input = trailing_newline + ? input_with_nl + : testcase->input; + assert(input != NULL); + const size_t length = strlen(input); + + fsm_state_t end; /* unused but required by API */ + struct fsm_capture capture_buf[MAX_CAPTEST_SINGLE_CAPTURE_PAIRS]; + const size_t capture_buf_length = MAX_CAPTEST_SINGLE_CAPTURE_PAIRS; + + /* Initialize with values that are distinct from FSM_CAPTURE_NO_POS + * and will stand out visually. Should never see these. */ + for (size_t i = 0; i < MAX_CAPTEST_SINGLE_CAPTURE_PAIRS; i++) { + capture_buf[i].pos[0] = 88888888; + capture_buf[i].pos[1] = 99999999; + } + + /* If verbosity is exactly DUMP_PROGRAMS_VERBOSITY, then print out capture info and pass. */ + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + assert(prog_output != NULL); + if (!trailing_newline) { + const char *match_str = testcase->match == SHOULD_MATCH ? "SHOULD_MATCH" + : testcase->match == SHOULD_NOT_MATCH ? "SHOULD_NOT_MATCH" + : testcase->match == SHOULD_REJECT_AS_UNSUPPORTED ? "SHOULD_REJECT_AS_UNSUPPORTED" + : "ERROR"; + fprintf(prog_output, "regex \"%s\", input \"%s\", match %s, no_nl %d, count %zu:", + testcase->regex, testcase->input, match_str, testcase->no_nl, + testcase->count); + for (size_t i = 0; i < testcase->count; i++) { + fprintf(prog_output, " %zu:[%zd, %zd]", + i, testcase->expected[i].pos[0], testcase->expected[i].pos[1]); + } + fprintf(prog_output, "\n"); + fsm_capture_dump(prog_output, "capture_info", fsm); } + fsm_free(fsm); + return CAPTEST_RUN_CASE_PASS; + } + + /* first, execute with a capture buffer that is one cell too small and check for an error */ + const size_t capture_ceil = fsm_capture_ceiling(fsm); + assert(capture_ceil > 0); + const size_t insufficient_capture_buf_length = capture_ceil - 1; + errno = 0; + int exec_res = fsm_exec_with_captures(fsm, + (const unsigned char *)input, length, &end, capture_buf, insufficient_capture_buf_length); + assert(exec_res == -1); + assert(errno == EINVAL); + errno = 0; + + /* then, execute and check result & captures */ + exec_res = fsm_exec_with_captures(fsm, + (const unsigned char *)input, length, &end, capture_buf, capture_buf_length); + if (exec_res == -1) { + perror("fsm_exec_with_captures"); + return CAPTEST_RUN_CASE_ERROR; + } - gres = fsm_getendids(fsm, end, 1, id_buf, &written); - if (gres != FSM_GETENDIDS_FOUND) { - FAIL("failed to get end IDs"); + if (testcase->match == SHOULD_NOT_MATCH) { /* expect match failure */ + res = (exec_res == 0 + ? CAPTEST_RUN_CASE_PASS + : CAPTEST_RUN_CASE_FAIL); + } else if (exec_res == 0) { + res = CAPTEST_RUN_CASE_FAIL; /* didn't match, should have */ + } else { + res = CAPTEST_RUN_CASE_PASS; + if (verbosity > 1) { + dump_captures = true; } - if (0 != id_buf[0]) { - FAIL("failed to get end ID of 0"); + /* check captures against expected */ + for (size_t i = 0; i < testcase->count; i++) { + if (testcase->expected[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected[i].pos[1] != capture_buf[i].pos[1]) { + res = CAPTEST_RUN_CASE_FAIL; + dump_captures = true; + } } } - for (i = 0; i < capture_count; i++) { -#if CAPTEST_RUN_SINGLE_LOG - fprintf(stderr, "captest: capture %lu: exp (%ld, %ld), got (%ld, %ld)\n", - i, exp_captures[i].pos[0], exp_captures[i].pos[1], - got_captures[i].pos[0], got_captures[i].pos[1]); -#endif - if (got_captures[i].pos[0] != exp_captures[i].pos[0]) { - fprintf(stderr, "capture[%lu].pos[0]: exp %lu, got %lu\n", - i, exp_captures[i].pos[0], - got_captures[i].pos[0]); - FAIL("capture mismatch"); + switch (res) { + case CAPTEST_RUN_CASE_PASS: + if (verbosity > 0) { + printf("pass\n"); } - if (got_captures[i].pos[1] != exp_captures[i].pos[1]) { - fprintf(stderr, "capture[%lu].pos[1]: exp %lu, got %lu\n", - i, exp_captures[i].pos[1], - got_captures[i].pos[1]); - FAIL("capture mismatch"); + break; + case CAPTEST_RUN_CASE_FAIL: + if (verbosity == 0) { + printf("/%s/ <- \"%s%s\": FAIL\n", + testcase->regex, testcase->input, + trailing_newline ? "\\n" : ""); + } + if (verbosity > 0) { + printf("FAIL\n"); + } + break; + case CAPTEST_RUN_CASE_ERROR: + printf("ERROR\n"); + break; + } + + if (dump_captures) { + for (size_t i = 0; i < testcase->count; i++) { + printf("exp %zd, %zd, got %zd, %zd%s\n", + testcase->expected[i].pos[0], testcase->expected[i].pos[1], + capture_buf[i].pos[0], capture_buf[i].pos[1], + (testcase->expected[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected[i].pos[1] != capture_buf[i].pos[1]) + ? " *" : ""); } } fsm_free(fsm); - return 0; + return res; } -struct fsm * -captest_fsm_of_string(const char *string, unsigned end_id) +enum captest_run_case_res +captest_run_case_multi(const struct captest_case_multi *testcase, + int verbosity, bool trailing_newline, FILE *prog_output, + struct captest_case_multi_result *result) { - struct fsm *fsm = captest_fsm_with_options(); - const size_t length = strlen(string); - size_t i; + struct re_err err; + const enum re_flags flags = 0; - if (fsm == NULL) { - return NULL; + struct captest_case_multi_result ignored_result; + if (result == NULL) { + result = &ignored_result; } + memset(result, 0x00, sizeof(*result)); - if (!fsm_addstate_bulk(fsm, length + 1)) { - goto cleanup; + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + assert(prog_output != NULL); + } else { + assert(prog_output == NULL); } - fsm_setstart(fsm, 0); - for (i = 0; i < length; i++) { - if (!fsm_addedge_literal(fsm, i, i + 1, string[i])) { + /* build each regex, combining them and keeping track of capture offsets */ + struct fsm *fsms[testcase->regex_count]; + struct fsm_combined_base_pair bases[testcase->regex_count]; + struct fsm *combined_fsm = NULL; + + for (size_t i = 0; i < testcase->regex_count; i++) { + fsms[i] = NULL; + } + + /* compile each individually */ + for (size_t i = 0; i < testcase->regex_count; i++) { + struct captest_input comp_input = { + .string = testcase->regexes[i], + }; + + if (verbosity > 1) { + fprintf(stderr, "%s: compiling \"%s\"\n", + __func__, comp_input.string); + } + + struct fsm *fsm = re_comp(RE_PCRE, + captest_getc, &comp_input, + &options, flags, &err); + assert(fsm != NULL); + + if (!fsm_determinise(fsm)) { goto cleanup; } + + if (!fsm_minimise(fsm)) { + goto cleanup; + } + + if (verbosity > 3) { + char tag_buf[16] = { 0 }; + snprintf(tag_buf, sizeof(tag_buf), "fsm[%zu]", i); + + fprintf(stderr, "==== fsm[%zu]\n", i); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, tag_buf, fsm); + } + + fsms[i] = fsm; + } + + combined_fsm = fsm_union_array(testcase->regex_count, fsms, bases); + assert(combined_fsm != NULL); + if (verbosity > 0) { + fprintf(stderr, "%s: combined_fsm: %d states after fsm_union_array\n", + __func__, fsm_countstates(combined_fsm)); + } + if (verbosity > 1) { + for (size_t i = 0; i < testcase->regex_count; i++) { + fprintf(stderr, "%s: base[%zu]: state %d, capture %u\n", + __func__, i, bases[i].state, bases[i].capture); + } } - fsm_setend(fsm, length, 1); - if (!fsm_setendid(fsm, end_id)) { + if (!fsm_determinise(combined_fsm)) { goto cleanup; } + if (verbosity > 0) { + fprintf(stderr, "%s: combined_fsm: %d states after determinise\n", + __func__, fsm_countstates(combined_fsm)); + } + + if (!fsm_minimise(combined_fsm)) { + goto cleanup; + } + if (verbosity > 0) { + fprintf(stderr, "%s: combined_fsm: %d states after minimise\n", + __func__, fsm_countstates(combined_fsm)); + } + + /* If verbosity is exactly 9, then print out capture info and pass. */ + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + fsm_capture_dump(prog_output, "capture_info", combined_fsm); + fsm_free(combined_fsm); + return CAPTEST_RUN_CASE_PASS; + } + + if (verbosity > 3) { + fprintf(stderr, "==== combined\n"); + fsm_print_fsm(stderr, combined_fsm); + fsm_capture_dump(stderr, "combined", combined_fsm); + } + + /* for each input, execute and check result */ + const struct multi_case_input_info *info; + for (info = &testcase->inputs[0]; info->input != NULL; info++) { + if (trailing_newline) { + const size_t length = strlen(info->input); + assert(length + 1 < MAX_INPUT_WITH_NL_LENGTH); + memcpy(input_with_nl, info->input, + length); + input_with_nl[length] = '\n'; + input_with_nl[length + 1] = '\0'; + } + + const char *input = trailing_newline + ? input_with_nl + : info->input; + assert(input != NULL); + const size_t length = strlen(input); + + if (verbosity > 1) { + fprintf(stderr, "%s: input: %s\n", __func__, input); + } + + fsm_state_t end; /* unused but required by API */ + struct fsm_capture capture_buf[MAX_CAPTEST_MULTI_CAPTURE_PAIRS]; + const size_t capture_buf_length = MAX_CAPTEST_MULTI_CAPTURE_PAIRS; + for (size_t i = 0; i < capture_buf_length; i++) { + capture_buf[i].pos[0] = (size_t)-2; + capture_buf[i].pos[1] = (size_t)-3; + } + + /* execute and check result & captures */ + int exec_res = fsm_exec_with_captures(combined_fsm, + (const unsigned char *)input, length, &end, capture_buf, capture_buf_length); + if (exec_res == -1) { + perror("fsm_exec_with_captures"); + return CAPTEST_RUN_CASE_ERROR; + } + + /* The .regex field should be in ascending order so we know + * when we've reached the all-0 suffix of expected[]. */ + uint8_t prev_regex = 0; + for (const struct case_multi_expected *exp = &info->expected[0]; + exp->regex >= prev_regex; exp++) { + prev_regex = exp->regex; + bool match = true; + const unsigned capture_base = bases[exp->regex].capture; + const unsigned capture_id = capture_base + exp->capture; + assert(capture_id < MAX_CAPTEST_MULTI_CAPTURE_PAIRS); + const size_t exp_s = exp->pos[0]; + const size_t exp_e = exp->pos[1]; + const size_t got_s = capture_buf[capture_id].pos[0]; + const size_t got_e = capture_buf[capture_id].pos[1]; + if (exp_s == got_s && exp_e == got_e) { + result->pass++; + } else { + match = false; + result->fail++; + } + + if (!match || verbosity > 2) { + fprintf(stderr, "%s: regex %u, capture %u (%u + base %u), exp (%zd, %zd), got (%zd, %zd)%s\n", + __func__, exp->regex, + capture_id, exp->capture, capture_base, + exp_s, exp_e, got_s, got_e, + match ? "" : " *** mismatch ***"); + } + } + } + + fsm_free(combined_fsm); - return fsm; + /* this could populate a result struct so it can pass/fail multiple inputs */ + + return result->fail == 0 + ? CAPTEST_RUN_CASE_PASS + : CAPTEST_RUN_CASE_FAIL; cleanup: - fsm_free(fsm); - return NULL; -} + if (combined_fsm != NULL) { + fsm_free(combined_fsm); + } else { + for (size_t i = 0; i < testcase->regex_count; i++) { + if (fsms[i] != NULL) { + fsm_free(fsms[i]); + } + } + } -static struct fsm_options options; + return CAPTEST_RUN_CASE_ERROR; +} -struct fsm * -captest_fsm_with_options(void) +static struct capvm_program * +get_program_copy(const struct captest_case_program *testcase) { - struct fsm *fsm = NULL; + static struct capvm_program prog; + static struct capvm_opcode ops[MAX_PROGRAM_OPS + 1] = { 0 }; + static struct capvm_char_class cc_sets[MAX_PROGRAM_CHAR_CLASSES] = { 0 }; + + memset(&prog, 0x00, sizeof(prog)); + + memcpy(ops, testcase->ops, + MAX_PROGRAM_OPS * sizeof(testcase->ops[0])); + memcpy(cc_sets, testcase->char_class, + MAX_PROGRAM_CHAR_CLASSES * sizeof(testcase->char_class[0])); + + assert(testcase->expected.count < MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS); + prog.capture_count = testcase->expected.count; + prog.capture_base = testcase->expected.base; + + uint32_t max_cc_used = (uint32_t)-1; + + prog.used = MAX_PROGRAM_OPS; + for (size_t i = 0; i < MAX_PROGRAM_OPS; i++) { + const struct capvm_opcode *op = &testcase->ops[i]; + if (op->t == CAPVM_OP_CHAR && op->u.chr == 0x00) { + prog.used = i; + break; + } else if (op->t == CAPVM_OP_CHARCLASS) { + if (max_cc_used == (uint32_t)-1 || op->u.charclass_id > max_cc_used) { + assert(op->u.charclass_id < MAX_PROGRAM_CHAR_CLASSES); + max_cc_used = op->u.charclass_id; + } + } + } + + prog.ceil = MAX_PROGRAM_OPS; + prog.ops = ops; + + prog.char_classes.sets = cc_sets; + prog.char_classes.count = max_cc_used == (uint32_t)-1 ? 0 : max_cc_used + 1; + prog.char_classes.ceil = MAX_PROGRAM_CHAR_CLASSES; - /* We currently don't need to set anything custom on this. */ - fsm = fsm_new(&options); - return fsm; + return &prog; } -int -captest_check_single_end_id(const struct fsm *fsm, fsm_state_t end_state, - unsigned expected_end_id, const char **msg) +enum captest_run_case_res +captest_run_case_program(const struct captest_case_program *testcase, + int verbosity) { - fsm_end_id_t id_buf[1] = { ~0 }; - enum fsm_getendids_res gres; - size_t written; - const char *unused; + (void)verbosity; - if (msg == NULL) { - msg = &unused; + /* copy program */ + const size_t input_length = strlen(testcase->input); + struct fsm_capture capture_buf[MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS]; + const size_t capture_buf_length = MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS; + + /* Initialize with FSM_CAPTURE_NO_POS, as the caller would */ + for (size_t i = 0; i < capture_buf_length; i++) { + capture_buf[i].pos[0] = FSM_CAPTURE_NO_POS; + capture_buf[i].pos[1] = FSM_CAPTURE_NO_POS; } - if (1 != fsm_getendidcount(fsm, end_state)) { - *msg = "did not have exactly one end ID"; - return 0; + struct capvm_program *program = get_program_copy(testcase); + + if (verbosity > 2) { + fsm_capvm_program_dump(stderr, program); } - gres = fsm_getendids(fsm, end_state, 1, id_buf, &written); - if (gres != FSM_GETENDIDS_FOUND) { - *msg = "failed to get end IDs"; - return 0; + fsm_capvm_program_exec(program, (const uint8_t *)testcase->input, input_length, + capture_buf, capture_buf_length); + + bool dump_captures = false; + enum captest_run_case_res res = CAPTEST_RUN_CASE_PASS; + + /* check captures against expected */ + for (size_t i = 0; i < testcase->expected.count; i++) { + if (testcase->expected.captures[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected.captures[i].pos[1] != capture_buf[i].pos[1]) { + res = CAPTEST_RUN_CASE_FAIL; + dump_captures = true; + } } - if (expected_end_id != id_buf[0]) { - *msg = "failed to get expected end ID"; - return 0; + if (dump_captures) { + for (size_t i = 0; i < testcase->expected.count; i++) { + printf("exp %zd, %zd, got %zd, %zd%s\n", + testcase->expected.captures[i].pos[0], + testcase->expected.captures[i].pos[1], + capture_buf[i].pos[0], capture_buf[i].pos[1], + (testcase->expected.captures[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected.captures[i].pos[1] != capture_buf[i].pos[1]) + ? " *" : ""); + } } - return 1; + return res; } diff --git a/tests/capture/captest.h b/tests/capture/captest.h index a9debff6c..53b30cec0 100644 --- a/tests/capture/captest.h +++ b/tests/capture/captest.h @@ -1,56 +1,123 @@ /* - * Copyright 2020 Scott Vokes + * Copyright 2022 Scott Vokes * * See LICENCE for the full copyright terms. */ + #ifndef CAPTEST_H #define CAPTEST_H #include #include #include +#include +#include #include #include #include #include +#include -#define MAX_SINGLE_FSM_TEST_PATHS 8 -#define MAX_TEST_CAPTURES 8 +#include -#define CAPTEST_RUN_SINGLE_LOG 0 +/* for captest_run_case_program, to evaluate handwritten programs */ +#include "../../src/libfsm/capture_vm_program.h" +#include "../../src/libfsm/capture_vm.h" -#ifndef LOG_INTERMEDIATE_FSMS -#define LOG_INTERMEDIATE_FSMS 0 -#endif +#define MAX_CAPTEST_SINGLE_CAPTURE_PAIRS 16 +#define MAX_CAPTEST_MULTI_CAPTURE_PAIRS 16 +#define MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS 16 + +/* position representing no match */ +#define POS_NONE { (size_t)-1, (size_t)-1 } + +/* If verbosity is set to this (with -vvvvvvvvv) then dump all the + * compiled programs to 'prog_output'. */ +#define DUMP_PROGRAMS_VERBOSITY 9 -struct captest_single_fsm_test_info { - const char *string; - struct captest_single_fsm_test_path { - fsm_state_t start; - fsm_state_t end; - } paths[MAX_SINGLE_FSM_TEST_PATHS]; +enum captest_match { + SHOULD_MATCH = 0, /* implied, set by designated initializer */ + SHOULD_NOT_MATCH = 1, + SHOULD_REJECT_AS_UNSUPPORTED = 2, + SHOULD_SKIP = 3, }; -struct captest_input { - const char *string; - size_t pos; +struct captest_case_single { + const char *regex; + const char *input; + enum captest_match match; + bool no_nl; /* do not retry with trailing newline */ + + size_t count; + struct fsm_capture expected[MAX_CAPTEST_SINGLE_CAPTURE_PAIRS]; }; -int -captest_run_single(const struct captest_single_fsm_test_info *info); +/* Same as captest_case_single, but these expect multiple (possibly overlapping) + * regexes to be combined before checking the match/capture behavior. */ +#define MAX_REGEXES 4 +#define MAX_INPUTS 8 +#define MAX_CAPTEST_MULTI_EXPECTED 8 +struct captest_case_multi { + uint8_t regex_count; + const char *regexes[MAX_REGEXES]; + enum captest_match match; + bool no_nl; + + struct multi_case_input_info { + const char *input; /* first NULL input = end of list */ + struct case_multi_expected { + uint8_t regex; /* expected: ascending order */ + uint8_t capture; /* 0 is default */ + size_t pos[2]; + } expected[MAX_CAPTEST_MULTI_EXPECTED]; + } inputs[MAX_INPUTS]; +}; -int -captest_getc(void *opaque); +struct captest_case_multi_result { + size_t pass; + size_t fail; +}; + +#define MAX_PROGRAM_CHAR_CLASSES 4 +#define MAX_PROGRAM_OPS 32 +struct captest_case_program { + const char *input; + + struct capvm_char_class char_class[MAX_PROGRAM_CHAR_CLASSES]; + + struct { + uint32_t count; + uint32_t base; + struct fsm_capture captures[MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS]; + } expected; + + /* termined by 0'd record, { .t == CAPVM_OP_CHAR, .u.chr = 0x00 } */ + struct capvm_opcode ops[MAX_PROGRAM_OPS]; +}; + +enum captest_run_case_res { + CAPTEST_RUN_CASE_PASS, + CAPTEST_RUN_CASE_FAIL, + CAPTEST_RUN_CASE_ERROR, +}; +enum captest_run_case_res +captest_run_case(const struct captest_case_single *testcase, + int verbosity, bool trailing_newline, FILE *prog_output); -struct fsm * -captest_fsm_with_options(void); +enum captest_run_case_res +captest_run_case_multi(const struct captest_case_multi *testcase, + int verbosity, bool trailing_newline, FILE *prog_output, + struct captest_case_multi_result *result); -struct fsm * -captest_fsm_of_string(const char *string, unsigned end_id); +/* This should probably only be used for evaluating specific + * hand-written programs for development, because we only care + * about supporting the kinds of programs that could be produced + * by compiling from valid regexes. In other words, this is not + * a stable public interface. */ +enum captest_run_case_res +captest_run_case_program(const struct captest_case_program *testcase, + int verbosity); -int -captest_check_single_end_id(const struct fsm *fsm, fsm_state_t end_state, - unsigned expected_end_id, const char **msg); #endif diff --git a/tests/capture/capture_test_case_list.c b/tests/capture/capture_test_case_list.c new file mode 100644 index 000000000..6a5a36a19 --- /dev/null +++ b/tests/capture/capture_test_case_list.c @@ -0,0 +1,1867 @@ +#include "captest.h" + +#include + +#define NO_POS FSM_CAPTURE_NO_POS + +const struct captest_case_single single_cases[] = { + { + .regex = "^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$^", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "()*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^$", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^($|($)|(($))|((($))))", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(((($)))|(($))|($)|$)", + .input = "", + .count = 5, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^((((a$)))|((b$))|(c$)|d$)", + .input = "a", + .count = 5, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^((((a$)))|((b$))|(c$)|d$)", + .input = "b", + .count = 7, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^((((b$)))|((b$))|(c$)|d$)", + .input = "b", + .count = 5, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^((x?))*$", + .input = "x", + .count = 3, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^((x?)*)*$", + .input = "", + .count = 3, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^((x?)*)*$", + .input = "xxxxx", + .count = 3, .expected = { + { .pos = {0, 5}, }, + { .pos = {5, 5}, }, + { .pos = {5, 5}, }, + }, + }, + { + .regex = "xx*x", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 2}, }, + }, + }, + { + .regex = "^(x?)*$", + .input = "xx", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {2, 2}, }, + }, + }, + { + .regex = "^(x?)*$", + .input = "xxx", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {3, 3}, }, + }, + }, + { + .regex = "^(x?)+$", + .input = "xx", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {2, 2}, }, + }, + }, + { + .regex = "^(x?)+$", + .input = "xxx", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {3, 3}, }, + }, + }, + { + .regex = "^x(z?)*y$", + .input = "xy", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "()|x", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()|x", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "x|()", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|()", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$|", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = ".|$^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = ".|$^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$^|.", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$^|.", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$$$^|...", + .input = "xxx", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "x?$x?^x?|x?$x?^x?", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "[^x]", .input = "", + .no_nl = true, + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "[^x]", + .input = "\n", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = ".$()", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = ".$()", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^.$()", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^.$()", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$(x?)(y?)(z?)", + .input = "a", + .count = 4, .expected = { + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = ".$(x?)(y?)(z?)", + .input = "a", + .count = 4, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "[^y]", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = ".", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "(x)+", + .input = "xxx", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {2, 3}, }, + }, + }, + { + .regex = "^(x)*.", + .input = "xx", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^(x)*.", + .input = "xy", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "a.b(c)*", + .input = "axbc", + .count = 2, .expected = { + { .pos = {0, 4}, }, + { .pos = {3, 4}, }, + }, + }, + { + .regex = "^x?^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^x?^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$(^)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "($)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "($$$)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$x?^", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$(^)*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$(^)*", + .input = "x", + .count = 1, .expected = { + { .pos = {1, 1}, }, + }, + }, + { + .regex = "$()*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$()*", + .input = "x", + .count = 2, .expected = { + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^$^", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$^$", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$y?^x*", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "x|$^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|$^", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "x|$^$^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|$^$^", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$^|x", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$^|x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$^$^|x", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$^$^|x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^$|.", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|^$^$", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^$^$|x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$|^|a$", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "[^a]x", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "[^a]x", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 2}, }, + }, + }, + { + .regex = "a(b|c$)d", .input = "ac", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "a(^b|c)d", .input = "bd", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(a|b|)*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "xx*y$", + .input = "x_xxy", + .count = 1, .expected = { + { .pos = {2, 5}, }, + }, + }, + { + .regex = "(|.$)*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(.$)*x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(.$)*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^(|.$)*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(|.$)*$", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "x|y(^)", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(?:x*.|^$).", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(?:x|^$)x", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "()+x", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "($$)^", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$($|$a)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?i)abc$", + .input = "AbC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)ab(?-i)c$", .input = "AbC", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^(?i)ab(?-i)c$", + .input = "Abc", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)a[b]c$", + .input = "ABC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)a[^b]c$", .input = "ABC", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^(?i)a[bx]c$", + .input = "ABC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)a[b-c]c$", + .input = "ABC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "(a()b)+a", + .input = "a!aba", + .count = 3, .expected = { + { .pos = {2, 5}, }, + { .pos = {2, 4}, }, + { .pos = {3, 3}, }, + }, + }, + { + .regex = "^^[^]]", + .input = "\n", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x(x()y)*", + .input = "xxy", + .count = 3, .expected = { + { .pos = {0, 3}, }, + { .pos = {1, 3}, }, + { .pos = {2, 2}, }, + }, + }, + { + .regex = "x(()x)*", + .input = "xx", + .count = 3, .expected = { + { .pos = {0, 2}, }, + { .pos = {1, 2}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "b(x*x*a()*y)*(a)a*", + .input = "ba", + .count = 4, .expected = { + { .pos = {0, 2}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {1, 2}, }, + }, + }, + { + .regex = "a(().x)*ab", + .input = "a.a.aaxab", + .count = 3, .expected = { + { .pos = {4, 9}, }, + { .pos = {5, 7}, }, + { .pos = {5, 5}, }, + }, + }, + { + .regex = "ab(b()*()*)*()*z", + .input = "a!abz", + .count = 5, .expected = { + { .pos = {2, 5}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {4, 4}, }, + }, + }, + { + .regex = "^x(y?z*)*$", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^(y?z*)*$", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(x|$x?)*$", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(^|$x)*$", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "((x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*)*y$", + .input = "xxxxxxxxxxy", + .count = 15, .expected = { + { .pos = {0, 11}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + }, + }, + { + .regex = "^a$", + .input = "a", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^a(bcd)e$", + .input = "abcde", + .count = 2, .expected = { + { .pos = {0, 5}, }, + { .pos = {1, 4}, }, + }, + }, + { + .regex = "^(a(b((c))(d)))$", + .input = "abcd", + .count = 6, .expected = { + { .pos = {0, 4}, }, + { .pos = {0, 4}, }, + { .pos = {1, 4}, }, + { .pos = {2, 3}, }, + { .pos = {2, 3}, }, + { .pos = {3, 4}, }, + }, + }, + { + .regex = "^(a(b(c)))$", + .input = "abc", + .count = 4, .expected = { + { .pos = {0, 3}, }, + { .pos = {0, 3}, }, + { .pos = {1, 3}, }, + { .pos = {2, 3}, }, + }, + }, + { + .regex = "^a(b*)(c)$", + .input = "ac", + .count = 3, .expected = { + { .pos = {0, 2}, }, + { .pos = {1, 1}, }, + { .pos = {1, 2}, }, + }, + }, + { + .regex = "^a(b*)(c)$", + .input = "abc", + .count = 3, .expected = { + { .pos = {0, 3}, }, + { .pos = {1, 2}, }, + { .pos = {2, 3}, }, + }, + }, + { + .regex = "^a(b*)(c)$", + .input = "abbc", + .count = 3, .expected = { + { .pos = {0, 4}, }, + { .pos = {1, 3}, }, + { .pos = {3, 4}, }, + }, + }, + { + .regex = "^(ab*c)$", + .input = "ac", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 2}, }, + }, + }, + { + .regex = "^(ab*c)$", + .input = "abc", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(ab*c)$", + .input = "abbc", + .count = 2, .expected = { + { .pos = {0, 4}, }, + { .pos = {0, 4}, }, + }, + }, + { + .regex = "^(a*)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(a*)", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(a*)", + .input = "a", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^(a*)", + .input = "ax", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^a*", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^a*", + .input = "a", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^a*", + .input = "ax", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = ".|", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()*^", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(((())))*^", + .input = "", + .count = 5, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + + { + .regex = "(x|(x|))^", + .input = "", + .count = 3, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = ".*(x|())^", + .input = "", + .count = 3, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(()|(()|x)^|x)^", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + + { + .regex = "x^()()|()", + .input = "", + .count = 4, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "y^()|()^x", + .input = "x", + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()$a|()", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()$z|(x)$", + .input = "x", + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 1}, }, + }, + }, + + { + /* long enough to exercise the USE_COLLAPSED_ZERO_PREFIX optimization */ + .regex = "a*(ba*)c$", + .input = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaac", + .count = 2, + .expected = { + { .pos = {101, 303}, }, + { .pos = {201, 302}, }, + }, + }, + + /* regression: losing the first character on the transition from + * the unanchored start loop to the capture */ + { + .regex = "aa+b$", + .input = "aXaXaaab", + .count = 1, + .expected = { + { .pos = {4, 8}, }, + }, + }, + { + .regex = "aa*b$", + .input = "aXaXaaab", + .count = 1, + .expected = { + { .pos = {4, 8}, }, + }, + }, + { + .regex = "!!!+$", + .input = "!\"!\"!\"!!!!", + .count = 1, + .expected = { + { .pos = {6, 10}, }, + }, + }, + + /* new fuzzer regressions */ + { + /* PCRE does not set the first capture, which is unsatisfiable */ + .regex = "^(.^)*^(a*)", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + /* similar to the previous case, but with different anchoring */ + .regex = "(a)*(^)*^", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(.a)*^(.a)", + .input = "!a", + .count = 3, + .expected = { + { .pos = {0, 2}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 2}, }, + }, + }, + { + .regex = "(A)*^()*^", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + + { + .regex = "(a(b*)*|)*bc", + .input = "b!bc", + .count = 3, + .expected = { + { .pos = {2, 4}, }, + { .pos = {2, 2}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(a(b*)*|)*bc$", + .input = "bc", + .count = 3, + .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "(|a((b*)*b*))*", + .input = "", + .count = 4, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + /* simplified version of the above */ + .regex = "^(|a(b*)*)*$", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + /* zero repetitions should not set the capture */ + .regex = "^(a)*$", + .input = "", + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(a)*(^)$", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + /* raw fuzzer output */ + .regex = "()((()(^|$|$^|^|$|$^^|$|$^|^|$|$^^^^|^|(|)($)|)+|^^|^|(|)($)|)+|)($)()+", + .input = "", + .count = 12, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(^|())+()", + .input = "", + .count = 4, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(?:(^|^$)+|)+", + .input = "", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^((|)($)|)+a$", + .input = "a", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 4, + .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(($)|)+a$", + .input = "a", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(|(|x))*$", + .input = "x", + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + /* same as the previous but without outer capture */ + .regex = "^(?:|(|x))*$", + .input = "x", + .count = 2, + .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "(((($)|)+|)a|)+", + .input = "", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 5, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + + { + .regex = "^(|(|(|x)))*$", + .input = "x", + .count = 4, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + + + { + .regex = "^(?:(?:(x?)^)y?)+$", + .input = "", + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?:^())+$", + .input = "", + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?:($|x))+$", + .input = "x", + + .match = SHOULD_REJECT_AS_UNSUPPORTED, + + .count = 2, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^(($)|x)+$", + .input = "x", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^(?:()?^()?)+$", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?:($|x)())+$", + .input = "x", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + + { + .regex = "()~((|)($)|%)+", + .input = "~%", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 5, + .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 0}, }, + { .pos = {2, 2}, }, + { .pos = {2, 2}, }, + { .pos = {2, 2}, }, + }, + }, + + { + /* (slightly) reduced version of the previous */ + .regex = "^(()($)|x)+$", + .input = "x", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 4, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + + { + .regex = "a|_$[^b]", + .input = "a", + .count = 1, + .expected = { + { .pos = {0, 1}, }, + }, + }, +}; + +const struct captest_case_multi multi_cases[] = { + { + .regex_count = 4, + .regexes = { + "^aa$", /* exactly two 'a's */ + "^a*", /* zero or more 'a's followed by anything */ + "^ab?$", /* 'a' and optionally 'b' */ + "a*$", /* anything ending in zero or more 'a's */ + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 0 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 0, 0 } }, + }, + }, + + { + .input = "a", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 1 } }, + { .regex = 2, .pos = { 0, 1 } }, + { .regex = 3, .pos = { 0, 1 } }, + }, + }, + + { + .input = "aa", + .expected = { + { .regex = 0, .pos = { 0, 2 } }, + { .regex = 1, .pos = { 0, 2 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 0, 2 } }, + }, + }, + + { + .input = "aaa", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 3 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 0, 3 } }, + }, + }, + + { + .input = "ba", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 0 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 1, 2 } }, + }, + }, + + { + .input = "ab", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 1 } }, + { .regex = 2, .pos = { 0, 2 } }, + { .regex = 3, .pos = { 2, 2 } }, + }, + }, + + { + .input = NULL, + }, + }, + }, + + { + .regex_count = 3, + .regexes = { + "a(b?)*c", + "(ab)(c)", + "ab+(c)", + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 2, .pos = POS_NONE }, + { .regex = 2, .capture = 0, .pos = POS_NONE }, + { .regex = 2, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "abc", + .expected = { + { .regex = 0, .capture = 0, .pos = {0, 3} }, + { .regex = 0, .capture = 1, .pos = {2, 2} }, + { .regex = 1, .capture = 0, .pos = {0, 3} }, + { .regex = 1, .capture = 1, .pos = {0, 2} }, + { .regex = 1, .capture = 2, .pos = {2, 3} }, + { .regex = 2, .capture = 0, .pos = {0, 3} }, + { .regex = 2, .capture = 1, .pos = {2, 3} }, + }, + }, + }, + }, + { + /* fuzzer regression: This led to an execution path in fsm_union_array, + * fsm_union, fsm_merge, merge that did not init or otherwise set the + * `struct fsm_combine_info`, leading to an out of range offset for + * the capture base. */ + .regex_count = 3, + .regexes = { + ".", + ".^", + "^^_", + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = POS_NONE }, + { .regex = 2, .pos = POS_NONE }, + }, + }, + { + .input = "_", + .expected = { + { .regex = 0, .pos = { 0, 1 } }, + { .regex = 1, .pos = { 0, 1 } }, + { .regex = 2, .pos = { 0, 1 } }, + }, + }, + }, + }, +}; + + +static struct captest_case_program program_cases[] = { + { + .input = "", + .char_class = { + { .octets = { ~0, ~0, ~0, ~0 }}, /* 0x00 <= x <= 0xff */ + }, + .expected = { + .count = 4, + .captures = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + + .ops = { + { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 3, .new = 1 }}, + { .t = CAPVM_OP_CHARCLASS, .u.charclass_id = 0 }, + { .t = CAPVM_OP_JMP, .u.jmp = 0 }, + { .t = CAPVM_OP_SAVE, .u.save = 0 }, + { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 5, .new = 7 }}, + { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_START }, + + { .t = CAPVM_OP_JMP, .u.jmp = 9 }, /* jump after |() */ + { .t = CAPVM_OP_SAVE, .u.save = 4 }, + { .t = CAPVM_OP_SAVE, .u.save = 5 }, + + { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 4, .new = 10 }}, + + { .t = CAPVM_OP_SAVE, .u.save = 2 }, + { .t = CAPVM_OP_SAVE, .u.save = 3 }, + { .t = CAPVM_OP_SAVE, .u.save = 6 }, + { .t = CAPVM_OP_SAVE, .u.save = 7 }, + { .t = CAPVM_OP_SAVE, .u.save = 1 }, + { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 18, .new = 16 }}, + { .t = CAPVM_OP_CHARCLASS, .u.charclass_id = 0 }, + { .t = CAPVM_OP_JMP, .u.jmp = 15 }, + { .t = CAPVM_OP_MATCH }, + }, + }, + + + { + /* correcting compilation of '^(?:($|x))+$' */ + .input = "x", + .expected = { + .count = 2, + .captures = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + + .ops = { + [0] = { .t = CAPVM_OP_SAVE, .u.save = 0 }, + [1] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_START }, + [2] = { .t = CAPVM_OP_SAVE, .u.save = 2 }, + [3] = { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 4, .new = 6 }}, + [4] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_END }, + + /* [5] = { .t = CAPVM_OP_JMP, .u.jmp = 7 }, */ + [5] = { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 7, .new = 9 }}, + + [6] = { .t = CAPVM_OP_CHAR, .u.chr = 'x' }, + [7] = { .t = CAPVM_OP_SAVE, .u.save = 3 }, + [8] = { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 2, .new = 9 }}, + [9] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_END }, + [10] = { .t = CAPVM_OP_SAVE, .u.save = 1 }, + [11] = { .t = CAPVM_OP_MATCH }, + }, + }, +}; + +#define NO_FILTER ((size_t)-1) +struct options { + size_t filter; + int verbosity; + bool track_timing; + FILE *prog_output; + enum groups { + GROUP_SINGLE = 0x01, + GROUP_MULTI = 0x02, + GROUP_PROGRAMS = 0x04, + GROUP_ALL = 0xff, + } group; +}; + +static void +print_usage(FILE *f, const char *progname) +{ + fprintf(f, "%s: [-h] [-v] [-s | -m | -p] [-f ] [-t]\n", progname); + fprintf(f, " -h: print this usage info\n"); + fprintf(f, " -v: increase verbosity (can repeat: -vvv)\n"); + fprintf(f, " -f : just run a specific test, by numeric ID\n"); + fprintf(f, " -s: only single casse\n"); + fprintf(f, " -m: only multi cases\n"); + fprintf(f, " -p: only program cases\n"); + fprintf(f, " -t: print timing info\n"); +} + +static void +get_options(struct options *opt, int argc, char **argv) +{ + const char *progname = argv[0]; + int c; + while (c = getopt(argc, argv, "hf:mpstv"), c != -1) { + switch (c) { + case 'h': + print_usage(stdout, progname); + exit(EXIT_SUCCESS); + break; + case 'v': + opt->verbosity++; + break; + case 'f': + opt->filter = atol(optarg); + break; + case 't': + opt->track_timing = true; + break; + case 'p': + opt->group = GROUP_PROGRAMS; + break; + case 's': + opt->group = GROUP_SINGLE; + break; + case 'm': + opt->group = GROUP_MULTI; + break; + case '?': + default: + print_usage(stderr, progname); + exit(EXIT_FAILURE); + } + } +} + +int main(int argc, char **argv) { + size_t pass = 0; + size_t fail = 0; + size_t skip = 0; + size_t nth = 0; + + struct options options = { + .filter = NO_FILTER, + .verbosity = 0, + .group = GROUP_ALL, + }; + get_options(&options, argc, argv); + + if (options.verbosity == DUMP_PROGRAMS_VERBOSITY) { + options.prog_output = fopen("prog_output", "w"); + assert(options.prog_output != NULL); + } + + /* avoid an extra layer of indentation here */ + if (!(options.group & GROUP_SINGLE)) { goto after_single; } + + printf("-- single cases without trailing newline\n"); + const size_t single_case_count = sizeof(single_cases)/sizeof(single_cases[0]); + for (size_t c_i = 0; c_i < single_case_count; c_i++) { + const size_t cur = nth++; + if (options.filter != NO_FILTER && options.filter != cur) { + continue; + } + + if (options.verbosity > 0) { + printf("%zu: ", cur); + if (options.verbosity > 2) { + fflush(stdout); + } + } + + if (options.verbosity == DUMP_PROGRAMS_VERBOSITY) { + fprintf(options.prog_output, "\n\n==== test_case %zu\n", c_i); + } + + const struct captest_case_single *t = &single_cases[c_i]; + + if (t->match == SHOULD_SKIP) { + printf("%zd: SKIP (regex \"%s\", input \"%s\")\n", + cur, t->regex, t->input); + skip++; + continue; + } + + enum captest_run_case_res res = captest_run_case(t, options.verbosity, false, options.prog_output); + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + pass++; + break; + case CAPTEST_RUN_CASE_FAIL: + if (options.verbosity == 0) { + printf("-- test case %zd (regex \"%s\", input \"%s\")\n", cur, t->regex, t->input); + } + fail++; + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } + + /* second pass, adding a trailing newline to input */ + printf("-- single cases with trailing newline\n"); + for (size_t c_i = 0; c_i < single_case_count; c_i++) { + const size_t cur = nth++; + if (options.filter != NO_FILTER && options.filter != cur) { + continue; + } + + const struct captest_case_single *t = &single_cases[c_i]; + if (t->no_nl) { continue; } + if (t->match == SHOULD_SKIP) { + printf("%zd: SKIP (regex \"%s\", input \"%s\\n\")\n", + cur, t->regex, t->input); + skip++; + continue; + } + + if (options.verbosity > 0) { + printf("%zu: ", cur); + if (options.verbosity > 2) { + fflush(stdout); + } + } + + enum captest_run_case_res res = captest_run_case(t, options.verbosity, true, options.prog_output); + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + pass++; + break; + case CAPTEST_RUN_CASE_FAIL: + if (options.verbosity == 0) { + printf("-- test case %zd (regex \"%s\", input \"%s\\n\")\n", cur, t->regex, t->input); + } + fail++; + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } +after_single: + + /* multi-regex tests */ + if (!(options.group & GROUP_MULTI)) { goto after_multi; } + + printf("-- multi-regex cases\n"); + const size_t multi_case_count = sizeof(multi_cases)/sizeof(multi_cases[0]); + for (size_t c_i = 0; c_i < multi_case_count; c_i++) { + const size_t cur = nth++; + if ((options.filter != NO_FILTER && options.filter != cur)) { + continue; + } + + const struct captest_case_multi *t = &multi_cases[c_i]; + if (t->match == SHOULD_SKIP) { + printf("%zu: SKIP (multi)\n", c_i); + skip++; + continue; + } + + if (options.verbosity > 0) { + printf("%zu: ", cur); + } + + struct captest_case_multi_result result; + enum captest_run_case_res res = captest_run_case_multi(t, + options.verbosity, false, options.prog_output, &result); + + pass += result.pass; + fail += result.fail; + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + if (options.verbosity > 0) { + printf("pass\n"); + } + break; + case CAPTEST_RUN_CASE_FAIL: + if (options.verbosity > 0) { + printf("FAIL\n"); + } else { + printf("-- test case %zd\n", cur); + } + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } +after_multi: + + /* hardcoded programs */ + if (!(options.group & GROUP_PROGRAMS)) { goto after_programs; } + + const size_t prog_case_count = sizeof(program_cases)/sizeof(program_cases[0]); + for (size_t c_i = 0; c_i < prog_case_count; c_i++) { + const size_t cur = nth++; + if ((options.filter != NO_FILTER && options.filter != cur)) { + continue; + } + + const struct captest_case_program *t = &program_cases[c_i]; + + if (options.verbosity > 0) { + printf("%zu: ", cur); + } + + enum captest_run_case_res res = captest_run_case_program(t, + options.verbosity); + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + if (options.verbosity > 0) { + printf("pass\n"); + } + pass++; + break; + case CAPTEST_RUN_CASE_FAIL: + fail++; + if (options.verbosity > 0) { + printf("FAIL\n"); + } else if (options.verbosity == 0) { + printf("-- test case %zd\n", cur); + } + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } +after_programs: + + printf("-- pass %zu, fail %zu, skip %zu\n", pass, fail, skip); + + return fail > 0 + ? EXIT_FAILURE + : EXIT_SUCCESS; +} diff --git a/tests/idmap/idmap_basic.c b/tests/idmap/idmap_basic.c index 19f44d56e..c7a18856b 100644 --- a/tests/idmap/idmap_basic.c +++ b/tests/idmap/idmap_basic.c @@ -19,13 +19,14 @@ #define ID_MASK ((1 << 9) - 1) #define VALUE_MASK ((1 << 10) - 1) -static void +static int dump_cb(fsm_state_t state_id, unsigned value, void *opaque) { /* fprintf(stderr, " -- state %d, value %u\n", state_id, value); */ assert(state_id <= ID_MASK); assert(value <= VALUE_MASK); (void)opaque; + return 1; } static int diff --git a/tests/minimise/minimise_test_case_list.c b/tests/minimise/minimise_test_case_list.c index 1386f0dcc..7299d2fa1 100644 --- a/tests/minimise/minimise_test_case_list.c +++ b/tests/minimise/minimise_test_case_list.c @@ -22,7 +22,6 @@ const char *test_cases[] = { "(?:a+|b)a+", "(?:a*ba)+", "(?:a|cd)+e?x", - "-> 1 'a';", "(?:abc|def)+", "(?:abc|def)*", "(?:b|a*)", @@ -81,7 +80,7 @@ check_minimisation(const char *pattern) .offset = 0 }; - fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI, &err); + fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI | RE_NOCAPTURE, &err); assert(fsm != NULL); if (!fsm_determinise(fsm)) { return 0; diff --git a/theft/fuzz_capture_string_set.c b/theft/fuzz_capture_string_set.c index 7326356c2..f225bb326 100644 --- a/theft/fuzz_capture_string_set.c +++ b/theft/fuzz_capture_string_set.c @@ -158,7 +158,7 @@ check_capstring_set(struct capture_env *env, return THEFT_TRIAL_ERROR; } - const size_t capture_count = fsm_countcaptures(dfa); + const size_t capture_count = fsm_capture_ceiling(dfa); if (verbosity > 2) { fprintf(stderr, "==== cs '%s'\n", cs->string); @@ -172,7 +172,7 @@ check_capstring_set(struct capture_env *env, assert(cp != NULL); fsm_copies[cs_i] = cp; - const size_t cp_capture_count = fsm_countcaptures(cp); + const size_t cp_capture_count = fsm_capture_ceiling(cp); if (verbosity > 2) { fprintf(stderr, "==== min(det(cp))\n"); fsm_print_fsm(stderr, cp); @@ -196,7 +196,7 @@ check_capstring_set(struct capture_env *env, return THEFT_TRIAL_FAIL; } - combined_capture_count = fsm_countcaptures(combined); + combined_capture_count = fsm_capture_ceiling(combined); for (size_t cs_i = 0; cs_i < css->count; cs_i++) { total_captures += capture_counts[cs_i]; } @@ -295,7 +295,7 @@ check_fsms_for_single_input(struct check_env *env, struct fsm_capture *captures, assert(exec_res >= 0); if (exec_res == 1) { if (LOG_LEVEL > 0) { - const size_t combined_capture_count = fsm_countcaptures(env->combined); + const size_t combined_capture_count = fsm_capture_ceiling(env->combined); for (size_t i = 0; i < combined_capture_count; i++) { fprintf(stderr, "capture[%zu/%zu]: (%ld, %ld)\n", i, combined_capture_count, @@ -415,7 +415,7 @@ compare_captures(const struct check_env *env, const struct fsm_capture *captures_combined, size_t nth_fsm, const struct fsm_capture *captures) { - const size_t combined_capture_count = fsm_countcaptures(env->combined); + const size_t combined_capture_count = fsm_capture_ceiling(env->combined); if (combined_capture_count == 0) { return true; /* no captures */ } @@ -639,7 +639,7 @@ build_capstring_dfa(const struct capstring *cs, uint8_t end_id) goto cleanup; } - if (fsm_countcaptures(fsm) != cs->capture_count) { + if (fsm_capture_ceiling(fsm) != cs->capture_count) { goto cleanup; } From 31ff23429fa7d1d36067e74322af73546335270e Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 2 Jun 2023 11:41:28 -0400 Subject: [PATCH 08/51] Add pcre-anchor test for anchoring edge case. --- tests/pcre-anchor/in81.re | 1 + tests/pcre-anchor/out81.fsm | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 tests/pcre-anchor/in81.re create mode 100644 tests/pcre-anchor/out81.fsm diff --git a/tests/pcre-anchor/in81.re b/tests/pcre-anchor/in81.re new file mode 100644 index 000000000..8b5fad7c3 --- /dev/null +++ b/tests/pcre-anchor/in81.re @@ -0,0 +1 @@ +($x)* \ No newline at end of file diff --git a/tests/pcre-anchor/out81.fsm b/tests/pcre-anchor/out81.fsm new file mode 100644 index 000000000..2cdc2f023 --- /dev/null +++ b/tests/pcre-anchor/out81.fsm @@ -0,0 +1,5 @@ +0 -> 0 ?; +0 -> 1 "\n"; + +start: 0; +end: 0, 1; \ No newline at end of file From 77f9260ba15b1589e288b83b82b8e2dda6c59f5c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 2 Jun 2023 14:12:15 -0400 Subject: [PATCH 09/51] fuzz/target.c: Add multi-regex and single regex cmp against PCRE. Add a couple more execution modes to the fuzzer test harness. In particular, add support for comparing match and capture behavior against PCRE's -- because this depends on linking with libpcre, it's disabled by defalut. Set `PCRE_CMP` in the Makefile to enable it, or pass it as a build argument. --- fuzz/Makefile | 11 +- fuzz/target.c | 1206 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 1203 insertions(+), 14 deletions(-) diff --git a/fuzz/Makefile b/fuzz/Makefile index 1d418cd97..2327a811b 100644 --- a/fuzz/Makefile +++ b/fuzz/Makefile @@ -7,6 +7,15 @@ ${BUILD}/fuzz/: ${BUILD} DIR += ${BUILD}/fuzz +# Uncomment to enable capture fuzzing using PCRE as a test oracle. +#PCRE_CMP=1 + +.if PCRE_CMP +PKG += libpcre2-8 +LFLAGS.fuzzer += ${LIBS.libpcre2-8} +CFLAGS.${SRC:Mfuzz/target.c} += -DCMP_PCRE=1 +.endif + .for src in ${SRC:Mfuzz/*.c} CFLAGS.${src} += -std=c99 .endfor @@ -15,7 +24,7 @@ CFLAGS.${src} += -std=c99 fuzz:: ${BUILD}/fuzz/fuzzer ${BUILD}/fuzz/fuzzer: mkdir - ${CC} -o $@ ${LFLAGS} ${.ALLSRC:M*.o} ${.ALLSRC:M*.a} + ${CC} -o $@ ${LFLAGS} ${LFLAGS.fuzzer} ${.ALLSRC:M*.o} ${.ALLSRC:M*.a} .for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} ${BUILD}/fuzz/fuzzer: ${BUILD}/lib/${lib:R}.a diff --git a/fuzz/target.c b/fuzz/target.c index cae283aca..eeda0e9d5 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -21,17 +22,47 @@ #include "../src/libfsm/minimise_test_oracle.h" +/* for fsm_capture_dump */ +/* FIXME: should this be a public interface? */ +#include "../src/libfsm/capture.h" + +/* Buffer for sanitized fuzzer input */ +#define MAX_FUZZER_DATA (64 * 1024) +static uint8_t data_buf[MAX_FUZZER_DATA + 1]; + +/* Should fuzzer harness code be built that compares behavior + * with PCRE? (Obviously, this depends on PCRE.) */ +#ifndef CMP_PCRE +#define CMP_PCRE 0 +#endif + +#if CMP_PCRE +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +static int +compare_with_pcre(const char *pattern, struct fsm *fsm); +#endif + /* 10 seconds */ #define TIMEOUT_USEC (10ULL * 1000 * 1000) +/* for TRACK_TIMES and EXPENSIVE_CHECKS */ +#include "../src/libfsm/internal.h" + enum run_mode { - MODE_DEFAULT, + MODE_REGEX, + MODE_REGEX_SINGLE_ONLY, + MODE_REGEX_MULTI_ONLY, + MODE_IDEMPOTENT_DET_MIN, MODE_SHUFFLE_MINIMISE, MODE_ALL_PRINT_FUNCTIONS, }; +static size_t +get_env_config(size_t default_value, const char *env_var_name); -/* This stuff will already exist elsewhere once other branches are merged. */ +/* TODO: These could be moved to a common file for test utils. */ #if 1 static void time_get(struct timeval *tv) @@ -73,7 +104,110 @@ scanner_next(void *opaque) } #endif -static const struct fsm_options opt; +/* This is used to track allocation during each fuzzer + * run. Note that hwm is not reduced when memory is + * free'd or realloc'd, because the size info is not + * passed to those calls. */ +#define MB(X) ((size_t)X * 1000 * 1000) +#define FH_ALLOCATOR_HWM_LIMIT (MB(50)) +struct fh_allocator_stats { + size_t hwm; /* high water mark */ +}; + +static void +fh_memory_hwm_limit_hook(const char *caller_name) +{ + /* It doesn't really help to exit here because libfuzzer will + * still treat it as a failure, but at least we can print a + * message about hitting the allocator limit and exit so we + * don't need to spend time investigating timeouts or ooms + * that are due to obvious resource exhaustion. */ + fprintf(stderr, "%s: hit FH_ALLOCATOR_HWM_LIMIT (%zu), exiting\n", + caller_name, FH_ALLOCATOR_HWM_LIMIT); + exit(EXIT_SUCCESS); +} + +static void +fh_free(void *opaque, void *p) +{ + (void)opaque; + free(p); +} + +static void * +fh_calloc(void *opaque, size_t n, size_t sz) +{ + struct fh_allocator_stats *stats = opaque; + stats->hwm += sz; + if (stats->hwm > FH_ALLOCATOR_HWM_LIMIT) { + fh_memory_hwm_limit_hook(__func__); + return NULL; + } + + (void)opaque; + return calloc(n, sz); +} + +static void * +fh_malloc(void *opaque, size_t sz) +{ + struct fh_allocator_stats *stats = opaque; + stats->hwm += sz; + if (stats->hwm > FH_ALLOCATOR_HWM_LIMIT) { + fh_memory_hwm_limit_hook(__func__); + return NULL; + } + + return malloc(sz); +} + +static void * +fh_realloc(void *opaque, void *p, size_t sz) +{ + struct fh_allocator_stats *stats = opaque; + stats->hwm += sz; + if (stats->hwm > FH_ALLOCATOR_HWM_LIMIT) { + fh_memory_hwm_limit_hook(__func__); + return NULL; + } + + return realloc(p, sz); +} + +static struct fh_allocator_stats allocator_stats; + +/* fuzzer harness allocators */ +static struct fsm_alloc custom_allocators = { + .free = fh_free, + .calloc = fh_calloc, + .malloc = fh_malloc, + .realloc = fh_realloc, + .opaque = &allocator_stats, +}; + +static const struct fsm_options fsm_options = { + .group_edges = 1, /* make output readable */ + .alloc = &custom_allocators, +}; + +static void +dump_pattern(const char *pattern) +{ + const size_t pattern_length = strlen(pattern); + fprintf(stderr, "-- Pattern: %zu bytes\n", pattern_length); + for (size_t i = 0; i < pattern_length; i++) { + fprintf(stderr, " %02x", (uint8_t)pattern[i]); + if ((i & 31) == 31) { fprintf(stderr, "\n"); } + } + if ((pattern_length & 31) != 31) { + fprintf(stderr, "\n"); + } + for (size_t i = 0; i < pattern_length; i++) { + fprintf(stderr, "%c", isprint(pattern[i]) ? pattern[i] : '.'); + if ((i & 63) == 63) { fprintf(stderr, "\n"); } + } + fprintf(stderr, "\n"); +} static struct fsm * build(const char *pattern) @@ -93,7 +227,7 @@ build(const char *pattern) }; time_get(&pre); - fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI, &err); + fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, RE_MULTI, &err); time_get(&post); delta_usec = time_diff_usec(&pre, &post); total_usec += delta_usec; @@ -119,12 +253,24 @@ build(const char *pattern) total_usec += delta_usec; if (total_usec > TIMEOUT_USEC) { + dump_pattern(pattern); assert(!"timeout"); } return fsm; } +static size_t +get_env_config(size_t default_value, const char *env_var_name) +{ + const char *s = getenv(env_var_name); + if (s == NULL) { + return default_value; + } else { + return strtoul(s, NULL, 10); + } +} + static int codegen(const struct fsm *fsm) { @@ -136,14 +282,46 @@ codegen(const struct fsm *fsm) } static int -build_and_codegen(const char *pattern) +build_and_check_single(const char *pattern) { + const int verbosity = get_env_config(0, "VERBOSITY"); + if (verbosity > 1) { + fprintf(stderr, "pattern: \"%s\"\n", pattern); + } + + INIT_TIMERS(); + TIME(&pre); struct fsm *fsm = build(pattern); if (fsm == NULL) { return EXIT_SUCCESS; } + TIME(&post); + DIFF_MSEC("build", pre, post, NULL); + + if (getenv("DUMP")) { + fprintf(stderr,"==================================================\n"); + fsm_print_fsm(stderr, fsm); + fprintf(stderr,"==================================================\n"); + fsm_capture_dump(stderr, "CAPTURE", fsm); + fprintf(stderr,"==================================================\n"); + } - if (!codegen(fsm)) { +#if CMP_PCRE + TIME(&pre); + const int cmp_res = compare_with_pcre(pattern, fsm); + TIME(&post); + DIFF_MSEC("cmp", pre, post, NULL); + if (!cmp_res) { + fsm_free(fsm); + return EXIT_SUCCESS; + } +#endif + + TIME(&pre); + const int codegen_res = codegen(fsm); + TIME(&post); + DIFF_MSEC("codegen", pre, post, NULL); + if (!codegen_res) { return EXIT_SUCCESS; } @@ -151,6 +329,845 @@ build_and_codegen(const char *pattern) return EXIT_SUCCESS; } +#define DEF_MAX_DEPTH 20 +#define DEF_MAX_LENGTH 10 +#define DEF_MAX_STEPS 10000 +#define DEF_MAX_MATCH_COUNT 1000 + +#if CMP_PCRE +/* These two are only used with PCRE2 */ +#define ANCHORED_PCRE 0 +#define FUZZ_RE_MATCH_LIMIT 10000 +#define FUZZ_RE_RECURSION_LIMIT 200 +#define MAX_OVEC_SIZE 512 + +static pcre2_match_context *pcre2_mc = NULL; + +struct cmp_pcre_env { + int verbosity; + const char *pattern; + const struct fsm *fsm; + pcre2_match_data *md; + pcre2_code *p; + + struct fsm_capture *captures; + size_t captures_length; + + size_t max_depth; + size_t max_steps; + size_t max_match_count; +}; + +struct test_pcre_match_info { + int res; + int pcre_error; + size_t ovector[MAX_OVEC_SIZE]; +}; + +static pcre2_code * +build_pcre2(const char *pattern, int verbosity) +{ + const uint32_t options = ANCHORED_PCRE ? PCRE2_ANCHORED : 0; + int errorcode; + PCRE2_SIZE erroffset = 0; + pcre2_compile_context *cctx = NULL; + + /* Set match limits */ + if (pcre2_mc == NULL) { + pcre2_mc = pcre2_match_context_create(NULL); + assert(pcre2_mc != NULL); + + pcre2_set_match_limit(pcre2_mc, FUZZ_RE_MATCH_LIMIT); + pcre2_set_recursion_limit(pcre2_mc, FUZZ_RE_RECURSION_LIMIT); + } + + pcre2_code *p = pcre2_compile((const unsigned char *)pattern, + PCRE2_ZERO_TERMINATED, + options, &errorcode, &erroffset, cctx); + if (verbosity > 0 && p == NULL && errorcode != 0) { +#define ERRSIZE 4096 + unsigned char errbuf[ERRSIZE] = {0}; + if (!pcre2_get_error_message(errorcode, + errbuf, ERRSIZE)) { + fprintf(stderr, "pcre2_get_error_message: failed\n"); + } + fprintf(stderr, "pcre2_compile: error: %s\n", errbuf); + } + return p; +} + +enum do_pcre_match_res { + DO_PCRE_MATCH_HIT, + DO_PCRE_MATCH_MISS, + DO_PCRE_MATCH_SKIP, /* an exceptional case we don't care about */ + DO_PCRE_MATCH_ERROR = -1, +}; +enum do_pcre_match_res +do_pcre_match(FILE *f, const pcre2_code *p, pcre2_match_data *md, int verbosity, + const char *input, struct test_pcre_match_info *match_info) +{ +#define MAX_BUF (64*1024) + const size_t input_len = strlen(input); + enum do_pcre_match_res mres; + + /* turn off the JIT because it can give inconsistent results while fuzzing */ + const uint32_t options = (ANCHORED_PCRE ? PCRE2_ANCHORED : 0) + | PCRE2_NO_JIT; + + assert(pcre2_mc != NULL); + + /* The value returned by pcre2_match() is one more than the + * highest numbered pair that has been set. */ + int res = pcre2_match(p, (const unsigned char *)input, input_len, + 0, options, md, pcre2_mc); + + if (res == PCRE2_ERROR_NOMATCH || res == PCRE2_ERROR_PARTIAL) { + if (f != NULL && verbosity > 1) { + fprintf(f, " -- no match (%s)\n", + res == PCRE2_ERROR_NOMATCH ? "NOMATCH" + : res == PCRE2_ERROR_PARTIAL ? "PARTIAL" + : ""); + } + mres = DO_PCRE_MATCH_MISS; + goto cleanup; + } else if (res == PCRE2_ERROR_MATCHLIMIT || res == PCRE2_ERROR_DEPTHLIMIT) { + /* It's possible to exhaust PCRE's internal limits with pathologically + * nested regexes like "(((((((((^.)?)*)?)?)?)*)+)+)*$" and + * "((((((((akbzaabdcOaa)|((((b*))))?|.|.|.*|.|.))+)+)+$)*)?)" , but + * as long as they don't cause it to block for excessively long or + * exhaust resources that's fine. */ + if (f != NULL) { + fprintf(f, " -- PCRE2_ERROR_MATCHLIMIT (returning SKIP)\n"); + } + mres = DO_PCRE_MATCH_SKIP; + } else if (res <= 0) { + if (f != NULL) { +#define ERR_MAX 4096 + unsigned char err_buf[ERR_MAX]; + if (pcre2_get_error_message(res, err_buf, ERR_MAX)) { + fprintf(f, " -- error %d: %s\n", res, err_buf); + } else { + fprintf(f, " -- error %d\n", res); + } +#undef ERR_MAX + } + if (match_info != NULL) { + match_info->pcre_error = res; + } + mres = DO_PCRE_MATCH_ERROR; + goto cleanup; + } else { + const uint32_t ovc = pcre2_get_ovector_count(md); + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); + assert(res >= 0); + size_t ures = (size_t)res; + assert(ovc > ures); + + assert(ovector[1] >= ovector[0]); + const size_t mlen = ovector[1] - ovector[0]; + if (ANCHORED_PCRE && (ovector[0] != 0 || mlen != input_len)) { + mres = DO_PCRE_MATCH_MISS; + goto cleanup; + } + mres = DO_PCRE_MATCH_HIT; + + if (f != NULL && verbosity > 1) { + for (size_t i = 0; i < ures; i++) { + char buf[MAX_BUF] = { 0 }; + memcpy(buf, &input[ovector[2*i]], + ovector[2*i + 1U] - ovector[2*i]); + fprintf(f, " -- %zu: \"%s\"\n", i, buf); + } + } + + if (match_info != NULL && res < MAX_OVEC_SIZE) { + match_info->res = res; + assert(res >= 0); + const size_t ures = (size_t)res; + + for (size_t i = 0; i < 2*ures; i++) { + match_info->ovector[i] = ovector[i]; + } + } + } + +cleanup: + return mres; +#undef MAX_BUF +} + +static bool +exec_and_compare_captures(struct cmp_pcre_env *env, + const char *input, size_t input_size, + const struct test_pcre_match_info *match_info) +{ + bool matching = true; + fsm_state_t end_state; + const uint8_t *u8_input = (const uint8_t *)input; + int res = fsm_exec_with_captures(env->fsm, u8_input, input_size, + &end_state, env->captures, env->captures_length); + + if (res < 0) { + if (env->verbosity > 1) { + fprintf(stderr, "got res of %d\n", res); + } + + return false; + } + + if (res > 0) { + assert(match_info->res >= 0); + const size_t ures = (size_t)match_info->res; + + if (env->verbosity > 1) { + fprintf(stderr, "ures %zu\n", ures); + } + + for (size_t i = 0; i < ures; i++) { + if (env->verbosity > 1) { + fprintf(stderr, "%zu/%zu: pcre [%ld, %ld] <-> libfsm [%ld, %ld]\n", + i, ures, + match_info->ovector[2*i], match_info->ovector[2*i + 1], + env->captures[i].pos[0], env->captures[i].pos[1]); + } + if ((match_info->ovector[2*i] != env->captures[i].pos[0]) + || (match_info->ovector[2*i + 1] != env->captures[i].pos[1])) { + matching = false; + } + } + + if (!matching) { + for (size_t i = 0; i < ures; i++) { + fprintf(stderr, "%zu/%zu: pcre [%ld, %ld] <-> libfsm [%ld, %ld]\n", + i, ures, + match_info->ovector[2*i], match_info->ovector[2*i + 1], + env->captures[i].pos[0], env->captures[i].pos[1]); + } + } + } + + return matching; +} + +static void +dump_pattern_and_input(const char *pattern, const char *input, size_t input_length) +{ + dump_pattern(pattern); + + fprintf(stderr, "-- Input: %zu bytes\n", input_length); + for (size_t i = 0; i < input_length; i++) { + fprintf(stderr, " %02x", (uint8_t)input[i]); + if ((i & 31) == 31) { fprintf(stderr, "\n"); } + } + if ((input_length & 31) != 31) { + fprintf(stderr, "\n"); + } + for (size_t i = 0; i < input_length; i++) { + fprintf(stderr, "%c", isprint(input[i]) ? input[i] : '.'); + if ((i & 63) == 63) { fprintf(stderr, "\n"); } + } + fprintf(stderr, "\n"); +} + +static enum fsm_generate_matches_cb_res +cmp_pcre_gen_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + struct cmp_pcre_env *env = opaque; + assert(env != NULL); + + (void)fsm; + (void)depth; + (void)end_state; + + const size_t len = strlen(input); + + if (env->verbosity > 4) { + fprintf(stderr, "%s: depth %zu/%zu, match_count %zu/%zu, steps %zu/%zu\n", + __func__, + depth, env->max_depth, + match_count, env->max_match_count, + steps, env->max_steps); + } + + if (steps > env->max_steps) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (match_count > env->max_match_count) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (depth > env->max_depth) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + /* Completely avoid exploring inputs with embedded 0x00 bytes. */ + if (input_length != len) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + if (len > 0 && input[len - 1] == '\n') { + /* These will need to be handled properly, but PCRE has + * special cases for '\n' handling. */ + /* fprintf(stderr, " -- skipping input ending with '\\n'.\n"); */ + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + struct test_pcre_match_info match_info = { .pcre_error = 0 }; + enum do_pcre_match_res mres = do_pcre_match(stderr, + env->p, env->md, env->verbosity, input, &match_info); + switch (mres) { + case DO_PCRE_MATCH_SKIP: + break; + case DO_PCRE_MATCH_MISS: + dump_pattern_and_input(env->pattern, input, input_length); + assert(!"matches libfsm but not with PCRE"); + return FSM_GENERATE_MATCHES_CB_RES_HALT; + case DO_PCRE_MATCH_ERROR: + fprintf(stderr, "FAIL: PCRE returned ERROR %d: pattern \"%s\"\n", + match_info.pcre_error, env->pattern); + return FSM_GENERATE_MATCHES_CB_RES_HALT; + case DO_PCRE_MATCH_HIT: + break; /* okay; continue below */ + } + + if (env->verbosity > 1) { + fprintf(stderr, "-- comparing captures for pattern \"%s\", input \"%s\" (len %zu)\n", + env->pattern, input, len); + } + + if (!exec_and_compare_captures(env, input, input_length, &match_info)) { + if (env->verbosity > 1 || 1) { + dump_pattern_and_input(env->pattern, input, input_length); + fsm_print_fsm(stderr, env->fsm); + fsm_capture_dump(stderr, "fsm", env->fsm); + } + assert(!"captures don't match"); + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; +} + +static int +compare_fixed_input(struct fsm *fsm, const char *pattern, const char *input, pcre2_match_data *md, pcre2_code *p) +{ + fsm_state_t end_state; + const size_t capture_ceil = fsm_capture_ceiling(fsm); + + struct fsm_capture *captures = malloc(capture_ceil * sizeof(captures[0])); + assert(captures != NULL); + for (size_t i = 0; i < capture_ceil; i++) { + /* clobber with meaningless but visually distinct values */ + captures[i].pos[0] = 88888888; + captures[i].pos[1] = 99999999; + }; + + const uint8_t *u8_input = (const uint8_t *)input; + const size_t input_len = strlen(input); + const int libfsm_res = fsm_exec_with_captures(fsm, u8_input, input_len, + &end_state, captures, capture_ceil); + + const bool libfsm_matching = libfsm_res > 0; + + int res = 1; + + struct test_pcre_match_info match_info = { .pcre_error = 0 }; + enum do_pcre_match_res mres = do_pcre_match(stderr, + p, md, 0, input, &match_info); + switch (mres) { + case DO_PCRE_MATCH_SKIP: + return 1; + case DO_PCRE_MATCH_MISS: + if (!libfsm_matching) { + goto cleanup; + } + dump_pattern_and_input(pattern, input, 0); + assert(!"matches libfsm but not with PCRE"); + return 0; + case DO_PCRE_MATCH_ERROR: + fprintf(stderr, "FAIL: PCRE returned ERROR %d: pattern \"%s\"\n", + match_info.pcre_error, pattern); + return 0; + case DO_PCRE_MATCH_HIT: + if (!libfsm_matching) { + dump_pattern_and_input(pattern, input, input_len); + assert(!"matches PCRE but not libfsm"); + res = 0; + goto cleanup; + } + + const size_t ures = (size_t)match_info.res; + if (ures > capture_ceil) { + dump_pattern_and_input(pattern, input, 0); + fprintf(stderr, "error: capture_ceil: %zu exceeded by ures: %zd\n", + capture_ceil, ures); + assert(!"both PCRE and libfsm match but with different capture counts"); + } + + bool matching = true; + for (size_t i = 0; i < ures; i++) { + if ((match_info.ovector[2*i] != captures[i].pos[0]) + || (match_info.ovector[2*i + 1] != captures[i].pos[1])) { + matching = false; + } + } + for (size_t i = 0; i < ures; i++) { + if (!matching) { + fprintf(stderr, "%zu/%zu: pcre [%ld, %ld] <-> libfsm [%ld, %ld]\n", + i, ures, + match_info.ovector[2*i], match_info.ovector[2*i + 1], + captures[i].pos[0], captures[i].pos[1]); + } + } + + if (!matching) { + dump_pattern_and_input(pattern, input, 0); + assert(!"both PCRE and libfsm match but with different captures"); + } + + goto cleanup; /* ok, both matched */ + } + + assert(!"unreachable"); + +cleanup: + free(captures); + return res; + +} + +static int +compare_with_pcre(const char *pattern, struct fsm *fsm) +{ + size_t verbosity = get_env_config(0, "VERBOSITY"); + size_t max_length = get_env_config(DEF_MAX_LENGTH, "MAX_LENGTH"); + size_t max_steps = get_env_config(DEF_MAX_STEPS, "MAX_STEPS"); + size_t max_depth = get_env_config(DEF_MAX_DEPTH, "MAX_DEPTH"); + size_t max_match_count = get_env_config(DEF_MAX_MATCH_COUNT, "MAX_MATCH_COUNT"); + int res = 1; + + pcre2_match_data *md; + + pcre2_code *p = build_pcre2(pattern, 0); + if (p == NULL) { + return 1; + } + + md = pcre2_match_data_create(MAX_OVEC_SIZE, NULL); + assert(md != NULL); + + /* Check the empty string and "\n", because PCRE has an awkward + * special case for "\n" that has complicated interactions + * with start and end anchoring. */ + if (!compare_fixed_input(fsm, pattern, "", md, p) + || !compare_fixed_input(fsm, pattern, "\n", md, p)) { + pcre2_match_data_free(md); + pcre2_code_free(p); + return res; + } + + struct fsm_capture captures[MAX_OVEC_SIZE/2] = { 0 }; + + const size_t pattern_length = strlen(pattern); + if (pattern_length >= max_length) { + max_length = pattern_length + 1; + static size_t max_max_length; + if (max_length > max_max_length) { + fprintf(stderr, "Note: increasing max_length to %zu\n", + pattern_length + 1); + max_max_length = max_length; + if (max_depth < max_length) { + max_depth = max_length + 1; + } + } + } + + struct cmp_pcre_env env = { + .verbosity = (int)verbosity, + .pattern = pattern, + .fsm = fsm, + .captures = captures, + .captures_length = MAX_OVEC_SIZE/2, + .md = md, + .p = p, + .max_steps = max_steps, + .max_depth = max_depth, + .max_match_count = max_match_count, + }; + + if (!fsm_generate_matches(fsm, max_length, cmp_pcre_gen_cb, &env)) { + res = 0; + } + + pcre2_match_data_free(md); + pcre2_code_free(p); + return res; +} +#endif + +/* Note: combined_fsm and fsms[] are non-const because fsm_generate_matches + * calls fsm_trim on them. */ +static int +compare_combined_and_separate(int verbosity, size_t max_length, size_t count, + struct fsm *combined_fsm, const struct fsm_combined_base_pair *bases, + struct fsm **fsms); + +static enum fsm_generate_matches_cb_res +cmp_combined_and_separate_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +static int +build_and_check_multi(const char *input) +{ + int res = EXIT_FAILURE; + const int verbosity = get_env_config(0, "VERBOSITY"); +#define MAX_PATTERNS 8 +#define MAX_PATTERN_LEN 256 + char patterns[MAX_PATTERNS][MAX_PATTERN_LEN] = { 0 }; + size_t count = 0; + const size_t len = strlen(input); + size_t max_length = get_env_config(DEF_MAX_LENGTH, "MAX_LENGTH"); + INIT_TIMERS(); + + /* if nonzero, apply a timeout to the combined FSM det/min below */ + const size_t timeout = get_env_config(0, "TIMEOUT"); + + if (timeout > 0) { + if (TRACK_TIMES == 0) { + fprintf(stderr, "\n\n\n\n\nError: src/libfsm/internal.h:TRACK_TIMES needs to be nonzero for this use case, exiting.\n\n\n\n\n"); + exit(EXIT_FAILURE); + } else { + static bool printed_timeout_msg; + if (!printed_timeout_msg) { + fprintf(stderr, "Using timeout of %zu msec for fsm_determinise/fsm_minimise on combined FSM.\n", + timeout); + printed_timeout_msg = true; + } + } + } + + size_t i, j; + for (i = 0, j = 0; i < len; i++) { + const char c = input[i]; + if (c == '\n' || c == '\r') { + if (j > max_length) { + max_length = j; + } + count++; + if (count == MAX_PATTERNS) { + /* ignore: too many patterns */ + return EXIT_SUCCESS; + } + j = 0; + } else { + patterns[count][j] = c; + j++; + if (j == MAX_PATTERN_LEN) { + /* ignore: pattern too long */ + return EXIT_SUCCESS; + } + } + } + if (j > 0) { count++; } + + struct re_err err; + const enum re_flags flags = 0; + + /* build each regex, combining them and keeping track of capture offsets */ + struct fsm *fsms[count]; + struct fsm *fsms_cp[count]; + struct fsm_combined_base_pair bases[count]; + struct fsm *combined_fsm = NULL; + for (size_t i = 0; i < count; i++) { + fsms[i] = NULL; + fsms_cp[i] = NULL; + + bases[i].state = 0; + bases[i].capture = 0; + } + + /* compile each individually */ + for (size_t i = 0; i < count; i++) { + if (verbosity > 1) { + fprintf(stderr, "%s: compiling \"%s\"\n", + __func__, patterns[i]); + } + + struct scanner s = { + .str = (const uint8_t *)patterns[i], + .size = strlen(patterns[i]), + }; + + struct fsm *fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, flags, &err); + if (fsm == NULL) { + res = EXIT_SUCCESS; /* invalid regex, so skip this batch */ + goto cleanup; + } + + char label_buf[100]; + snprintf(label_buf, 100, "single_determisise_%zu", i); + + TIME(&pre); + if (!fsm_determinise(fsm)) { + goto cleanup; + } + TIME(&post); + DIFF_MSEC(label_buf, pre, post, NULL); + + snprintf(label_buf, 100, "single_minimise_%zu", i); + TIME(&pre); + if (!fsm_minimise(fsm)) { + goto cleanup; + } + TIME(&post); + DIFF_MSEC(label_buf, pre, post, NULL); + + if (verbosity > 4) { + char tag_buf[16] = { 0 }; + snprintf(tag_buf, sizeof(tag_buf), "fsm[%zu]", i); + + fprintf(stderr, "==== fsm[%zu]\n", i); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, tag_buf, fsm); + } + + fsms[i] = fsm; + fsms_cp[i] = fsm_clone(fsm); /* save a copy for comparison */ + } + + combined_fsm = fsm_union_array(count, fsms, bases); + assert(combined_fsm != NULL); + if (verbosity > 1) { + fprintf(stderr, "%s: combined_fsm: %d states after fsm_union_array\n", + __func__, fsm_countstates(combined_fsm)); + } + if (verbosity > 1) { + for (size_t i = 0; i < count; i++) { + fprintf(stderr, "%s: base[%zu]: state %d, capture %u\n", + __func__, i, bases[i].state, bases[i].capture); + } + } + + TIME(&pre); + if (!fsm_determinise(combined_fsm)) { + goto cleanup; + } + TIME(&post); + size_t timeout_accum = 0; + if (timeout != 0) { + if (verbosity > 1) { + DIFF_MSEC_ALWAYS("combined_determinise", pre, post, &timeout_accum); + } else { + DIFF_MSEC("combined_determinise", pre, post, &timeout_accum); + } + assert(timeout_accum < timeout); + timeout_accum = 0; + } + + const unsigned states_after_determinise = fsm_countstates(combined_fsm); + if (verbosity > 1) { + fprintf(stderr, "%s: combined_fsm: %d states after determinise\n", + __func__, states_after_determinise); + } + + TIME(&pre); + if (!fsm_minimise(combined_fsm)) { + goto cleanup; + } + TIME(&post); + if (timeout != 0) { + if (verbosity > 1) { + DIFF_MSEC_ALWAYS("combined_minimise", pre, post, &timeout_accum); + } else { + DIFF_MSEC("combined_minimise", pre, post, &timeout_accum); + } + assert(timeout_accum < timeout); + timeout_accum = 0; + } + + const unsigned states_after_minimise = fsm_countstates(combined_fsm); + if (verbosity > 1) { + fprintf(stderr, "%s: combined_fsm: %d states after minimise\n", + __func__, states_after_minimise); + } + + if (verbosity > 4) { + fprintf(stderr, "==== combined\n"); + fsm_print_fsm(stderr, combined_fsm); + fsm_capture_dump(stderr, "combined", combined_fsm); + } + + res = compare_combined_and_separate(verbosity, max_length, + count, combined_fsm, bases, (struct fsm **)fsms_cp); + + for (i = 0; i < count; i++) { + fsm_free(fsms_cp[i]); + } + fsm_free(combined_fsm); + + if (res == EXIT_SUCCESS) { + static size_t pass_count; + if (verbosity == 1) { + fprintf(stderr, "%s: pass: %zu, %zu patterns\n", + __func__, ++pass_count, count); + } else if (verbosity > 1) { + fprintf(stderr, "%s: pass: %zu, %zu patterns\n", + __func__, ++pass_count, count); + for (i = 0; i < count; i++) { + fprintf(stderr, " -- %zu: \"%s\"\n", + i, patterns[i]); + } + } + } + + return res; + +cleanup: + for (i = 0; i < count; i++) { + if (fsms[i] != NULL) { + fsm_free(fsms[i]); + } + if (fsms_cp[i] != NULL) { + fsm_free(fsms_cp[i]); + } + } + if (combined_fsm != NULL) { + fsm_free(combined_fsm); + } + return res; +} + +struct cmp_combined_env { + bool ok; + int verbosity; + size_t count; + struct fsm *combined_fsm; + const struct fsm_combined_base_pair *bases; + size_t current_i; + struct fsm **fsms; + size_t max_depth; + size_t max_steps; + size_t max_match_count; +}; + +static int +compare_combined_and_separate(int verbosity, size_t max_length, size_t count, + struct fsm *combined_fsm, const struct fsm_combined_base_pair *bases, + struct fsm **fsms) +{ + const size_t max_steps = get_env_config(DEF_MAX_STEPS, "MAX_STEPS"); + const size_t max_depth = get_env_config(DEF_MAX_DEPTH, "MAX_DEPTH"); + const size_t max_match_count = get_env_config(DEF_MAX_MATCH_COUNT, "MAX_MATCH_COUNT"); + + struct cmp_combined_env env = { + .ok = true, + .verbosity = verbosity, + .count = count, + .combined_fsm = combined_fsm, + .bases = bases, + .fsms = fsms, + .max_steps = max_steps, + .max_depth = max_depth, + .max_match_count = max_match_count, + }; + + /* For each individual fsm, generate matching inputs and check that + * they match with the same captures in the combined fsm. */ + for (env.current_i = 0; env.current_i < count; env.current_i++) { + if (!fsm_generate_matches(env.fsms[env.current_i], max_length, + cmp_combined_and_separate_cb, &env)) { + env.ok = false; + } + if (!env.ok) { + break; + } + } + + /* TODO: also generate matches with combined and check the individual ones */ + + return env.ok ? EXIT_SUCCESS : EXIT_FAILURE; +} + +static enum fsm_generate_matches_cb_res +cmp_combined_and_separate_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + struct cmp_combined_env *env = opaque; + + if (steps > env->max_steps) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (depth > env->max_depth) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + if (match_count > env->max_match_count) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + +#define MAX_CAPTURES 256 + struct fsm_capture captures_single[MAX_CAPTURES]; + struct fsm_capture captures_combined[MAX_CAPTURES]; + + const uint8_t *u8_input = (const uint8_t *)input; + const int res_combined = fsm_exec_with_captures(env->combined_fsm, u8_input, input_length, + &end_state, captures_combined, MAX_CAPTURES); + const int res_single = fsm_exec_with_captures(fsm, u8_input, input_length, + &end_state, captures_single, MAX_CAPTURES); + + if (res_combined != res_single) { + env->ok = false; + if (env->verbosity > 0) { + fprintf(stderr, "%s: res_combined %d != res_single %d\n", + __func__, res_combined, res_single); + } + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (res_single > 0) { + if (env->verbosity > 3) { + fprintf(stderr, "%s: res %d (single and combined)\n", __func__, res_single); + } + + bool matching = true; + const unsigned base = env->bases[env->current_i].capture; + assert(base < MAX_CAPTURES); + for (int i = 0; i < res_single; i++) { + if (env->verbosity > 3) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + if ((captures_single[i].pos[0] != captures_combined[i + base].pos[0]) || + (captures_single[i].pos[1] != captures_combined[i + base].pos[1])) { + matching = false; + } + } + + if (!matching) { + for (int i = 0; i < res_single; i++) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; +} + #define DEF_MAX_SHUFFLE 10 #define DEF_MAX_MINIMISE_ORACLE_STATE_COUNT 1000 @@ -169,7 +1186,7 @@ shuffle_minimise(const char *pattern) .offset = 0 }; - fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI, &err); + fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, RE_MULTI, &err); if (fsm == NULL) { /* ignore invalid regexp syntax, etc. */ @@ -324,8 +1341,81 @@ fuzz_all_print_functions(FILE *f, const char *pattern, bool det, bool min, const return EXIT_SUCCESS; } -#define MAX_FUZZER_DATA (64 * 1024) -static uint8_t data_buf[MAX_FUZZER_DATA + 1]; +static int +build_and_test_idempotent_det_and_min(const char *pattern) +{ + const int verbosity = get_env_config(0, "VERBOSITY"); + assert(pattern != NULL); + + struct re_err err; + struct fsm *fsm; + const size_t length = strlen(pattern); + + struct scanner s = { + .str = (const uint8_t *)pattern, + .size = length, + }; + + fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, RE_MULTI, &err); + if (fsm == NULL) { + return EXIT_SUCCESS; + } + + if (!fsm_determinise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_det_a\n"); + fsm_print_fsm(stderr, fsm); + } + const size_t post_det_a = fsm_countstates(fsm); + + if (!fsm_determinise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_det_b\n"); + fsm_print_fsm(stderr, fsm); + } + const size_t post_det_b = fsm_countstates(fsm); + assert(post_det_b == post_det_a); + + if (!fsm_minimise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_min_a\n"); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "post_a", fsm); + } + const size_t post_min_a = fsm_countstates(fsm); + + if (!fsm_minimise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_min_b\n"); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "post_b", fsm); + } + const size_t post_min_b = fsm_countstates(fsm); + assert(post_min_b == post_min_a); + + if (!fsm_determinise(fsm)) { + return EXIT_FAILURE; + } + const size_t post_det_c = fsm_countstates(fsm); + assert(post_det_c == post_min_b); + + if (!fsm_minimise(fsm)) { + return EXIT_FAILURE; + } + const size_t post_min_c = fsm_countstates(fsm); + assert(post_min_c == post_det_c); + + fsm_free(fsm); + return EXIT_SUCCESS; +} static enum run_mode get_run_mode(void) @@ -333,14 +1423,26 @@ get_run_mode(void) const char *mode = getenv("MODE"); if (mode != NULL) { switch (mode[0]) { - case 'm': return MODE_SHUFFLE_MINIMISE; + case 'r': return MODE_REGEX; + case 's': return MODE_REGEX_SINGLE_ONLY; + case 'm': return MODE_REGEX_MULTI_ONLY; + case 'i': return MODE_IDEMPOTENT_DET_MIN; + case 'M': return MODE_SHUFFLE_MINIMISE; case 'p': return MODE_ALL_PRINT_FUNCTIONS; default: + fprintf(stderr, "Unrecognized mode '%c', expect one of:\n", mode[0]); + fprintf(stderr, " - r.egex (default)\n"); + fprintf(stderr, " - s.ingle regex only\n"); + fprintf(stderr, " - m.ulti regex only\n"); + fprintf(stderr, " - M.inimisation shuffling\n"); + fprintf(stderr, " - i.dempotent determinise/minimise\n"); + fprintf(stderr, " - p.rint functions\n"); + exit(EXIT_FAILURE); break; } } - return MODE_DEFAULT; + return MODE_REGEX; /* default */ } static FILE *dev_null = NULL; @@ -359,12 +1461,87 @@ harness_fuzzer_target(const uint8_t *data, size_t size) size = MAX_FUZZER_DATA; } memcpy(data_buf, data, size); + /* ensure the buffer is 0-terminated */ + data_buf[size] = 0; + + /* truncate to a valid c string */ + size = strlen((const char *)data_buf); + data_buf[size] = 0; + + /* reset for each run */ + allocator_stats.hwm = 0; + + size_t dot_count = 0; + bool has_newline = false; + size_t first_newline; + + for (size_t i = 0; i < size; i++) { + const uint8_t c = data_buf[i]; + if (c == '.') { + dot_count++; + if (dot_count >= 4) { + /* Too many '.'s can lead to a regex that is + * very slow to determinise/minimise, but that + * failure mode is not interesting to this + * particular fuzzer. */ + return EXIT_SUCCESS; + } + } + + if (c == '(') { + /* This triggers an "unreached" assertion in the parser. + * It's already been reported (issue #386), but once the + * fuzzer finds it, it will report it over and over. + * Exit here so that the fuzzer considers it uninteresting. */ + if (size - i >= 3 && 0 == memcmp("(*:", &data_buf[i], 3)) { + return EXIT_SUCCESS; + } + } + + if (c == '\\') { + /* Not supported yet. */ + return EXIT_SUCCESS; + } + + if (c == '\r' || c == '\n') { + if (!has_newline) { + first_newline = i; + } + has_newline = true; + } + } const char *pattern = (const char *)data_buf; switch (run_mode) { - case MODE_DEFAULT: - return build_and_codegen(pattern); + case MODE_REGEX: + if (has_newline) { + return build_and_check_multi(pattern); + } else { + return build_and_check_single(pattern); + } + + case MODE_REGEX_SINGLE_ONLY: + if (has_newline) { + return EXIT_SUCCESS; /* ignore */ + } else { + return build_and_check_single(pattern); + } + + case MODE_REGEX_MULTI_ONLY: + if (has_newline) { + return build_and_check_multi(pattern); + } else { + return EXIT_SUCCESS; /* ignore */ + } + + case MODE_IDEMPOTENT_DET_MIN: + if (has_newline) { + assert(data_buf[first_newline] == '\n' + || data_buf[first_newline] == '\r'); + data_buf[first_newline] = '\0'; + } + return build_and_test_idempotent_det_and_min(pattern); case MODE_SHUFFLE_MINIMISE: return shuffle_minimise(pattern); @@ -385,5 +1562,8 @@ harness_fuzzer_target(const uint8_t *data, size_t size) int res = fuzz_all_print_functions(dev_null, shifted_pattern, det, min, io_mode); return res; } + +default: + assert(!"match fail"); } } From 318c9ad41380d6ef340a7cd41703cf7ba067f3a0 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 2 Jun 2023 14:24:16 -0400 Subject: [PATCH 10/51] re: Add -FC flag to disable captures. I'm adding this because many of the existing tests under `tests/pcre-anchor/` and so on contain regexes that would now be rejected as unsupported in combination with captures, but they are testing cases unrelated to capturing. --- src/re/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/re/main.c b/src/re/main.c index 226267196..fd9ea2424 100644 --- a/src/re/main.c +++ b/src/re/main.c @@ -618,6 +618,10 @@ parse_flags(const char *arg, enum re_flags *flags) *flags = *flags | RE_EXTENDED; break; + case 'C': + *flags = *flags | RE_NOCAPTURE; + break; + /* others? */ default: From 5e4971354677b9478fe7f38f3f5d4a23fdaf55bf Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 16 Feb 2023 18:39:42 -0500 Subject: [PATCH 11/51] tests/*/Makefile: Add `-FC` (no captures) for some calls to RE. There are several tests that have nothing to do with captures, capture behavior is tested directly with `tests/captures/`. --- tests/aho_corasick/Makefile | 8 ++++---- tests/ir/Makefile | 2 +- tests/native/Makefile | 6 +++--- tests/pcre-anchor/Makefile | 4 ++-- tests/pcre-classes/Makefile | 10 +++++----- tests/pcre-flags/Makefile | 6 +++--- tests/pcre-repeat/Makefile | 6 +++--- tests/pcre/Makefile | 10 +++++----- tests/re_literal/Makefile | 2 +- 9 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/aho_corasick/Makefile b/tests/aho_corasick/Makefile index 5748ddd5c..6eef421bf 100644 --- a/tests/aho_corasick/Makefile +++ b/tests/aho_corasick/Makefile @@ -33,7 +33,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}a.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}a.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}a.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}a.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt @@ -48,7 +48,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}l.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}l.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}l.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}l.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt @@ -63,7 +63,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}r.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}r.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}r.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}r.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt @@ -79,7 +79,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}u.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}u.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}u.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}u.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt diff --git a/tests/ir/Makefile b/tests/ir/Makefile index 0009c45ec..566d1add8 100755 --- a/tests/ir/Makefile +++ b/tests/ir/Makefile @@ -9,7 +9,7 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/ir:T:Mout*.json:R:C/^out//} ${TEST_OUTDIR.tests/ir}/got${n}.json: ${TEST_SRCDIR.tests/ir}/in${n}.re - ${RE} -pl irjson -y ${.ALLSRC:M*.re} \ + ${RE} -FC -pl irjson -y ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/ir}/res${n}: \ diff --git a/tests/native/Makefile b/tests/native/Makefile index 8712e1588..fbca0ca69 100755 --- a/tests/native/Makefile +++ b/tests/native/Makefile @@ -9,11 +9,11 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/native:T:Mout*.fsm:R:C/^out//} ${TEST_OUTDIR.tests/native}/got${n}.fsm: ${TEST_SRCDIR.tests/native}/in${n}.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/native}/nfa${n}.fsm: ${TEST_SRCDIR.tests/native}/in${n}.re - ${RE} -r native -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/native}/res${n}: \ @@ -27,7 +27,7 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/native}/res${n} .for n in ${TEST.tests/native:T:Mout*.err:R:C/^out//} ${TEST_OUTDIR.tests/native}/got${n}.err: ${TEST_SRCDIR.tests/native}/in${n}.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ 2> $@; [ $$? -ne 0 ] ${TEST_OUTDIR.tests/native}/res${n}: \ diff --git a/tests/pcre-anchor/Makefile b/tests/pcre-anchor/Makefile index bb9954554..1dc4a77bc 100644 --- a/tests/pcre-anchor/Makefile +++ b/tests/pcre-anchor/Makefile @@ -9,11 +9,11 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/pcre-anchor:T:Mout*.fsm:R:C/^out//} ${TEST_OUTDIR.tests/pcre-anchor}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-anchor}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-anchor}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-anchor}/in${n}.re - ${RE} -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-anchor}/res${n}: \ diff --git a/tests/pcre-classes/Makefile b/tests/pcre-classes/Makefile index 0d9809d76..0d459e256 100755 --- a/tests/pcre-classes/Makefile +++ b/tests/pcre-classes/Makefile @@ -16,7 +16,7 @@ RE=${BUILD}/bin/re FSM=${BUILD}/bin/fsm ${TEST_OUTDIR.tests/pcre-classes}/dot-all.fsm: - ${RE} -r pcre -p '^[\x00-\xff]$$' | ${FSM} -pm \ + ${RE} -FC -r pcre -p '^[\x00-\xff]$$' | ${FSM} -pm \ > $@ # compl.re tests @@ -32,7 +32,7 @@ ${TEST_OUTDIR.tests/pcre-classes}/dot-all.fsm: .for n in ${TEST.tests/pcre-classes:M*/compl*.re:T:R:C/^compl//} ${TEST_OUTDIR.tests/pcre-classes}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/got-compl${n}.fsm: ${TEST_OUTDIR.tests/pcre-classes}/got${n}.fsm ${TEST_OUTDIR.tests/pcre-classes}/dot-all.fsm @@ -40,7 +40,7 @@ ${TEST_OUTDIR.tests/pcre-classes}/got-compl${n}.fsm: ${TEST_OUTDIR.tests/pcre-cl > $@ ${TEST_OUTDIR.tests/pcre-classes}/expect-compl${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/compl${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/compl*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/compl*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/res${n}: \ @@ -66,11 +66,11 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/pcre-classes}/res${n} @echo x: ${n} ${TEST_OUTDIR.tests/pcre-classes}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/out${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/equal${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/equal*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/equal*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/res${n}: \ diff --git a/tests/pcre-flags/Makefile b/tests/pcre-flags/Makefile index 67e70fbbb..308571395 100755 --- a/tests/pcre-flags/Makefile +++ b/tests/pcre-flags/Makefile @@ -13,17 +13,17 @@ RE=${BUILD}/bin/re TEST_OUTDIR.tests/pcre-flags/mode${n} != cat ${TEST_SRCDIR.tests/pcre-flags}/mode${n} ${TEST_OUTDIR.tests/pcre-flags}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re - ${RE} -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-flags}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re - ${RE} -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ .else ${TEST_OUTDIR.tests/pcre-flags}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re - ${RE} -b -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -b -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-flags}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re diff --git a/tests/pcre-repeat/Makefile b/tests/pcre-repeat/Makefile index c325d2f8e..97535b2c3 100755 --- a/tests/pcre-repeat/Makefile +++ b/tests/pcre-repeat/Makefile @@ -12,11 +12,11 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/pcre-repeat:T:Mout*.fsm:R:C/^out//} ${TEST_OUTDIR.tests/pcre-repeat}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-repeat}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-repeat}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-repeat}/in${n}.re - ${RE} -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-repeat}/res${n}: \ @@ -30,7 +30,7 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/pcre-repeat}/res${n} .for n in ${TEST.tests/pcre-repeat:T:Mout*.err:R:C/^out//} ${TEST_OUTDIR.tests/pcre-repeat}/got${n}.err: ${TEST_SRCDIR.tests/pcre-repeat}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ 2> $@; [ $$? -ne 0 ] ${TEST_OUTDIR.tests/pcre-repeat}/res${n}: \ diff --git a/tests/pcre/Makefile b/tests/pcre/Makefile index 23f879a04..239d2c93b 100755 --- a/tests/pcre/Makefile +++ b/tests/pcre/Makefile @@ -21,7 +21,7 @@ PCREGREP ?= pcregrep # bit of a hack. # 2) removes any trailing \n at the end of the input ${TEST_OUTDIR.tests/pcre-pcregrep}/in${n}.txt: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -mr pcre -y ${.ALLSRC:M*.re} \ + ${RE} -FC -mr pcre -y ${.ALLSRC:M*.re} \ | perl -0pe 's/\\x([0-9a-zA-z]{2})/chr(hex($$1))/ge;' -e 's/\n\Z//' \ > $@ @@ -41,16 +41,16 @@ test:: ${TEST_OUTDIR.tests/pcre-pcregrep}/res${n} .if exists(${TEST_SRCDIR.tests/pcre}/mode${n}) TEST_OUTDIR.tests/pcre/mode${n} != cat ${TEST_SRCDIR.tests/pcre}/mode$n ${TEST_OUTDIR.tests/pcre}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -F "${TEST_OUTDIR.tests/pcre/mode${n}}" -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -F "${TEST_OUTDIR.tests/pcre/mode${n}}" -r pcre -py ${.ALLSRC:M*.re} \ > $@ .else ${TEST_OUTDIR.tests/pcre}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ > $@ .endif ${TEST_OUTDIR.tests/pcre}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre}/res${n}: \ @@ -64,7 +64,7 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/pcre}/res${n} .for n in ${TEST.tests/pcre:T:Mout*.err:R:C/^out//} ${TEST_OUTDIR.tests/pcre}/got${n}.err: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ 2> $@; [ $$? -ne 0 ] ${TEST_OUTDIR.tests/pcre}/res${n}: \ diff --git a/tests/re_literal/Makefile b/tests/re_literal/Makefile index 27941b094..41680e4df 100755 --- a/tests/re_literal/Makefile +++ b/tests/re_literal/Makefile @@ -9,7 +9,7 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/re_literal:T:R:C/^out//} ${TEST_OUTDIR.tests/re_literal}/got${n}.txt: ${TEST_SRCDIR.tests/re_literal}/in${n}.re - ( ${RE} -r pcre -t -y ${.ALLSRC:M*.re} || echo non-literal ) \ + ( ${RE} -FC -r pcre -t -y ${.ALLSRC:M*.re} || echo non-literal ) \ > $@ ${TEST_OUTDIR.tests/re_literal}/res${n}: \ From 6e410b3b4d1ac1ae7997678c9c56401f7c0278d4 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 2 Jun 2023 14:49:05 -0400 Subject: [PATCH 12/51] ast_analysis: Reject '\z' as unsupported. We currently get the wrong capture result for it. --- src/libre/ast_analysis.c | 6 ++++++ tests/capture/capture_test_case_list.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index 70e65cad4..5e811bdb0 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -1836,7 +1836,13 @@ analysis_iter_captures(struct capture_env *env, struct ast_expr *n) switch (n->type) { case AST_EXPR_EMPTY: case AST_EXPR_TOMBSTONE: + break; + case AST_EXPR_ANCHOR: + if (env->use_captures && n->u.anchor.type == AST_ANCHOR_END && !n->u.anchor.is_end_nl) { + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } break; case AST_EXPR_LITERAL: diff --git a/tests/capture/capture_test_case_list.c b/tests/capture/capture_test_case_list.c index 6a5a36a19..ea82e82de 100644 --- a/tests/capture/capture_test_case_list.c +++ b/tests/capture/capture_test_case_list.c @@ -1376,6 +1376,13 @@ const struct captest_case_single single_cases[] = { { .pos = {0, 1}, }, }, }, + + { + .regex = "\\z", + .input = "", + .count = 1, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + }, }; const struct captest_case_multi multi_cases[] = { From 86ec9e4757ba9d22090f3bbbb4c5e7e1229b7e49 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 15 Jun 2023 12:26:18 -0400 Subject: [PATCH 13/51] union.c: Remove EXPENSIVE_CHECKS based on removed interface. Now instead of exposing exactly how many captures the fsm has, we keep track of the ceiling of the count, to track how large the capture buffer needs to be. We could add this back if fsm_capture_count gets re-added. --- src/libfsm/union.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 860cdc2c8..736292a8b 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -104,15 +104,8 @@ fsm_union_array(size_t fsm_count, fsms[0] = NULL; memset(bases, 0x00, fsm_count * sizeof(bases[0])); -#if EXPENSIVE_CHECKS - size_t capture_count = fsm_capture_count(res); -#endif - for (i = 1; i < fsm_count; i++) { struct fsm_combine_info ci; -#if EXPENSIVE_CHECKS - capture_count += fsm_capture_count(fsms[i]); -#endif struct fsm *combined = fsm_union(res, fsms[i], &ci); fsms[i] = NULL; @@ -149,8 +142,5 @@ fsm_union_array(size_t fsm_count, } #endif -#if EXPENSIVE_CHECKS - assert(fsm_capture_count(res) == capture_count); -#endif return res; } From d54ba0d5db6cab12069a9e8e249d6801824e703d Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 15 Jun 2023 12:27:43 -0400 Subject: [PATCH 14/51] bugfix: resize endid buffer for carry_end_metadata properly. Previously we doubled the buffer if it wasn't large enough, but the next endid array may be > twice the size of the old buffer, so we need to keep expanding until it's large enough. We weren't saving the updated array size, so this could potentially lead to repeated doubling and eventually allocation failures. Also, change the assert for fsm_getendids's result -- if we get to that point, it should always be found, not just not an insufficient space error. --- src/libfsm/epsilons.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index e63ae50ce..52b73db6c 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -265,20 +265,26 @@ carry_end_metadata(struct carry_end_metadata_env *env, enum fsm_getendids_res id_res; size_t written; if (id_count > env->end.ceil) { /* grow buffer */ - const size_t nceil = (env->end.ceil == 0) + size_t nceil = (env->end.ceil == 0) ? DEF_END_METADATA_ENDIDS_CEIL : 2*env->end.ceil; + while (nceil < id_count) { + nceil *= 2; + } assert(nceil > 0); - env->end.ids = f_realloc(env->alloc, + fsm_end_id_t *nids = f_realloc(env->alloc, env->end.ids, nceil * sizeof(env->end.ids[0])); - if (env->end.ids == NULL) { + if (nids == NULL) { return 0; } + env->end.ids = nids; + env->end.ceil = nceil; } id_res = fsm_getendids(env->fsm, end_state, id_count, env->end.ids, &written); - assert(id_res != FSM_GETENDIDS_ERROR_INSUFFICIENT_SPACE); + assert(id_res == FSM_GETENDIDS_FOUND); + assert(written == id_count); for (i = 0; i < id_count; i++) { #if LOG_COPYING From e2ac4d7ba4630b52eb39a89abedc1f81b9b5a039 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 15 Jun 2023 14:00:25 -0400 Subject: [PATCH 15/51] Address a couple warnings from scan-build. determinise: It's not possible to find a cached result in the hash table without allocating a to-set buffer first, so assert that it will be non-NULL. fsm_findmode: This should never be used on a state without edges. vm/v1.c and vm/v2.c: Free allocated return value on error. --- src/libfsm/determinise.c | 2 ++ src/libfsm/mode.c | 4 ++++ src/libfsm/vm/v1.c | 4 +++- src/libfsm/vm/v2.c | 5 ++++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 817fd335b..3dc1a4268 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -1260,6 +1260,7 @@ to_set_htab_check(struct analyze_closures_env *env, if (b->count == 0) { return 0; /* empty bucket -> not found */ } else if (b->count == count) { + assert(env->to_sets.buf != NULL); assert(b->offset + count <= env->to_sets.used); const fsm_state_t *ids = &env->to_sets.buf[b->offset]; if (0 == memcmp(ids, dst, count * sizeof(dst[0]))) { @@ -1386,6 +1387,7 @@ save_to_set(struct analyze_closures_env *env, env->to_sets.ceil = nceil; env->to_sets.buf = nbuf; } + assert(env->to_sets.buf != NULL); #if LOG_TO_SET static size_t to_set_id; diff --git a/src/libfsm/mode.c b/src/libfsm/mode.c index 76c60b8ad..87af0bdf9 100644 --- a/src/libfsm/mode.c +++ b/src/libfsm/mode.c @@ -28,6 +28,7 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq) } mode; mode.freq = 1; + mode.state = (fsm_state_t)-1; edge_set_group_iter_reset(fsm->states[state].edges, EDGE_GROUP_ITER_ALL, &iter); while (edge_set_group_iter_next(&iter, &info)) { @@ -46,6 +47,9 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq) *freq = mode.freq; } + /* It's not meaningful to call this on a state without edges. */ + assert(mode.state != (fsm_state_t)-1); + assert(mode.freq >= 1); return mode.state; } diff --git a/src/libfsm/vm/v1.c b/src/libfsm/vm/v1.c index a326b88d8..de1f6ea93 100644 --- a/src/libfsm/vm/v1.c +++ b/src/libfsm/vm/v1.c @@ -217,7 +217,9 @@ encode_opasm_v1(const struct dfavm_vm_op *instr, size_t ninstr, size_t total_byt return ret; error: - /* XXX - cleanup */ + if (ret != NULL) { + free(ret); + } return NULL; } diff --git a/src/libfsm/vm/v2.c b/src/libfsm/vm/v2.c index c85edff98..07eb12ef4 100644 --- a/src/libfsm/vm/v2.c +++ b/src/libfsm/vm/v2.c @@ -155,7 +155,10 @@ encode_opasm_v2(const struct dfavm_vm_op *instr, size_t ninstr) return ret; error: - /* XXX - cleanup */ + if (ret != NULL) { + free(ret); + } + return NULL; } From c11f372ae5656e8a7e59e2d421bfb8bc17593a99 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 15 Jun 2023 15:59:26 -0400 Subject: [PATCH 16/51] fuzz/target.c: In MULTI mode, check endid behavior. For each pattern 0..n that will be combined, set an endid on them. Then, generate inputs that match, and check that the endid result on the single and combined FSMs are consistent. --- fuzz/target.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/fuzz/target.c b/fuzz/target.c index 1b5b30e7e..21d6589de 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -877,6 +877,11 @@ build_and_check_multi(const char *input) } if (j > 0) { count++; } + if (count == 1) { + /* multi mode with only one pattern is pointless */ + return EXIT_SUCCESS; + } + struct re_err err; const enum re_flags flags = 0; @@ -894,6 +899,7 @@ build_and_check_multi(const char *input) } /* compile each individually */ + /* FIXME: apply and check endids */ for (size_t i = 0; i < count; i++) { if (verbosity > 1) { fprintf(stderr, "%s: compiling \"%s\"\n", @@ -911,6 +917,11 @@ build_and_check_multi(const char *input) goto cleanup; } + /* set endid to associate each FSM with its pattern */ + if (!fsm_setendid(fsm, (fsm_end_id_t)i)) { + goto cleanup; + } + char label_buf[100]; snprintf(label_buf, 100, "single_determisise_%zu", i); @@ -1102,6 +1113,7 @@ cmp_combined_and_separate_cb(const struct fsm *fsm, fsm_state_t end_state, void *opaque) { struct cmp_combined_env *env = opaque; + (void)end_state; if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; @@ -1119,11 +1131,15 @@ cmp_combined_and_separate_cb(const struct fsm *fsm, struct fsm_capture captures_single[MAX_CAPTURES]; struct fsm_capture captures_combined[MAX_CAPTURES]; + const fsm_end_id_t expected_end_id = (fsm_end_id_t)env->current_i; + const uint8_t *u8_input = (const uint8_t *)input; + fsm_state_t end_state_combined, end_state_single; + const int res_combined = fsm_exec_with_captures(env->combined_fsm, u8_input, input_length, - &end_state, captures_combined, MAX_CAPTURES); + &end_state_combined, captures_combined, MAX_CAPTURES); const int res_single = fsm_exec_with_captures(fsm, u8_input, input_length, - &end_state, captures_single, MAX_CAPTURES); + &end_state_single, captures_single, MAX_CAPTURES); if (res_combined != res_single) { env->ok = false; @@ -1134,11 +1150,44 @@ cmp_combined_and_separate_cb(const struct fsm *fsm, return FSM_GENERATE_MATCHES_CB_RES_HALT; } + fsm_end_id_t id_buf_combined[MAX_PATTERNS]; + size_t written_combined = 0; + if (res_combined > 0) { + const size_t exp_written = fsm_getendidcount(env->combined_fsm, end_state_combined); + assert(exp_written <= env->count); + const enum fsm_getendids_res gres = fsm_getendids(env->combined_fsm, + end_state_combined, MAX_PATTERNS, id_buf_combined, &written_combined); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written_combined == exp_written); + } + if (res_single > 0) { if (env->verbosity > 3) { fprintf(stderr, "%s: res %d (single and combined)\n", __func__, res_single); } + /* Check that the end state's endid for the single DFA is among the + * endids for the combined DFA's end state. */ + assert(fsm_getendidcount(fsm, end_state_single) == 1); + assert(fsm_getendidcount(env->combined_fsm, end_state_combined) <= env->count); + + fsm_end_id_t id_buf_single[1]; + size_t written; + const enum fsm_getendids_res gres = fsm_getendids(fsm, + end_state_single, 1, id_buf_single, &written); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written == 1); + assert(id_buf_single[0] == expected_end_id); + + bool found_single_id_in_combined = false; + for (size_t i = 0; i < written_combined; i++) { + if (id_buf_combined[i] == expected_end_id) { + found_single_id_in_combined = true; + break; + } + } + assert(found_single_id_in_combined); + bool matching = true; const unsigned base = env->bases[env->current_i].capture; assert(base < MAX_CAPTURES); @@ -1165,6 +1214,18 @@ cmp_combined_and_separate_cb(const struct fsm *fsm, env->ok = false; return FSM_GENERATE_MATCHES_CB_RES_HALT; } + } else if (res_combined > 0) { + /* This matched the combined DFA but not the single one, + * so check that the single DFA's end id is *absent* + * from the combined DFA's end state. */ + bool found_single_id_in_combined = false; + for (size_t i = 0; i < written_combined; i++) { + if (id_buf_combined[i] == expected_end_id) { + found_single_id_in_combined = true; + break; + } + } + assert(!found_single_id_in_combined); } return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; From 8b49a1a1071df47107dbb370c0490bb06f861310 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 10 Jul 2023 13:50:27 -0400 Subject: [PATCH 17/51] minimisation: Distinct sets of end IDs should not split ECs. In minimise.c, `split_ecs_by_end_metadata` does a first pass splitting ECs based on their end data (like the name says). This sets which end metadata should prevent states from starting out in the same EC, effectively which states can/cannot be combined once minimisation's analysis is done. Previously, distinct sets of end IDs would keep states from merging, but if epsilon removal and determinisation have led to end states with distinct sets of end IDs, that alone shouldn't prevent them from merging later -- the same end state just becomes associated with all those end IDs. We do prevent states with distinct capture ID sets from merging, but that's because of a few special cases like "^a(b)c$" and "^a(b*)c$", where combining partially overlapping regexes' end states could lead to false positives in the capture result. Note: I added checking the program IDs (which was missing) to `same_end_metadata`. This seems correct to me, but at the moment I can't think of any test inputs that would lead to the same sets of capture IDs but distinct sets of program IDs. I will see if fuzzing can find any. This is tested by tests/endids/endids2_union_many_endids.c and the new multi test in tests/capture/capture_test_case_list.c. --- src/libfsm/minimise.c | 76 ++------------------------ tests/capture/capture_test_case_list.c | 70 ++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 70 deletions(-) diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index 39929b4ac..6cfd7968e 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -53,11 +53,6 @@ all_end_states_are_currently_together(const struct min_env *env); #define DEF_CAPTURE_ID_CEIL 4 struct end_metadata { - struct end_metadata_end { - unsigned count; - fsm_end_id_t *ids; - } end; - struct end_metadata_capture { unsigned count; unsigned ceil; @@ -71,10 +66,6 @@ struct end_metadata { } program; }; -static int -collect_end_ids(const struct fsm *fsm, fsm_state_t s, - struct end_metadata_end *e); - static int collect_capture_ids(const struct fsm *fsm, fsm_state_t s, struct end_metadata_capture *c); @@ -732,24 +723,23 @@ incremental_hash_of_ids(uint64_t *accum, fsm_end_id_t id) static int same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) { - if (a->end.count != b->end.count) { + if (a->capture.count != b->capture.count) { return 0; } - if (a->capture.count != b->capture.count) { + if (a->program.count != b->program.count) { return 0; } /* compare -- these must be sorted */ - for (size_t i = 0; i < a->end.count; i++) { - if (a->end.ids[i] != b->end.ids[i]) { + for (size_t i = 0; i < a->capture.count; i++) { + if (a->capture.ids[i] != b->capture.ids[i]) { return 0; } } - - for (size_t i = 0; i < a->capture.count; i++) { - if (a->capture.ids[i] != b->capture.ids[i]) { + for (size_t i = 0; i < a->program.count; i++) { + if (a->program.ids[i] != b->program.ids[i]) { return 0; } } @@ -806,13 +796,6 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) break; /* this EC has non-end states, skip */ } - /* FIXME: should distinct end IDs partition here? - * Disabled to make tests/endids/endids2_union_many_endids.c pass. */ - if (0 && - !collect_end_ids(fsm, s, &e->end)) { - goto cleanup; - } - if (!collect_capture_ids(fsm, s, &e->capture)) { goto cleanup; } @@ -849,9 +832,6 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) uint64_t hash = 0; const fsm_state_t next = env->jump[s]; - for (size_t eid_i = 0; eid_i < s_md->end.count; eid_i++) { - incremental_hash_of_ids(&hash, s_md->end.ids[eid_i]); - } for (size_t pid_i = 0; pid_i < s_md->program.count; pid_i++) { incremental_hash_of_ids(&hash, s_md->program.ids[pid_i]); } @@ -996,9 +976,6 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) size_t i; for (i = 0; i < state_count; i++) { struct end_metadata *e = &end_md[i]; - if (e->end.ids != NULL) { - f_free(fsm->opt->alloc, e->end.ids); - } if (e->capture.ids != NULL) { f_free(fsm->opt->alloc, e->capture.ids); } @@ -1012,14 +989,6 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) return res; } -static int -cmp_end_ids(const void *pa, const void *pb) -{ - const fsm_end_id_t a = *(fsm_end_id_t *)pa; - const fsm_end_id_t b = *(fsm_end_id_t *)pb; - return a < b ? -1 : a > b ? 1 : 0; -} - static int cmp_unsigned(const void *pa, const void *pb) { @@ -1028,39 +997,6 @@ cmp_unsigned(const void *pa, const void *pb) return a < b ? -1 : a > b ? 1 : 0; } -static int -collect_end_ids(const struct fsm *fsm, fsm_state_t s, - struct end_metadata_end *e) -{ - e->count = fsm_getendidcount(fsm, s); - - if (e->count > 0) { - e->ids = f_malloc(fsm->opt->alloc, - e->count * sizeof(e->ids[0])); - if (e->ids == NULL) { - return 0; - } - -#if LOG_ECS - fprintf(stderr, "%d:", s); -#endif - - size_t written; - enum fsm_getendids_res gres = fsm_getendids(fsm, - s, e->count, e->ids, &written); - assert(gres == FSM_GETENDIDS_FOUND); - assert(written == e->count); - -#if LOG_ECS - fprintf(stderr, "\n"); -#endif - /* sort, to make comparison easier later */ - qsort(e->ids, e->count, - sizeof(e->ids[0]), cmp_end_ids); - } - return 1; -} - struct collect_capture_env { int ok; const struct fsm_alloc *alloc; diff --git a/tests/capture/capture_test_case_list.c b/tests/capture/capture_test_case_list.c index ea82e82de..d05210d2c 100644 --- a/tests/capture/capture_test_case_list.c +++ b/tests/capture/capture_test_case_list.c @@ -1525,6 +1525,76 @@ const struct captest_case_multi multi_cases[] = { }, }, }, + + { + /* This checks that minimisation doesn't incorrectly + * merge these and lead to capture false positives. */ + .regex_count = 2, + .regexes = { + "^a(b)c$", /* exactly one 'b' */ + "^a(b*)c$", /* any number of 'b's */ + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "a", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "ab", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "ac", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = { 0, 2 } }, + { .regex = 1, .capture = 1, .pos = { 1, 1 } }, + }, + }, + { + .input = "abc", + .expected = { + { .regex = 0, .capture = 0, .pos = {0, 3 } }, + { .regex = 0, .capture = 1, .pos = {1, 2 } }, + { .regex = 1, .capture = 0, .pos = { 0, 3 } }, + { .regex = 1, .capture = 1, .pos = { 1, 2 } }, + }, + }, + { + .input = "abbc", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = { 0, 4 } }, + { .regex = 1, .capture = 1, .pos = { 1, 3 } }, + }, + }, + + { + .input = NULL, + }, + }, + } }; From a3a79cac1b31bef478ce885017fd99f7ec7f1850 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 10 Jul 2023 14:02:31 -0400 Subject: [PATCH 18/51] Remove theft test harness for deleted ADT (ipriq). --- theft/Makefile | 1 - theft/fuzz_adt_ipriq.c | 197 ----------------------------------------- 2 files changed, 198 deletions(-) delete mode 100644 theft/fuzz_adt_ipriq.c diff --git a/theft/Makefile b/theft/Makefile index 0d38d8cfc..921c482a9 100644 --- a/theft/Makefile +++ b/theft/Makefile @@ -6,7 +6,6 @@ SRC += theft/util.c SRC += theft/wrap.c SRC += theft/fuzz_adt_edge_set.c -SRC += theft/fuzz_adt_ipriq.c SRC += theft/fuzz_adt_priq.c SRC += theft/fuzz_capture_string_set.c SRC += theft/fuzz_literals.c diff --git a/theft/fuzz_adt_ipriq.c b/theft/fuzz_adt_ipriq.c deleted file mode 100644 index 1847ef6ce..000000000 --- a/theft/fuzz_adt_ipriq.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright 2021 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include "type_info_adt_ipriq.h" - -#include -#include - -struct model { - size_t used; - size_t entries[]; -}; - -static enum ipriq_cmp_res -cmp_size_t(size_t a, size_t b, void *opaque) -{ - (void)opaque; - return a < b ? IPRIQ_CMP_LT : - a > b ? IPRIQ_CMP_GT : IPRIQ_CMP_EQ; -} - -static int exec_add(size_t x, struct model *m, struct ipriq *pq) -{ - if (!ipriq_add(pq, x)) { - return 0; - } - - m->entries[m->used] = x; - m->used++; - return 1; -} - -static int find_min_pos(const struct model *m, size_t *pos) -{ - size_t i; - if (m->used == 0) { - return 0; - } - - size_t res, min; - res = 0; - min = m->entries[0]; - - for (i = 1; i < m->used; i++) { - if (m->entries[i] < min) { - res = i; - min = m->entries[i]; - } - } - *pos = res; - return 1; -} - -static int exec_peek(struct model *m, struct ipriq *pq) -{ - size_t res; - - if (!ipriq_peek(pq, &res)) { - return m->used == 0; - } - - size_t pos; - if (!find_min_pos(m, &pos)) { - assert(!"unreachable (peek)"); - } - - return res == m->entries[pos]; -} - -static int exec_pop(struct model *m, struct ipriq *pq) -{ - size_t res; - - if (!ipriq_pop(pq, &res)) { - return m->used == 0; - } - - size_t pos; - if (!find_min_pos(m, &pos)) { - assert(!"unreachable (pop)"); - } - - if (res != m->entries[pos]) { - return 0; - } - - assert(m->used > 0); - if (pos < m->used - 1) { - m->entries[pos] = m->entries[m->used - 1]; - } - m->used--; - return 1; -} - -static enum theft_trial_res -compare_against_model(const struct ipriq_scenario *scen) -{ - enum theft_trial_res res = THEFT_TRIAL_FAIL; - size_t i; - - struct model *m = malloc(sizeof(*m) - + scen->count * sizeof(m->entries[0])); - if (m == NULL) { - return THEFT_TRIAL_ERROR; - } - m->used = 0; - - struct ipriq *pq = ipriq_new(NULL, cmp_size_t, NULL); - if (pq == NULL) { - return THEFT_TRIAL_ERROR; - } - - for (i = 0; i < scen->count; i++) { - const struct ipriq_op *op = &scen->ops[i]; - - switch (op->t) { - case IPRIQ_OP_ADD: - if (!exec_add(op->u.add.x, m, pq)) { - goto cleanup; - } - break; - - case IPRIQ_OP_PEEK: - if (!exec_peek(m, pq)) { - goto cleanup; - } - break; - - case IPRIQ_OP_POP: - if (!exec_pop(m, pq)) { - goto cleanup; - } - break; - - default: - assert(false); break; - } - } - - res = THEFT_TRIAL_PASS; - -cleanup: - free(m); - - return res; -} - -static enum theft_trial_res -prop_ipriq_model(struct theft *t, void *arg1) -{ - const struct ipriq_scenario *scen = arg1; - (void)t; - return compare_against_model(scen); -} - -static bool -test_ipriq(theft_seed seed, uintptr_t limit) -{ - enum theft_run_res res; - - struct ipriq_hook_env env = { - .tag = 'I', - .limit = limit, - }; - - struct theft_run_config config = { - .name = __func__, - .prop1 = prop_ipriq_model, - .type_info = { &type_info_adt_ipriq }, - .trials = 1000, - .hooks = { - .trial_pre = theft_hook_first_fail_halt, - .env = &env, - }, - .fork = { - .enable = true, - }, - - .seed = seed, - }; - - (void)limit; - - res = theft_run(&config); - printf("%s: %s\n", __func__, theft_run_res_str(res)); - - return res == THEFT_RUN_PASS; -} - -void -register_test_adt_ipriq(void) -{ - reg_test1("adt_ipriq", test_ipriq, 10000); -} From bfc9a804509c237894fd371c2e498740bee79736 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 11 Jul 2023 10:47:09 -0400 Subject: [PATCH 19/51] capture_vm_exec: Address scan-build warnings. There were several unused store warnings about values that were only set for logging, either `(void)x` them out or restructure the code so that their scope is more limited. Remove `next_offset` from `populate_solution`, since that isn't being used at all. IIRC it was added before some of the path sharing details made it unnecessary. --- src/libfsm/capture_vm_exec.c | 48 ++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/src/libfsm/capture_vm_exec.c b/src/libfsm/capture_vm_exec.c index 5e33f159c..f2d0b522e 100644 --- a/src/libfsm/capture_vm_exec.c +++ b/src/libfsm/capture_vm_exec.c @@ -659,6 +659,10 @@ cmp_paths(struct capvm *vm, uint32_t p_a, uint32_t p_b) const uint32_t backlink_a = get_path_node_backlink(vm, link_a); const uint32_t backlink_b = get_path_node_backlink(vm, link_b); + /* These are only used for logging, which may compile out. */ + (void)backlink_a; + (void)backlink_b; + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: link_a %d (offset %u, prev %d), link_b %d (offset %u, prev %d)\n", __func__, link_a, offset_a, prev_a, link_b, offset_b, prev_b); @@ -1506,6 +1510,7 @@ extend_path_info(struct capvm *vm, uint32_t pi_id, bool greedy, uint32_t uniq_id if (LOG_CAPVM >= 4 || 1) { const uint32_t refcount = get_path_node_refcount(vm, epi_id); + (void)refcount; LOG(4 - LOG_EPI, "%s: pi[%u] refcount %u -> %u (reusing identical path backlink %u instead of %u)\n", __func__, epi_id, refcount, refcount + 1, epi_id, pi_id); @@ -1593,7 +1598,6 @@ populate_solution(struct capvm *vm) uint32_t path_link = vm->solution.best_path_id; uint32_t next_link = NO_ID; - uint32_t next_offset = NO_POS; uint32_t first_link = NO_ID; size_t split_count = 0; @@ -1648,33 +1652,34 @@ populate_solution(struct capvm *vm) prev = prev_link; } - next_offset = get_path_node_offset(vm, path_link); next_link = path_link; assert(path_link != prev_link); path_link = prev_link; } while (path_link != NO_ID && path_link != COLLAPSED_ZERO_PREFIX_ID); /* iter forward */ - uint32_t cur = first_link; - if (LOG_CAPVM >= 3) do { - struct capvm_path_info *pi = &vm->paths.pool[cur]; + if (LOG_CAPVM >= 3) { + uint32_t cur = first_link; + do { + struct capvm_path_info *pi = &vm->paths.pool[cur]; - assert(IS_PATH_NODE(pi)); - LOG(3, "%s (moving fwd): node %u: refcount %u, used %u, offset %u, fwdlink %d, bits '", - __func__, cur, get_path_node_refcount(vm, cur), - pi->u.path.used, - get_path_node_offset(vm, cur), - get_path_node_backlink(vm, cur)); - for (uint8_t i = 0; i < pi->u.path.used; i++) { - const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); - LOG(3, "%c", bit ? '1' : '0'); - } - LOG(3, "'\n"); + assert(IS_PATH_NODE(pi)); + LOG(3, "%s (moving fwd): node %u: refcount %u, used %u, offset %u, fwdlink %d, bits '", + __func__, cur, get_path_node_refcount(vm, cur), + pi->u.path.used, + get_path_node_offset(vm, cur), + get_path_node_backlink(vm, cur)); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", bit ? '1' : '0'); + } + LOG(3, "'\n"); - const uint32_t next_cur = get_path_node_backlink(vm, cur); - assert(cur != next_cur); - cur = next_cur; /* fwd link */ - } while (cur != NO_ID); + const uint32_t next_cur = get_path_node_backlink(vm, cur); + assert(cur != next_cur); + cur = next_cur; /* fwd link */ + } while (cur != NO_ID); + } /* evaluate program with forward path */ LOG(3, "%s: split_count %zu\n", __func__, split_count); @@ -1693,7 +1698,7 @@ populate_solution(struct capvm *vm) * as (0,1). */ bool explicitly_matched_nl_at_end = false; - cur = first_link; + uint32_t cur = first_link; while (split_i < split_count || !done) { assert(prog_i < vm->p->used); const uint32_t cur_prog_i = prog_i; @@ -1901,6 +1906,7 @@ populate_solution(struct capvm *vm) for (uint8_t i = 0; i < pi->u.path.used; i++) { const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); LOG(3, "%c", (pi->u.path.bits & bit) ? '1' : '0'); + (void)bit; } LOG(3, "'\n"); From f43585ab012a2d922fb47ff48e6b0d9c3a390667 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 17 Jul 2023 13:39:15 -0400 Subject: [PATCH 20/51] fuzz/target.c: Expand build_and_check_multi. Also compare in the other direction, generating matching inputs from the combined DFA and then check that individual regexes's captures still match as expected. --- fuzz/target.c | 236 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 176 insertions(+), 60 deletions(-) diff --git a/fuzz/target.c b/fuzz/target.c index 21d6589de..0166f7cf2 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -813,12 +813,12 @@ compare_with_pcre(const char *pattern, struct fsm *fsm) /* Note: combined_fsm and fsms[] are non-const because fsm_generate_matches * calls fsm_trim on them. */ static int -compare_combined_and_separate(int verbosity, size_t max_length, size_t count, +compare_separate_and_combined(int verbosity, size_t max_length, size_t count, struct fsm *combined_fsm, const struct fsm_combined_base_pair *bases, struct fsm **fsms); static enum fsm_generate_matches_cb_res -cmp_combined_and_separate_cb(const struct fsm *fsm, +cmp_separate_and_combined_cb(const struct fsm *fsm, size_t depth, size_t match_count, size_t steps, const char *input, size_t input_length, fsm_state_t end_state, void *opaque); @@ -1015,7 +1015,7 @@ build_and_check_multi(const char *input) fsm_capture_dump(stderr, "combined", combined_fsm); } - res = compare_combined_and_separate(verbosity, max_length, + res = compare_separate_and_combined(verbosity, max_length, count, combined_fsm, bases, (struct fsm **)fsms_cp); for (i = 0; i < count; i++) { @@ -1068,8 +1068,14 @@ struct cmp_combined_env { size_t max_match_count; }; +static enum fsm_generate_matches_cb_res +cmp_combined_with_separate_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + static int -compare_combined_and_separate(int verbosity, size_t max_length, size_t count, +compare_separate_and_combined(int verbosity, size_t max_length, size_t count, struct fsm *combined_fsm, const struct fsm_combined_base_pair *bases, struct fsm **fsms) { @@ -1093,21 +1099,29 @@ compare_combined_and_separate(int verbosity, size_t max_length, size_t count, * they match with the same captures in the combined fsm. */ for (env.current_i = 0; env.current_i < count; env.current_i++) { if (!fsm_generate_matches(env.fsms[env.current_i], max_length, - cmp_combined_and_separate_cb, &env)) { + cmp_separate_and_combined_cb, &env)) { env.ok = false; } if (!env.ok) { break; } } + env.current_i = (size_t)-1; - /* TODO: also generate matches with combined and check the individual ones */ + /* Also go in the other direction, generating matches with + * combined and check the individual ones match as expected. */ + if (env.ok) { + if (!fsm_generate_matches(env.combined_fsm, max_length, + cmp_combined_with_separate_cb, &env)) { + env.ok = false; + } + } return env.ok ? EXIT_SUCCESS : EXIT_FAILURE; } static enum fsm_generate_matches_cb_res -cmp_combined_and_separate_cb(const struct fsm *fsm, +cmp_separate_and_combined_cb(const struct fsm *fsm, size_t depth, size_t match_count, size_t steps, const char *input, size_t input_length, fsm_state_t end_state, void *opaque) @@ -1161,76 +1175,178 @@ cmp_combined_and_separate_cb(const struct fsm *fsm, assert(written_combined == exp_written); } - if (res_single > 0) { + /* we got here, so we have a match */ + assert(res_single > 0); + + if (env->verbosity > 3) { + fprintf(stderr, "%s: res %d (single and combined)\n", __func__, res_single); + } + + /* Check that the end state's endid for the single DFA is among the + * endids for the combined DFA's end state. */ + assert(fsm_getendidcount(fsm, end_state_single) == 1); + assert(fsm_getendidcount(env->combined_fsm, end_state_combined) <= env->count); + + fsm_end_id_t id_buf_single[1]; + size_t written; + const enum fsm_getendids_res gres = fsm_getendids(fsm, + end_state_single, 1, id_buf_single, &written); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written == 1); + assert(id_buf_single[0] == expected_end_id); + + bool found_single_id_in_combined = false; + for (size_t i = 0; i < written_combined; i++) { + if (id_buf_combined[i] == expected_end_id) { + found_single_id_in_combined = true; + break; + } + } + assert(found_single_id_in_combined); + + bool matching = true; + const unsigned base = env->bases[env->current_i].capture; + assert(base < MAX_CAPTURES); + for (int i = 0; i < res_single; i++) { if (env->verbosity > 3) { - fprintf(stderr, "%s: res %d (single and combined)\n", __func__, res_single); + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + if ((captures_single[i].pos[0] != captures_combined[i + base].pos[0]) || + (captures_single[i].pos[1] != captures_combined[i + base].pos[1])) { + matching = false; } + } + + if (!matching) { + for (int i = 0; i < res_single; i++) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; +} + +static enum fsm_generate_matches_cb_res +cmp_combined_with_separate_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + /* We have an input that matched the combined DFA, + * use the set of end IDs to check which of the + * single DFAs it should/should not match, and check + * the endid behavior. */ + + struct cmp_combined_env *env = opaque; + + if (steps > env->max_steps) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } - /* Check that the end state's endid for the single DFA is among the - * endids for the combined DFA's end state. */ - assert(fsm_getendidcount(fsm, end_state_single) == 1); - assert(fsm_getendidcount(env->combined_fsm, end_state_combined) <= env->count); + if (depth > env->max_depth) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + if (match_count > env->max_match_count) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + +#define MAX_CAPTURES 256 + struct fsm_capture captures_single[MAX_CAPTURES]; + struct fsm_capture captures_combined[MAX_CAPTURES]; - fsm_end_id_t id_buf_single[1]; - size_t written; - const enum fsm_getendids_res gres = fsm_getendids(fsm, - end_state_single, 1, id_buf_single, &written); + const uint8_t *u8_input = (const uint8_t *)input; + + fsm_state_t end_state_combined; + assert(fsm == env->combined_fsm); + const int res_combined = fsm_exec_with_captures(env->combined_fsm, u8_input, input_length, + &end_state_combined, captures_combined, MAX_CAPTURES); + assert(res_combined > 0); /* we got here, so we have a match */ + assert(end_state_combined == end_state); + + fsm_end_id_t id_buf_combined[MAX_PATTERNS]; + size_t written_combined = 0; + { + const size_t exp_written = fsm_getendidcount(env->combined_fsm, end_state_combined); + assert(exp_written <= env->count); + const enum fsm_getendids_res gres = fsm_getendids(env->combined_fsm, + end_state_combined, MAX_PATTERNS, id_buf_combined, &written_combined); assert(gres == FSM_GETENDIDS_FOUND); - assert(written == 1); - assert(id_buf_single[0] == expected_end_id); + assert(written_combined == exp_written); + } - bool found_single_id_in_combined = false; - for (size_t i = 0; i < written_combined; i++) { - if (id_buf_combined[i] == expected_end_id) { - found_single_id_in_combined = true; + /* For each pattern, check if its endid is in the combined DFA's end state + * endids. If so, it should match, otherwise it should not. */ + for (size_t pattern_i = 0; pattern_i < env->count; pattern_i++) { + const struct fsm *single_fsm = env->fsms[pattern_i]; + bool found = false; + for (size_t endid_i = 0; endid_i < written_combined; endid_i++) { + const fsm_end_id_t endid = id_buf_combined[endid_i]; + if (endid == pattern_i) { + found = true; break; } } - assert(found_single_id_in_combined); - - bool matching = true; - const unsigned base = env->bases[env->current_i].capture; - assert(base < MAX_CAPTURES); - for (int i = 0; i < res_single; i++) { - if (env->verbosity > 3) { - fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", - i, res_single, - captures_single[i].pos[0], captures_single[i].pos[1], - captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); - } - if ((captures_single[i].pos[0] != captures_combined[i + base].pos[0]) || - (captures_single[i].pos[1] != captures_combined[i + base].pos[1])) { - matching = false; - } - } - - if (!matching) { + fsm_state_t end_state_single; + + const int res_single = fsm_exec_with_captures(single_fsm, + u8_input, input_length, + &end_state_single, captures_single, MAX_CAPTURES); + + if (found) { + assert(res_single > 0); + fsm_end_id_t id_buf_single[1]; + size_t written; + const enum fsm_getendids_res gres = fsm_getendids(single_fsm, + end_state_single, 1, id_buf_single, &written); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written == 1); + assert(id_buf_single[0] == pattern_i); + + /* check captures */ + bool matching = true; + const unsigned base = env->bases[pattern_i].capture; + assert(base < MAX_CAPTURES); for (int i = 0; i < res_single; i++) { - fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", - i, res_single, - captures_single[i].pos[0], captures_single[i].pos[1], - captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + if (env->verbosity > 3) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + if ((captures_single[i].pos[0] != captures_combined[i + base].pos[0]) || + (captures_single[i].pos[1] != captures_combined[i + base].pos[1])) { + matching = false; + } } - env->ok = false; - return FSM_GENERATE_MATCHES_CB_RES_HALT; - } - } else if (res_combined > 0) { - /* This matched the combined DFA but not the single one, - * so check that the single DFA's end id is *absent* - * from the combined DFA's end state. */ - bool found_single_id_in_combined = false; - for (size_t i = 0; i < written_combined; i++) { - if (id_buf_combined[i] == expected_end_id) { - found_single_id_in_combined = true; - break; + + if (!matching) { + for (int i = 0; i < res_single; i++) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; } + } else { + assert(res_single == 0); /* no match */ } - assert(!found_single_id_in_combined); } return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; } + #define DEF_MAX_SHUFFLE 10 #define DEF_MAX_MINIMISE_ORACLE_STATE_COUNT 1000 @@ -1622,7 +1738,7 @@ harness_fuzzer_target(const uint8_t *data, size_t size) const bool det = b0 & 0x1; const bool min = b0 & 0x2; const enum fsm_io io_mode = (b0 >> 2) % 3; - + const char *shifted_pattern = (const char *)&data_buf[1]; int res = fuzz_all_print_functions(dev_null, shifted_pattern, det, min, io_mode); return res; From 3a93f6d15e710b593ad9fd6515f341649bec4f3b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 17 Jul 2023 14:03:29 -0400 Subject: [PATCH 21/51] re_capvm_compile.c: Update #include for EXPENSIVE_CHECKS. --- src/libre/re_capvm_compile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index c67db2669..c1d856618 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -10,7 +10,7 @@ #include "../libfsm/capture_vm_log.h" /* for EXPENSIVE_CHECKS */ -#include "../libfsm/internal.h" +#include "adt/common.h" #include #include From da51d9e237fbbfde9cd96caf2800dcc6571fb0d9 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 17 Jul 2023 14:04:31 -0400 Subject: [PATCH 22/51] Remove nullable ALT backpatching, it's now unreachable. This was set up to handle edge cases like `^(($)|x)+$` where there is an alt with some nullable cases with anchors, surrounded by + repetition, which turned up during fuzzing. This is an obscure edge case, it is proving very difficult to handle correctly, and there is probably little value in actually doing so. Now we are flagging it as an unsupported PCRE construct in ast_analysis.c's analysis_iter_repetition, so the code using repeated_alt_backpatches is unreachable. Just remove it. --- src/libre/ast.h | 1 - src/libre/ast_analysis.c | 1 - src/libre/re_capvm_compile.c | 122 ----------------------------------- 3 files changed, 124 deletions(-) diff --git a/src/libre/ast.h b/src/libre/ast.h index 20cc3cdb7..3ef0c1f5f 100644 --- a/src/libre/ast.h +++ b/src/libre/ast.h @@ -183,7 +183,6 @@ struct ast_expr { unsigned min; unsigned max; /* can be AST_COUNT_UNBOUNDED */ int contains_empty_groups; - int contains_nullable_alt; } repeat; struct { diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index 3ae19841d..548d9f60c 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -392,7 +392,6 @@ analysis_iter_repetition(struct ast_expr *n, struct ast_expr *outermost_repeat_p assert(repeat_plus_ancestor->type == AST_EXPR_REPEAT); assert(repeat_plus_ancestor->u.repeat.min == 1); assert(repeat_plus_ancestor->u.repeat.max == AST_COUNT_UNBOUNDED); - repeat_plus_ancestor->u.repeat.contains_nullable_alt = 1; /* Repetition of an alt subtree which has a capture group child that * only contains only* anchors is not handled properly yet. This diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index c1d856618..15f5bbfa5 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -88,19 +88,6 @@ struct capvm_compile_env { /* linked list */ struct repeated_group_info *prev; } *repeated_groups; - - /* Linked list of nodes used for regexes like '^(($)|x)+$', - * which need to special-case the JMP instruction after the - * nullable '($)' case to replace it with a SPLIT to before - * and after the + repetition. */ - struct repeated_alt_backpatch_info { - const struct ast_expr *repeat; /* must be a + repeat */ - size_t ceil; - size_t used; - unsigned *opcode_offsets; - /* linked list */ - struct repeated_alt_backpatch_info *prev; - } *repeated_alt_backpatches; }; static bool @@ -512,74 +499,6 @@ pop_repeated_group_info(struct capvm_compile_env *env, const struct ast_expr *ex f_free(env->alloc, rgi); } -static bool -push_repeated_alt_backpatch_info(struct capvm_compile_env *env, const struct ast_expr *expr) -{ - assert(expr - && expr->type == AST_EXPR_REPEAT - && expr->u.repeat.min == 1 - && expr->u.repeat.max == AST_COUNT_UNBOUNDED); - struct repeated_alt_backpatch_info *rabi = f_calloc(env->alloc, - 1, sizeof(*rabi)); - if (rabi == NULL) { - return false; - } - rabi->repeat = expr; - rabi->prev = env->repeated_alt_backpatches; - LOG(3 - LOG_REPETITION_CASES, - "%s: pushing node %p onto %p, prev link %p\n", - __func__, (void *)expr, (void *)rabi, (void *)rabi->prev); - env->repeated_alt_backpatches = rabi; - return true; -} - -static bool -append_repeated_alt_backpatch_offset(struct capvm_compile_env *env, unsigned offset) -{ - struct repeated_alt_backpatch_info *rabi = env->repeated_alt_backpatches; - assert(rabi != NULL); - if (rabi->used == rabi->ceil) { - const size_t nceil = (rabi->ceil == 0 - ? DEF_REPEATED_ALT_BACKPATCH_CEIL - : 2*rabi->ceil); - LOG(3 - LOG_REPETITION_CASES, - "%s: growing %zu -> %zu\n", __func__, rabi->ceil, nceil); - - unsigned *noffsets = f_realloc(env->alloc, - rabi->opcode_offsets, nceil * sizeof(noffsets[0])); - if (noffsets == NULL) { - return false; - } - rabi->ceil = nceil; - rabi->opcode_offsets = noffsets; - } - - LOG(3 - LOG_REPETITION_CASES, - "%s: pushing offset %u\n", __func__, offset); - rabi->opcode_offsets[rabi->used] = offset; - rabi->used++; - return true; -} - -static void -pop_repeated_alt_backpatch_info(struct capvm_compile_env *env, const struct ast_expr *expr) -{ - struct repeated_alt_backpatch_info *rabi = env->repeated_alt_backpatches; - assert(rabi != NULL); - assert(rabi->repeat == expr); - struct repeated_alt_backpatch_info *prev = rabi->prev; - LOG(3 - LOG_REPETITION_CASES, - "%s: popping %p, prev link %p\n", - __func__, (void *)rabi, (void *)prev); - f_free(env->alloc, rabi->opcode_offsets); - f_free(env->alloc, rabi); - env->repeated_alt_backpatches = prev; -} - -static void -backpatch_repeated_nullable_alt_split(struct capvm_compile_env *env, - const struct ast_expr *expr, struct capvm_program *p, unsigned split_new_dst); - static bool emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p); @@ -845,9 +764,6 @@ capvm_compile_iter(struct capvm_compile_env *env, op_split_after->t = CAPVM_OP_SPLIT; op_split_after->u.split.cont = PENDING_OFFSET_ALT_BACKPATCH_JMP; op_split_after->u.split.new = PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS; - if (!append_repeated_alt_backpatch_offset(env, pos_split_after)) { - return false; - } } else { const uint32_t pos_jmp_after = reserve_program_opcode(p); flow_info[c_i].backpatch = pos_jmp_after; @@ -974,12 +890,6 @@ capvm_compile_iter(struct capvm_compile_env *env, return false; } } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* + */ - if (expr->u.repeat.contains_nullable_alt) { - if (!push_repeated_alt_backpatch_info(env, expr)) { - return false; - } - } - /* l1: * split l1, l2 * l2: */ @@ -1007,16 +917,6 @@ capvm_compile_iter(struct capvm_compile_env *env, op_split->t = CAPVM_OP_SPLIT; op_split->u.split.cont = pos_l1; op_split->u.split.new = pos_l2; - - /* Update any ALT nodes in the subtree whose SPLIT instructions - * are awaiting backpatching with pos_l2. */ - if (expr->u.repeat.contains_nullable_alt) { - backpatch_repeated_nullable_alt_split(env, expr, p, pos_l2); - } - } - - if (expr->u.repeat.contains_nullable_alt) { - pop_repeated_alt_backpatch_info(env, expr); } } else if (min == 0 && max == 0) { /* {0,0} */ /* ignored, except any groups contained within that could match @@ -1384,28 +1284,6 @@ emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p) return true; } -static void -backpatch_repeated_nullable_alt_split(struct capvm_compile_env *env, - const struct ast_expr *expr, struct capvm_program *p, unsigned split_new_dst) -{ - struct repeated_alt_backpatch_info *rabi = env->repeated_alt_backpatches; - assert(rabi != NULL && rabi->repeat == expr); - - for (size_t op_i = 0; op_i < rabi->used; op_i++) { - const unsigned offset = rabi->opcode_offsets[op_i]; - assert(offset < p->used); - LOG(3 - LOG_REPETITION_CASES, - "%s: backpatching SPLIT instruction %u's .new to %u\n", - __func__, offset, split_new_dst); - struct capvm_opcode *op = &p->ops[offset]; - assert(op->t == CAPVM_OP_SPLIT); - assert(op->u.split.new == PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS); - op->u.split.new = split_new_dst; - } - - rabi->used = 0; -} - static bool capvm_compile_iter_save_groups_in_skipped_subtree(struct capvm_compile_env *env, struct capvm_program *p, const struct ast_expr *expr) From 9f3ae29c2c7433218920560a3da6193b47114f6e Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 17 Jul 2023 16:40:21 -0400 Subject: [PATCH 23/51] capture vm: Cleanup & clarifying comments throughout. --- src/libfsm/capture_vm.c | 17 ++++++------- src/libfsm/capture_vm.h | 3 +-- src/libfsm/capture_vm_exec.c | 48 ++++++++++++++++++++++-------------- src/libre/re_capvm_compile.c | 12 ++++++--- 4 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/libfsm/capture_vm.c b/src/libfsm/capture_vm.c index aacae7f9c..7dd1599cd 100644 --- a/src/libfsm/capture_vm.c +++ b/src/libfsm/capture_vm.c @@ -13,9 +13,8 @@ * with PCRE's behavior for libfsm's supported subset of PCRE. * * Instead of giving each green thread its own copy of the capture - * buffers, which uses a prohibitive amount of memory when matching DFAs - * that combine several regexes with several captures each, operate in - * two passes. + * buffers, which uses a prohibitive amount of memory when combining DFAs + * with several captures each, operate in two passes: * * In the first pass, each thread keeps track of its execution path, * appending a bit for each branch: 1 for the greedy option, 0 for the @@ -32,13 +31,13 @@ * a greedier search path (since PCRE's results match the greediest). * * In the second pass, replay the execution path for just the single - * greediest thread, which represents the "best" match, and write - * capture offsets into buffers passed in by the caller. + * greediest thread, which represents the "correct" match (according to + * PCRE semantics), and write capture offsets into buffers passed in by + * the caller. * - * Most of the other differences have to do with matching PCRE's quirky - * behaviors, particularly interactions between newlines and start/end - * anchors. - * */ + * Most of the other differences have to do with matching PCRE + * edge cases, particularly interactions between newlines and start/end + * anchors. */ #include "capture_vm.h" #include "capture_vm_program.h" diff --git a/src/libfsm/capture_vm.h b/src/libfsm/capture_vm.h index 74e958eb8..34d4c73df 100644 --- a/src/libfsm/capture_vm.h +++ b/src/libfsm/capture_vm.h @@ -13,7 +13,7 @@ #include #include -/* Interface the virtual machine used to resolve captures. +/* Interface for the virtual machine used to resolve captures. * These interfaces are exposed to libre but should not be * used directly. */ @@ -43,7 +43,6 @@ enum fsm_capvm_program_exec_res { FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND, FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC = -1, - /* FIXME: what other ways can this fail? */ }; /* Execute a capture program with the given input and populate diff --git a/src/libfsm/capture_vm_exec.c b/src/libfsm/capture_vm_exec.c index f2d0b522e..579a67fa4 100644 --- a/src/libfsm/capture_vm_exec.c +++ b/src/libfsm/capture_vm_exec.c @@ -36,7 +36,7 @@ * This value cannot be changed without reworking the data structures. */ #define PATH_LINK_BITS 32 -/* This enables extra debugging/testing output */ +/* This enables extra debugging/testing output in an easily scraped format */ #ifndef TESTING_OPTIONS #define TESTING_OPTIONS 0 #endif @@ -49,10 +49,12 @@ #define CAPVM_STATS (0 || TESTING_OPTIONS) #define CAPVM_PATH_STATS (0 && CAPVM_STATS) -/* This may no longer be necessary after further work on path handling. */ -#define ALLOW_TABLE_RESIZING 1 -#define ALLOW_PATH_TABLE_RESIZING (1 || ALLOW_TABLE_RESIZING) -#define ALLOW_THREAD_TABLE_RESIZING (0 || ALLOW_TABLE_RESIZING) +/* Allow the path table to grow on demand. + * In theory it should be possible to determine the worst case + * based on compile-time analysis and the input length; if an + * appropriately sized buffer was passed in capture resolution + * would not need dynamic allocation at all. */ +#define ALLOW_PATH_TABLE_RESIZING 1 /* Set to non-zero to trap runaway path table growth */ #define PATH_TABLE_CEIL_LIMIT 0 @@ -135,7 +137,8 @@ struct capvm { * be advanced next. The current stack is * run_stacks[PAIR_ID_CURRENT], run_stacks[PAIR_ID_NEXT] is the * stack for the next input position, and when the current stack - * is completed the next stack is copied over (and reversed). + * is completed the next stack is copied over (and reversed, so + * the greediest threads end up on top and resume first). * Same with run_stacks_h, the height for each stack, and the * other fields with [2] below. */ uint32_t *run_stacks[2]; @@ -143,9 +146,12 @@ struct capvm { /* Similarly, two columns of bits and two arrays of path_info * node IDs and uniq_ids for the execution at a particular - * opcode. */ + * opcode. + * + * evaluated bit array[]: Has the instruction n already been + * evaluated at the current input position? */ uint32_t *evaluated[2]; - uint32_t *path_info_heads[2]; + uint32_t *path_info_heads[2]; /* path for thread on instruction */ #if CAPVM_STATS uint32_t *uniq_ids[2]; #endif @@ -164,14 +170,19 @@ struct capvm { struct capvm_path_info { union { struct capvm_path_freelist_link { - uint16_t refcount; /* == 0 */ + uint16_t refcount; /* == 0: tag for freelist node */ uint32_t freelist; } freelist_node; struct capvm_path_info_link { - uint16_t refcount; /* > 0, sticky at UINT16_MAX? */ + /* refcount: When > 0 this is a path node. + * This could be sticky at UINT16_MAX, but in order + * to get there it would need a regex whose compiled + * program has well over 2**16 instructions that all + * share the same path info node. */ + uint16_t refcount; uint8_t used; /* .bits used, <= PATH_LINK_BITS */ - uint32_t bits; - uint32_t offset; + uint32_t bits; /* buffer for this link's path bits */ + uint32_t offset; /* offset into the path bit array */ /* Linked list to earlier path nodes, with common * nodes shared until paths diverge. * @@ -667,8 +678,8 @@ cmp_paths(struct capvm *vm, uint32_t p_a, uint32_t p_b) "%s: backward loop: link_a %d (offset %u, prev %d), link_b %d (offset %u, prev %d)\n", __func__, link_a, offset_a, prev_a, link_b, offset_b, prev_b); - assert((offset_a & 31) == 0); /* multiple of 32 */ - assert((offset_b & 31) == 0); /* multiple of 32 */ + assert((offset_a & (PATH_LINK_BITS - 1)) == 0); /* multiple of 32 */ + assert((offset_b & (PATH_LINK_BITS - 1)) == 0); /* multiple of 32 */ if (offset_a > offset_b) { LOG(3 - LOG_CMP_PATHS, "%s: backward loop: a longer than b\n", __func__); set_path_node_backlink(vm, link_a, fwd_a); @@ -894,7 +905,7 @@ schedule_possible_next_step(struct capvm *vm, enum pair_id pair_id, const size_t h = *stack_h; for (size_t i = 0; i < h; i++) { if (stack[i] == op_id) { - stack[i] = NO_ID; + stack[i] = NO_ID; /* cancel thread */ vm->threads.live--; } } @@ -1093,8 +1104,8 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, case CAPVM_OP_ANCHOR: if (op->u.anchor == CAPVM_ANCHOR_START) { LOG(3, "%s: ^ anchor\n", __func__); - /* ignore a trailing newline, because PCRE does, - * even after a $ anchor. */ + /* ignore a single trailing newline, because PCRE does. + * For ^ this affects the capture position. */ if (input_pos == 0 && vm->input_len == 1 && vm->input[0] == '\n') { @@ -1113,7 +1124,7 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, LOG(3, "%s: $ anchor: input_len %u, input_pos %u\n", __func__, vm->input_len, input_pos); - /* ignore a trailing newline, because PCRE does */ + /* ignore a single trailing newline, because PCRE does */ if (vm->input_len > 0 && input_pos == vm->input_len - 1) { if (vm->input[input_pos] != '\n') { goto halt_thread; @@ -1140,7 +1151,6 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, check_path_table(vm); } - /* FIXME: Check the cleanup logic here. */ return; halt_thread: diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index 15f5bbfa5..6b810b246 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -605,9 +605,11 @@ capvm_compile_iter(struct capvm_compile_env *env, pop_repeated_group_info(env, expr); } - /* FIXME: may need distinct error case to not leak. - * There is currently no test reaching this, try - * using the fuzzer to trigger it. */ + /* FIXME: May need distinct error case to not + * leak. There is currently no test reaching + * this and the fuzzer has not produced an input + * that reaches it -- unsatisfiability has probably + * already pruned subtrees that would get here. */ return true; } else if (active_count == 1) { /* even if one of the later subtrees is active, an earlier @@ -938,7 +940,9 @@ capvm_compile_iter(struct capvm_compile_env *env, if (max == AST_COUNT_UNBOUNDED) { /* A repeat of {x,inf} should be treated like - * (?:subtree){x} (?:subtree)* . */ + * (?:subtree){x} (?:subtree)* , where any numbered + * capture groups inside have the same group ID in + * both copies of the subtree. */ if (!compile_kleene_star(env, p, expr)) { return false; } From 26483cf76567488d55f4f63a66443f53cdd14aa6 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 18 Jul 2023 14:21:35 -0400 Subject: [PATCH 24/51] re_capvm_compile: Change "active_node" categorization. Strictly speaking this shouldn't include nodes that have been flagged with `AST_FLAG_UNSATISFIABLE`. I have re-fuzzed with this changed and it does not seem to have introduced any new issues. `active_node` is only used in one place. --- src/libre/re_capvm_compile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index 6b810b246..248cd0e29 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -334,7 +334,7 @@ active_node(const struct ast_expr *n) case AST_EXPR_TOMBSTONE: return false; default: - return true; + return !(n->flags & AST_FLAG_UNSATISFIABLE); } } From e9353afd02c0370231d7ac9499e84aa5fe925508 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 21 Jul 2023 11:29:59 -0400 Subject: [PATCH 25/51] capture_vm: rename split's .cont and .new to .greedy and .nongreedy. Rather than commenting throughout to note that .cont is the greedy branch and .new is non-greedy, just rename them. --- src/libfsm/capture_vm.c | 2 +- src/libfsm/capture_vm_exec.c | 48 ++++++++++---------- src/libfsm/capture_vm_program.h | 4 +- src/libre/re_capvm_compile.c | 62 +++++++++++++------------- tests/capture/capture_test_case_list.c | 14 +++--- 5 files changed, 64 insertions(+), 66 deletions(-) diff --git a/src/libfsm/capture_vm.c b/src/libfsm/capture_vm.c index 7dd1599cd..bc5073df9 100644 --- a/src/libfsm/capture_vm.c +++ b/src/libfsm/capture_vm.c @@ -153,7 +153,7 @@ fsm_capvm_program_dump(FILE *f, fprintf(f, "%zu: jmp_once %u\n", i, op->u.jmp_once); break; case CAPVM_OP_SPLIT: - fprintf(f, "%zu: split cont %u new %u\n", i, op->u.split.cont, op->u.split.new); + fprintf(f, "%zu: split greedy %u nongreedy %u\n", i, op->u.split.greedy, op->u.split.nongreedy); break; case CAPVM_OP_SAVE: fprintf(f, "%zu: save %u (cap %u, %s)\n", diff --git a/src/libfsm/capture_vm_exec.c b/src/libfsm/capture_vm_exec.c index 579a67fa4..5fc477ca8 100644 --- a/src/libfsm/capture_vm_exec.c +++ b/src/libfsm/capture_vm_exec.c @@ -1045,36 +1045,35 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, case CAPVM_OP_SPLIT: { - const uint32_t dst_cont = op->u.split.cont; - const uint32_t dst_new = op->u.split.new; + const uint32_t dst_greedy = op->u.split.greedy; + const uint32_t dst_nongreedy = op->u.split.nongreedy; /* destinations must be in range and not self-referential */ - assert(dst_cont < vm->p->used); - assert(dst_new < vm->p->used); - assert(dst_cont != op_id); - assert(dst_new != op_id); + assert(dst_greedy < vm->p->used); + assert(dst_nongreedy < vm->p->used); + assert(dst_greedy != op_id); + assert(dst_nongreedy != op_id); - uint32_t new_path_info_head; - if (!copy_path_info(vm, path_info_head, &new_path_info_head)) { + uint32_t nongreedy_path_info_head; + if (!copy_path_info(vm, path_info_head, &nongreedy_path_info_head)) { goto alloc_error; } - /* cont is the greedy branch */ if (!extend_path_info(vm, path_info_head, 1, uniq_id, &path_info_head)) { release_path_info_link(vm, &path_info_head); goto alloc_error; } - /* new is the non-greedy branch */ - if (!extend_path_info(vm, new_path_info_head, 0, uniq_id, &new_path_info_head)) { + /* nongreedy is the non-greedy branch */ + if (!extend_path_info(vm, nongreedy_path_info_head, 0, uniq_id, &nongreedy_path_info_head)) { release_path_info_link(vm, &path_info_head); goto alloc_error; } #if CAPVM_STATS - const uint32_t new_uniq_id = ++vm->uniq_id_counter; + const uint32_t nongreedy_uniq_id = ++vm->uniq_id_counter; #else - const uint32_t new_uniq_id = 0; + const uint32_t nongreedy_uniq_id = 0; #endif vm->threads.live++; @@ -1082,15 +1081,14 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, set_max_threads_live(vm, vm->threads.live); } - /* Push the split.new destination, and then the - * split.cont destination on top of it, so that the - * greedier .cont branch will be fully evaluated - * first. */ - schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_new, - new_path_info_head, new_uniq_id); - schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_cont, + /* Push the split.nongreedy destination, and then the + * split.greedy destination on top of it, so that the + * greedier branch will be fully evaluated first. */ + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_nongreedy, + nongreedy_path_info_head, nongreedy_uniq_id); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_greedy, path_info_head, uniq_id); - LOG_EXEC_SPLIT(uniq_id, new_uniq_id); + LOG_EXEC_SPLIT(uniq_id, nongreedy_uniq_id); break; } @@ -1792,8 +1790,8 @@ populate_solution(struct capvm *vm) const uint32_t offset = get_path_node_offset(vm, cur); const struct capvm_path_info *pi = &vm->paths.pool[cur]; - const uint32_t dst_cont = op->u.split.cont; - const uint32_t dst_new = op->u.split.new; + const uint32_t dst_greedy = op->u.split.greedy; + const uint32_t dst_nongreedy = op->u.split.nongreedy; assert(IS_PATH_NODE(pi)); bool next_bit; @@ -1811,9 +1809,9 @@ populate_solution(struct capvm *vm) LOG(3, "split: next_bit %d\n", next_bit); LOG_EXEC_PATH_SAVE_CAPTURES(uniq_id, next_bit); if (next_bit) { /* greedy edge */ - prog_i = dst_cont; + prog_i = dst_greedy; } else { /* non-greedy edge */ - prog_i = dst_new; + prog_i = dst_nongreedy; } split_i++; if (split_i >= offset && diff --git a/src/libfsm/capture_vm_program.h b/src/libfsm/capture_vm_program.h index 15e585761..0b24ffb5b 100644 --- a/src/libfsm/capture_vm_program.h +++ b/src/libfsm/capture_vm_program.h @@ -45,8 +45,8 @@ struct capvm_program { uint32_t jmp; /* absolute */ uint32_t jmp_once; /* absolute */ struct { - uint32_t cont; /* greedy branch */ - uint32_t new; /* non-greedy branch */ + uint32_t greedy; + uint32_t nongreedy; } split; /* (save >> 1): capture ID, * (save & 0x01): save pos to start (0b0) or end (0b1). */ diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index 248cd0e29..8a4c25562 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -144,10 +144,10 @@ check_program_for_invalid_labels(const struct capvm_program *p) assert(op->u.jmp_once != op_i); break; case CAPVM_OP_SPLIT: - assert(op->u.split.cont < p->used); - assert(op->u.split.cont != op_i); - assert(op->u.split.new < p->used); - assert(op->u.split.new != op_i); + assert(op->u.split.greedy < p->used); + assert(op->u.split.greedy != op_i); + assert(op->u.split.nongreedy < p->used); + assert(op->u.split.nongreedy != op_i); break; case CAPVM_OP_CHAR: @@ -724,11 +724,11 @@ capvm_compile_iter(struct capvm_compile_env *env, op_split_before->t = CAPVM_OP_SPLIT; /* greedier branch: trying the next case, in order */ - op_split_before->u.split.cont = get_program_offset(p); + op_split_before->u.split.greedy = get_program_offset(p); /* less greedy branch: moving on to the next case. * will backpatch .new to after this case's JMP later */ - op_split_before->u.split.new = PENDING_OFFSET_ALT_BACKPATCH_NEW; + op_split_before->u.split.nongreedy = PENDING_OFFSET_ALT_BACKPATCH_NEW; const struct ast_expr *n = expr->u.alt.n[c_i]; LOG(3, "%s: %p recursing...\n", __func__, (void *)expr); @@ -764,8 +764,8 @@ capvm_compile_iter(struct capvm_compile_env *env, flow_info[c_i].backpatch = pos_split_after; struct capvm_opcode *op_split_after = &p->ops[pos_split_after]; op_split_after->t = CAPVM_OP_SPLIT; - op_split_after->u.split.cont = PENDING_OFFSET_ALT_BACKPATCH_JMP; - op_split_after->u.split.new = PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS; + op_split_after->u.split.greedy = PENDING_OFFSET_ALT_BACKPATCH_JMP; + op_split_after->u.split.nongreedy = PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS; } else { const uint32_t pos_jmp_after = reserve_program_opcode(p); flow_info[c_i].backpatch = pos_jmp_after; @@ -779,7 +779,7 @@ capvm_compile_iter(struct capvm_compile_env *env, /* and the original split jumps to after * this case's JMP */ - op_split_before->u.split.new = get_program_offset(p); + op_split_before->u.split.nongreedy = get_program_offset(p); } } @@ -802,10 +802,10 @@ capvm_compile_iter(struct capvm_compile_env *env, assert(op_patch->u.jmp == PENDING_OFFSET_ALT_BACKPATCH_JMP); op_patch->u.jmp = pos_after_all; } else if (op_patch->t == CAPVM_OP_SPLIT) { - assert(op_patch->u.split.cont == PENDING_OFFSET_ALT_BACKPATCH_JMP); - op_patch->u.split.cont = pos_after_all; + assert(op_patch->u.split.greedy == PENDING_OFFSET_ALT_BACKPATCH_JMP); + op_patch->u.split.greedy = pos_after_all; /* This will be patched by an ancestor repeat node after returning. */ - assert(op_patch->u.split.cont == PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS); + assert(op_patch->u.split.greedy == PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS); } else { assert(!"type mismatch"); } @@ -875,8 +875,8 @@ capvm_compile_iter(struct capvm_compile_env *env, struct capvm_opcode *op_split = &p->ops[pos_split]; op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = pos_l1; - op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; if (!capvm_compile_iter(env, p, e)) { return false; } @@ -886,7 +886,7 @@ capvm_compile_iter(struct capvm_compile_env *env, op_split = &p->ops[pos_split]; /* refresh pointer */ const uint32_t after_expr = get_program_offset(p); - op_split->u.split.new = after_expr; + op_split->u.split.nongreedy = after_expr; } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* * */ if (!compile_kleene_star(env, p, expr)) { return false; @@ -917,8 +917,8 @@ capvm_compile_iter(struct capvm_compile_env *env, struct capvm_opcode *op_split = &p->ops[pos_split]; op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = pos_l1; - op_split->u.split.new = pos_l2; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = pos_l2; } } else if (min == 0 && max == 0) { /* {0,0} */ /* ignored, except any groups contained within that could match @@ -963,8 +963,8 @@ capvm_compile_iter(struct capvm_compile_env *env, struct capvm_opcode *op_split = &p->ops[pos_split]; op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = pos_l1; - op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; if (!capvm_compile_iter(env, p, e)) { return false; } @@ -974,7 +974,7 @@ capvm_compile_iter(struct capvm_compile_env *env, op_split = &p->ops[pos_split]; /* refresh pointer */ const uint32_t after_expr = get_program_offset(p); - op_split->u.split.new = after_expr; + op_split->u.split.nongreedy = after_expr; } } } @@ -1171,8 +1171,8 @@ compile_kleene_star(struct capvm_compile_env *env, struct capvm_opcode *op_split = &p->ops[pos_l1]; op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = PENDING_OFFSET_REPEAT_OPTIONAL_CONT; - op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + op_split->u.split.greedy = PENDING_OFFSET_REPEAT_OPTIONAL_CONT; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; if (!capvm_compile_iter(env, p, expr->u.repeat.e)) { return false; } @@ -1197,8 +1197,8 @@ compile_kleene_star(struct capvm_compile_env *env, const uint32_t pos_l3 = get_program_offset(p); op_split = &p->ops[pos_l1]; /* refresh pointer */ - op_split->u.split.cont = pos_l2; - op_split->u.split.new = pos_l3; + op_split->u.split.greedy = pos_l2; + op_split->u.split.nongreedy = pos_l3; return true; } @@ -1233,8 +1233,8 @@ emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p) struct capvm_opcode *op_split = &p->ops[pos_split]; op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = pos_l1; - op_split->u.split.new = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; if (group->flags & AST_FLAG_ANCHORED_START) { const uint32_t pos_start = reserve_program_opcode(p); @@ -1263,7 +1263,7 @@ emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p) const uint32_t after_expr = get_program_offset(p); op_split = &p->ops[pos_split]; /* refresh pointer */ - op_split->u.split.new = after_expr; + op_split->u.split.nongreedy = after_expr; } else { /* simple case, emit SAVE pair */ if (!ensure_program_capacity(env->alloc, p, 2)) { @@ -1404,8 +1404,8 @@ capvm_compile(struct capvm_compile_env *env, const uint32_t l3 = get_program_offset(p); op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = l3; /* greedy */ - op_split->u.split.new = l2; /* non-greedy */ + op_split->u.split.greedy = l3; + op_split->u.split.nongreedy = l2; op_cc->t = CAPVM_OP_CHARCLASS; uint64_t any[4]; @@ -1447,8 +1447,8 @@ capvm_compile(struct capvm_compile_env *env, struct capvm_opcode *op_jmp = &p->ops[l_jmp]; op_split->t = CAPVM_OP_SPLIT; - op_split->u.split.cont = l3; /* greedy */ - op_split->u.split.new = l2; /* non-greedy */ + op_split->u.split.greedy = l3; + op_split->u.split.nongreedy = l2; op_any->t = CAPVM_OP_CHARCLASS; uint64_t any[4]; diff --git a/tests/capture/capture_test_case_list.c b/tests/capture/capture_test_case_list.c index d05210d2c..6b72d1018 100644 --- a/tests/capture/capture_test_case_list.c +++ b/tests/capture/capture_test_case_list.c @@ -1615,25 +1615,25 @@ static struct captest_case_program program_cases[] = { }, .ops = { - { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 3, .new = 1 }}, + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 3, .nongreedy = 1 }}, { .t = CAPVM_OP_CHARCLASS, .u.charclass_id = 0 }, { .t = CAPVM_OP_JMP, .u.jmp = 0 }, { .t = CAPVM_OP_SAVE, .u.save = 0 }, - { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 5, .new = 7 }}, + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 5, .nongreedy = 7 }}, { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_START }, { .t = CAPVM_OP_JMP, .u.jmp = 9 }, /* jump after |() */ { .t = CAPVM_OP_SAVE, .u.save = 4 }, { .t = CAPVM_OP_SAVE, .u.save = 5 }, - { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 4, .new = 10 }}, + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 4, .nongreedy = 10 }}, { .t = CAPVM_OP_SAVE, .u.save = 2 }, { .t = CAPVM_OP_SAVE, .u.save = 3 }, { .t = CAPVM_OP_SAVE, .u.save = 6 }, { .t = CAPVM_OP_SAVE, .u.save = 7 }, { .t = CAPVM_OP_SAVE, .u.save = 1 }, - { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 18, .new = 16 }}, + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 18, .nongreedy = 16 }}, { .t = CAPVM_OP_CHARCLASS, .u.charclass_id = 0 }, { .t = CAPVM_OP_JMP, .u.jmp = 15 }, { .t = CAPVM_OP_MATCH }, @@ -1656,15 +1656,15 @@ static struct captest_case_program program_cases[] = { [0] = { .t = CAPVM_OP_SAVE, .u.save = 0 }, [1] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_START }, [2] = { .t = CAPVM_OP_SAVE, .u.save = 2 }, - [3] = { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 4, .new = 6 }}, + [3] = { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 4, .nongreedy = 6 }}, [4] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_END }, /* [5] = { .t = CAPVM_OP_JMP, .u.jmp = 7 }, */ - [5] = { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 7, .new = 9 }}, + [5] = { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 7, .nongreedy = 9 }}, [6] = { .t = CAPVM_OP_CHAR, .u.chr = 'x' }, [7] = { .t = CAPVM_OP_SAVE, .u.save = 3 }, - [8] = { .t = CAPVM_OP_SPLIT, .u.split = { .cont = 2, .new = 9 }}, + [8] = { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 2, .nongreedy = 9 }}, [9] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_END }, [10] = { .t = CAPVM_OP_SAVE, .u.save = 1 }, [11] = { .t = CAPVM_OP_MATCH }, From 1e552204934797be29c35b8d00bcc2c7b9542e18 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 21 Jul 2023 12:02:00 -0400 Subject: [PATCH 26/51] capture_vm: Add error state for step limit reached. --- src/libfsm/capture_vm.h | 2 +- src/libfsm/capture_vm_exec.c | 31 +++++++++++++++---------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/libfsm/capture_vm.h b/src/libfsm/capture_vm.h index 34d4c73df..02c198dab 100644 --- a/src/libfsm/capture_vm.h +++ b/src/libfsm/capture_vm.h @@ -41,7 +41,7 @@ fsm_capvm_program_dump(FILE *f, enum fsm_capvm_program_exec_res { FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN, FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND, - + FSM_CAPVM_PROGRAM_EXEC_STEP_LIMIT_REACHED, FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC = -1, }; diff --git a/src/libfsm/capture_vm_exec.c b/src/libfsm/capture_vm_exec.c index 5fc477ca8..9d4be066a 100644 --- a/src/libfsm/capture_vm_exec.c +++ b/src/libfsm/capture_vm_exec.c @@ -926,33 +926,29 @@ schedule_possible_next_step(struct capvm *vm, enum pair_id pair_id, } } -static void +/* returns whether the vm should continue. */ +static bool eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, uint32_t path_info_head, uint32_t uniq_id, uint32_t op_id) { LOG(5, "%s: input_pos %u, input_len %u, op_id %u, threads_live %u\n", __func__, input_pos, vm->input_len, op_id, vm->threads.live); + assert(op_id < vm->p->used); + if (vm->stats.steps == vm->step_limit) { LOG(1, "%s: halting, steps == step_limit %zu\n", __func__, vm->step_limit); - return; + vm->res = FSM_CAPVM_PROGRAM_EXEC_STEP_LIMIT_REACHED; + return false; } - - assert(op_id < vm->p->used); + vm->stats.steps++; const struct capvm_opcode *op = &vm->p->ops[op_id]; - LOG(2, "%s: op_id[%u]: input_pos %u, path_info_head %u, uniq_id %u, op %s\n", __func__, op_id, input_pos, path_info_head, uniq_id, op_name[op->t]); LOG_EXEC_OP(uniq_id, input_pos, op_id, op_name[op->t]); - vm->stats.steps++; - if (vm->stats.steps == vm->step_limit) { - /* TODO: Set some sort of STEP_LIMIT_REACHED error. */ - return; - } - switch (op->t) { case CAPVM_OP_CHAR: if (input_pos == vm->input_len) { @@ -1142,14 +1138,14 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, default: assert(!"unreachable"); - return; + return false; } if (EXPENSIVE_CHECKS) { /* postcondition */ check_path_table(vm); } - return; + return true; halt_thread: /* do not push further execution on the run stack */ @@ -1158,11 +1154,12 @@ eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, release_path_info_link(vm, &path_info_head); assert(vm->threads.live > 0); vm->threads.live--; - return; + return true; alloc_error: release_path_info_link(vm, &path_info_head); vm->res = FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC; + return false; } static void @@ -1351,7 +1348,9 @@ eval_vm(struct capvm *vm) #else const uint32_t uniq_id = 0; #endif - eval_vm_advance_greediest(vm, i_i, path_info_head, uniq_id, op_id); + if (!eval_vm_advance_greediest(vm, i_i, path_info_head, uniq_id, op_id)) { + return false; + } } @@ -2054,7 +2053,7 @@ fsm_capvm_program_exec(const struct capvm_program *program, * flag, which would skip this assertion. */ assert(vm.paths.live == 0); } else { - assert(vm.res == FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND); + assert(vm.res != FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN); } TIME(&post); From 937c9f0bce0378d47fcc6eb655fd928f8d40b7ef Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 24 Jul 2023 17:14:15 -0400 Subject: [PATCH 27/51] ast_analysis: Remove variables that are no longer used. --- src/libre/ast_analysis.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index 548d9f60c..df9e8ce54 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -1048,8 +1048,6 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) /* flow ANCHORED_START and ANCHORED_END flags upward */ { - int after_always_consumes = 0; - for (i = 0; i < n->u.concat.count; i++) { struct ast_expr *child = n->u.concat.n[i]; if (child->flags & AST_FLAG_ANCHORED_START) { @@ -1062,7 +1060,6 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (always_consumes_input(child)) { LOG(3 - LOG_ANCHORING, "%s: child %zd always consumes input\n", __func__, i); - after_always_consumes = 1; } } } @@ -1525,7 +1522,6 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) case AST_EXPR_ALT: { int any_sat = 0; int all_set_followed_by_consuming = 1; - int all_set_followed_by_consuming_newline = 1; int any_set_followed_by_consuming_newline = 0; int all_set_before_start_anchor = 1; @@ -1554,7 +1550,6 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) all_set_before_start_anchor &= child_env.before_start_anchor; any_set_followed_by_consuming_newline |= child_env.followed_by_consuming_newline; - all_set_followed_by_consuming_newline &= child_env.followed_by_consuming_newline; any_sat = 1; } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE From 5e54a8bb0af2a75826edecb25b6930798c94c261 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 24 Jul 2023 17:20:07 -0400 Subject: [PATCH 28/51] src/adt/stateset.c: Reference `adt/common.h`'s `EXPENSIVE_CHECKS`. --- src/adt/stateset.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/adt/stateset.c b/src/adt/stateset.c index ab3542161..65bbf173a 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -15,10 +15,9 @@ #include #include -/* FIXME: This should use the same define as currently exists in - * src/fsm/internal.h. This is used here because the calls to +/* This is used here because the calls to * state_set_contains change the order of growth. */ -#define EXPENSIVE_CHECKS 0 +#include /* From 94d6e7f97b182202179dbb242ccb0f935319bff8 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 25 Jul 2023 09:55:39 -0400 Subject: [PATCH 29/51] fuzz/target.c: Use default mode for the empty string. --- fuzz/target.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fuzz/target.c b/fuzz/target.c index e7a981b02..736c2d889 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -1615,6 +1615,7 @@ get_run_mode(void) } switch (mode[0]) { + case '\0': return MODE_REGEX; /* default */ case 'r': return MODE_REGEX; case 's': return MODE_REGEX_SINGLE_ONLY; case 'm': return MODE_REGEX_MULTI_ONLY; From dced7c77ef3cc83d58d6743792052c79b7536fce Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 25 Jul 2023 09:55:52 -0400 Subject: [PATCH 30/51] re_capvm_compile.c: Avoid potential sign-extension bug. `expr->u.literal.c` is signed, so when negative casting it to `uint64_t` can lead to a 64-bit value > 255. Clamp it to 8 bits. --- src/libre/re_capvm_compile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index 8a4c25562..06c48fc28 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -362,7 +362,7 @@ subtree_represents_character_class(const struct ast_expr *expr, uint64_t cc[4]) || t->type != AST_ENDPOINT_LITERAL) { return false; } - for (uint64_t c = (uint64_t)f->u.literal.c; c <= (uint64_t)t->u.literal.c; c++) { + for (uint64_t c = (uint8_t)f->u.literal.c; c <= (uint8_t)t->u.literal.c; c++) { u64bitset_set(cc, (uint8_t)c); } return true; @@ -404,7 +404,7 @@ subtree_represents_character_class(const struct ast_expr *expr, uint64_t cc[4]) } for (uint64_t i = 0; i < 256; i++) { - if (i >= (uint64_t)f->u.literal.c && i <= (uint64_t)f->u.literal.c) { + if (i >= (uint8_t)f->u.literal.c && i <= (uint8_t)f->u.literal.c) { u64bitset_set(cc, i); } } @@ -827,7 +827,7 @@ capvm_compile_iter(struct capvm_compile_env *env, if (expr->re_flags & RE_ICASE) { uint64_t cc[4] = { 0 }; - u64bitset_set(cc, (uint64_t)expr->u.literal.c); + u64bitset_set(cc, (uint8_t)expr->u.literal.c); op->t = CAPVM_OP_CHARCLASS; make_charclass_case_insensitive(cc); From 8f0a683758ef5ebeca2f4fc56c9ec697a12e207e Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 25 Jul 2023 10:19:26 -0400 Subject: [PATCH 31/51] Do non-zero allocations to silence EFENCE. EFENCE breaks CI builds that (correctly) do a zero-size allocation when calling `calloc` with a count of zero, reporting that it's a likely bug. Use a size of 1 instead of 0 to silence it. --- src/libfsm/capture_vm.c | 4 +++- src/libre/ast_compile.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/libfsm/capture_vm.c b/src/libfsm/capture_vm.c index bc5073df9..e6a1f0539 100644 --- a/src/libfsm/capture_vm.c +++ b/src/libfsm/capture_vm.c @@ -77,7 +77,9 @@ capvm_program_copy(const struct fsm_alloc *alloc, ops = f_calloc(alloc, src->used, sizeof(ops[0])); if (ops == NULL) { goto cleanup; } - sets = f_calloc(alloc, src->char_classes.count, + sets = f_calloc(alloc, + /* do non-zero allocation to silence EFENCE */ + src->char_classes.count == 0 ? 1 : src->char_classes.count, sizeof(src->char_classes.sets[0])); if (sets == NULL) { goto cleanup; } diff --git a/src/libre/ast_compile.c b/src/libre/ast_compile.c index d13f742aa..b376aa144 100644 --- a/src/libre/ast_compile.c +++ b/src/libre/ast_compile.c @@ -765,7 +765,7 @@ comp_iter(struct comp_env *env, { const size_t capture_id_words = (env->max_capture_id == AST_NO_MAX_CAPTURE_ID) - ? 0 + ? 1 /* do non-zero allocation to silence EFENCE */ : ((env->max_capture_id)/64 + 1); active_capture_ids = f_calloc(env->alloc, capture_id_words, sizeof(active_capture_ids[0])); From 722260ad34d28d13e692879e71b1f93f1b580a94 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 25 Jul 2023 13:02:58 -0400 Subject: [PATCH 32/51] re_capvm_compile: subtree_represents_character_class: handle empty b. The comment says this handles a .b that is EMPTY, but the recursive call rejects it. Something in PCRE suite in CI is triggering this. --- src/libre/re_capvm_compile.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c index 06c48fc28..068c00d32 100644 --- a/src/libre/re_capvm_compile.c +++ b/src/libre/re_capvm_compile.c @@ -414,7 +414,11 @@ subtree_represents_character_class(const struct ast_expr *expr, uint64_t cc[4]) } uint64_t neg_cc[4]; - if (subtree_represents_character_class(expr->u.subtract.b, neg_cc)) { + if (expr->u.subtract.b->type == AST_EXPR_EMPTY) { + for (size_t cc_i = 0; cc_i < 4; cc_i++) { + neg_cc[cc_i] = (uint64_t)0; + } + } else if (subtree_represents_character_class(expr->u.subtract.b, neg_cc)) { for (size_t cc_i = 0; cc_i < 4; cc_i++) { cc[cc_i] &=~ neg_cc[cc_i]; } From 75bb88d95bc25117d79ff7cc349bce5aa94b9999 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 16:52:35 -0400 Subject: [PATCH 33/51] state_set: Improve `state_set_search` performance, correct result. The description says "Return where an item would be, if it were inserted", but it was returning the last element <= rather than the first element >=, then the call to `state_set_cmpval` later was shifting i by 1 for that specific case. Handle it correctly inside the search function instead. Two other all call sites need to check whether the result refers to the append position (one past the end of the array) before checking `set->a[i] == state`, update them. Add a fast path upfront: It's VERY common to append states in order to the state array, so before we binary search each first compare against the last entry (unless empty). --- src/adt/stateset.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/adt/stateset.c b/src/adt/stateset.c index c1cff9933..4d4de9438 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -138,7 +138,8 @@ state_set_cmp(const struct state_set *a, const struct state_set *b) } /* - * Return where an item would be, if it were inserted + * Return where an item would be, if it were inserted. + * When insertion would append this returns one past the array. */ static size_t state_set_search(const struct state_set *set, fsm_state_t state) @@ -150,6 +151,11 @@ state_set_search(const struct state_set *set, fsm_state_t state) assert(!IS_SINGLETON(set)); assert(set->a != NULL); + /* fast path: append case */ + if (set->i > 0 && state > set->a[set->i - 1]) { + return set->i; + } + start = mid = 0; end = set->i; @@ -161,6 +167,12 @@ state_set_search(const struct state_set *set, fsm_state_t state) end = mid; } else if (r > 0) { start = mid + 1; + /* update mid if we're about to halt, because + * we're looking for the first position >= state, + * not the last position <= */ + if (start == end) { + mid = start; + } } else { return mid; } @@ -242,7 +254,7 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, */ if (!state_set_empty(set)) { i = state_set_search(set, state); - if (set->a[i] == state) { + if (i < set->i && set->a[i] == state) { return 1; } } @@ -261,9 +273,6 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, set->n *= 2; } - if (state_set_cmpval(state, set->a[i]) > 0) { - i++; - } if (i <= set->i) { memmove(&set->a[i + 1], &set->a[i], (set->i - i) * (sizeof *set->a)); @@ -470,7 +479,7 @@ state_set_remove(struct state_set **setp, fsm_state_t state) } i = state_set_search(set, state); - if (set->a[i] == state) { + if (i < set->i && set->a[i] == state) { if (i < set->i) { memmove(&set->a[i], &set->a[i + 1], (set->i - i - 1) * (sizeof *set->a)); } @@ -524,7 +533,7 @@ state_set_contains(const struct state_set *set, fsm_state_t state) } i = state_set_search(set, state); - if (set->a[i] == state) { + if (i < set->i && set->a[i] == state) { return 1; } From 709b8cc11c5f425e6962386cd43ad36ae6eb2d68 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 16:57:13 -0400 Subject: [PATCH 34/51] stateset: Avoid memmove of size 0. --- src/adt/stateset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/adt/stateset.c b/src/adt/stateset.c index 4d4de9438..645424555 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -273,8 +273,7 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, set->n *= 2; } - - if (i <= set->i) { + if (i < set->i) { memmove(&set->a[i + 1], &set->a[i], (set->i - i) * (sizeof *set->a)); } From cead0d99915faad077e3e22dd09761a61556c8e0 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 17:09:01 -0400 Subject: [PATCH 35/51] stateset: Add note about potentially expensive assertion. In -O0 this can become pretty expensive (~25% of overall runtime for `time ./re -rpcre -C '^[ab]{0,2000}$'`), but when built with -O3 very little overhead remains. I'm adding this comment because every time I see this it seems to me like it should have `EXPENSIVE_CHECKS` around it, but profiling is telling me it really doesn't matter. --- src/adt/stateset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/adt/stateset.c b/src/adt/stateset.c index 645424555..7839234d2 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -284,6 +284,8 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, set->i = 1; } + /* This assert can be pretty expensive in -O0 but in -O3 it has very + * little impact on the overall runtime. */ assert(state_set_contains(set, state)); return 1; From cbfeddd9c54178fe34aae0606a619e9592c6dd69 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 17:11:07 -0400 Subject: [PATCH 36/51] stateset: Comment struct fields. --- src/adt/stateset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/adt/stateset.c b/src/adt/stateset.c index 7839234d2..8d73a9b5f 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -44,8 +44,8 @@ struct state_set { const struct fsm_alloc *alloc; fsm_state_t *a; - size_t i; - size_t n; + size_t i; /* used */ + size_t n; /* ceil */ }; int From c3dab77fc7785e3ec4df9e4fb60c6c760ca66b0c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 17:12:04 -0400 Subject: [PATCH 37/51] edgeset: Fix indentation for `#if`'d block. --- src/adt/edgeset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/adt/edgeset.c b/src/adt/edgeset.c index c718727ca..f1a9e9f31 100644 --- a/src/adt/edgeset.c +++ b/src/adt/edgeset.c @@ -223,9 +223,9 @@ edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, assert(set->count <= set->ceil); #if LOG_BITSET - fprintf(stderr, " -- edge_set_add: symbols [0x%lx, 0x%lx, 0x%lx, 0x%lx] -> state %d on %p\n", - symbols[0], symbols[1], symbols[2], symbols[3], - state, (void *)set); + fprintf(stderr, " -- edge_set_add: symbols [0x%lx, 0x%lx, 0x%lx, 0x%lx] -> state %d on %p\n", + symbols[0], symbols[1], symbols[2], symbols[3], + state, (void *)set); #endif /* Linear search for a group with the same destination From 1ada07cce13fd2982056963e5b1e2bae59b235e9 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 17:12:31 -0400 Subject: [PATCH 38/51] edgeset: Switch from linear to binary searching in edge_set_add_bulk. This is a major hotspot when doing epsilon removal over large runs of potentially skipped states (as might appear from `^[ab]{0,2000}$`). Add a fast path for appending, which is also very common. Extract the edge set destination search into its own function, `find_state_position`, and add a `#define` to switch between linear search, binary search, or calling both and comparing the result. I will remove linear search in the next commit, but am checking this in as an intermediate step for checking & benchmarking. --- src/adt/edgeset.c | 182 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 165 insertions(+), 17 deletions(-) diff --git a/src/adt/edgeset.c b/src/adt/edgeset.c index f1a9e9f31..818b78493 100644 --- a/src/adt/edgeset.c +++ b/src/adt/edgeset.c @@ -11,6 +11,7 @@ #include #define LOG_BITSET 0 +#define LOG_BSEARCH 0 #include "libfsm/internal.h" /* XXX: for allocating struct fsm_edge, and the edges array */ @@ -184,6 +185,157 @@ edge_set_advise_growth(struct edge_set **pset, const struct fsm_alloc *alloc, return 1; } +enum fsp_res { + FSP_FOUND_INSERT_POSITION, + FSP_FOUND_VALUE_PRESENT, +}; + +/* Use binary search to find the first position N where set->groups[N].to >= state, + * which includes the position immediately following the last entry. Return an enum + * which indicates whether state is already present. */ +static enum fsp_res +find_state_position_bsearch(const struct edge_set *set, fsm_state_t state, size_t *dst) +{ + size_t lo = 0, hi = set->count; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: looking for %d in %p (count %zu)\n", + __func__, state, (void *)set, set->count); + } + +#if EXPENSIVE_CHECKS + /* invariant: input is unique and sorted */ + for (size_t i = 1; i < set->count; i++) { + assert(set->groups[i - 1].to < set->groups[i].to); + } +#endif + + if (set->count == 0) { + if (LOG_BSEARCH) { + fprintf(stderr, "%s: empty, returning 0\n", __func__); + } + *dst = 0; + return FSP_FOUND_INSERT_POSITION; + } else { + if (LOG_BSEARCH) { + fprintf(stderr, "%s: fast path: looking for %d, set->groups[last].to %d\n", + __func__, state, set->groups[hi - 1].to); + } + + /* Check the last entry so we can append in constant time. */ + const fsm_state_t last = set->groups[hi - 1].to; + if (state > last) { + *dst = hi; + return FSP_FOUND_INSERT_POSITION; + } else if (state == last) { + *dst = hi - 1; + return FSP_FOUND_VALUE_PRESENT; + } + } + + size_t mid; + while (lo < hi) { /* lo <= mid < hi */ + mid = lo + (hi - lo)/2; /* avoid overflow */ + const struct edge_group *eg = &set->groups[mid]; + const fsm_state_t cur = eg->to; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: lo %zu, hi %zu, mid %zu, cur %d, looking for %d\n", + __func__, lo, hi, mid, cur, state); + } + + if (state == cur) { + *dst = mid; + return FSP_FOUND_VALUE_PRESENT; + } else if (state > cur) { + lo = mid + 1; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: new lo %zd\n", __func__, lo); + } + + /* Update mid if we're about to halt, because we're looking + * for the first position >= state, not the last position <=. */ + if (lo == hi) { + mid = lo; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: special case, updating mid to %zd\n", __func__, mid); + } + } + } else if (state < cur) { + hi = mid; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: new hi %zd\n", __func__, hi); + } + } + } + + if (LOG_BSEARCH) { + fprintf(stderr, "%s: halting at %zd (looking for %d, cur %d)\n", + __func__, mid, state, set->groups[mid].to); + } + + /* dst is now the first position > state (== case is handled above), + * which may be one past the end of the array. */ + assert(mid == set->count || set->groups[mid].to > state); + *dst = mid; + return FSP_FOUND_INSERT_POSITION; +} + +static enum fsp_res +find_state_position_linear(const struct edge_set *set, fsm_state_t state, size_t *dst) +{ + /* Linear search for a group with the same destination + * state, or the position where that group would go. */ + size_t i; + for (i = 0; i < set->count; i++) { + const struct edge_group *eg = &set->groups[i]; + if (eg->to == state) { + *dst = i; + return FSP_FOUND_VALUE_PRESENT; + } else if (eg->to > state) { + break; /* will shift down and insert below */ + } else { + continue; + } + } + + *dst = i; + return FSP_FOUND_INSERT_POSITION; +} + +/* Find the state in the edge set, or where it would be inserted if not present. */ +static enum fsp_res +find_state_position(const struct edge_set *set, fsm_state_t state, size_t *dst) +{ + /* 0: linear, 1: bsearch, -1: call both, to check result */ +#define USE_BSEARCH 1 + + switch (USE_BSEARCH) { + case 0: + return find_state_position_linear(set, state, dst); + case 1: + return find_state_position_bsearch(set, state, dst); + case -1: + { + size_t dst_linear, dst_bsearch; + enum fsp_res res_linear = find_state_position_linear(set, state, &dst_linear); + enum fsp_res res_bsearch = find_state_position_bsearch(set, state, &dst_bsearch); + + if (res_linear != res_bsearch || dst_linear != dst_bsearch) { + fprintf(stderr, "%s: disagreement for state %d: linear res %d, dst %zu, bsearch res %d, dst %zu\n", + __func__, state, + res_linear, dst_linear, + res_bsearch, dst_bsearch); + for (size_t i = 0; i < set->count; i++) { + fprintf(stderr, "set->groups[%zu].to: %d\n", i, set->groups[i].to); + } + } + assert(res_linear == res_bsearch); + assert(dst_linear == dst_bsearch); + *dst = dst_linear; + return res_linear; + } + } +} + int edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, uint64_t symbols[256/64], fsm_state_t state) @@ -228,25 +380,21 @@ edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, state, (void *)set); #endif - /* Linear search for a group with the same destination - * state, or the position where that group would go. */ - for (i = 0; i < set->count; i++) { + switch (find_state_position(set, state, &i)) { + case FSP_FOUND_VALUE_PRESENT: + assert(i < set->count); + /* This API does not indicate whether that + * symbol -> to edge was already present. */ eg = &set->groups[i]; - - if (eg->to == state) { - /* This API does not indicate whether that - * symbol -> to edge was already present. */ - size_t i; - for (i = 0; i < 256/64; i++) { - eg->symbols[i] |= symbols[i]; - } - dump_edge_set(set); - return 1; - } else if (eg->to > state) { - break; /* will shift down and insert below */ - } else { - continue; + for (i = 0; i < 256/64; i++) { + eg->symbols[i] |= symbols[i]; } + dump_edge_set(set); + return 1; + + break; + case FSP_FOUND_INSERT_POSITION: + break; /* continue below */ } /* insert/append at i */ From 7122d2fe739e106be8b422206b15e681027acd11 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Aug 2023 17:19:24 -0400 Subject: [PATCH 39/51] edgeset: Commit to using binary search. When I run `time ./re -rpcre -C '^[ab]{0,2000}$'` locally for -O3: - linear search: 2.991s - binary search: 1.521s --- src/adt/edgeset.c | 59 +---------------------------------------------- 1 file changed, 1 insertion(+), 58 deletions(-) diff --git a/src/adt/edgeset.c b/src/adt/edgeset.c index 818b78493..dd3b8853c 100644 --- a/src/adt/edgeset.c +++ b/src/adt/edgeset.c @@ -194,7 +194,7 @@ enum fsp_res { * which includes the position immediately following the last entry. Return an enum * which indicates whether state is already present. */ static enum fsp_res -find_state_position_bsearch(const struct edge_set *set, fsm_state_t state, size_t *dst) +find_state_position(const struct edge_set *set, fsm_state_t state, size_t *dst) { size_t lo = 0, hi = set->count; if (LOG_BSEARCH) { @@ -279,63 +279,6 @@ find_state_position_bsearch(const struct edge_set *set, fsm_state_t state, size_ return FSP_FOUND_INSERT_POSITION; } -static enum fsp_res -find_state_position_linear(const struct edge_set *set, fsm_state_t state, size_t *dst) -{ - /* Linear search for a group with the same destination - * state, or the position where that group would go. */ - size_t i; - for (i = 0; i < set->count; i++) { - const struct edge_group *eg = &set->groups[i]; - if (eg->to == state) { - *dst = i; - return FSP_FOUND_VALUE_PRESENT; - } else if (eg->to > state) { - break; /* will shift down and insert below */ - } else { - continue; - } - } - - *dst = i; - return FSP_FOUND_INSERT_POSITION; -} - -/* Find the state in the edge set, or where it would be inserted if not present. */ -static enum fsp_res -find_state_position(const struct edge_set *set, fsm_state_t state, size_t *dst) -{ - /* 0: linear, 1: bsearch, -1: call both, to check result */ -#define USE_BSEARCH 1 - - switch (USE_BSEARCH) { - case 0: - return find_state_position_linear(set, state, dst); - case 1: - return find_state_position_bsearch(set, state, dst); - case -1: - { - size_t dst_linear, dst_bsearch; - enum fsp_res res_linear = find_state_position_linear(set, state, &dst_linear); - enum fsp_res res_bsearch = find_state_position_bsearch(set, state, &dst_bsearch); - - if (res_linear != res_bsearch || dst_linear != dst_bsearch) { - fprintf(stderr, "%s: disagreement for state %d: linear res %d, dst %zu, bsearch res %d, dst %zu\n", - __func__, state, - res_linear, dst_linear, - res_bsearch, dst_bsearch); - for (size_t i = 0; i < set->count; i++) { - fprintf(stderr, "set->groups[%zu].to: %d\n", i, set->groups[i].to); - } - } - assert(res_linear == res_bsearch); - assert(dst_linear == dst_bsearch); - *dst = dst_linear; - return res_linear; - } - } -} - int edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, uint64_t symbols[256/64], fsm_state_t state) From 937585a955d277231d558fe5ae20b0cc40c87eda Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 30 Aug 2023 12:08:44 -0400 Subject: [PATCH 40/51] determinise: Drastically reduce calls to qsort. After the other changes in this PR, calls to qsort from `sort_and_dedup_dst_buf` are one of the largest remaining hotspots in the profile. We can often avoid calling qsort, though: - If there is <= 1 entry, just return, it's sorted. - Otherwise, first do a sweep through the array noting the min and max values. Unless there is a huge range between them, it's much faster to build a bitset from them in a small (max 10KB) stack-allocated array and then unpack the bitset (now sorted and unique). Only the needed portion of the array is initialized. I have not done a lot of experimentation to find a cutoff point where the bitset becomes slower than qsort (it may be much larger), I picked 10KB because it's likely to be safe to stack-allocate. I tried changing the bitset unpacking to use an 8 or 16 bit mask and jump forward faster through large sub-word ranges of 0 bits, but any improvement was lost among random variation, so I decided it wasn't worth the extra complexity. We already skip whole words that are 0. --- src/libfsm/determinise.c | 95 ++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 18 deletions(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 56e135afd..ac15f05a8 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -2016,28 +2016,87 @@ static void sort_and_dedup_dst_buf(fsm_state_t *buf, size_t *used) { const size_t orig_used = *used; - qsort(buf, orig_used, sizeof(buf[0]), cmp_fsm_state_t); - - /* squash out duplicates */ - size_t rd = 1; - size_t wr = 1; - while (rd < orig_used) { - if (buf[rd - 1] == buf[rd]) { - rd++; /* skip */ - } else { - buf[wr] = buf[rd]; - rd++; - wr++; - } + + if (orig_used <= 1) { + return; /* no change */ } - *used = wr; -#if EXPENSIVE_CHECKS - assert(wr <= orig_used); - for (size_t i = 1; i < *used; i++) { - assert(buf[i - 1] < buf[i]); + /* Figure out what the min and max values are, because + * when the difference between them is not too large it + * can be significantly faster to avoid qsort here. */ + fsm_state_t min = (fsm_state_t)-1; + fsm_state_t max = 0; + for (size_t i = 0; i < orig_used; i++) { + const fsm_state_t cur = buf[i]; + if (cur < min) { min = cur; } + if (cur > max) { max = cur; } } + + /* If there's only one unique value, then we're done. */ + if (min == max) { + buf[0] = min; + *used = 1; + return; + } + +/* 81920 = 10 KB buffer on the stack. This must be divisible by 64. + * Set to 0 to disable. */ +#define QSORT_CUTOFF 81920 + + if (QSORT_CUTOFF == 0 || max - min > QSORT_CUTOFF) { + /* If the bitset would be very large but sparse due to + * extreme values, then fall back on using qsort and + * then sweeping over the array to squash out + * duplicates. */ + qsort(buf, orig_used, sizeof(buf[0]), cmp_fsm_state_t); + + /* squash out duplicates */ + size_t rd = 1; + size_t wr = 1; + while (rd < orig_used) { + if (buf[rd - 1] == buf[rd]) { + rd++; /* skip */ + } else { + buf[wr] = buf[rd]; + rd++; + wr++; + } + } + + *used = wr; +#if EXPENSIVE_CHECKS + assert(wr <= orig_used); + for (size_t i = 1; i < *used; i++) { + assert(buf[i - 1] < buf[i]); + } #endif + } else { + /* Convert the array into a bitset and back, which sorts + * and deduplicates in the process. Add 1 to avoid a zero- + * zero-length array error if QSORT_CUTOFF is 0. */ + uint64_t bitset[QSORT_CUTOFF/64 + 1]; + const size_t words = u64bitset_words(max - min); + memset(bitset, 0x00, words * sizeof(bitset[0])); + + for (size_t i = 0; i < orig_used; i++) { + u64bitset_set(bitset, buf[i] - min); + } + + size_t dst = 0; + for (size_t i = 0; i < words; i++) { + const uint64_t w = bitset[i]; + if (w != 0) { /* skip empty words */ + uint64_t bit = 0x1; + for (size_t b_i = 0; b_i < 64; b_i++, bit <<= 1) { + if (w & bit) { + buf[dst] = 64*i + b_i + min; + dst++; + } + } + } + } + *used = dst; + } } static int From 30e34ef9a739c78f788753919d68d6925171a0d5 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 30 Aug 2023 12:17:56 -0400 Subject: [PATCH 41/51] edgeset: Remove stale comment. --- src/adt/edgeset.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/adt/edgeset.c b/src/adt/edgeset.c index dd3b8853c..9658213c8 100644 --- a/src/adt/edgeset.c +++ b/src/adt/edgeset.c @@ -326,8 +326,6 @@ edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, switch (find_state_position(set, state, &i)) { case FSP_FOUND_VALUE_PRESENT: assert(i < set->count); - /* This API does not indicate whether that - * symbol -> to edge was already present. */ eg = &set->groups[i]; for (i = 0; i < 256/64; i++) { eg->symbols[i] |= symbols[i]; From cf6051f26135802784795465232369e5dfc66c66 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 30 Aug 2023 12:46:35 -0400 Subject: [PATCH 42/51] UBSan: Avoid implicit signed/unsigned conversion. --- src/libre/ast_compile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libre/ast_compile.c b/src/libre/ast_compile.c index 502faf8b4..aa0902d55 100644 --- a/src/libre/ast_compile.c +++ b/src/libre/ast_compile.c @@ -208,11 +208,11 @@ addedge_literal(struct comp_env *env, enum re_flags re_flags, assert(to < env->fsm->statecount); if (re_flags & RE_ICASE) { - if (!fsm_addedge_literal(fsm, from, to, tolower((unsigned char) c))) { + if (!fsm_addedge_literal(fsm, from, to, (char)tolower((unsigned char) c))) { return 0; } - if (!fsm_addedge_literal(fsm, from, to, toupper((unsigned char) c))) { + if (!fsm_addedge_literal(fsm, from, to, (char)toupper((unsigned char) c))) { return 0; } } else { From c1e12828b2e79a11b61978d809a0e020f5d8a6e6 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 30 Aug 2023 12:49:22 -0400 Subject: [PATCH 43/51] UBSan: Avoid implicit signed/unsigned conversion. --- src/retest/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/retest/main.c b/src/retest/main.c index b6b4c52f7..e01c93e7c 100644 --- a/src/retest/main.c +++ b/src/retest/main.c @@ -393,7 +393,7 @@ parse_escapes(char *s, char **errpos, int *lenp) ndig++; } else { - s[j++] = ccode; + s[j++] = (char)ccode; st = ST_BARE; if (!hexcurly) { From 6eff0f98ce625e93ba025f0f4c7358153708cc92 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 30 Aug 2023 13:17:12 -0400 Subject: [PATCH 44/51] bugfix: The range is min..max inclusive, so add 1. If min and max are exactly 64 states apart the upper value was getting silently dropped due to an incorrect `words` value here. One of the patterns in the PCRE suite triggers this: ./re -rpcre '(?:c|d)(?:)(?:aaaaaaaa(?:)(?:bbbbbbbb)(?:bbbbbbbb(?:))(?:bbbbbbbb(?:)(?:bbbbbbbb)))' "caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" This should match, but did not. --- src/libfsm/determinise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index ac15f05a8..3dbdc429e 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -2075,7 +2075,7 @@ sort_and_dedup_dst_buf(fsm_state_t *buf, size_t *used) * and deduplicates in the process. Add 1 to avoid a zero- * zero-length array error if QSORT_CUTOFF is 0. */ uint64_t bitset[QSORT_CUTOFF/64 + 1]; - const size_t words = u64bitset_words(max - min); + const size_t words = u64bitset_words(max - min + 1); memset(bitset, 0x00, words * sizeof(bitset[0])); for (size_t i = 0; i < orig_used; i++) { From 98ee906d15426150020070db975778bf5f97a565 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 15 Jun 2023 14:00:25 -0400 Subject: [PATCH 45/51] Address a couple warnings from scan-build. determinise: It's not possible to find a cached result in the hash table without allocating a to-set buffer first, so assert that it will be non-NULL. fsm_findmode: This should never be used on a state without edges. vm/v1.c and vm/v2.c: Free allocated return value on error. --- src/libfsm/determinise.c | 2 ++ src/libfsm/mode.c | 4 ++++ src/libfsm/vm/v1.c | 4 +++- src/libfsm/vm/v2.c | 5 ++++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 3dbdc429e..9c77f3a42 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -1339,6 +1339,7 @@ to_set_htab_check(struct analyze_closures_env *env, if (b->count == 0) { return 0; /* empty bucket -> not found */ } else if (b->count == count) { + assert(env->to_sets.buf != NULL); assert(b->offset + count <= env->to_sets.used); const fsm_state_t *ids = &env->to_sets.buf[b->offset]; if (0 == memcmp(ids, dst, count * sizeof(dst[0]))) { @@ -1465,6 +1466,7 @@ save_to_set(struct analyze_closures_env *env, env->to_sets.ceil = nceil; env->to_sets.buf = nbuf; } + assert(env->to_sets.buf != NULL); #if LOG_TO_SET static size_t to_set_id; diff --git a/src/libfsm/mode.c b/src/libfsm/mode.c index 76c60b8ad..87af0bdf9 100644 --- a/src/libfsm/mode.c +++ b/src/libfsm/mode.c @@ -28,6 +28,7 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq) } mode; mode.freq = 1; + mode.state = (fsm_state_t)-1; edge_set_group_iter_reset(fsm->states[state].edges, EDGE_GROUP_ITER_ALL, &iter); while (edge_set_group_iter_next(&iter, &info)) { @@ -46,6 +47,9 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq) *freq = mode.freq; } + /* It's not meaningful to call this on a state without edges. */ + assert(mode.state != (fsm_state_t)-1); + assert(mode.freq >= 1); return mode.state; } diff --git a/src/libfsm/vm/v1.c b/src/libfsm/vm/v1.c index a326b88d8..de1f6ea93 100644 --- a/src/libfsm/vm/v1.c +++ b/src/libfsm/vm/v1.c @@ -217,7 +217,9 @@ encode_opasm_v1(const struct dfavm_vm_op *instr, size_t ninstr, size_t total_byt return ret; error: - /* XXX - cleanup */ + if (ret != NULL) { + free(ret); + } return NULL; } diff --git a/src/libfsm/vm/v2.c b/src/libfsm/vm/v2.c index c85edff98..07eb12ef4 100644 --- a/src/libfsm/vm/v2.c +++ b/src/libfsm/vm/v2.c @@ -155,7 +155,10 @@ encode_opasm_v2(const struct dfavm_vm_op *instr, size_t ninstr) return ret; error: - /* XXX - cleanup */ + if (ret != NULL) { + free(ret); + } + return NULL; } From 51892e3745fd642f6bb8a6acd31b69cf065a43b3 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 16 Feb 2023 13:52:31 -0500 Subject: [PATCH 46/51] Add src/adt/idmap.c, a state -> ID set map. --- Makefile | 1 + include/adt/idmap.h | 58 ++++++ src/adt/Makefile | 1 + src/adt/idmap.c | 392 ++++++++++++++++++++++++++++++++++++++ tests/idmap/Makefile | 19 ++ tests/idmap/idmap_basic.c | 136 +++++++++++++ 6 files changed, 607 insertions(+) create mode 100644 include/adt/idmap.h create mode 100644 src/adt/idmap.c create mode 100644 tests/idmap/Makefile create mode 100644 tests/idmap/idmap_basic.c diff --git a/Makefile b/Makefile index 8d742883e..f1f4f1396 100644 --- a/Makefile +++ b/Makefile @@ -108,6 +108,7 @@ SUBDIR += src SUBDIR += tests/capture SUBDIR += tests/complement SUBDIR += tests/gen +SUBDIR += tests/idmap SUBDIR += tests/intersect #SUBDIR += tests/ir # XXX: fragile due to state numbering SUBDIR += tests/eclosure diff --git a/include/adt/idmap.h b/include/adt/idmap.h new file mode 100644 index 000000000..064fd15d1 --- /dev/null +++ b/include/adt/idmap.h @@ -0,0 +1,58 @@ +#ifndef IDMAP_H +#define IDMAP_H + +/* Mapping between one fsm_state_t and a set of + * unsigned IDs. The implementation assumes that both + * IDs are sequentially assigned and don't need a sparse + * mapping -- it will handle 10 -> [1, 3, 47] well, but + * not 1000000 -> [14, 524288, 1073741823]. */ + +#include + +#include "fsm/fsm.h" +#include "fsm/alloc.h" + +struct idmap; /* Opaque handle. */ + +struct idmap * +idmap_new(const struct fsm_alloc *alloc); + +void +idmap_free(struct idmap *m); + +/* Associate a value with a state (if not already present.) + * Returns 1 on success, or 0 on allocation failure. */ +int +idmap_set(struct idmap *m, fsm_state_t state_id, unsigned value); + +/* How many values are associated with an ID? */ +size_t +idmap_get_value_count(const struct idmap *m, fsm_state_t state_id); + +/* Get the values associated with an ID. + * + * Returns 1 on success and writes them into the buffer, in ascending + * order, with the count in *written (if non-NULL). + * + * Returns 0 on error (insufficient buffer space). */ +int +idmap_get(const struct idmap *m, fsm_state_t state_id, + size_t buf_size, unsigned *buf, size_t *written); + +/* Iterator callback. */ +typedef void +idmap_iter_fun(fsm_state_t state_id, unsigned value, void *opaque); + +/* Iterate over the ID map. State IDs may be yielded out of order, + * values will be in ascending order. */ +void +idmap_iter(const struct idmap *m, + idmap_iter_fun *cb, void *opaque); + +/* Iterate over the values associated with a single state + * (in ascending order). */ +void +idmap_iter_for_state(const struct idmap *m, fsm_state_t state_id, + idmap_iter_fun *cb, void *opaque); + +#endif diff --git a/src/adt/Makefile b/src/adt/Makefile index 05199f2dc..64ad7429f 100644 --- a/src/adt/Makefile +++ b/src/adt/Makefile @@ -2,6 +2,7 @@ SRC += src/adt/alloc.c SRC += src/adt/bitmap.c +SRC += src/adt/idmap.c SRC += src/adt/internedstateset.c SRC += src/adt/priq.c SRC += src/adt/path.c diff --git a/src/adt/idmap.c b/src/adt/idmap.c new file mode 100644 index 000000000..ca169b71e --- /dev/null +++ b/src/adt/idmap.c @@ -0,0 +1,392 @@ +/* + * Copyright 2021 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "adt/idmap.h" + +#include "adt/alloc.h" +#include "adt/hash.h" +#include "adt/u64bitset.h" + +#include +#include +#include + +#define NO_STATE ((fsm_state_t)-1) + +#define DEF_BUCKET_COUNT 4 + +struct idmap { + const struct fsm_alloc *alloc; + unsigned bucket_count; + unsigned buckets_used; + + /* All buckets' values are assumed to be large + * enough to store this value, and they will all + * grow as necessary. */ + unsigned max_value; + + /* Basic linear-probing, add-only hash table. */ + struct idmap_bucket { + fsm_state_t state; /* Key. NO_STATE when empty. */ + + /* values[] is always either NULL or has at least + * max_value + 1 bits; all grow on demand. */ + uint64_t *values; + } *buckets; +}; + +static unsigned +value_words(unsigned max_value) { + if (max_value == 0) { + /* Still allocate one word, for storing 0. */ + return 1; + } else { + return u64bitset_words(max_value); + } +} + +struct idmap * +idmap_new(const struct fsm_alloc *alloc) +{ + struct idmap *res = NULL; + struct idmap_bucket *buckets = NULL; + + res = f_malloc(alloc, sizeof(*res)); + if (res == NULL) { + goto cleanup; + } + + buckets = f_calloc(alloc, + DEF_BUCKET_COUNT, sizeof(buckets[0])); + if (buckets == NULL) { + goto cleanup; + } + + for (size_t i = 0; i < DEF_BUCKET_COUNT; i++) { + buckets[i].state = NO_STATE; + } + + res->alloc = alloc; + res->buckets_used = 0; + res->bucket_count = DEF_BUCKET_COUNT; + res->max_value = 0; + res->buckets = buckets; + + return res; + +cleanup: + f_free(alloc, res); + f_free(alloc, buckets); + return NULL; +} + +void +idmap_free(struct idmap *m) +{ + if (m == NULL) { + return; + } + + for (size_t i = 0; i < m->bucket_count; i++) { + if (m->buckets[i].state == NO_STATE) { + continue; + } + f_free(m->alloc, m->buckets[i].values); + } + + f_free(m->alloc, m->buckets); + f_free(m->alloc, m); +} + +static int +grow_bucket_values(struct idmap *m, unsigned old_words, unsigned new_words) +{ + assert(new_words > old_words); + + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + struct idmap_bucket *b = &m->buckets[b_i]; + if (b->state == NO_STATE) { + assert(b->values == NULL); + continue; + } + + uint64_t *nv = f_calloc(m->alloc, + new_words, sizeof(nv[0])); + if (nv == NULL) { + return 0; + } + + for (size_t w_i = 0; w_i < old_words; w_i++) { + nv[w_i] = b->values[w_i]; + } + f_free(m->alloc, b->values); + b->values = nv; + } + return 1; +} + +static int +grow_buckets(struct idmap *m) +{ + const size_t ocount = m->bucket_count; + const size_t ncount = 2*ocount; + assert(ncount > m->bucket_count); + + struct idmap_bucket *nbuckets = f_calloc(m->alloc, + ncount, sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return 0; + } + for (size_t nb_i = 0; nb_i < ncount; nb_i++) { + nbuckets[nb_i].state = NO_STATE; + } + + const size_t nmask = ncount - 1; + + for (size_t ob_i = 0; ob_i < ocount; ob_i++) { + const struct idmap_bucket *ob = &m->buckets[ob_i]; + if (ob->state == NO_STATE) { + continue; + } + + const uint64_t h = hash_id(ob->state); + for (size_t nb_i = 0; nb_i < ncount; nb_i++) { + struct idmap_bucket *nb = &nbuckets[(h + nb_i) & nmask]; + if (nb->state == NO_STATE) { + nb->state = ob->state; + nb->values = ob->values; + break; + } else { + assert(nb->state != ob->state); + /* collision */ + continue; + } + } + } + + f_free(m->alloc, m->buckets); + + m->buckets = nbuckets; + m->bucket_count = ncount; + + return 1; +} + +int +idmap_set(struct idmap *m, fsm_state_t state_id, + unsigned value) +{ + assert(state_id != NO_STATE); + + const uint64_t h = hash_id(state_id); + if (value > m->max_value) { + const unsigned ovw = value_words(m->max_value); + const unsigned nvw = value_words(value); + /* If this value won't fit in the existing value + * arrays, then grow them all. We do not track the + * number of bits in each individual array. */ + if (nvw > ovw && !grow_bucket_values(m, ovw, nvw)) { + return 0; + } + m->max_value = value; + } + + assert(m->max_value >= value); + + if (m->buckets_used >= m->bucket_count/2) { + if (!grow_buckets(m)) { + return 0; + } + } + + const uint64_t mask = m->bucket_count - 1; + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; + if (b->state == state_id) { + assert(b->values != NULL); + u64bitset_set(b->values, value); + return 1; + } else if (b->state == NO_STATE) { + b->state = state_id; + assert(b->values == NULL); + + const unsigned vw = value_words(m->max_value); + b->values = f_calloc(m->alloc, + vw, sizeof(b->values[0])); + if (b->values == NULL) { + return 0; + } + m->buckets_used++; + + u64bitset_set(b->values, value); + return 1; + } else { + continue; /* collision */ + } + + } + + assert(!"unreachable"); + return 0; +} + +static const struct idmap_bucket * +get_bucket(const struct idmap *m, fsm_state_t state_id) +{ + const uint64_t h = hash_id(state_id); + const uint64_t mask = m->bucket_count - 1; + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + const struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; + if (b->state == NO_STATE) { + return NULL; + } else if (b->state == state_id) { + return b; + } + } + + return NULL; +} + +size_t +idmap_get_value_count(const struct idmap *m, fsm_state_t state_id) +{ + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + return 0; + } + assert(b->values != NULL); + + size_t res = 0; + const size_t words = value_words(m->max_value); + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + /* This could use popcount64(w). */ + if (w == 0) { + continue; + } + for (uint64_t bit = 1; bit; bit <<= 1) { + if (w & bit) { + res++; + } + } + } + + return res; +} + +int +idmap_get(const struct idmap *m, fsm_state_t state_id, + size_t buf_size, unsigned *buf, size_t *written) +{ + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + if (written != NULL) { + *written = 0; + } + return 1; + } + + size_t buf_offset = 0; + const size_t words = value_words(m->max_value); + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + if (buf_offset * sizeof(buf[0]) >= buf_size) { + return 0; + } + buf[buf_offset] = 64*w_i + b_i; + buf_offset++; + } + } + } + + if (written != NULL) { + *written = buf_offset; + } + return 1; +} + +void +idmap_iter(const struct idmap *m, + idmap_iter_fun *cb, void *opaque) +{ + const size_t words = value_words(m->max_value); + + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + const struct idmap_bucket *b = &m->buckets[b_i]; + if (b->state == NO_STATE) { + continue; + } + + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + const unsigned v = 64*w_i + b_i; + cb(b->state, v, opaque); + } + } + } + } +} + +void +idmap_iter_for_state(const struct idmap *m, fsm_state_t state_id, + idmap_iter_fun *cb, void *opaque) +{ + const size_t words = value_words(m->max_value); + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + return; + } + + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + /* if N contiguous bits are all zero, skip them all at once */ +#define BLOCK_BITS 16 + uint64_t block = ((uint64_t)1 << BLOCK_BITS) - 1; + size_t block_count = 0; + + uint64_t b_i = 0; + while (b_i < 64) { + if ((w & block) == 0) { + block <<= BLOCK_BITS; + b_i += BLOCK_BITS; + continue; + } + + if (w & ((uint64_t)1 << b_i)) { + const unsigned v = 64*w_i + b_i; + cb(b->state, v, opaque); + block_count++; + } + b_i++; + block <<= 1; + } + +#define CHECK 0 +#if CHECK + size_t check_count = 0; + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + check_count++; + } + } + assert(block_count == check_count); +#endif + } +} diff --git a/tests/idmap/Makefile b/tests/idmap/Makefile new file mode 100644 index 000000000..aee01f565 --- /dev/null +++ b/tests/idmap/Makefile @@ -0,0 +1,19 @@ +.include "../../share/mk/top.mk" + +TEST.tests/idmap != ls -1 tests/idmap/idmap*.c +TEST_SRCDIR.tests/idmap = tests/idmap +TEST_OUTDIR.tests/idmap = ${BUILD}/tests/idmap + +.for n in ${TEST.tests/idmap:T:R:C/^idmap//} +INCDIR.${TEST_SRCDIR.tests/idmap}/idmap${n}.c += src/adt +.endfor + +.for n in ${TEST.tests/idmap:T:R:C/^idmap//} +test:: ${TEST_OUTDIR.tests/idmap}/res${n} +SRC += ${TEST_SRCDIR.tests/idmap}/idmap${n}.c +CFLAGS.${TEST_SRCDIR.tests/idmap}/idmap${n}.c += -UNDEBUG -D_DEFAULT_SOURCE -std=c99 +${TEST_OUTDIR.tests/idmap}/run${n}: ${TEST_OUTDIR.tests/idmap}/idmap${n}.o ${BUILD}/lib/adt.o + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/idmap}/idmap${n}.c} -o ${TEST_OUTDIR.tests/idmap}/run${n} ${TEST_OUTDIR.tests/idmap}/idmap${n}.o ${BUILD}/lib/adt.o +${TEST_OUTDIR.tests/idmap}/res${n}: ${TEST_OUTDIR.tests/idmap}/run${n} + ( ${TEST_OUTDIR.tests/idmap}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/idmap}/res${n} +.endfor diff --git a/tests/idmap/idmap_basic.c b/tests/idmap/idmap_basic.c new file mode 100644 index 000000000..19f44d56e --- /dev/null +++ b/tests/idmap/idmap_basic.c @@ -0,0 +1,136 @@ +/* + * Copyright 2021 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include + +#include + +#define DEF_LIMIT 10 +#define DEF_SEED 0 + +/* Thes numbers were chose to get a reasonable variety, + * but also some duplicated values as the input grows. */ +#define MAX_GEN_VALUES 23 +#define ID_MASK ((1 << 9) - 1) +#define VALUE_MASK ((1 << 10) - 1) + +static void +dump_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + /* fprintf(stderr, " -- state %d, value %u\n", state_id, value); */ + assert(state_id <= ID_MASK); + assert(value <= VALUE_MASK); + (void)opaque; +} + +static int +cmp_u(const void *pa, const void *pb) +{ + const unsigned a = *(unsigned *)pa; + const unsigned b = *(unsigned *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +int main(int argc, char **argv) { + const size_t limit = (argc > 1 ? atoi(argv[1]) : DEF_LIMIT); + const unsigned seed = (argc > 2 ? atoi(argv[2]) : DEF_SEED); + + (void)argc; + (void)argv; + struct idmap *m = idmap_new(NULL); + + srandom(seed); + + /* Fill the table with random data */ + for (size_t id_i = 0; id_i < limit; id_i++) { + const fsm_state_t id = (fsm_state_t)(random() & ID_MASK); + const size_t value_count = random() % MAX_GEN_VALUES; + + for (size_t v_i = 0; v_i < value_count; v_i++) { + const unsigned v = random() & VALUE_MASK; + if (!idmap_set(m, id, v)) { + assert(!"failed to set"); + } + } + } + + idmap_iter(m, dump_cb, NULL); + + srandom(seed); + + size_t got_buf_ceil = MAX_GEN_VALUES; + unsigned *got_buf = malloc(got_buf_ceil * sizeof(got_buf[0])); + assert(got_buf != NULL); + + /* Reset the PRNG and read back the same data. */ + for (size_t id_i = 0; id_i < limit; id_i++) { + const fsm_state_t id = (fsm_state_t)(random() & ID_MASK); + const size_t generated_value_count = random() % MAX_GEN_VALUES; + + /* Note: This can occasionally differ from + * generated_value_count, because the same id or values + * may have been generated more than once. As long as + * all the values match, it's fine. */ + const size_t value_count = idmap_get_value_count(m, id); + + if (value_count > got_buf_ceil) { + size_t nceil = got_buf_ceil; + while (nceil <= value_count) { + nceil *= 2; + } + free(got_buf); + got_buf = malloc(nceil * sizeof(got_buf[0])); + assert(got_buf != NULL); + got_buf_ceil = nceil; + } + + size_t written; + if (!idmap_get(m, id, + got_buf_ceil * sizeof(got_buf[0]), got_buf, + &written)) { + assert(!"failed to get"); + } + assert(written == value_count); + + unsigned gen_buf[MAX_GEN_VALUES]; + + for (size_t v_i = 0; v_i < generated_value_count; v_i++) { + const unsigned v = random() & VALUE_MASK; + gen_buf[v_i] = v; + } + qsort(gen_buf, generated_value_count, sizeof(gen_buf[0]), cmp_u); + + /* Every generated value should appear in the buffer. + * There may be more in the buffer; ignore them. */ + size_t v_i = 0; + for (size_t gen_i = 0; gen_i < generated_value_count; gen_i++) { + int found = 0; + const unsigned gv = gen_buf[gen_i]; + assert(value_count <= got_buf_ceil); + /* got_buf should be sorted, so we can pick up where we left off */ + while (v_i < value_count) { + if (gv == got_buf[v_i]) { + /* Intentionally don't increment v_i on match, + * because gen_buf can repeat values. */ + found = 1; + break; + } + v_i++; + } + if (!found) { + fprintf(stderr, "NOT FOUND: state %d -- value: %u\n", + id, gv); + return EXIT_FAILURE; + } + } + } + + free(got_buf); + idmap_free(m); + return EXIT_SUCCESS; +} From 7c6644f53b94c3be7246a67eaa985b03dfb0d526 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 10 Jul 2023 14:02:31 -0400 Subject: [PATCH 47/51] Remove theft test harness for deleted ADT (ipriq). --- theft/Makefile | 1 - theft/fuzz_adt_ipriq.c | 197 ----------------------------------------- 2 files changed, 198 deletions(-) delete mode 100644 theft/fuzz_adt_ipriq.c diff --git a/theft/Makefile b/theft/Makefile index 0d38d8cfc..921c482a9 100644 --- a/theft/Makefile +++ b/theft/Makefile @@ -6,7 +6,6 @@ SRC += theft/util.c SRC += theft/wrap.c SRC += theft/fuzz_adt_edge_set.c -SRC += theft/fuzz_adt_ipriq.c SRC += theft/fuzz_adt_priq.c SRC += theft/fuzz_capture_string_set.c SRC += theft/fuzz_literals.c diff --git a/theft/fuzz_adt_ipriq.c b/theft/fuzz_adt_ipriq.c deleted file mode 100644 index 1847ef6ce..000000000 --- a/theft/fuzz_adt_ipriq.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright 2021 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include "type_info_adt_ipriq.h" - -#include -#include - -struct model { - size_t used; - size_t entries[]; -}; - -static enum ipriq_cmp_res -cmp_size_t(size_t a, size_t b, void *opaque) -{ - (void)opaque; - return a < b ? IPRIQ_CMP_LT : - a > b ? IPRIQ_CMP_GT : IPRIQ_CMP_EQ; -} - -static int exec_add(size_t x, struct model *m, struct ipriq *pq) -{ - if (!ipriq_add(pq, x)) { - return 0; - } - - m->entries[m->used] = x; - m->used++; - return 1; -} - -static int find_min_pos(const struct model *m, size_t *pos) -{ - size_t i; - if (m->used == 0) { - return 0; - } - - size_t res, min; - res = 0; - min = m->entries[0]; - - for (i = 1; i < m->used; i++) { - if (m->entries[i] < min) { - res = i; - min = m->entries[i]; - } - } - *pos = res; - return 1; -} - -static int exec_peek(struct model *m, struct ipriq *pq) -{ - size_t res; - - if (!ipriq_peek(pq, &res)) { - return m->used == 0; - } - - size_t pos; - if (!find_min_pos(m, &pos)) { - assert(!"unreachable (peek)"); - } - - return res == m->entries[pos]; -} - -static int exec_pop(struct model *m, struct ipriq *pq) -{ - size_t res; - - if (!ipriq_pop(pq, &res)) { - return m->used == 0; - } - - size_t pos; - if (!find_min_pos(m, &pos)) { - assert(!"unreachable (pop)"); - } - - if (res != m->entries[pos]) { - return 0; - } - - assert(m->used > 0); - if (pos < m->used - 1) { - m->entries[pos] = m->entries[m->used - 1]; - } - m->used--; - return 1; -} - -static enum theft_trial_res -compare_against_model(const struct ipriq_scenario *scen) -{ - enum theft_trial_res res = THEFT_TRIAL_FAIL; - size_t i; - - struct model *m = malloc(sizeof(*m) - + scen->count * sizeof(m->entries[0])); - if (m == NULL) { - return THEFT_TRIAL_ERROR; - } - m->used = 0; - - struct ipriq *pq = ipriq_new(NULL, cmp_size_t, NULL); - if (pq == NULL) { - return THEFT_TRIAL_ERROR; - } - - for (i = 0; i < scen->count; i++) { - const struct ipriq_op *op = &scen->ops[i]; - - switch (op->t) { - case IPRIQ_OP_ADD: - if (!exec_add(op->u.add.x, m, pq)) { - goto cleanup; - } - break; - - case IPRIQ_OP_PEEK: - if (!exec_peek(m, pq)) { - goto cleanup; - } - break; - - case IPRIQ_OP_POP: - if (!exec_pop(m, pq)) { - goto cleanup; - } - break; - - default: - assert(false); break; - } - } - - res = THEFT_TRIAL_PASS; - -cleanup: - free(m); - - return res; -} - -static enum theft_trial_res -prop_ipriq_model(struct theft *t, void *arg1) -{ - const struct ipriq_scenario *scen = arg1; - (void)t; - return compare_against_model(scen); -} - -static bool -test_ipriq(theft_seed seed, uintptr_t limit) -{ - enum theft_run_res res; - - struct ipriq_hook_env env = { - .tag = 'I', - .limit = limit, - }; - - struct theft_run_config config = { - .name = __func__, - .prop1 = prop_ipriq_model, - .type_info = { &type_info_adt_ipriq }, - .trials = 1000, - .hooks = { - .trial_pre = theft_hook_first_fail_halt, - .env = &env, - }, - .fork = { - .enable = true, - }, - - .seed = seed, - }; - - (void)limit; - - res = theft_run(&config); - printf("%s: %s\n", __func__, theft_run_res_str(res)); - - return res == THEFT_RUN_PASS; -} - -void -register_test_adt_ipriq(void) -{ - reg_test1("adt_ipriq", test_ipriq, 10000); -} From c646868316ac8281ab38c33e2f9dae5cb7a5f7d8 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 2 Jun 2023 11:41:28 -0400 Subject: [PATCH 48/51] Add pcre-anchor test for anchoring edge case. --- tests/pcre-anchor/in81.re | 1 + tests/pcre-anchor/out81.fsm | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 tests/pcre-anchor/in81.re create mode 100644 tests/pcre-anchor/out81.fsm diff --git a/tests/pcre-anchor/in81.re b/tests/pcre-anchor/in81.re new file mode 100644 index 000000000..8b5fad7c3 --- /dev/null +++ b/tests/pcre-anchor/in81.re @@ -0,0 +1 @@ +($x)* \ No newline at end of file diff --git a/tests/pcre-anchor/out81.fsm b/tests/pcre-anchor/out81.fsm new file mode 100644 index 000000000..2cdc2f023 --- /dev/null +++ b/tests/pcre-anchor/out81.fsm @@ -0,0 +1,5 @@ +0 -> 0 ?; +0 -> 1 "\n"; + +start: 0; +end: 0, 1; \ No newline at end of file From 0789d614a22e2302132fb80f1fde4f6a056f793c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 31 May 2023 17:14:35 -0400 Subject: [PATCH 49/51] fuzz/run_fuzzer: Run single seed file when given as argument. --- fuzz/run_fuzzer | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fuzz/run_fuzzer b/fuzz/run_fuzzer index be8ba1d95..429ffa961 100755 --- a/fuzz/run_fuzzer +++ b/fuzz/run_fuzzer @@ -4,6 +4,8 @@ BUILD=../build FUZZER=${BUILD}/fuzz/fuzzer SEEDS=${BUILD}/fuzz/fuzzer_seeds +ARG=$1 + SECONDS=${SECONDS:-60} WORKERS=${WORKERS:-4} SEEDS=${SEEDS:-seeds} @@ -25,5 +27,9 @@ if [ ! -d "${SEEDS}" ]; then mkdir -p "${SEEDS}" fi -echo "\n==== ${FUZZER}" -${FUZZER} -jobs=${WORKERS} -workers=${WORKERS} -max_total_time=${SECONDS} ${SEEDS} +if [ -z "${ARG}" ]; then + echo "\n==== ${FUZZER}" + exec ${FUZZER} -jobs=${WORKERS} -workers=${WORKERS} -max_total_time=${SECONDS} ${SEEDS} +else + exec ${FUZZER} ${ARG} +fi \ No newline at end of file From 1ca3726322abed8c02f7f16bffe8b32683189605 Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 12 Sep 2023 12:14:20 +0100 Subject: [PATCH 50/51] Don't purge the seed cache for PRs syncing clones. --- .github/workflows/ci.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 429ec2523..27fdf78f7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -409,8 +409,15 @@ jobs: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} - # note we do the fuzzing unconditionally; each run adds to the corpus + # note we do the fuzzing unconditionally; each run adds to the corpus. + # + # We only run fuzzing for PRs in the base repo, this prevents attempting + # to purge the seed cache from a PR syncing a forked repo, which fails + # due to a permissions error (I'm unsure why, I think PRs from clones can't + # purge a cache in CI presumably for security/DoS reasons). PRs from clones + # still run fuzzing, just from empty, and do not save their seeds. - name: Restore seeds (mode ${{ matrix.mode }}) + if: github.repository == 'katef/libfsm' uses: actions/cache/restore@v3 id: cache-seeds with: From 239927c3696e6953e1a48840a01451addac32c2b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 2 Oct 2023 09:43:22 -0400 Subject: [PATCH 51/51] ci.yml: Update fuzzer modes for CI. - 'd' (default) no longer exists, it's now 'r' - add 's', 'i', 'M' --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 27fdf78f7..3315fc025 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -374,7 +374,7 @@ jobs: cc: [ clang ] make: [ bmake ] debug: [ DEBUG, RELEASE ] # RELEASE=1 is a no-op - mode: [ m, p, d ] + mode: [ r, s, m, i, M, p ] exclude: - os: macos cc: gcc # it's clang anyway