diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 429ec2523..3315fc025 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -374,7 +374,7 @@ jobs: cc: [ clang ] make: [ bmake ] debug: [ DEBUG, RELEASE ] # RELEASE=1 is a no-op - mode: [ m, p, d ] + mode: [ r, s, m, i, M, p ] exclude: - os: macos cc: gcc # it's clang anyway @@ -409,8 +409,15 @@ jobs: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} - # note we do the fuzzing unconditionally; each run adds to the corpus + # note we do the fuzzing unconditionally; each run adds to the corpus. + # + # We only run fuzzing for PRs in the base repo, this prevents attempting + # to purge the seed cache from a PR syncing a forked repo, which fails + # due to a permissions error (I'm unsure why, I think PRs from clones can't + # purge a cache in CI presumably for security/DoS reasons). PRs from clones + # still run fuzzing, just from empty, and do not save their seeds. - name: Restore seeds (mode ${{ matrix.mode }}) + if: github.repository == 'katef/libfsm' uses: actions/cache/restore@v3 id: cache-seeds with: diff --git a/Makefile b/Makefile index 8d742883e..f1f4f1396 100644 --- a/Makefile +++ b/Makefile @@ -108,6 +108,7 @@ SUBDIR += src SUBDIR += tests/capture SUBDIR += tests/complement SUBDIR += tests/gen +SUBDIR += tests/idmap SUBDIR += tests/intersect #SUBDIR += tests/ir # XXX: fragile due to state numbering SUBDIR += tests/eclosure diff --git a/fuzz/Makefile b/fuzz/Makefile index 1d418cd97..2327a811b 100644 --- a/fuzz/Makefile +++ b/fuzz/Makefile @@ -7,6 +7,15 @@ ${BUILD}/fuzz/: ${BUILD} DIR += ${BUILD}/fuzz +# Uncomment to enable capture fuzzing using PCRE as a test oracle. +#PCRE_CMP=1 + +.if PCRE_CMP +PKG += libpcre2-8 +LFLAGS.fuzzer += ${LIBS.libpcre2-8} +CFLAGS.${SRC:Mfuzz/target.c} += -DCMP_PCRE=1 +.endif + .for src in ${SRC:Mfuzz/*.c} CFLAGS.${src} += -std=c99 .endfor @@ -15,7 +24,7 @@ CFLAGS.${src} += -std=c99 fuzz:: ${BUILD}/fuzz/fuzzer ${BUILD}/fuzz/fuzzer: mkdir - ${CC} -o $@ ${LFLAGS} ${.ALLSRC:M*.o} ${.ALLSRC:M*.a} + ${CC} -o $@ ${LFLAGS} ${LFLAGS.fuzzer} ${.ALLSRC:M*.o} ${.ALLSRC:M*.a} .for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} ${BUILD}/fuzz/fuzzer: ${BUILD}/lib/${lib:R}.a diff --git a/fuzz/run_fuzzer b/fuzz/run_fuzzer index be8ba1d95..429ffa961 100755 --- a/fuzz/run_fuzzer +++ b/fuzz/run_fuzzer @@ -4,6 +4,8 @@ BUILD=../build FUZZER=${BUILD}/fuzz/fuzzer SEEDS=${BUILD}/fuzz/fuzzer_seeds +ARG=$1 + SECONDS=${SECONDS:-60} WORKERS=${WORKERS:-4} SEEDS=${SEEDS:-seeds} @@ -25,5 +27,9 @@ if [ ! -d "${SEEDS}" ]; then mkdir -p "${SEEDS}" fi -echo "\n==== ${FUZZER}" -${FUZZER} -jobs=${WORKERS} -workers=${WORKERS} -max_total_time=${SECONDS} ${SEEDS} +if [ -z "${ARG}" ]; then + echo "\n==== ${FUZZER}" + exec ${FUZZER} -jobs=${WORKERS} -workers=${WORKERS} -max_total_time=${SECONDS} ${SEEDS} +else + exec ${FUZZER} ${ARG} +fi \ No newline at end of file diff --git a/fuzz/target.c b/fuzz/target.c index 87086b929..736c2d889 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -23,17 +24,47 @@ #include "../src/libfsm/minimise_test_oracle.h" +/* for fsm_capture_dump */ +/* FIXME: should this be a public interface? */ +#include "../src/libfsm/capture.h" + +/* Buffer for sanitized fuzzer input */ +#define MAX_FUZZER_DATA (64 * 1024) +static uint8_t data_buf[MAX_FUZZER_DATA + 1]; + +/* Should fuzzer harness code be built that compares behavior + * with PCRE? (Obviously, this depends on PCRE.) */ +#ifndef CMP_PCRE +#define CMP_PCRE 0 +#endif + +#if CMP_PCRE +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +static int +compare_with_pcre(const char *pattern, struct fsm *fsm); +#endif + /* 10 seconds */ #define TIMEOUT_USEC (10ULL * 1000 * 1000) +/* for TRACK_TIMES and EXPENSIVE_CHECKS */ +#include "../src/libfsm/internal.h" + enum run_mode { - MODE_DEFAULT, + MODE_REGEX, + MODE_REGEX_SINGLE_ONLY, + MODE_REGEX_MULTI_ONLY, + MODE_IDEMPOTENT_DET_MIN, MODE_SHUFFLE_MINIMISE, MODE_ALL_PRINT_FUNCTIONS, }; +static size_t +get_env_config(size_t default_value, const char *env_var_name); -/* This stuff will already exist elsewhere once other branches are merged. */ +/* TODO: These could be moved to a common file for test utils. */ #if 1 static void time_get(struct timeval *tv) @@ -75,7 +106,110 @@ scanner_next(void *opaque) } #endif -static const struct fsm_options opt; +/* This is used to track allocation during each fuzzer + * run. Note that hwm is not reduced when memory is + * free'd or realloc'd, because the size info is not + * passed to those calls. */ +#define MB(X) ((size_t)X * 1000 * 1000) +#define FH_ALLOCATOR_HWM_LIMIT (MB(50)) +struct fh_allocator_stats { + size_t hwm; /* high water mark */ +}; + +static void +fh_memory_hwm_limit_hook(const char *caller_name) +{ + /* It doesn't really help to exit here because libfuzzer will + * still treat it as a failure, but at least we can print a + * message about hitting the allocator limit and exit so we + * don't need to spend time investigating timeouts or ooms + * that are due to obvious resource exhaustion. */ + fprintf(stderr, "%s: hit FH_ALLOCATOR_HWM_LIMIT (%zu), exiting\n", + caller_name, FH_ALLOCATOR_HWM_LIMIT); + exit(EXIT_SUCCESS); +} + +static void +fh_free(void *opaque, void *p) +{ + (void)opaque; + free(p); +} + +static void * +fh_calloc(void *opaque, size_t n, size_t sz) +{ + struct fh_allocator_stats *stats = opaque; + stats->hwm += sz; + if (stats->hwm > FH_ALLOCATOR_HWM_LIMIT) { + fh_memory_hwm_limit_hook(__func__); + return NULL; + } + + (void)opaque; + return calloc(n, sz); +} + +static void * +fh_malloc(void *opaque, size_t sz) +{ + struct fh_allocator_stats *stats = opaque; + stats->hwm += sz; + if (stats->hwm > FH_ALLOCATOR_HWM_LIMIT) { + fh_memory_hwm_limit_hook(__func__); + return NULL; + } + + return malloc(sz); +} + +static void * +fh_realloc(void *opaque, void *p, size_t sz) +{ + struct fh_allocator_stats *stats = opaque; + stats->hwm += sz; + if (stats->hwm > FH_ALLOCATOR_HWM_LIMIT) { + fh_memory_hwm_limit_hook(__func__); + return NULL; + } + + return realloc(p, sz); +} + +static struct fh_allocator_stats allocator_stats; + +/* fuzzer harness allocators */ +static struct fsm_alloc custom_allocators = { + .free = fh_free, + .calloc = fh_calloc, + .malloc = fh_malloc, + .realloc = fh_realloc, + .opaque = &allocator_stats, +}; + +static const struct fsm_options fsm_options = { + .group_edges = 1, /* make output readable */ + .alloc = &custom_allocators, +}; + +static void +dump_pattern(const char *pattern) +{ + const size_t pattern_length = strlen(pattern); + fprintf(stderr, "-- Pattern: %zu bytes\n", pattern_length); + for (size_t i = 0; i < pattern_length; i++) { + fprintf(stderr, " %02x", (uint8_t)pattern[i]); + if ((i & 31) == 31) { fprintf(stderr, "\n"); } + } + if ((pattern_length & 31) != 31) { + fprintf(stderr, "\n"); + } + for (size_t i = 0; i < pattern_length; i++) { + fprintf(stderr, "%c", isprint(pattern[i]) ? pattern[i] : '.'); + if ((i & 63) == 63) { fprintf(stderr, "\n"); } + } + fprintf(stderr, "\n"); +} static struct fsm * build(const char *pattern) @@ -95,7 +229,7 @@ build(const char *pattern) }; time_get(&pre); - fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI, &err); + fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, RE_MULTI, &err); time_get(&post); delta_usec = time_diff_usec(&pre, &post); total_usec += delta_usec; @@ -122,8 +256,10 @@ build(const char *pattern) if (total_usec > TIMEOUT_USEC) { #ifndef EXPENSIVE_CHECKS + dump_pattern(pattern); assert(!"timeout"); #else + (void)dump_pattern; fprintf(stderr, "exiting zero due to timeout under EXPENSIVE_CHECKS\n"); exit(0); #endif @@ -132,6 +268,17 @@ build(const char *pattern) return fsm; } +static size_t +get_env_config(size_t default_value, const char *env_var_name) +{ + const char *s = getenv(env_var_name); + if (s == NULL) { + return default_value; + } else { + return strtoul(s, NULL, 10); + } +} + static int codegen(const struct fsm *fsm) { @@ -143,14 +290,46 @@ codegen(const struct fsm *fsm) } static int -build_and_codegen(const char *pattern) +build_and_check_single(const char *pattern) { + const int verbosity = get_env_config(0, "VERBOSITY"); + if (verbosity > 1) { + fprintf(stderr, "pattern: \"%s\"\n", pattern); + } + + INIT_TIMERS(); + TIME(&pre); struct fsm *fsm = build(pattern); if (fsm == NULL) { return EXIT_SUCCESS; } + TIME(&post); + DIFF_MSEC("build", pre, post, NULL); + + if (getenv("DUMP")) { + fprintf(stderr,"==================================================\n"); + fsm_print_fsm(stderr, fsm); + fprintf(stderr,"==================================================\n"); + fsm_capture_dump(stderr, "CAPTURE", fsm); + fprintf(stderr,"==================================================\n"); + } + +#if CMP_PCRE + TIME(&pre); + const int cmp_res = compare_with_pcre(pattern, fsm); + TIME(&post); + DIFF_MSEC("cmp", pre, post, NULL); + if (!cmp_res) { + fsm_free(fsm); + return EXIT_SUCCESS; + } +#endif - if (!codegen(fsm)) { + TIME(&pre); + const int codegen_res = codegen(fsm); + TIME(&post); + DIFF_MSEC("codegen", pre, post, NULL); + if (!codegen_res) { return EXIT_SUCCESS; } @@ -158,6 +337,1022 @@ build_and_codegen(const char *pattern) return EXIT_SUCCESS; } +#define DEF_MAX_DEPTH 20 +#define DEF_MAX_LENGTH 10 +#define DEF_MAX_STEPS 10000 +#define DEF_MAX_MATCH_COUNT 1000 + +#if CMP_PCRE +/* These two are only used with PCRE2 */ +#define ANCHORED_PCRE 0 +#define FUZZ_RE_MATCH_LIMIT 10000 +#define FUZZ_RE_RECURSION_LIMIT 200 +#define MAX_OVEC_SIZE 512 + +static pcre2_match_context *pcre2_mc = NULL; + +struct cmp_pcre_env { + int verbosity; + const char *pattern; + const struct fsm *fsm; + pcre2_match_data *md; + pcre2_code *p; + + struct fsm_capture *captures; + size_t captures_length; + + size_t max_depth; + size_t max_steps; + size_t max_match_count; +}; + +struct test_pcre_match_info { + int res; + int pcre_error; + size_t ovector[MAX_OVEC_SIZE]; +}; + +static pcre2_code * +build_pcre2(const char *pattern, int verbosity) +{ + const uint32_t options = ANCHORED_PCRE ? PCRE2_ANCHORED : 0; + int errorcode; + PCRE2_SIZE erroffset = 0; + pcre2_compile_context *cctx = NULL; + + /* Set match limits */ + if (pcre2_mc == NULL) { + pcre2_mc = pcre2_match_context_create(NULL); + assert(pcre2_mc != NULL); + + pcre2_set_match_limit(pcre2_mc, FUZZ_RE_MATCH_LIMIT); + pcre2_set_recursion_limit(pcre2_mc, FUZZ_RE_RECURSION_LIMIT); + } + + pcre2_code *p = pcre2_compile((const unsigned char *)pattern, + PCRE2_ZERO_TERMINATED, + options, &errorcode, &erroffset, cctx); + if (verbosity > 0 && p == NULL && errorcode != 0) { +#define ERRSIZE 4096 + unsigned char errbuf[ERRSIZE] = {0}; + if (!pcre2_get_error_message(errorcode, + errbuf, ERRSIZE)) { + fprintf(stderr, "pcre2_get_error_message: failed\n"); + } + fprintf(stderr, "pcre2_compile: error: %s\n", errbuf); + } + return p; +} + +enum do_pcre_match_res { + DO_PCRE_MATCH_HIT, + DO_PCRE_MATCH_MISS, + DO_PCRE_MATCH_SKIP, /* an exceptional case we don't care about */ + DO_PCRE_MATCH_ERROR = -1, +}; +enum do_pcre_match_res +do_pcre_match(FILE *f, const pcre2_code *p, pcre2_match_data *md, int verbosity, + const char *input, struct test_pcre_match_info *match_info) +{ +#define MAX_BUF (64*1024) + const size_t input_len = strlen(input); + enum do_pcre_match_res mres; + + /* turn off the JIT because it can give inconsistent results while fuzzing */ + const uint32_t options = (ANCHORED_PCRE ? PCRE2_ANCHORED : 0) + | PCRE2_NO_JIT; + + assert(pcre2_mc != NULL); + + /* The value returned by pcre2_match() is one more than the + * highest numbered pair that has been set. */ + int res = pcre2_match(p, (const unsigned char *)input, input_len, + 0, options, md, pcre2_mc); + + if (res == PCRE2_ERROR_NOMATCH || res == PCRE2_ERROR_PARTIAL) { + if (f != NULL && verbosity > 1) { + fprintf(f, " -- no match (%s)\n", + res == PCRE2_ERROR_NOMATCH ? "NOMATCH" + : res == PCRE2_ERROR_PARTIAL ? "PARTIAL" + : ""); + } + mres = DO_PCRE_MATCH_MISS; + goto cleanup; + } else if (res == PCRE2_ERROR_MATCHLIMIT || res == PCRE2_ERROR_DEPTHLIMIT) { + /* It's possible to exhaust PCRE's internal limits with pathologically + * nested regexes like "(((((((((^.)?)*)?)?)?)*)+)+)*$" and + * "((((((((akbzaabdcOaa)|((((b*))))?|.|.|.*|.|.))+)+)+$)*)?)" , but + * as long as they don't cause it to block for excessively long or + * exhaust resources that's fine. */ + if (f != NULL) { + fprintf(f, " -- PCRE2_ERROR_MATCHLIMIT (returning SKIP)\n"); + } + mres = DO_PCRE_MATCH_SKIP; + } else if (res <= 0) { + if (f != NULL) { +#define ERR_MAX 4096 + unsigned char err_buf[ERR_MAX]; + if (pcre2_get_error_message(res, err_buf, ERR_MAX)) { + fprintf(f, " -- error %d: %s\n", res, err_buf); + } else { + fprintf(f, " -- error %d\n", res); + } +#undef ERR_MAX + } + if (match_info != NULL) { + match_info->pcre_error = res; + } + mres = DO_PCRE_MATCH_ERROR; + goto cleanup; + } else { + const uint32_t ovc = pcre2_get_ovector_count(md); + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); + assert(res >= 0); + size_t ures = (size_t)res; + assert(ovc > ures); + + assert(ovector[1] >= ovector[0]); + const size_t mlen = ovector[1] - ovector[0]; + if (ANCHORED_PCRE && (ovector[0] != 0 || mlen != input_len)) { + mres = DO_PCRE_MATCH_MISS; + goto cleanup; + } + mres = DO_PCRE_MATCH_HIT; + + if (f != NULL && verbosity > 1) { + for (size_t i = 0; i < ures; i++) { + char buf[MAX_BUF] = { 0 }; + memcpy(buf, &input[ovector[2*i]], + ovector[2*i + 1U] - ovector[2*i]); + fprintf(f, " -- %zu: \"%s\"\n", i, buf); + } + } + + if (match_info != NULL && res < MAX_OVEC_SIZE) { + match_info->res = res; + assert(res >= 0); + const size_t ures = (size_t)res; + + for (size_t i = 0; i < 2*ures; i++) { + match_info->ovector[i] = ovector[i]; + } + } + } + +cleanup: + return mres; +#undef MAX_BUF +} + +static bool +exec_and_compare_captures(struct cmp_pcre_env *env, + const char *input, size_t input_size, + const struct test_pcre_match_info *match_info) +{ + bool matching = true; + fsm_state_t end_state; + const uint8_t *u8_input = (const uint8_t *)input; + int res = fsm_exec_with_captures(env->fsm, u8_input, input_size, + &end_state, env->captures, env->captures_length); + + if (res < 0) { + if (env->verbosity > 1) { + fprintf(stderr, "got res of %d\n", res); + } + + return false; + } + + if (res > 0) { + assert(match_info->res >= 0); + const size_t ures = (size_t)match_info->res; + + if (env->verbosity > 1) { + fprintf(stderr, "ures %zu\n", ures); + } + + for (size_t i = 0; i < ures; i++) { + if (env->verbosity > 1) { + fprintf(stderr, "%zu/%zu: pcre [%ld, %ld] <-> libfsm [%ld, %ld]\n", + i, ures, + match_info->ovector[2*i], match_info->ovector[2*i + 1], + env->captures[i].pos[0], env->captures[i].pos[1]); + } + if ((match_info->ovector[2*i] != env->captures[i].pos[0]) + || (match_info->ovector[2*i + 1] != env->captures[i].pos[1])) { + matching = false; + } + } + + if (!matching) { + for (size_t i = 0; i < ures; i++) { + fprintf(stderr, "%zu/%zu: pcre [%ld, %ld] <-> libfsm [%ld, %ld]\n", + i, ures, + match_info->ovector[2*i], match_info->ovector[2*i + 1], + env->captures[i].pos[0], env->captures[i].pos[1]); + } + } + } + + return matching; +} + +static void +dump_pattern_and_input(const char *pattern, const char *input, size_t input_length) +{ + dump_pattern(pattern); + + fprintf(stderr, "-- Input: %zu bytes\n", input_length); + for (size_t i = 0; i < input_length; i++) { + fprintf(stderr, " %02x", (uint8_t)input[i]); + if ((i & 31) == 31) { fprintf(stderr, "\n"); } + } + if ((input_length & 31) != 31) { + fprintf(stderr, "\n"); + } + for (size_t i = 0; i < input_length; i++) { + fprintf(stderr, "%c", isprint(input[i]) ? input[i] : '.'); + if ((i & 63) == 63) { fprintf(stderr, "\n"); } + } + fprintf(stderr, "\n"); +} + +static enum fsm_generate_matches_cb_res +cmp_pcre_gen_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + struct cmp_pcre_env *env = opaque; + assert(env != NULL); + + (void)fsm; + (void)depth; + (void)end_state; + + const size_t len = strlen(input); + + if (env->verbosity > 4) { + fprintf(stderr, "%s: depth %zu/%zu, match_count %zu/%zu, steps %zu/%zu\n", + __func__, + depth, env->max_depth, + match_count, env->max_match_count, + steps, env->max_steps); + } + + if (steps > env->max_steps) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (match_count > env->max_match_count) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (depth > env->max_depth) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + /* Completely avoid exploring inputs with embedded 0x00 bytes. */ + if (input_length != len) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + if (len > 0 && input[len - 1] == '\n') { + /* These will need to be handled properly, but PCRE has + * special cases for '\n' handling. */ + /* fprintf(stderr, " -- skipping input ending with '\\n'.\n"); */ + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + struct test_pcre_match_info match_info = { .pcre_error = 0 }; + enum do_pcre_match_res mres = do_pcre_match(stderr, + env->p, env->md, env->verbosity, input, &match_info); + switch (mres) { + case DO_PCRE_MATCH_SKIP: + break; + case DO_PCRE_MATCH_MISS: + dump_pattern_and_input(env->pattern, input, input_length); + assert(!"matches libfsm but not with PCRE"); + return FSM_GENERATE_MATCHES_CB_RES_HALT; + case DO_PCRE_MATCH_ERROR: + fprintf(stderr, "FAIL: PCRE returned ERROR %d: pattern \"%s\"\n", + match_info.pcre_error, env->pattern); + return FSM_GENERATE_MATCHES_CB_RES_HALT; + case DO_PCRE_MATCH_HIT: + break; /* okay; continue below */ + } + + if (env->verbosity > 1) { + fprintf(stderr, "-- comparing captures for pattern \"%s\", input \"%s\" (len %zu)\n", + env->pattern, input, len); + } + + if (!exec_and_compare_captures(env, input, input_length, &match_info)) { + if (env->verbosity > 1 || 1) { + dump_pattern_and_input(env->pattern, input, input_length); + fsm_print_fsm(stderr, env->fsm); + fsm_capture_dump(stderr, "fsm", env->fsm); + } + assert(!"captures don't match"); + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; +} + +static int +compare_fixed_input(struct fsm *fsm, const char *pattern, const char *input, pcre2_match_data *md, pcre2_code *p) +{ + fsm_state_t end_state; + const size_t capture_ceil = fsm_capture_ceiling(fsm); + + struct fsm_capture *captures = malloc(capture_ceil * sizeof(captures[0])); + assert(captures != NULL); + for (size_t i = 0; i < capture_ceil; i++) { + /* clobber with meaningless but visually distinct values */ + captures[i].pos[0] = 88888888; + captures[i].pos[1] = 99999999; + }; + + const uint8_t *u8_input = (const uint8_t *)input; + const size_t input_len = strlen(input); + const int libfsm_res = fsm_exec_with_captures(fsm, u8_input, input_len, + &end_state, captures, capture_ceil); + + const bool libfsm_matching = libfsm_res > 0; + + int res = 1; + + struct test_pcre_match_info match_info = { .pcre_error = 0 }; + enum do_pcre_match_res mres = do_pcre_match(stderr, + p, md, 0, input, &match_info); + switch (mres) { + case DO_PCRE_MATCH_SKIP: + return 1; + case DO_PCRE_MATCH_MISS: + if (!libfsm_matching) { + goto cleanup; + } + dump_pattern_and_input(pattern, input, 0); + assert(!"matches libfsm but not with PCRE"); + return 0; + case DO_PCRE_MATCH_ERROR: + fprintf(stderr, "FAIL: PCRE returned ERROR %d: pattern \"%s\"\n", + match_info.pcre_error, pattern); + return 0; + case DO_PCRE_MATCH_HIT: + if (!libfsm_matching) { + dump_pattern_and_input(pattern, input, input_len); + assert(!"matches PCRE but not libfsm"); + res = 0; + goto cleanup; + } + + const size_t ures = (size_t)match_info.res; + if (ures > capture_ceil) { + dump_pattern_and_input(pattern, input, 0); + fprintf(stderr, "error: capture_ceil: %zu exceeded by ures: %zd\n", + capture_ceil, ures); + assert(!"both PCRE and libfsm match but with different capture counts"); + } + + bool matching = true; + for (size_t i = 0; i < ures; i++) { + if ((match_info.ovector[2*i] != captures[i].pos[0]) + || (match_info.ovector[2*i + 1] != captures[i].pos[1])) { + matching = false; + } + } + for (size_t i = 0; i < ures; i++) { + if (!matching) { + fprintf(stderr, "%zu/%zu: pcre [%ld, %ld] <-> libfsm [%ld, %ld]\n", + i, ures, + match_info.ovector[2*i], match_info.ovector[2*i + 1], + captures[i].pos[0], captures[i].pos[1]); + } + } + + if (!matching) { + dump_pattern_and_input(pattern, input, 0); + assert(!"both PCRE and libfsm match but with different captures"); + } + + goto cleanup; /* ok, both matched */ + } + + assert(!"unreachable"); + +cleanup: + free(captures); + return res; + +} + +static int +compare_with_pcre(const char *pattern, struct fsm *fsm) +{ + size_t verbosity = get_env_config(0, "VERBOSITY"); + size_t max_length = get_env_config(DEF_MAX_LENGTH, "MAX_LENGTH"); + size_t max_steps = get_env_config(DEF_MAX_STEPS, "MAX_STEPS"); + size_t max_depth = get_env_config(DEF_MAX_DEPTH, "MAX_DEPTH"); + size_t max_match_count = get_env_config(DEF_MAX_MATCH_COUNT, "MAX_MATCH_COUNT"); + int res = 1; + + pcre2_match_data *md; + + pcre2_code *p = build_pcre2(pattern, 0); + if (p == NULL) { + return 1; + } + + md = pcre2_match_data_create(MAX_OVEC_SIZE, NULL); + assert(md != NULL); + + /* Check the empty string and "\n", because PCRE has an awkward + * special case for "\n" that has complicated interactions + * with start and end anchoring. */ + if (!compare_fixed_input(fsm, pattern, "", md, p) + || !compare_fixed_input(fsm, pattern, "\n", md, p)) { + pcre2_match_data_free(md); + pcre2_code_free(p); + return res; + } + + struct fsm_capture captures[MAX_OVEC_SIZE/2] = { 0 }; + + const size_t pattern_length = strlen(pattern); + if (pattern_length >= max_length) { + max_length = pattern_length + 1; + static size_t max_max_length; + if (max_length > max_max_length) { + fprintf(stderr, "Note: increasing max_length to %zu\n", + pattern_length + 1); + max_max_length = max_length; + if (max_depth < max_length) { + max_depth = max_length + 1; + } + } + } + + struct cmp_pcre_env env = { + .verbosity = (int)verbosity, + .pattern = pattern, + .fsm = fsm, + .captures = captures, + .captures_length = MAX_OVEC_SIZE/2, + .md = md, + .p = p, + .max_steps = max_steps, + .max_depth = max_depth, + .max_match_count = max_match_count, + }; + + if (!fsm_generate_matches(fsm, max_length, cmp_pcre_gen_cb, &env)) { + res = 0; + } + + pcre2_match_data_free(md); + pcre2_code_free(p); + return res; +} +#endif + +/* Note: combined_fsm and fsms[] are non-const because fsm_generate_matches + * calls fsm_trim on them. */ +static int +compare_separate_and_combined(int verbosity, size_t max_length, size_t count, + struct fsm *combined_fsm, const struct fsm_combined_base_pair *bases, + struct fsm **fsms); + +static enum fsm_generate_matches_cb_res +cmp_separate_and_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +static int +build_and_check_multi(const char *input) +{ + int res = EXIT_FAILURE; + const int verbosity = get_env_config(0, "VERBOSITY"); +#define MAX_PATTERNS 8 +#define MAX_PATTERN_LEN 256 + char patterns[MAX_PATTERNS][MAX_PATTERN_LEN] = { 0 }; + size_t count = 0; + const size_t len = strlen(input); + size_t max_length = get_env_config(DEF_MAX_LENGTH, "MAX_LENGTH"); + INIT_TIMERS(); + + /* if nonzero, apply a timeout to the combined FSM det/min below */ + const size_t timeout = get_env_config(0, "TIMEOUT"); + + if (timeout > 0) { + if (TRACK_TIMES == 0) { + fprintf(stderr, "\n\n\n\n\nError: src/libfsm/internal.h:TRACK_TIMES needs to be nonzero for this use case, exiting.\n\n\n\n\n"); + exit(EXIT_FAILURE); + } else { + static bool printed_timeout_msg; + if (!printed_timeout_msg) { + fprintf(stderr, "Using timeout of %zu msec for fsm_determinise/fsm_minimise on combined FSM.\n", + timeout); + printed_timeout_msg = true; + } + } + } + + size_t i, j; + for (i = 0, j = 0; i < len; i++) { + const char c = input[i]; + if (c == '\n' || c == '\r') { + if (j > max_length) { + max_length = j; + } + count++; + if (count == MAX_PATTERNS) { + /* ignore: too many patterns */ + return EXIT_SUCCESS; + } + j = 0; + } else { + patterns[count][j] = c; + j++; + if (j == MAX_PATTERN_LEN) { + /* ignore: pattern too long */ + return EXIT_SUCCESS; + } + } + } + if (j > 0) { count++; } + + if (count == 1) { + /* multi mode with only one pattern is pointless */ + return EXIT_SUCCESS; + } + + struct re_err err; + const enum re_flags flags = 0; + + /* build each regex, combining them and keeping track of capture offsets */ + struct fsm *fsms[count]; + struct fsm *fsms_cp[count]; + struct fsm_combined_base_pair bases[count]; + struct fsm *combined_fsm = NULL; + for (size_t i = 0; i < count; i++) { + fsms[i] = NULL; + fsms_cp[i] = NULL; + + bases[i].state = 0; + bases[i].capture = 0; + } + + /* compile each individually */ + /* FIXME: apply and check endids */ + for (size_t i = 0; i < count; i++) { + if (verbosity > 1) { + fprintf(stderr, "%s: compiling \"%s\"\n", + __func__, patterns[i]); + } + + struct scanner s = { + .str = (const uint8_t *)patterns[i], + .size = strlen(patterns[i]), + }; + + struct fsm *fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, flags, &err); + if (fsm == NULL) { + res = EXIT_SUCCESS; /* invalid regex, so skip this batch */ + goto cleanup; + } + + /* set endid to associate each FSM with its pattern */ + if (!fsm_setendid(fsm, (fsm_end_id_t)i)) { + goto cleanup; + } + + char label_buf[100]; + snprintf(label_buf, 100, "single_determisise_%zu", i); + + TIME(&pre); + if (!fsm_determinise(fsm)) { + goto cleanup; + } + TIME(&post); + DIFF_MSEC(label_buf, pre, post, NULL); + + snprintf(label_buf, 100, "single_minimise_%zu", i); + TIME(&pre); + if (!fsm_minimise(fsm)) { + goto cleanup; + } + TIME(&post); + DIFF_MSEC(label_buf, pre, post, NULL); + + if (verbosity > 4) { + char tag_buf[16] = { 0 }; + snprintf(tag_buf, sizeof(tag_buf), "fsm[%zu]", i); + + fprintf(stderr, "==== fsm[%zu]\n", i); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, tag_buf, fsm); + } + + fsms[i] = fsm; + fsms_cp[i] = fsm_clone(fsm); /* save a copy for comparison */ + } + + combined_fsm = fsm_union_array(count, fsms, bases); + assert(combined_fsm != NULL); + if (verbosity > 1) { + fprintf(stderr, "%s: combined_fsm: %d states after fsm_union_array\n", + __func__, fsm_countstates(combined_fsm)); + } + if (verbosity > 1) { + for (size_t i = 0; i < count; i++) { + fprintf(stderr, "%s: base[%zu]: state %d, capture %u\n", + __func__, i, bases[i].state, bases[i].capture); + } + } + + TIME(&pre); + if (!fsm_determinise(combined_fsm)) { + goto cleanup; + } + TIME(&post); + size_t timeout_accum = 0; + if (timeout != 0) { + if (verbosity > 1) { + DIFF_MSEC_ALWAYS("combined_determinise", pre, post, &timeout_accum); + } else { + DIFF_MSEC("combined_determinise", pre, post, &timeout_accum); + } + assert(timeout_accum < timeout); + timeout_accum = 0; + } + + const unsigned states_after_determinise = fsm_countstates(combined_fsm); + if (verbosity > 1) { + fprintf(stderr, "%s: combined_fsm: %d states after determinise\n", + __func__, states_after_determinise); + } + + TIME(&pre); + if (!fsm_minimise(combined_fsm)) { + goto cleanup; + } + TIME(&post); + if (timeout != 0) { + if (verbosity > 1) { + DIFF_MSEC_ALWAYS("combined_minimise", pre, post, &timeout_accum); + } else { + DIFF_MSEC("combined_minimise", pre, post, &timeout_accum); + } + assert(timeout_accum < timeout); + timeout_accum = 0; + } + + const unsigned states_after_minimise = fsm_countstates(combined_fsm); + if (verbosity > 1) { + fprintf(stderr, "%s: combined_fsm: %d states after minimise\n", + __func__, states_after_minimise); + } + + if (verbosity > 4) { + fprintf(stderr, "==== combined\n"); + fsm_print_fsm(stderr, combined_fsm); + fsm_capture_dump(stderr, "combined", combined_fsm); + } + + res = compare_separate_and_combined(verbosity, max_length, + count, combined_fsm, bases, (struct fsm **)fsms_cp); + + for (i = 0; i < count; i++) { + fsm_free(fsms_cp[i]); + } + fsm_free(combined_fsm); + + if (res == EXIT_SUCCESS) { + static size_t pass_count; + if (verbosity == 1) { + fprintf(stderr, "%s: pass: %zu, %zu patterns\n", + __func__, ++pass_count, count); + } else if (verbosity > 1) { + fprintf(stderr, "%s: pass: %zu, %zu patterns\n", + __func__, ++pass_count, count); + for (i = 0; i < count; i++) { + fprintf(stderr, " -- %zu: \"%s\"\n", + i, patterns[i]); + } + } + } + + return res; + +cleanup: + for (i = 0; i < count; i++) { + if (fsms[i] != NULL) { + fsm_free(fsms[i]); + } + if (fsms_cp[i] != NULL) { + fsm_free(fsms_cp[i]); + } + } + if (combined_fsm != NULL) { + fsm_free(combined_fsm); + } + return res; +} + +struct cmp_combined_env { + bool ok; + int verbosity; + size_t count; + struct fsm *combined_fsm; + const struct fsm_combined_base_pair *bases; + size_t current_i; + struct fsm **fsms; + size_t max_depth; + size_t max_steps; + size_t max_match_count; +}; + +static enum fsm_generate_matches_cb_res +cmp_combined_with_separate_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +static int +compare_separate_and_combined(int verbosity, size_t max_length, size_t count, + struct fsm *combined_fsm, const struct fsm_combined_base_pair *bases, + struct fsm **fsms) +{ + const size_t max_steps = get_env_config(DEF_MAX_STEPS, "MAX_STEPS"); + const size_t max_depth = get_env_config(DEF_MAX_DEPTH, "MAX_DEPTH"); + const size_t max_match_count = get_env_config(DEF_MAX_MATCH_COUNT, "MAX_MATCH_COUNT"); + + struct cmp_combined_env env = { + .ok = true, + .verbosity = verbosity, + .count = count, + .combined_fsm = combined_fsm, + .bases = bases, + .fsms = fsms, + .max_steps = max_steps, + .max_depth = max_depth, + .max_match_count = max_match_count, + }; + + /* For each individual fsm, generate matching inputs and check that + * they match with the same captures in the combined fsm. */ + for (env.current_i = 0; env.current_i < count; env.current_i++) { + if (!fsm_generate_matches(env.fsms[env.current_i], max_length, + cmp_separate_and_combined_cb, &env)) { + env.ok = false; + } + if (!env.ok) { + break; + } + } + env.current_i = (size_t)-1; + + /* Also go in the other direction, generating matches with + * combined and check the individual ones match as expected. */ + if (env.ok) { + if (!fsm_generate_matches(env.combined_fsm, max_length, + cmp_combined_with_separate_cb, &env)) { + env.ok = false; + } + } + + return env.ok ? EXIT_SUCCESS : EXIT_FAILURE; +} + +static enum fsm_generate_matches_cb_res +cmp_separate_and_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + struct cmp_combined_env *env = opaque; + (void)end_state; + + if (steps > env->max_steps) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (depth > env->max_depth) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + if (match_count > env->max_match_count) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + +#define MAX_CAPTURES 256 + struct fsm_capture captures_single[MAX_CAPTURES]; + struct fsm_capture captures_combined[MAX_CAPTURES]; + + const fsm_end_id_t expected_end_id = (fsm_end_id_t)env->current_i; + + const uint8_t *u8_input = (const uint8_t *)input; + fsm_state_t end_state_combined, end_state_single; + + const int res_combined = fsm_exec_with_captures(env->combined_fsm, u8_input, input_length, + &end_state_combined, captures_combined, MAX_CAPTURES); + const int res_single = fsm_exec_with_captures(fsm, u8_input, input_length, + &end_state_single, captures_single, MAX_CAPTURES); + + if (res_combined != res_single) { + env->ok = false; + if (env->verbosity > 0) { + fprintf(stderr, "%s: res_combined %d != res_single %d\n", + __func__, res_combined, res_single); + } + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + fsm_end_id_t id_buf_combined[MAX_PATTERNS]; + size_t written_combined = 0; + if (res_combined > 0) { + const size_t exp_written = fsm_getendidcount(env->combined_fsm, end_state_combined); + assert(exp_written <= env->count); + const enum fsm_getendids_res gres = fsm_getendids(env->combined_fsm, + end_state_combined, MAX_PATTERNS, id_buf_combined, &written_combined); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written_combined == exp_written); + } + + /* we got here, so we have a match */ + assert(res_single > 0); + + if (env->verbosity > 3) { + fprintf(stderr, "%s: res %d (single and combined)\n", __func__, res_single); + } + + /* Check that the end state's endid for the single DFA is among the + * endids for the combined DFA's end state. */ + assert(fsm_getendidcount(fsm, end_state_single) == 1); + assert(fsm_getendidcount(env->combined_fsm, end_state_combined) <= env->count); + + fsm_end_id_t id_buf_single[1]; + size_t written; + const enum fsm_getendids_res gres = fsm_getendids(fsm, + end_state_single, 1, id_buf_single, &written); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written == 1); + assert(id_buf_single[0] == expected_end_id); + + bool found_single_id_in_combined = false; + for (size_t i = 0; i < written_combined; i++) { + if (id_buf_combined[i] == expected_end_id) { + found_single_id_in_combined = true; + break; + } + } + assert(found_single_id_in_combined); + + bool matching = true; + const unsigned base = env->bases[env->current_i].capture; + assert(base < MAX_CAPTURES); + for (int i = 0; i < res_single; i++) { + if (env->verbosity > 3) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + if ((captures_single[i].pos[0] != captures_combined[i + base].pos[0]) || + (captures_single[i].pos[1] != captures_combined[i + base].pos[1])) { + matching = false; + } + } + + if (!matching) { + for (int i = 0; i < res_single; i++) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; +} + +static enum fsm_generate_matches_cb_res +cmp_combined_with_separate_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + /* We have an input that matched the combined DFA, + * use the set of end IDs to check which of the + * single DFAs it should/should not match, and check + * the endid behavior. */ + + struct cmp_combined_env *env = opaque; + + if (steps > env->max_steps) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (depth > env->max_depth) { + return FSM_GENERATE_MATCHES_CB_RES_PRUNE; + } + + if (match_count > env->max_match_count) { + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + +#define MAX_CAPTURES 256 + struct fsm_capture captures_single[MAX_CAPTURES]; + struct fsm_capture captures_combined[MAX_CAPTURES]; + + const uint8_t *u8_input = (const uint8_t *)input; + + fsm_state_t end_state_combined; + assert(fsm == env->combined_fsm); + const int res_combined = fsm_exec_with_captures(env->combined_fsm, u8_input, input_length, + &end_state_combined, captures_combined, MAX_CAPTURES); + assert(res_combined > 0); /* we got here, so we have a match */ + assert(end_state_combined == end_state); + + fsm_end_id_t id_buf_combined[MAX_PATTERNS]; + size_t written_combined = 0; + { + const size_t exp_written = fsm_getendidcount(env->combined_fsm, end_state_combined); + assert(exp_written <= env->count); + const enum fsm_getendids_res gres = fsm_getendids(env->combined_fsm, + end_state_combined, MAX_PATTERNS, id_buf_combined, &written_combined); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written_combined == exp_written); + } + + /* For each pattern, check if its endid is in the combined DFA's end state + * endids. If so, it should match, otherwise it should not. */ + for (size_t pattern_i = 0; pattern_i < env->count; pattern_i++) { + const struct fsm *single_fsm = env->fsms[pattern_i]; + bool found = false; + for (size_t endid_i = 0; endid_i < written_combined; endid_i++) { + const fsm_end_id_t endid = id_buf_combined[endid_i]; + if (endid == pattern_i) { + found = true; + break; + } + } + fsm_state_t end_state_single; + + const int res_single = fsm_exec_with_captures(single_fsm, + u8_input, input_length, + &end_state_single, captures_single, MAX_CAPTURES); + + if (found) { + assert(res_single > 0); + fsm_end_id_t id_buf_single[1]; + size_t written; + const enum fsm_getendids_res gres = fsm_getendids(single_fsm, + end_state_single, 1, id_buf_single, &written); + assert(gres == FSM_GETENDIDS_FOUND); + assert(written == 1); + assert(id_buf_single[0] == pattern_i); + + /* check captures */ + bool matching = true; + const unsigned base = env->bases[pattern_i].capture; + assert(base < MAX_CAPTURES); + for (int i = 0; i < res_single; i++) { + if (env->verbosity > 3) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + if ((captures_single[i].pos[0] != captures_combined[i + base].pos[0]) || + (captures_single[i].pos[1] != captures_combined[i + base].pos[1])) { + matching = false; + } + } + + if (!matching) { + for (int i = 0; i < res_single; i++) { + fprintf(stderr, "%d/%d: single [%ld, %ld] <-> combined [%ld, %ld]\n", + i, res_single, + captures_single[i].pos[0], captures_single[i].pos[1], + captures_combined[i + base].pos[0], captures_combined[i + base].pos[1]); + } + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + } else { + assert(res_single == 0); /* no match */ + } + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; +} + + #define DEF_MAX_SHUFFLE 10 #define DEF_MAX_MINIMISE_ORACLE_STATE_COUNT 1000 @@ -176,7 +1371,7 @@ shuffle_minimise(const char *pattern) .offset = 0 }; - fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI, &err); + fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, RE_MULTI, &err); if (fsm == NULL) { /* ignore invalid regexp syntax, etc. */ @@ -335,23 +1530,108 @@ fuzz_all_print_functions(FILE *f, const char *pattern, bool det, bool min, const return EXIT_SUCCESS; } -#define MAX_FUZZER_DATA (64 * 1024) -static uint8_t data_buf[MAX_FUZZER_DATA + 1]; +static int +build_and_test_idempotent_det_and_min(const char *pattern) +{ + const int verbosity = get_env_config(0, "VERBOSITY"); + assert(pattern != NULL); + + struct re_err err; + struct fsm *fsm; + const size_t length = strlen(pattern); + + struct scanner s = { + .str = (const uint8_t *)pattern, + .size = length, + }; + + fsm = re_comp(RE_PCRE, scanner_next, &s, &fsm_options, RE_MULTI, &err); + if (fsm == NULL) { + return EXIT_SUCCESS; + } + + if (!fsm_determinise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_det_a\n"); + fsm_print_fsm(stderr, fsm); + } + const size_t post_det_a = fsm_countstates(fsm); + + if (!fsm_determinise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_det_b\n"); + fsm_print_fsm(stderr, fsm); + } + const size_t post_det_b = fsm_countstates(fsm); + assert(post_det_b == post_det_a); + + if (!fsm_minimise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_min_a\n"); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "post_a", fsm); + } + const size_t post_min_a = fsm_countstates(fsm); + + if (!fsm_minimise(fsm)) { + return EXIT_FAILURE; + } + if (verbosity >= 3) { + fprintf(stderr, "=== post_min_b\n"); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "post_b", fsm); + } + const size_t post_min_b = fsm_countstates(fsm); + assert(post_min_b == post_min_a); + + if (!fsm_determinise(fsm)) { + return EXIT_FAILURE; + } + const size_t post_det_c = fsm_countstates(fsm); + assert(post_det_c == post_min_b); + + if (!fsm_minimise(fsm)) { + return EXIT_FAILURE; + } + const size_t post_min_c = fsm_countstates(fsm); + assert(post_min_c == post_det_c); + + fsm_free(fsm); + return EXIT_SUCCESS; +} static enum run_mode get_run_mode(void) { const char *mode = getenv("MODE"); if (mode == NULL) { - return MODE_DEFAULT; + return MODE_REGEX; /* default */ } switch (mode[0]) { - case 'm': return MODE_SHUFFLE_MINIMISE; + case '\0': return MODE_REGEX; /* default */ + case 'r': return MODE_REGEX; + case 's': return MODE_REGEX_SINGLE_ONLY; + case 'm': return MODE_REGEX_MULTI_ONLY; + case 'i': return MODE_IDEMPOTENT_DET_MIN; + case 'M': return MODE_SHUFFLE_MINIMISE; case 'p': return MODE_ALL_PRINT_FUNCTIONS; - case 'd': default: - return MODE_DEFAULT; + fprintf(stderr, "Unrecognized mode '%c', expect one of:\n", mode[0]); + fprintf(stderr, " - r.egex (default)\n"); + fprintf(stderr, " - s.ingle regex only\n"); + fprintf(stderr, " - m.ulti regex only\n"); + fprintf(stderr, " - M.inimisation shuffling\n"); + fprintf(stderr, " - i.dempotent determinise/minimise\n"); + fprintf(stderr, " - p.rint functions\n"); + exit(EXIT_FAILURE); + break; } } @@ -369,12 +1649,87 @@ harness_fuzzer_target(const uint8_t *data, size_t size) size = MAX_FUZZER_DATA; } memcpy(data_buf, data, size); + /* ensure the buffer is 0-terminated */ + data_buf[size] = 0; + + /* truncate to a valid c string */ + size = strlen((const char *)data_buf); + data_buf[size] = 0; + + /* reset for each run */ + allocator_stats.hwm = 0; + + size_t dot_count = 0; + bool has_newline = false; + size_t first_newline; + + for (size_t i = 0; i < size; i++) { + const uint8_t c = data_buf[i]; + if (c == '.') { + dot_count++; + if (dot_count >= 4) { + /* Too many '.'s can lead to a regex that is + * very slow to determinise/minimise, but that + * failure mode is not interesting to this + * particular fuzzer. */ + return EXIT_SUCCESS; + } + } + + if (c == '(') { + /* This triggers an "unreached" assertion in the parser. + * It's already been reported (issue #386), but once the + * fuzzer finds it, it will report it over and over. + * Exit here so that the fuzzer considers it uninteresting. */ + if (size - i >= 3 && 0 == memcmp("(*:", &data_buf[i], 3)) { + return EXIT_SUCCESS; + } + } + + if (c == '\\') { + /* Not supported yet. */ + return EXIT_SUCCESS; + } + + if (c == '\r' || c == '\n') { + if (!has_newline) { + first_newline = i; + } + has_newline = true; + } + } const char *pattern = (const char *)data_buf; switch (get_run_mode()) { - case MODE_DEFAULT: - return build_and_codegen(pattern); + case MODE_REGEX: + if (has_newline) { + return build_and_check_multi(pattern); + } else { + return build_and_check_single(pattern); + } + + case MODE_REGEX_SINGLE_ONLY: + if (has_newline) { + return EXIT_SUCCESS; /* ignore */ + } else { + return build_and_check_single(pattern); + } + + case MODE_REGEX_MULTI_ONLY: + if (has_newline) { + return build_and_check_multi(pattern); + } else { + return EXIT_SUCCESS; /* ignore */ + } + + case MODE_IDEMPOTENT_DET_MIN: + if (has_newline) { + assert(data_buf[first_newline] == '\n' + || data_buf[first_newline] == '\r'); + data_buf[first_newline] = '\0'; + } + return build_and_test_idempotent_det_and_min(pattern); case MODE_SHUFFLE_MINIMISE: return shuffle_minimise(pattern); @@ -390,11 +1745,14 @@ harness_fuzzer_target(const uint8_t *data, size_t size) const bool det = b0 & 0x1; const bool min = b0 & 0x2; const enum fsm_io io_mode = (b0 >> 2) % 3; - + const char *shifted_pattern = (const char *)&data_buf[1]; int res = fuzz_all_print_functions(dev_null, shifted_pattern, det, min, io_mode); return res; } + +default: + assert(!"match fail"); } assert(!"unreached"); diff --git a/include/adt/hashrec.h b/include/adt/hashrec.h index 54816286e..545a20960 100644 --- a/include/adt/hashrec.h +++ b/include/adt/hashrec.h @@ -7,7 +7,7 @@ #ifndef ADT_HASHREC_H #define ADT_HASHREC_H -unsigned long +uint64_t hashrec(const void *p, size_t n); #endif diff --git a/include/adt/idmap.h b/include/adt/idmap.h new file mode 100644 index 000000000..504fd382b --- /dev/null +++ b/include/adt/idmap.h @@ -0,0 +1,59 @@ +#ifndef IDMAP_H +#define IDMAP_H + +/* Mapping between one fsm_state_t and a set of + * unsigned IDs. The implementation assumes that both + * IDs are sequentially assigned and don't need a sparse + * mapping -- it will handle 10 -> [1, 3, 47] well, but + * not 1000000 -> [14, 524288, 1073741823]. */ + +#include + +#include "fsm/fsm.h" +#include "fsm/alloc.h" + +struct idmap; /* Opaque handle. */ + +struct idmap * +idmap_new(const struct fsm_alloc *alloc); + +void +idmap_free(struct idmap *m); + +/* Associate a value with a state (if not already present.) + * Returns 1 on success, or 0 on allocation failure. */ +int +idmap_set(struct idmap *m, fsm_state_t state_id, unsigned value); + +/* How many values are associated with an ID? */ +size_t +idmap_get_value_count(const struct idmap *m, fsm_state_t state_id); + +/* Get the values associated with an ID. + * + * Returns 1 on success and writes them into the buffer, in ascending + * order, with the count in *written (if non-NULL). + * + * Returns 0 on error (insufficient buffer space). */ +int +idmap_get(const struct idmap *m, fsm_state_t state_id, + size_t buf_size, unsigned *buf, size_t *written); + +/* Iterator callback. + * Return status indicates whether to continue. */ +typedef int +idmap_iter_fun(fsm_state_t state_id, unsigned value, void *opaque); + +/* Iterate over the ID map. State IDs may be yielded out of order, + * values will be in ascending order. */ +void +idmap_iter(const struct idmap *m, + idmap_iter_fun *cb, void *opaque); + +/* Iterate over the values associated with a single state + * (in ascending order). */ +void +idmap_iter_for_state(const struct idmap *m, fsm_state_t state_id, + idmap_iter_fun *cb, void *opaque); + +#endif diff --git a/include/adt/mappingset.h b/include/adt/mappingset.h deleted file mode 100644 index d4f91105e..000000000 --- a/include/adt/mappingset.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2019 Shannon F. Stewman - * - * See LICENCE for the full copyright terms. - */ - -#ifndef ADT_MAPPINGSET_H -#define ADT_MAPPINGSET_H - -struct fsm_alloc; -struct mapping_set; -struct mapping; - -struct mapping_iter { - struct hashset_iter iter; -}; - -struct mapping_set * -mapping_set_create(const struct fsm_alloc *a, - unsigned long (*hash)(const struct mapping *a), - int (*cmp)(const void *a, const void *b)); - -void -mapping_set_free(struct mapping_set *set); - -struct mapping * -mapping_set_add(struct mapping_set *set, struct mapping *item); - -struct mapping * -mapping_set_contains(const struct mapping_set *set, const struct mapping *item); - -void -mapping_set_clear(struct mapping_set *set); - -struct mapping * -mapping_set_first(const struct mapping_set *set, struct mapping_iter *it); - -struct mapping * -mapping_set_next(struct mapping_iter *it); - -#endif - diff --git a/include/adt/stateset.h b/include/adt/stateset.h index 83e835467..becd263ad 100644 --- a/include/adt/stateset.h +++ b/include/adt/stateset.h @@ -7,6 +7,8 @@ #ifndef ADT_STATESET_H #define ADT_STATESET_H +#include + struct set; struct fsm_alloc; struct state_set; @@ -72,7 +74,7 @@ state_set_rebase(struct state_set **set, fsm_state_t base); void state_set_replace(struct state_set **set, fsm_state_t old, fsm_state_t new); -unsigned long +uint64_t state_set_hash(const struct state_set *set); #endif diff --git a/include/fsm/capture.h b/include/fsm/capture.h index d3c1aaa54..0fd418925 100644 --- a/include/fsm/capture.h +++ b/include/fsm/capture.h @@ -23,41 +23,32 @@ struct fsm_capture { size_t pos[2]; }; -/* How many captures does the FSM use? */ +/* What is the max capture ID an FSM uses? */ unsigned -fsm_countcaptures(const struct fsm *fsm); +fsm_capture_ceiling(const struct fsm *fsm); /* Does a specific state have any capture actions? */ int fsm_capture_has_capture_actions(const struct fsm *fsm, fsm_state_t state); -/* Set a capture path on an FSM. This means that during matching, the - * portion of a match between the path's START and END states will be - * captured. As the FSM is transformed (determinisation, minimisation, - * unioning, etc.), the path will be converted to refer to the pair(s) - * of new states instead. If the path's END state is no longer reachable - * from its START state, then the capture path will be ignored. - * Multiple instances of the same capture_id and path are ignored. */ -int -fsm_capture_set_path(struct fsm *fsm, unsigned capture_id, - fsm_state_t start, fsm_state_t end); - -/* Increase the base capture ID for all captures in an fsm. - * This could be used before combining multiple FSMs -- for - * example, before unioning a and b, where a has 3 captures - * and b has 2, b may be rebase'd to 3 -- so a has captures - * 0-2 and b has 3-4. */ -void -fsm_capture_rebase_capture_id(struct fsm *fsm, unsigned base); +/* Allocate a capture buffer with enough space for + * the current FSM's captures. + * + * This is provided for convenience -- the necessary array + * count can be checked with fsm_capture_ceiling, and then + * the buffer can be allocated directly. */ +struct fsm_capture * +fsm_capture_alloc_capture_buffer(const struct fsm *fsm); -/* Same, but for capture action states. */ +/* Free a capture buffer. */ void -fsm_capture_rebase_capture_action_states(struct fsm *fsm, fsm_state_t base); +fsm_capture_free_capture_buffer(const struct fsm *fsm, struct fsm_capture *capture_buffer); -/* Allocate a capture buffer with enough space for - * the current FSM's captures. */ -struct fsm_capture * -fsm_capture_alloc(const struct fsm *fsm); +/* Note that a capture is active for a particular end state. + * Using this for a non-end state is an unchecked error. */ +int +fsm_capture_set_active_for_end(struct fsm *fsm, + unsigned capture_id, fsm_state_t end_state); #ifndef NDEBUG #include diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 7c3883749..b269a05db 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -192,6 +192,10 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq); void fsm_setend(struct fsm *fsm, fsm_state_t state, int end); +/* Associate a numeric ID with a single end state in an FSM. */ +int +fsm_setendid_state(struct fsm *fsm, fsm_state_t s, fsm_end_id_t id); + /* Associate a numeric ID with the end states in an fsm. * This can be used to track which of the original fsms matched * input when multiple fsms are combined. @@ -462,8 +466,21 @@ fsm_shortest(const struct fsm *fsm, * The given FSM is expected to be a DFA. */ int -fsm_exec(const struct fsm *fsm, int (*fsm_getc)(void *opaque), void *opaque, - fsm_state_t *end, struct fsm_capture *captures); +fsm_exec(const struct fsm *fsm, + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end); + +/* Same as fsm_exec, but also populate information about captures if + * *captures is non-NULL and capture metadata is available for the DFA. + * Captures is expected to be large enough to fit captures from the FSM. + * To check, use `fsm_capture_ceiling`. + * + * The current implementation requires all input to be buffered ahead of + * time, so this takes a pointer to an input array rather than a + * character iterator. */ +int +fsm_exec_with_captures(const struct fsm *fsm, const unsigned char *input, + size_t input_length, fsm_state_t *end, + struct fsm_capture *captures, size_t capture_buf_length); /* * Callbacks which may be passed to fsm_exec(). These are conveniences for diff --git a/include/re/re.h b/include/re/re.h index deab6caed..ab5f09b39 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -20,16 +20,18 @@ enum re_dialect { }; enum re_flags { - RE_ICASE = 1 << 0, - RE_TEXT = 1 << 1, - RE_MULTI = 1 << 2, - RE_REVERSE = 1 << 3, - RE_SINGLE = 1 << 4, /* aka PCRE_DOTALL */ - RE_ZONE = 1 << 5, - RE_ANCHORED = 1 << 6, - RE_EXTENDED = 1 << 7, /* PCRE extended mode */ - RE_END_NL = 1 << 8, /* end anchor matches '\n' */ - RE_FLAGS_NONE = 0 + RE_ICASE = 1 << 0, + RE_TEXT = 1 << 1, + RE_MULTI = 1 << 2, + RE_REVERSE = 1 << 3, + RE_SINGLE = 1 << 4, /* aka PCRE_DOTALL */ + RE_ZONE = 1 << 5, + RE_ANCHORED = 1 << 6, + RE_EXTENDED = 1 << 7, /* PCRE extended mode */ + RE_NOCAPTURE = 1 << 8, /* disable captures */ + RE_END_NL = 1 << 9, /* end anchor matches '\n' */ + RE_END_NL_DISABLE = 1 << 10, /* disable end anchor matching '\n' */ + RE_FLAGS_NONE = 0 }; #define RE_ANCHOR (RE_TEXT | RE_MULTI | RE_ZONE) @@ -46,6 +48,8 @@ enum re_errno { RE_EERRNO = 1 | RE_MISC, RE_EBADDIALECT = 2 | RE_MISC, RE_EBADGROUP = 3 | RE_MISC, + RE_EUNSUPCAPTUR = 4 | RE_MISC, + RE_EUNSUPPPCRE = 5 | RE_MISC, RE_ENEGRANGE = 0 | RE_MARK | RE_GROUP, RE_ENEGCOUNT = 1 | RE_MARK | RE_GROUP, diff --git a/src/adt/Makefile b/src/adt/Makefile index 05199f2dc..6fae4e7ca 100644 --- a/src/adt/Makefile +++ b/src/adt/Makefile @@ -2,6 +2,7 @@ SRC += src/adt/alloc.c SRC += src/adt/bitmap.c +SRC += src/adt/idmap.c SRC += src/adt/internedstateset.c SRC += src/adt/priq.c SRC += src/adt/path.c @@ -19,12 +20,10 @@ CFLAGS.${src} += -I src # XXX: for internal.h DFLAGS.${src} += -I src # XXX: for internal.h .endfor -# not all concrete set interfaces use all static functions from set.inc -.if ${CC:T:Mgcc*} || ${CC:T:Mclang*} -.for src in ${SRC:Msrc/adt/stateset.c} ${SRC:Msrc/adt/tupleset.c} ${SRC:Msrc/adt/edgeset.c} -CFLAGS.${src} += -Wno-unused-function +.for src in ${SRC:Msrc/adt/siphash.c} ${SRC:Msrc/adt/edgeset.c} ${SRC:Msrc/adt/idmap.c} ${SRC:Msrc/adt/ipriq.c} ${SRC:Msrc/adt/internedstateset.c} +CFLAGS.${src} += -std=c99 # XXX: for internal.h +DFLAGS.${src} += -std=c99 # XXX: for internal.h .endfor -.endif # I want to assert on things which are currently true for this platform, # but not true in general. diff --git a/src/adt/edgeset.c b/src/adt/edgeset.c index c718727ca..9658213c8 100644 --- a/src/adt/edgeset.c +++ b/src/adt/edgeset.c @@ -11,6 +11,7 @@ #include #define LOG_BITSET 0 +#define LOG_BSEARCH 0 #include "libfsm/internal.h" /* XXX: for allocating struct fsm_edge, and the edges array */ @@ -184,6 +185,100 @@ edge_set_advise_growth(struct edge_set **pset, const struct fsm_alloc *alloc, return 1; } +enum fsp_res { + FSP_FOUND_INSERT_POSITION, + FSP_FOUND_VALUE_PRESENT, +}; + +/* Use binary search to find the first position N where set->groups[N].to >= state, + * which includes the position immediately following the last entry. Return an enum + * which indicates whether state is already present. */ +static enum fsp_res +find_state_position(const struct edge_set *set, fsm_state_t state, size_t *dst) +{ + size_t lo = 0, hi = set->count; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: looking for %d in %p (count %zu)\n", + __func__, state, (void *)set, set->count); + } + +#if EXPENSIVE_CHECKS + /* invariant: input is unique and sorted */ + for (size_t i = 1; i < set->count; i++) { + assert(set->groups[i - 1].to < set->groups[i].to); + } +#endif + + if (set->count == 0) { + if (LOG_BSEARCH) { + fprintf(stderr, "%s: empty, returning 0\n", __func__); + } + *dst = 0; + return FSP_FOUND_INSERT_POSITION; + } else { + if (LOG_BSEARCH) { + fprintf(stderr, "%s: fast path: looking for %d, set->groups[last].to %d\n", + __func__, state, set->groups[hi - 1].to); + } + + /* Check the last entry so we can append in constant time. */ + const fsm_state_t last = set->groups[hi - 1].to; + if (state > last) { + *dst = hi; + return FSP_FOUND_INSERT_POSITION; + } else if (state == last) { + *dst = hi - 1; + return FSP_FOUND_VALUE_PRESENT; + } + } + + size_t mid; + while (lo < hi) { /* lo <= mid < hi */ + mid = lo + (hi - lo)/2; /* avoid overflow */ + const struct edge_group *eg = &set->groups[mid]; + const fsm_state_t cur = eg->to; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: lo %zu, hi %zu, mid %zu, cur %d, looking for %d\n", + __func__, lo, hi, mid, cur, state); + } + + if (state == cur) { + *dst = mid; + return FSP_FOUND_VALUE_PRESENT; + } else if (state > cur) { + lo = mid + 1; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: new lo %zd\n", __func__, lo); + } + + /* Update mid if we're about to halt, because we're looking + * for the first position >= state, not the last position <=. */ + if (lo == hi) { + mid = lo; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: special case, updating mid to %zd\n", __func__, mid); + } + } + } else if (state < cur) { + hi = mid; + if (LOG_BSEARCH) { + fprintf(stderr, "%s: new hi %zd\n", __func__, hi); + } + } + } + + if (LOG_BSEARCH) { + fprintf(stderr, "%s: halting at %zd (looking for %d, cur %d)\n", + __func__, mid, state, set->groups[mid].to); + } + + /* dst is now the first position > state (== case is handled above), + * which may be one past the end of the array. */ + assert(mid == set->count || set->groups[mid].to > state); + *dst = mid; + return FSP_FOUND_INSERT_POSITION; +} + int edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, uint64_t symbols[256/64], fsm_state_t state) @@ -223,30 +318,24 @@ edge_set_add_bulk(struct edge_set **pset, const struct fsm_alloc *alloc, assert(set->count <= set->ceil); #if LOG_BITSET - fprintf(stderr, " -- edge_set_add: symbols [0x%lx, 0x%lx, 0x%lx, 0x%lx] -> state %d on %p\n", - symbols[0], symbols[1], symbols[2], symbols[3], - state, (void *)set); + fprintf(stderr, " -- edge_set_add: symbols [0x%lx, 0x%lx, 0x%lx, 0x%lx] -> state %d on %p\n", + symbols[0], symbols[1], symbols[2], symbols[3], + state, (void *)set); #endif - /* Linear search for a group with the same destination - * state, or the position where that group would go. */ - for (i = 0; i < set->count; i++) { + switch (find_state_position(set, state, &i)) { + case FSP_FOUND_VALUE_PRESENT: + assert(i < set->count); eg = &set->groups[i]; - - if (eg->to == state) { - /* This API does not indicate whether that - * symbol -> to edge was already present. */ - size_t i; - for (i = 0; i < 256/64; i++) { - eg->symbols[i] |= symbols[i]; - } - dump_edge_set(set); - return 1; - } else if (eg->to > state) { - break; /* will shift down and insert below */ - } else { - continue; + for (i = 0; i < 256/64; i++) { + eg->symbols[i] |= symbols[i]; } + dump_edge_set(set); + return 1; + + break; + case FSP_FOUND_INSERT_POSITION: + break; /* continue below */ } /* insert/append at i */ diff --git a/src/adt/hashrec.c b/src/adt/hashrec.c index 6a341710b..7348cbc38 100644 --- a/src/adt/hashrec.c +++ b/src/adt/hashrec.c @@ -23,7 +23,7 @@ static const unsigned char hashk[] = { 0x14, 0xa8, 0xff, 0x36, 0x15, 0x16, 0x2c, 0xf7, 0xf4, 0xce, 0xb8, 0x66, 0x74, 0xf4, 0x3d, 0x64, }; -unsigned long +uint64_t hashrec(const void *p, size_t n) { uint64_t h = 0; diff --git a/src/adt/idmap.c b/src/adt/idmap.c new file mode 100644 index 000000000..d1a265861 --- /dev/null +++ b/src/adt/idmap.c @@ -0,0 +1,396 @@ +/* + * Copyright 2021 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "adt/idmap.h" + +#include "adt/alloc.h" +#include "adt/hash.h" +#include "adt/u64bitset.h" + +#include +#include +#include + +#define NO_STATE ((fsm_state_t)-1) + +#define DEF_BUCKET_COUNT 4 + +struct idmap { + const struct fsm_alloc *alloc; + unsigned bucket_count; + unsigned buckets_used; + + /* All buckets' values are assumed to be large + * enough to store this value, and they will all + * grow as necessary. */ + unsigned max_value; + + /* Basic linear-probing, add-only hash table. */ + struct idmap_bucket { + fsm_state_t state; /* Key. NO_STATE when empty. */ + + /* values[] is always either NULL or has at least + * max_value + 1 bits; all grow on demand. */ + uint64_t *values; + } *buckets; +}; + +static unsigned +value_words(unsigned max_value) { + if (max_value == 0) { + /* Still allocate one word, for storing 0. */ + return 1; + } else { + return u64bitset_words(max_value); + } +} + +struct idmap * +idmap_new(const struct fsm_alloc *alloc) +{ + struct idmap *res = NULL; + struct idmap_bucket *buckets = NULL; + + res = f_malloc(alloc, sizeof(*res)); + if (res == NULL) { + goto cleanup; + } + + buckets = f_calloc(alloc, + DEF_BUCKET_COUNT, sizeof(buckets[0])); + if (buckets == NULL) { + goto cleanup; + } + + for (size_t i = 0; i < DEF_BUCKET_COUNT; i++) { + buckets[i].state = NO_STATE; + } + + res->alloc = alloc; + res->buckets_used = 0; + res->bucket_count = DEF_BUCKET_COUNT; + res->max_value = 0; + res->buckets = buckets; + + return res; + +cleanup: + f_free(alloc, res); + f_free(alloc, buckets); + return NULL; +} + +void +idmap_free(struct idmap *m) +{ + if (m == NULL) { + return; + } + + for (size_t i = 0; i < m->bucket_count; i++) { + if (m->buckets[i].state == NO_STATE) { + continue; + } + f_free(m->alloc, m->buckets[i].values); + } + + f_free(m->alloc, m->buckets); + f_free(m->alloc, m); +} + +static int +grow_bucket_values(struct idmap *m, unsigned old_words, unsigned new_words) +{ + assert(new_words > old_words); + + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + struct idmap_bucket *b = &m->buckets[b_i]; + if (b->state == NO_STATE) { + assert(b->values == NULL); + continue; + } + + uint64_t *nv = f_calloc(m->alloc, + new_words, sizeof(nv[0])); + if (nv == NULL) { + return 0; + } + + for (size_t w_i = 0; w_i < old_words; w_i++) { + nv[w_i] = b->values[w_i]; + } + f_free(m->alloc, b->values); + b->values = nv; + } + return 1; +} + +static int +grow_buckets(struct idmap *m) +{ + const size_t ocount = m->bucket_count; + const size_t ncount = 2*ocount; + assert(ncount > m->bucket_count); + + struct idmap_bucket *nbuckets = f_calloc(m->alloc, + ncount, sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return 0; + } + for (size_t nb_i = 0; nb_i < ncount; nb_i++) { + nbuckets[nb_i].state = NO_STATE; + } + + const size_t nmask = ncount - 1; + + for (size_t ob_i = 0; ob_i < ocount; ob_i++) { + const struct idmap_bucket *ob = &m->buckets[ob_i]; + if (ob->state == NO_STATE) { + continue; + } + + const uint64_t h = hash_id(ob->state); + for (size_t nb_i = 0; nb_i < ncount; nb_i++) { + struct idmap_bucket *nb = &nbuckets[(h + nb_i) & nmask]; + if (nb->state == NO_STATE) { + nb->state = ob->state; + nb->values = ob->values; + break; + } else { + assert(nb->state != ob->state); + /* collision */ + continue; + } + } + } + + f_free(m->alloc, m->buckets); + + m->buckets = nbuckets; + m->bucket_count = ncount; + + return 1; +} + +int +idmap_set(struct idmap *m, fsm_state_t state_id, + unsigned value) +{ + assert(state_id != NO_STATE); + + const uint64_t h = hash_id(state_id); + if (value > m->max_value) { + const unsigned ovw = value_words(m->max_value); + const unsigned nvw = value_words(value); + /* If this value won't fit in the existing value + * arrays, then grow them all. We do not track the + * number of bits in each individual array. */ + if (nvw > ovw && !grow_bucket_values(m, ovw, nvw)) { + return 0; + } + m->max_value = value; + } + + assert(m->max_value >= value); + + if (m->buckets_used >= m->bucket_count/2) { + if (!grow_buckets(m)) { + return 0; + } + } + + const uint64_t mask = m->bucket_count - 1; + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; + if (b->state == state_id) { + assert(b->values != NULL); + u64bitset_set(b->values, value); + return 1; + } else if (b->state == NO_STATE) { + b->state = state_id; + assert(b->values == NULL); + + const unsigned vw = value_words(m->max_value); + b->values = f_calloc(m->alloc, + vw, sizeof(b->values[0])); + if (b->values == NULL) { + return 0; + } + m->buckets_used++; + + u64bitset_set(b->values, value); + return 1; + } else { + continue; /* collision */ + } + + } + + assert(!"unreachable"); + return 0; +} + +static const struct idmap_bucket * +get_bucket(const struct idmap *m, fsm_state_t state_id) +{ + const uint64_t h = hash_id(state_id); + const uint64_t mask = m->bucket_count - 1; + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + const struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; + if (b->state == NO_STATE) { + return NULL; + } else if (b->state == state_id) { + return b; + } + } + + return NULL; +} + +size_t +idmap_get_value_count(const struct idmap *m, fsm_state_t state_id) +{ + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + return 0; + } + assert(b->values != NULL); + + size_t res = 0; + const size_t words = value_words(m->max_value); + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + /* This could use popcount64(w). */ + if (w == 0) { + continue; + } + for (uint64_t bit = 1; bit; bit <<= 1) { + if (w & bit) { + res++; + } + } + } + + return res; +} + +int +idmap_get(const struct idmap *m, fsm_state_t state_id, + size_t buf_size, unsigned *buf, size_t *written) +{ + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + if (written != NULL) { + *written = 0; + } + return 1; + } + + size_t buf_offset = 0; + const size_t words = value_words(m->max_value); + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + if (buf_offset * sizeof(buf[0]) >= buf_size) { + return 0; + } + buf[buf_offset] = 64*w_i + b_i; + buf_offset++; + } + } + } + + if (written != NULL) { + *written = buf_offset; + } + return 1; +} + +void +idmap_iter(const struct idmap *m, + idmap_iter_fun *cb, void *opaque) +{ + const size_t words = value_words(m->max_value); + + for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + const struct idmap_bucket *b = &m->buckets[b_i]; + if (b->state == NO_STATE) { + continue; + } + + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + const unsigned v = 64*w_i + b_i; + if (!cb(b->state, v, opaque)) { + return; + } + } + } + } + } +} + +void +idmap_iter_for_state(const struct idmap *m, fsm_state_t state_id, + idmap_iter_fun *cb, void *opaque) +{ + const size_t words = value_words(m->max_value); + const struct idmap_bucket *b = get_bucket(m, state_id); + if (b == NULL) { + return; + } + + for (size_t w_i = 0; w_i < words; w_i++) { + const uint64_t w = b->values[w_i]; + if (w == 0) { + continue; + } + /* if N contiguous bits are all zero, skip them all at once */ +#define BLOCK_BITS 16 + uint64_t block = ((uint64_t)1 << BLOCK_BITS) - 1; + size_t block_count = 0; + + uint64_t b_i = 0; + while (b_i < 64) { + if ((w & block) == 0) { + block <<= BLOCK_BITS; + b_i += BLOCK_BITS; + continue; + } + + if (w & ((uint64_t)1 << b_i)) { + const unsigned v = 64*w_i + b_i; + if (!cb(b->state, v, opaque)) { + return; + } + block_count++; + } + b_i++; + block <<= 1; + } + +#define CHECK 0 +#if CHECK + size_t check_count = 0; + for (uint64_t b_i = 0; b_i < 64; b_i++) { + if (w & ((uint64_t)1 << b_i)) { + check_count++; + } + } + assert(block_count == check_count); +#endif + } +} diff --git a/src/adt/stateset.c b/src/adt/stateset.c index c1cff9933..fa3d0c54a 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -15,6 +15,11 @@ #include #include +/* This is used here because the calls to + * state_set_contains change the order of growth. */ +#include + + /* * TODO: now fsm_state_t is a numeric index, this could be a dynamically * allocated bitmap, instead of a set.inc's array of items. @@ -44,8 +49,8 @@ struct state_set { const struct fsm_alloc *alloc; fsm_state_t *a; - size_t i; - size_t n; + size_t i; /* used */ + size_t n; /* ceil */ }; int @@ -138,7 +143,8 @@ state_set_cmp(const struct state_set *a, const struct state_set *b) } /* - * Return where an item would be, if it were inserted + * Return where an item would be, if it were inserted. + * When insertion would append this returns one past the array. */ static size_t state_set_search(const struct state_set *set, fsm_state_t state) @@ -150,6 +156,11 @@ state_set_search(const struct state_set *set, fsm_state_t state) assert(!IS_SINGLETON(set)); assert(set->a != NULL); + /* fast path: append case */ + if (set->i > 0 && state > set->a[set->i - 1]) { + return set->i; + } + start = mid = 0; end = set->i; @@ -161,6 +172,12 @@ state_set_search(const struct state_set *set, fsm_state_t state) end = mid; } else if (r > 0) { start = mid + 1; + /* update mid if we're about to halt, because + * we're looking for the first position >= state, + * not the last position <= */ + if (start == end) { + mid = start; + } } else { return mid; } @@ -242,7 +259,7 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, */ if (!state_set_empty(set)) { i = state_set_search(set, state); - if (set->a[i] == state) { + if (i < set->i && set->a[i] == state) { return 1; } } @@ -261,11 +278,7 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, set->n *= 2; } - if (state_set_cmpval(state, set->a[i]) > 0) { - i++; - } - - if (i <= set->i) { + if (i < set->i) { memmove(&set->a[i + 1], &set->a[i], (set->i - i) * (sizeof *set->a)); } @@ -276,6 +289,8 @@ state_set_add(struct state_set **setp, const struct fsm_alloc *alloc, set->i = 1; } + /* This assert can be pretty expensive in -O0 but in -O3 it has very + * little impact on the overall runtime. */ assert(state_set_contains(set, state)); return 1; @@ -470,7 +485,7 @@ state_set_remove(struct state_set **setp, fsm_state_t state) } i = state_set_search(set, state); - if (set->a[i] == state) { + if (i < set->i && set->a[i] == state) { if (i < set->i) { memmove(&set->a[i], &set->a[i + 1], (set->i - i - 1) * (sizeof *set->a)); } @@ -478,7 +493,9 @@ state_set_remove(struct state_set **setp, fsm_state_t state) set->i--; } +#if EXPENSIVE_CHECKS assert(!state_set_contains(set, state)); +#endif } int @@ -524,7 +541,7 @@ state_set_contains(const struct state_set *set, fsm_state_t state) } i = state_set_search(set, state); - if (set->a[i] == state) { + if (i < set->i && set->a[i] == state) { return 1; } @@ -659,7 +676,7 @@ state_set_replace(struct state_set **setp, fsm_state_t old, fsm_state_t new) } } -unsigned long +uint64_t state_set_hash(const struct state_set *set) { if (set == NULL) { diff --git a/src/fsm/main.c b/src/fsm/main.c index 8e668b779..192ab7de3 100644 --- a/src/fsm/main.c +++ b/src/fsm/main.c @@ -599,7 +599,7 @@ main(int argc, char *argv[]) size_t n; struct state_iter it; - closures = epsilon_closure(fsm); + closures = fsm_epsilon_closure(fsm); if (closures == NULL) { return -1; } @@ -620,7 +620,7 @@ main(int argc, char *argv[]) printf("\n"); } - closure_free(closures, fsm->statecount); + fsm_closure_free(closures, fsm->statecount); return 0; } else { @@ -647,7 +647,7 @@ main(int argc, char *argv[]) f = xopen(argv[0]); - e = fsm_exec(fsm, fsm_fgetc, f, &state, NULL); + e = fsm_exec(fsm, fsm_fgetc, f, &state); fclose(f); } else { @@ -655,7 +655,7 @@ main(int argc, char *argv[]) s = argv[i]; - e = fsm_exec(fsm, fsm_sgetc, &s, &state, NULL); + e = fsm_exec(fsm, fsm_sgetc, &s, &state); } if (e != 1) { diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 9af51a5a4..bfa8e67db 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -1,6 +1,8 @@ .include "../../share/mk/top.mk" SRC += src/libfsm/capture.c +SRC += src/libfsm/capture_vm.c +SRC += src/libfsm/capture_vm_exec.c SRC += src/libfsm/collate.c SRC += src/libfsm/complete.c SRC += src/libfsm/consolidate.c diff --git a/src/libfsm/capture.c b/src/libfsm/capture.c index 806bb3b12..21f32d06e 100644 --- a/src/libfsm/capture.c +++ b/src/libfsm/capture.c @@ -6,31 +6,82 @@ #include -#include "capture_internal.h" +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "capture.h" +#include "capture_vm_program.h" +#include "capture_log.h" +#include "capture_vm.h" +#include "endids.h" + +#define DEF_PROGRAMS_CEIL 4 + +struct fsm_capture_info { + unsigned max_capture_id; + + /* For particular end states, which captures are active? */ + struct idmap *end_capture_map; + + /* Set of capture resolution programs associated with specific + * end states. */ + struct capvm_program_set { + uint32_t ceil; + uint32_t used; + struct capvm_program **set; + } programs; + + /* For particular end states, which capture programs are + * associtaed with them? */ + struct idmap *end_capvm_program_map; +}; int fsm_capture_init(struct fsm *fsm) { struct fsm_capture_info *ci = NULL; - size_t i; + struct idmap *end_capture_map = NULL; + struct idmap *end_capvm_program_map = NULL; ci = f_calloc(fsm->opt->alloc, 1, sizeof(*ci)); if (ci == NULL) { goto cleanup; } - fsm->capture_info = ci; + end_capture_map = idmap_new(fsm->opt->alloc); + if (end_capture_map == NULL) { + goto cleanup; + } + ci->end_capture_map = end_capture_map; - for (i = 0; i < fsm->statealloc; i++) { - fsm->states[i].has_capture_actions = 0; + end_capvm_program_map = idmap_new(fsm->opt->alloc); + if (end_capvm_program_map == NULL) { + goto cleanup; } + ci->end_capvm_program_map = end_capvm_program_map; + + fsm->capture_info = ci; return 1; cleanup: - if (ci != NULL) { - f_free(fsm->opt->alloc, ci); - } + f_free(fsm->opt->alloc, ci); + idmap_free(end_capture_map); + idmap_free(end_capvm_program_map); return 0; } @@ -41,799 +92,570 @@ fsm_capture_free(struct fsm *fsm) if (ci == NULL) { return; } - f_free(fsm->opt->alloc, ci->buckets); + + idmap_free(ci->end_capture_map); + idmap_free(ci->end_capvm_program_map); + + for (size_t p_i = 0; p_i < ci->programs.used; p_i++) { + fsm_capvm_program_free(fsm->opt->alloc, ci->programs.set[p_i]); + } + f_free(fsm->opt->alloc, ci->programs.set); + f_free(fsm->opt->alloc, ci); fsm->capture_info = NULL; } unsigned -fsm_countcaptures(const struct fsm *fsm) +fsm_capture_ceiling(const struct fsm *fsm) { - (void)fsm; if (fsm->capture_info == NULL) { return 0; } - if (fsm->capture_info->buckets_used == 0) { - return 0; - } - /* check actual */ #if EXPENSIVE_CHECKS - { - struct fsm_capture_info *ci = fsm->capture_info; - size_t i; - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { /* empty */ - continue; - } - assert(ci->max_capture_id >= b->action.id); + /* check actual */ + unsigned res = 0; + for (size_t i = 0; i < fsm->capture_info->programs.used; i++) { + const unsigned id = fsm_capvm_program_get_max_capture_id(fsm->capture_info->programs.set[i]); + if (id > res) { + res = id; } } + assert(res == fsm->capture_info->max_capture_id); #endif return fsm->capture_info->max_capture_id + 1; } +struct fsm_capture * +fsm_capture_alloc_capture_buffer(const struct fsm *fsm) +{ + assert(fsm != NULL); + const size_t len = fsm_capture_ceiling(fsm); + struct fsm_capture *res = f_malloc(fsm->opt->alloc, + len * sizeof(res[0])); + return res; +} + +void +fsm_capture_free_capture_buffer(const struct fsm *fsm, + struct fsm_capture *capture_buffer) +{ + assert(fsm != NULL); + f_free(fsm->opt->alloc, capture_buffer); +} + + int fsm_capture_has_captures(const struct fsm *fsm) { return fsm->capture_info - ? fsm->capture_info->buckets_used > 0 + ? fsm->capture_info->programs.used > 0 : 0; } -int -fsm_capture_has_capture_actions(const struct fsm *fsm, fsm_state_t state) +void +fsm_capture_dump_programs(FILE *f, const struct fsm *fsm) { - assert(state < fsm->statecount); - return fsm->states[state].has_capture_actions; + fprintf(f, "\n==== %s:\n", __func__); + struct fsm_capture_info *ci = fsm->capture_info; + for (uint32_t i = 0; i < ci->programs.used; i++) { + const struct capvm_program *p = ci->programs.set[i]; + fprintf(f, "# program %u, capture_count %u, base %u\n", + i, p->capture_count, p->capture_base); + fsm_capvm_program_dump(f, p); + fprintf(f, "\n"); + } } int -fsm_capture_set_path(struct fsm *fsm, unsigned capture_id, - fsm_state_t start, fsm_state_t end) +fsm_capture_set_active_for_end(struct fsm *fsm, + unsigned capture_id, fsm_state_t end_state) { - struct fsm_capture_info *ci; - struct capture_set_path_env env; - size_t seen_words; - int res = 0; - - assert(fsm != NULL); - assert(start < fsm->statecount); - assert(end < fsm->statecount); - - ci = fsm->capture_info; + struct fsm_capture_info *ci = fsm->capture_info; assert(ci != NULL); + struct idmap *m = ci->end_capture_map; + assert(m != NULL); - /* captures should no longer be stored as paths -- instead, set - * the info on the states _here_, and convert it as necessary. */ - -#if LOG_CAPTURE > 0 - fprintf(stderr, "fsm_capture_set_path: capture %u: <%u, %u>\n", - capture_id, start, end); -#endif - - if (capture_id > FSM_CAPTURE_MAX) { - return 0; /* ID out of range */ - } - - if (!init_capture_action_htab(fsm, ci)) { - return 0; - } - - /* This will create a trail and do a depth-first search from the - * start state, marking every unique path to the end state. */ - env.fsm = fsm; - env.capture_id = capture_id; - env.start = start; - env.end = end; - - env.trail_ceil = 0; - env.trail = NULL; - env.seen = NULL; + #if EXPENSIVE_CHECKS + assert(fsm_isend(fsm, end_state)); + #endif - env.trail = f_malloc(fsm->opt->alloc, - DEF_TRAIL_CEIL * sizeof(env.trail[0])); - if (env.trail == NULL) { - goto cleanup; - } - env.trail_ceil = DEF_TRAIL_CEIL; - - seen_words = fsm->statecount/64 + 1; - env.seen = f_malloc(fsm->opt->alloc, - seen_words * sizeof(env.seen[0])); + return idmap_set(m, end_state, capture_id); +} - if (!mark_capture_path(&env)) { - goto cleanup; - } +void +fsm_capture_iter_active_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_active_for_end_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter_for_state(fsm->capture_info->end_capture_map, state, + idmap_cb, opaque); +} - if (capture_id >= ci->max_capture_id) { - ci->max_capture_id = capture_id; - } +void +fsm_capture_iter_active_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_active_for_end_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter(fsm->capture_info->end_capture_map, + idmap_cb, opaque); +} - res = 1; - /* fall through */ +void +fsm_capture_iter_program_ids_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter_for_state(fsm->capture_info->end_capvm_program_map, state, + idmap_cb, opaque); +} -cleanup: - f_free(fsm->opt->alloc, env.trail); - f_free(fsm->opt->alloc, env.seen); - return res; +void +fsm_capture_iter_program_ids_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque) +{ + /* These types should be the same. */ + idmap_iter_fun *idmap_cb = cb; + idmap_iter(fsm->capture_info->end_capvm_program_map, + idmap_cb, opaque); } static int -init_capture_action_htab(struct fsm *fsm, struct fsm_capture_info *ci) +dump_active_for_ends_cb(fsm_state_t state_id, unsigned value, void *opaque) { - size_t count, i; - assert(fsm != NULL); - assert(ci != NULL); + FILE *f = opaque; + fprintf(f, " -- state %d: value %u\n", state_id, value); + return 1; +} - if (ci->bucket_count > 0) { - assert(ci->buckets != NULL); - return 1; /* done */ - } +void +fsm_capture_dump_active_for_ends(FILE *f, const struct fsm *fsm) +{ + fprintf(f, "%s:\n", __func__); + idmap_iter(fsm->capture_info->end_capture_map, dump_active_for_ends_cb, f); +} - assert(ci->buckets == NULL); - assert(ci->buckets_used == 0); +void +fsm_capture_dump_program_end_mapping(FILE *f, const struct fsm *fsm) +{ + fprintf(f, "%s:\n", __func__); + idmap_iter(fsm->capture_info->end_capvm_program_map, dump_active_for_ends_cb, f); +} - count = DEF_CAPTURE_ACTION_BUCKET_COUNT; - ci->buckets = f_malloc(fsm->opt->alloc, - count * sizeof(ci->buckets[0])); - if (ci->buckets == NULL) { - return 0; - } +/* Dump capture metadata about an FSM. */ +void +fsm_capture_dump(FILE *f, const char *tag, const struct fsm *fsm) +{ + struct fsm_capture_info *ci; - /* Init buckets to CAPTURE_NO_STATE -> empty. */ - for (i = 0; i < count; i++) { - ci->buckets[i].state = CAPTURE_NO_STATE; + assert(fsm != NULL); + ci = fsm->capture_info; + if (ci == NULL) { + fprintf(f, "==== %s -- no captures\n", tag); + return; } - ci->bucket_count = count; - return 1; + fsm_endid_dump(f, fsm); + fsm_capture_dump_active_for_ends(f, fsm); + fsm_capture_dump_programs(f, fsm); + fsm_capture_dump_program_end_mapping(f, fsm); } +struct carry_active_captures_env { + fsm_state_t dst; + struct idmap *dst_m; + int ok; +}; + static int -mark_capture_path(struct capture_set_path_env *env) +copy_active_captures_cb(fsm_state_t state_id, unsigned value, void *opaque) { - const size_t seen_words = env->fsm->statecount/64 + 1; - -#if LOG_CAPTURE > 0 - fprintf(stderr, "mark_capture_path: path [id %u, %u - %u]\n", - env->capture_id, env->start, env->end); -#endif + (void)state_id; - if (env->start == env->end) { - struct fsm_capture_action action; - action.type = CAPTURE_ACTION_COMMIT_ZERO_STEP; - action.id = env->capture_id; - action.to = CAPTURE_NO_STATE; - if (!add_capture_action(env->fsm, env->fsm->capture_info, - env->start, &action)) { - return 0; - } - return 1; - } - - memset(env->seen, 0x00, - seen_words * sizeof(env->seen[0])); - - /* initialize to starting node */ - env->trail_i = 1; - env->trail[0].state = env->start; - env->trail[0].step = TRAIL_STEP_START; - env->trail[0].has_self_edge = 0; - - while (env->trail_i > 0) { - const enum trail_step step = env->trail[env->trail_i - 1].step; -#if LOG_CAPTURE > 0 - fprintf(stderr, "mark_capture_path: trail %u/%u, cur %u, step %d\n", - env->trail_i, env->trail_ceil, - env->trail[env->trail_i - 1].state, - step); -#endif - - switch (step) { - case TRAIL_STEP_START: - if (!step_trail_start(env)) { - return 0; - } - break; - case TRAIL_STEP_ITER_EDGES: - if (!step_trail_iter_edges(env)) { - return 0; - } - break; - case TRAIL_STEP_ITER_EPSILONS: - if (!step_trail_iter_epsilons(env)) { - return 0; - } - break; - case TRAIL_STEP_DONE: - if (!step_trail_done(env)) { - return 0; - } - break; - default: - assert(!"match fail"); - } + struct carry_active_captures_env *env = opaque; + if (!idmap_set(env->dst_m, env->dst, value)) { + env->ok = false; + return 0; } - return 1; } static int -cmp_action(const struct fsm_capture_action *a, - const struct fsm_capture_action *b) { - /* could use memcmp here, provided padding is always zeroed. */ - return a->id < b->id ? -1 - : a->id > b->id ? 1 - : a->type < b->type ? -1 - : a->type > b->type ? 1 - : a->to < b->to ? -1 - : a->to > b->to ? 1 - : 0; -} - -int -fsm_capture_add_action(struct fsm *fsm, - fsm_state_t state, enum capture_action_type type, - unsigned id, fsm_state_t to) +copy_program_associations_cb(fsm_state_t state_id, unsigned value, void *opaque) { - struct fsm_capture_action action; - assert(fsm->capture_info != NULL); + (void)state_id; - action.type = type; - action.id = id; - action.to = to; - return add_capture_action(fsm, fsm->capture_info, - state, &action); + struct carry_active_captures_env *env = opaque; + if (!idmap_set(env->dst_m, env->dst, value)) { + env->ok = false; + return 0; + } + return 1; } -static int -add_capture_action(struct fsm *fsm, struct fsm_capture_info *ci, - fsm_state_t state, const struct fsm_capture_action *action) +int +fsm_capture_copy_active_for_ends(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state) { - uint64_t h; - size_t b_i, mask; - - assert(state < fsm->statecount); - assert(action->to == CAPTURE_NO_STATE || action->to < fsm->statecount); - -#if LOG_CAPTURE > 0 - fprintf(stderr, "add_capture_action: state %u, type %s, ID %u, TO %d\n", - state, fsm_capture_action_type_name[action->type], - action->id, action->to); -#endif - - if (ci->bucket_count == 0) { - if (!init_capture_action_htab(fsm, ci)) { - return 0; - } - } else if (ci->buckets_used >= ci->bucket_count/2) { /* grow */ - if (!grow_capture_action_buckets(fsm->opt->alloc, ci)) { - return 0; + struct state_iter it; + fsm_state_t s; + + assert(src_fsm != NULL); + assert(src_fsm->capture_info != NULL); + assert(src_fsm->capture_info->end_capture_map != NULL); + assert(dst_fsm != NULL); + assert(dst_fsm->capture_info != NULL); + assert(dst_fsm->capture_info->end_capture_map != NULL); + struct idmap *src_m = src_fsm->capture_info->end_capture_map; + struct idmap *dst_m = dst_fsm->capture_info->end_capture_map; + + struct carry_active_captures_env env = { + .dst_m = dst_m, + .dst = dst_state, + .ok = true, + }; + + state_set_reset(states, &it); + while (state_set_next(&it, &s)) { + if (!fsm_isend(src_fsm, s)) { + continue; } - } - h = hash_id(state); - mask = ci->bucket_count - 1; - - for (b_i = 0; b_i < ci->bucket_count; b_i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[(h + b_i) & mask]; - if (b->state == CAPTURE_NO_STATE) { /* empty */ - b->state = state; - memcpy(&b->action, action, sizeof(*action)); - ci->buckets_used++; - fsm->states[state].has_capture_actions = 1; - if (action->id > ci->max_capture_id) { - ci->max_capture_id = action->id; - } - return 1; - } else if (b->state == state && - 0 == cmp_action(action, &b->action)) { - /* already present, ignore duplicate */ - assert(fsm->states[state].has_capture_actions); - assert(ci->max_capture_id >= action->id); - return 1; - } else { - continue; /* skip past collision */ + idmap_iter_for_state(src_m, s, copy_active_captures_cb, &env); + if (!env.ok) { + goto cleanup; } } - assert(!"unreachable"); - return 0; +cleanup: + return env.ok; } -static int -grow_capture_action_buckets(const struct fsm_alloc *alloc, - struct fsm_capture_info *ci) +int +fsm_capture_copy_program_end_state_associations(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state) { - const size_t ncount = 2 * ci->bucket_count; - struct fsm_capture_action_bucket *nbuckets; - size_t nused = 0; - size_t i; + struct state_iter it; + fsm_state_t s; + + assert(src_fsm != NULL); + assert(src_fsm->capture_info != NULL); + assert(src_fsm->capture_info->end_capvm_program_map != NULL); + assert(dst_fsm != NULL); + assert(dst_fsm->capture_info != NULL); + assert(dst_fsm->capture_info->end_capvm_program_map != NULL); + struct idmap *src_m = src_fsm->capture_info->end_capvm_program_map; + struct idmap *dst_m = dst_fsm->capture_info->end_capvm_program_map; + + struct carry_active_captures_env env = { + .dst_m = dst_m, + .dst = dst_state, + .ok = true, + }; + + state_set_reset(states, &it); + while (state_set_next(&it, &s)) { + if (!fsm_isend(src_fsm, s)) { + continue; + } - assert(ncount != 0); - nbuckets = f_malloc(alloc, ncount * sizeof(nbuckets[0])); - if (nbuckets == NULL) { - return 0; - } + LOG(5 - LOG_CAPTURE_COMBINING_ANALYSIS, + "%s: dst_state %d, state_set_next => %d\n", + __func__, dst_state, s); - for (i = 0; i < ncount; i++) { - nbuckets[i].state = CAPTURE_NO_STATE; + idmap_iter_for_state(src_m, s, copy_program_associations_cb, &env); + if (!env.ok) { + goto cleanup; + } } - for (i = 0; i < ci->bucket_count; i++) { - const struct fsm_capture_action_bucket *src_b = &ci->buckets[i]; - uint64_t h; - const size_t mask = ncount - 1; - size_t b_i; +cleanup: + return env.ok; +} - if (src_b->state == CAPTURE_NO_STATE) { - continue; +int +fsm_capture_copy_programs(const struct fsm *src_fsm, + struct fsm *dst_fsm) +{ + const struct fsm_alloc *alloc = src_fsm->opt->alloc; + assert(alloc == dst_fsm->opt->alloc); + const struct fsm_capture_info *src_ci = src_fsm->capture_info; + + for (uint32_t p_i = 0; p_i < src_ci->programs.used; p_i++) { + const struct capvm_program *p = src_ci->programs.set[p_i]; + struct capvm_program *cp = capvm_program_copy(alloc, p); + if (cp == NULL) { + return 0; } - h = hash_id(src_b->state); - for (b_i = 0; b_i < ncount; b_i++) { - struct fsm_capture_action_bucket *dst_b; - dst_b = &nbuckets[(h + b_i) & mask]; - if (dst_b->state == CAPTURE_NO_STATE) { - memcpy(dst_b, src_b, sizeof(*src_b)); - nused++; - break; - } else { - continue; - } + /* unused: because this is an in-order copy, it's assumed + * the programs will retain their order. */ + uint32_t prog_id; + if (!fsm_capture_add_program(dst_fsm, cp, &prog_id)) { + return 0; } } - - assert(nused == ci->buckets_used); - f_free(alloc, ci->buckets); - ci->buckets = nbuckets; - ci->bucket_count = ncount; return 1; } -static int -grow_trail(struct capture_set_path_env *env) +size_t +fsm_capture_program_count(const struct fsm *fsm) { - struct trail_cell *ntrail; - unsigned nceil; - assert(env != NULL); - - nceil = 2 * env->trail_ceil; - assert(nceil > env->trail_ceil); + return fsm->capture_info->programs.used; +} - ntrail = f_realloc(env->fsm->opt->alloc, env->trail, - nceil * sizeof(env->trail[0])); - if (ntrail == NULL) { - return 0; - } +struct check_program_mappings_env { + const struct fsm *fsm; +}; - env->trail = ntrail; - env->trail_ceil = nceil; +static int +check_program_mappings_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + const uint32_t prog_id = (uint32_t)value; + struct check_program_mappings_env *env = opaque; + assert(state_id < env->fsm->statecount); + assert(prog_id < env->fsm->capture_info->programs.used); return 1; } -static int -step_trail_start(struct capture_set_path_env *env) -{ - struct trail_cell *tc = &env->trail[env->trail_i - 1]; - const fsm_state_t cur = tc->state; - size_t i; - struct edge_set *edge_set = NULL; - - /* check if node is endpoint, if so mark trail, - * then pop trail and continue */ - if (cur == env->end) { - struct fsm_capture_action action; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- GOT END at %u\n", cur); -#endif - action.id = env->capture_id; +void +fsm_capture_integrity_check(const struct fsm *fsm) +{ + if (!EXPENSIVE_CHECKS) { return; } - for (i = 0; i < env->trail_i; i++) { - fsm_state_t state = env->trail[i].state; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- %lu: %d\n", - i, state); -#endif + /* check that all program mappings are in range */ + struct check_program_mappings_env env = { + .fsm = fsm, + }; + idmap_iter(fsm->capture_info->end_capvm_program_map, check_program_mappings_cb, &env); +} - /* Special case: if this is marked as having - * a self-edge on the path, then also add an - * extend for that. */ - if (env->trail[i].has_self_edge) { - struct fsm_capture_action self_action; - self_action.type = CAPTURE_ACTION_EXTEND; - self_action.id = env->capture_id; - self_action.to = state; - - if (!add_capture_action(env->fsm, - env->fsm->capture_info, - state, &self_action)) { - return 0; - } - } - - - if (i == 0) { - action.type = CAPTURE_ACTION_START; - } else { - action.type = (i < env->trail_i - 1 - ? CAPTURE_ACTION_EXTEND - : CAPTURE_ACTION_COMMIT); - } - - if (i < env->trail_i - 1) { - action.to = env->trail[i + 1].state; - } else { - action.to = CAPTURE_NO_STATE; - } - - if (!add_capture_action(env->fsm, - env->fsm->capture_info, - state, &action)) { - return 0; - } - } +struct capture_idmap_compact_env { + int ok; + struct idmap *dst; + const fsm_state_t *mapping; + size_t orig_statecount; +}; - tc->step = TRAIL_STEP_DONE; - return 1; - } +static int +copy_with_mapping_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + fsm_state_t dst_id; + struct capture_idmap_compact_env *env = opaque; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- resetting edge iterator\n"); -#endif - edge_set = env->fsm->states[cur].edges; + assert(state_id < env->orig_statecount); + dst_id = env->mapping[state_id]; - MARK_SEEN(env, cur); -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- marking %u as seen\n", cur); -#endif + if (dst_id == FSM_STATE_REMAP_NO_STATE) { + return 1; /* discard */ + } + + if (!idmap_set(env->dst, dst_id, value)) { + env->ok = 0; + return 0; + } - edge_set_reset(edge_set, &tc->iter); - tc->step = TRAIL_STEP_ITER_EDGES; return 1; } -static int -step_trail_iter_edges(struct capture_set_path_env *env) +int +fsm_capture_id_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount) { - struct trail_cell *tc = &env->trail[env->trail_i - 1]; - struct trail_cell *next_tc = NULL; - - struct fsm_edge e; + struct capture_idmap_compact_env env; + struct idmap *old_idmap = fsm->capture_info->end_capture_map; + struct idmap *new_idmap = idmap_new(fsm->opt->alloc); - if (!edge_set_next(&tc->iter, &e)) { -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- ITER_EDGE_NEXT: DONE %u\n", tc->state); -#endif - tc->step = TRAIL_STEP_ITER_EPSILONS; - return 1; + if (new_idmap == NULL) { + return 0; } -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- ITER_EDGE_NEXT: %u -- NEXT %u\n", - tc->state, e.state); -#endif + env.ok = 1; + env.dst = new_idmap; + env.mapping = mapping; + env.orig_statecount = orig_statecount; - if (tc->state == e.state) { -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- special case, self-edge\n"); -#endif - /* Mark this state as having a self-edge, then continue - * the iterator. An EXTEND action will be added for the - * self-edge later, if necessary. */ - tc->has_self_edge = 1; - return 1; - } else if (CHECK_SEEN(env, e.state)) { -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- seen, skipping\n"); -#endif - return 1; /* continue */ - } - - if (env->trail_i == env->trail_ceil) { - if (!grow_trail(env)) { - return 0; - } + idmap_iter(old_idmap, copy_with_mapping_cb, &env); + if (!env.ok) { + idmap_free(new_idmap); + return 0; } -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- marking %u as seen\n", e.state); -#endif - MARK_SEEN(env, e.state); + idmap_free(old_idmap); + fsm->capture_info->end_capture_map = new_idmap; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- not seen (%u), exploring\n", e.state); -#endif - env->trail_i++; - next_tc = &env->trail[env->trail_i - 1]; - next_tc->state = e.state; - next_tc->step = TRAIL_STEP_START; - next_tc->has_self_edge = 0; return 1; } -static int -step_trail_iter_epsilons(struct capture_set_path_env *env) +int +fsm_capture_program_association_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount) { - struct trail_cell *tc = &env->trail[env->trail_i - 1]; + struct capture_idmap_compact_env env; + struct idmap *old_idmap = fsm->capture_info->end_capvm_program_map; + struct idmap *new_idmap = idmap_new(fsm->opt->alloc); - /* skipping this for now */ - -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- ITER_EPSILONS: %u\n", tc->state); -#endif - - tc->step = TRAIL_STEP_DONE; - return 1; -} + if (new_idmap == NULL) { + return 0; + } -static int -step_trail_done(struct capture_set_path_env *env) -{ - struct trail_cell *tc; + env.ok = 1; + env.dst = new_idmap; + env.mapping = mapping; + env.orig_statecount = orig_statecount; - /* 0-step paths already handled outside loop */ - assert(env->trail_i > 0); + idmap_iter(old_idmap, copy_with_mapping_cb, &env); + if (!env.ok) { + idmap_free(new_idmap); + return 0; + } - tc = &env->trail[env->trail_i - 1]; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- DONE: %u\n", tc->state); -#endif - CLEAR_SEEN(env, tc->state); + idmap_free(old_idmap); + fsm->capture_info->end_capvm_program_map = new_idmap; - env->trail_i--; return 1; } void -fsm_capture_rebase_capture_id(struct fsm *fsm, unsigned base) +fsm_capture_update_max_capture_id(struct fsm_capture_info *ci, + unsigned capture_id) { - size_t i; - struct fsm_capture_info *ci = fsm->capture_info; assert(ci != NULL); - - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { - continue; - } - - b->action.id += base; - if (b->action.id > ci->max_capture_id) { - ci->max_capture_id = b->action.id; - } + if (capture_id >= ci->max_capture_id) { + ci->max_capture_id = capture_id; } } -void -fsm_capture_rebase_capture_action_states(struct fsm *fsm, fsm_state_t base) +int +fsm_capture_add_program(struct fsm *fsm, + struct capvm_program *program, uint32_t *prog_id) { - size_t i; + assert(program != NULL); + assert(prog_id != NULL); + struct fsm_capture_info *ci = fsm->capture_info; - assert(ci != NULL); - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { - continue; + if (ci->programs.used == ci->programs.ceil) { + const size_t nceil = (ci->programs.ceil == 0 + ? DEF_PROGRAMS_CEIL + : 2*ci->programs.ceil); + assert(nceil > ci->programs.ceil); + struct capvm_program **nset = f_realloc(fsm->opt->alloc, + ci->programs.set, nceil * sizeof(nset[0])); + if (nset == NULL) { + return 0; } - b->state += base; - if (b->action.to != CAPTURE_NO_STATE) { - b->action.to += base; - } + ci->programs.ceil = nceil; + ci->programs.set = nset; } -} + assert(ci->programs.used < ci->programs.ceil); -struct fsm_capture * -fsm_capture_alloc(const struct fsm *fsm) -{ - (void)fsm; - assert(!"todo"); - return NULL; -} - -void -fsm_capture_update_captures(const struct fsm *fsm, - fsm_state_t cur_state, fsm_state_t next_state, size_t offset, - struct fsm_capture *captures) -{ - const struct fsm_capture_info *ci; - uint64_t h; - size_t b_i, mask; - - assert(cur_state < fsm->statecount); - assert(fsm->states[cur_state].has_capture_actions); - - ci = fsm->capture_info; - assert(ci != NULL); - - h = hash_id(cur_state); - mask = ci->bucket_count - 1; - -#if LOG_CAPTURE > 0 - fprintf(stderr, "-- updating captures at state %u, to %d, offset %lu\n", - cur_state, next_state, offset); -#endif - - for (b_i = 0; b_i < ci->bucket_count; b_i++) { - const size_t b_id = (h + b_i) & mask; - struct fsm_capture_action_bucket *b = &ci->buckets[b_id]; - unsigned capture_id; - -#if LOG_CAPTURE > 3 - fprintf(stderr, " -- update_captures: bucket %lu, state %d\n", b_id, b->state); -#endif - - - if (b->state == CAPTURE_NO_STATE) { -#if LOG_CAPTURE > 3 - fprintf(stderr, " -- no more actions for this state\n"); -#endif - break; /* no more for this state */ - } else if (b->state != cur_state) { - continue; /* skip collision */ - } - - assert(b->state == cur_state); - capture_id = b->action.id; - - switch (b->action.type) { - case CAPTURE_ACTION_START: -#if LOG_CAPTURE > 0 - fprintf(stderr, "START [%u, %u]\n", - b->action.id, b->action.to); -#endif - if (next_state == b->action.to && captures[capture_id].pos[0] == FSM_CAPTURE_NO_POS) { - captures[capture_id].pos[0] = offset; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[0] to %lu\n", b->action.id, offset); -#endif - } else { - /* filtered, ignore */ - } - break; - case CAPTURE_ACTION_EXTEND: -#if LOG_CAPTURE > 0 - fprintf(stderr, "EXTEND [%u, %u]\n", - b->action.id, b->action.to); -#endif - if (captures[capture_id].pos[0] != FSM_CAPTURE_NO_POS - && (0 == (captures[capture_id].pos[1] & COMMITTED_CAPTURE_FLAG))) { - if (next_state == b->action.to) { - captures[capture_id].pos[1] = offset; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[1] to %lu\n", b->action.id, offset); -#endif - } else { - /* filtered, ignore */ - } - } - break; - case CAPTURE_ACTION_COMMIT_ZERO_STEP: -#if LOG_CAPTURE > 0 - fprintf(stderr, "COMMIT_ZERO_STEP [%u]\n", - b->action.id); -#endif - - if (captures[capture_id].pos[0] == FSM_CAPTURE_NO_POS) { - captures[capture_id].pos[0] = offset; - captures[capture_id].pos[1] = offset | COMMITTED_CAPTURE_FLAG; - } else { /* extend */ - captures[capture_id].pos[1] = offset | COMMITTED_CAPTURE_FLAG; - } - -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[0] and [1] to %lu (with COMMIT flag)\n", b->action.id, offset); -#endif - break; - case CAPTURE_ACTION_COMMIT: -#if LOG_CAPTURE > 0 - fprintf(stderr, "COMMIT [%u]\n", - b->action.id); -#endif - captures[capture_id].pos[1] = offset | COMMITTED_CAPTURE_FLAG; -#if LOG_CAPTURE > 0 - fprintf(stderr, " -- set capture[%u].[1] to %lu (with COMMIT flag)\n", b->action.id, offset); -#endif - break; - default: - assert(!"matchfail"); - } + const unsigned max_prog_capture_id = fsm_capvm_program_get_max_capture_id(program); + if (max_prog_capture_id > ci->max_capture_id) { + fsm_capture_update_max_capture_id(ci, max_prog_capture_id); } -} -void -fsm_capture_finalize_captures(const struct fsm *fsm, - size_t capture_count, struct fsm_capture *captures) -{ - size_t i; - - /* If either pos[] is FSM_CAPTURE_NO_POS or the - * COMMITTED_CAPTURE_FLAG isn't set on pos[1], then the capture - * wasn't finalized; clear it. Otherwise, clear that bit so the - * pos[1] offset is meaningful. */ - - /* FIXME: this should also take the end state(s) associated - * with a capture into account, when that information is available; - * otherwise there will be false positives for zero-width captures - * where the paths have a common prefix. */ - (void)fsm; - - for (i = 0; i < capture_count; i++) { -#if LOG_CAPTURE > 1 - fprintf(stderr, "finalize[%lu]: pos[0]: %ld, pos[1]: %ld\n", - i, captures[i].pos[0], captures[i].pos[1]); -#endif + *prog_id = ci->programs.used; + ci->programs.set[ci->programs.used] = program; + ci->programs.used++; + return 1; +} - if (captures[i].pos[0] == FSM_CAPTURE_NO_POS - || captures[i].pos[1] == FSM_CAPTURE_NO_POS - || (0 == (captures[i].pos[1] & COMMITTED_CAPTURE_FLAG))) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; -#if LOG_CAPTURE > 1 - fprintf(stderr, "finalize: discard %lu\n", i); -#endif - } else if (captures[i].pos[1] & COMMITTED_CAPTURE_FLAG) { - captures[i].pos[1] &=~ COMMITTED_CAPTURE_FLAG; - } +const struct capvm_program * +fsm_capture_get_program_by_id(const struct fsm *fsm, uint32_t prog_id) +{ + struct fsm_capture_info *ci = fsm->capture_info; + if (prog_id >= ci->programs.used) { + return NULL; } + return ci->programs.set[prog_id]; } -void -fsm_capture_action_iter(const struct fsm *fsm, - fsm_capture_action_iter_cb *cb, void *opaque) +int +fsm_capture_associate_program_with_end_state(struct fsm *fsm, + uint32_t prog_id, fsm_state_t end_state) { - size_t i; struct fsm_capture_info *ci = fsm->capture_info; - assert(ci != NULL); - - for (i = 0; i < ci->bucket_count; i++) { - struct fsm_capture_action_bucket *b = &ci->buckets[i]; - if (b->state == CAPTURE_NO_STATE) { - continue; - } + assert(end_state < fsm->statecount); + assert(prog_id < ci->programs.used); - if (!cb(b->state, b->action.type, - b->action.id, b->action.to, opaque)) { - break; - } + if (!idmap_set(ci->end_capvm_program_map, end_state, prog_id)) { + return 0; } + return 1; } -const char *fsm_capture_action_type_name[] = { - "START", "EXTEND", - "COMMIT_ZERO_STEP", "COMMIT" +struct capture_resolve_env { + const struct fsm_capture_info *ci; + const unsigned char *input; + const size_t length; + + int res; + struct fsm_capture *captures; + size_t captures_len; }; static int -dump_iter_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +exec_capvm_program_cb(fsm_state_t state_id, unsigned prog_id, void *opaque) { - FILE *f = opaque; - fprintf(f, " - state %u, %s [capture_id: %u, to: %d]\n", - state, fsm_capture_action_type_name[type], capture_id, to); + struct capture_resolve_env *env = opaque; + (void)state_id; + + /* TODO: idmap_iter could take a halt return value */ + if (env->res != 1) { return 0; } + + assert(prog_id < env->ci->programs.used); + struct capvm_program *p = env->ci->programs.set[prog_id]; + + LOG(5 - LOG_EVAL, "%s: evaluating prog_id %u for state %d\n", + __func__, prog_id, state_id); + +#define EXEC_COUNT 1 /* can be increased for benchmarking */ + + for (size_t i = 0; i < EXEC_COUNT; i++) { + const enum fsm_capvm_program_exec_res exec_res = + fsm_capvm_program_exec(p, + (const uint8_t *)env->input, env->length, + env->captures, env->captures_len); + if (exec_res != FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN) { + env->res = 0; + return 0; + } + } return 1; } -/* Dump capture metadata about an FSM. */ -void -fsm_capture_dump(FILE *f, const char *tag, const struct fsm *fsm) +int +fsm_capture_resolve_during_exec(const struct fsm *fsm, + fsm_state_t end_state, const unsigned char *input, size_t input_offset, + struct fsm_capture *captures, size_t captures_len) { - struct fsm_capture_info *ci; - assert(fsm != NULL); - ci = fsm->capture_info; - if (ci == NULL || ci->bucket_count == 0) { - fprintf(f, "==== %s -- no captures\n", tag); - return; - } - - fprintf(f, "==== %s -- capture action hash table (%u buckets)\n", - tag, ci->bucket_count); - fsm_capture_action_iter(fsm, dump_iter_cb, f); + assert(input != NULL); + assert(captures != NULL); + + const struct fsm_capture_info *ci = fsm->capture_info; + + struct capture_resolve_env capture_env = { + .res = 1, + .ci = ci, + .input = input, + .length = input_offset, + .captures = captures, + .captures_len = captures_len, + }; + + LOG(5 - LOG_EVAL, "%s: ended on state %d\n", + __func__, end_state); + idmap_iter_for_state(ci->end_capvm_program_map, + end_state, exec_capvm_program_cb, &capture_env); + + return capture_env.res; } diff --git a/src/libfsm/capture.h b/src/libfsm/capture.h index 4c0ba4722..16588060c 100644 --- a/src/libfsm/capture.h +++ b/src/libfsm/capture.h @@ -2,28 +2,26 @@ #define LIBFSM_CAPTURE_H #include +#include #include #include +#include -#define NEXT_STATE_END ((fsm_state_t)-1) - +/* Internal state IDs that are out of range for valid state IDs. + * + * CAPTURE_NO_STATE is used to represent the absence of a state, such as + * when remapping a state to a dead state (removing it) or empty + * hash table buckets. + * + * NEXT_STATE_END is used as a destination for capture actions that + * trigger when ending on a state. */ #define CAPTURE_NO_STATE ((fsm_state_t)-1) /* Capture interface -- functions internal to libfsm. * The public interface should not depend on any of these details. */ -enum capture_action_type { - /* Start an active capture if transitioning to TO. */ - CAPTURE_ACTION_START, - /* Continue an active capture if transitioning to TO, - * otherwise deactivate it. */ - CAPTURE_ACTION_EXTEND, - /* Write a zero-step capture (i.e., the start and - * end state are the same). */ - CAPTURE_ACTION_COMMIT_ZERO_STEP, - /* Write an active capture's endpoints. */ - CAPTURE_ACTION_COMMIT -}; +struct fsm_capture_info; +struct capvm_program; int fsm_capture_init(struct fsm *fsm); @@ -31,41 +29,110 @@ fsm_capture_init(struct fsm *fsm); void fsm_capture_free(struct fsm *fsm); +void +fsm_capture_dump_active_for_ends(FILE *f, const struct fsm *fsm); + +void +fsm_capture_dump_program_end_mapping(FILE *f, const struct fsm *fsm); + /* Does the FSM have captures? */ int fsm_capture_has_captures(const struct fsm *fsm); -/* Update captures, called when exiting or ending on a state. - * If ending on a state, use NEXT_STATE_END for next_state. */ void -fsm_capture_update_captures(const struct fsm *fsm, - fsm_state_t cur_state, fsm_state_t next_state, size_t offset, - struct fsm_capture *captures); +fsm_capture_dump(FILE *f, const char *tag, const struct fsm *fsm); + +void +fsm_capture_dump_programs(FILE *f, const struct fsm *fsm); +/* If EXPENSIVE_CHECKS is non-zero, assert that all capture metadata on + * an FSM is internally consistent. */ void -fsm_capture_finalize_captures(const struct fsm *fsm, - size_t capture_count, struct fsm_capture *captures); +fsm_capture_integrity_check(const struct fsm *fsm); + +int +fsm_capture_id_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount); -/* Add a capture action. This is used to update capture actions - * in the destination FSM when combining/transforming other FSMs. */ int -fsm_capture_add_action(struct fsm *fsm, - fsm_state_t state, enum capture_action_type type, - unsigned id, fsm_state_t to); - -/* Callback for iterating over capture actions. - * Return 1 to continue, return 0 to halt. - * If TO is not meaningful for a particular type, it will be - * set to NEXT_STATE_END. */ +fsm_capture_program_association_compact(struct fsm *fsm, const fsm_state_t *mapping, + size_t orig_statecount); + +/* Iterator callback for capture IDs that are active for a particular + * end state. Returns whether iteration should continue. */ typedef int -fsm_capture_action_iter_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, +fsm_capture_iter_active_for_end_cb(fsm_state_t state, unsigned capture_id, void *opaque); void -fsm_capture_action_iter(const struct fsm *fsm, - fsm_capture_action_iter_cb *cb, void *opaque); +fsm_capture_iter_active_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_active_for_end_cb *cb, void *opaque); + +void +fsm_capture_iter_active_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_active_for_end_cb *cb, void *opaque); -extern const char *fsm_capture_action_type_name[]; +/* Iterator callback for program IDs that are active for a particular + * end state. Returns whether iteration should continue. */ +typedef int +fsm_capture_iter_program_ids_for_end_state_cb(fsm_state_t state, unsigned prog_id, + void *opaque); + +void +fsm_capture_iter_program_ids_for_end_state(const struct fsm *fsm, fsm_state_t state, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque); +void +fsm_capture_iter_program_ids_for_all_end_states(const struct fsm *fsm, + fsm_capture_iter_program_ids_for_end_state_cb *cb, void *opaque); + +/* TODO: combine/rename */ +int +fsm_capture_copy_active_for_ends(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state); +int +fsm_capture_copy_program_end_state_associations(const struct fsm *src_fsm, + const struct state_set *states, + struct fsm *dst_fsm, fsm_state_t dst_state); + +int +fsm_capture_copy_programs(const struct fsm *src_fsm, + struct fsm *dst_fsm); + +size_t +fsm_capture_program_count(const struct fsm *fsm); + +void +fsm_capture_update_max_capture_id(struct fsm_capture_info *ci, + unsigned capture_id); + +int +fsm_capture_add_program(struct fsm *fsm, + struct capvm_program *program, uint32_t *prog_id); + +const struct capvm_program * +fsm_capture_get_program_by_id(const struct fsm *fsm, uint32_t prog_id); + +int +fsm_capture_associate_program_with_end_state(struct fsm *fsm, + uint32_t prog_id, fsm_state_t end_state); + +/* Resolve captures. + * + * FIXME: With the current implementation, if enough memory + * was passed in then it couldn't fail, but it may be worth + * changing the interface so that it doesn't assume there was + * already a successful match in order to support one-pass + * matching & capture resolution attempts from a stream. + * + * TODO: This should pass in a size for captures[]. + * TODO: An alternate interface that allows passing in + * preallocated buffers for working memory. + * + * TODO: describe return value. */ +int +fsm_capture_resolve_during_exec(const struct fsm *fsm, + fsm_state_t end_state, const unsigned char *input, size_t input_offset, + struct fsm_capture *captures, size_t captures_len); #endif diff --git a/src/libfsm/capture_internal.h b/src/libfsm/capture_internal.h deleted file mode 100644 index 70418b988..000000000 --- a/src/libfsm/capture_internal.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef CAPTURE_INTERNAL_H -#define CAPTURE_INTERNAL_H - -#include -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "capture.h" - -/* Bucket count for capture action hash table. - * Must be a power of 2. */ - -#define DEF_CAPTURE_ACTION_BUCKET_COUNT 32 -#define DEF_TRAIL_CEIL 8 - -#define LOG_CAPTURE 0 - -/* Most significant bit of a size_t. */ -#define COMMITTED_CAPTURE_FLAG ((SIZE_MAX) ^ (SIZE_MAX >> 1)) - -struct fsm_capture_info { - unsigned max_capture_id; - - /* Add-only hash table. */ - unsigned bucket_count; - unsigned buckets_used; /* grow if >= 1/2 used */ - - /* Hash buckets. If state is CAPTURE_NO_STATE, - * the bucket is empty. */ - struct fsm_capture_action_bucket { - fsm_state_t state; /* key */ - struct fsm_capture_action { - enum capture_action_type type; - unsigned id; - /* only used by START and EXTEND */ - fsm_state_t to; - } action; - } *buckets; -}; - -enum trail_step { - TRAIL_STEP_START, - TRAIL_STEP_ITER_EDGES, - TRAIL_STEP_ITER_EPSILONS, - TRAIL_STEP_DONE -}; - -/* env->seen is used as a bit set for tracking which states have already - * been processed. These macros set/check/clear the bits. */ -#define SEEN_BITOP(ENV, STATE, OP) ENV->seen[STATE/64] OP ((uint64_t)1 << (STATE&63)) -#define MARK_SEEN(ENV, STATE) SEEN_BITOP(ENV, STATE, |=) -#define CHECK_SEEN(ENV, STATE) SEEN_BITOP(ENV, STATE, &) -#define CLEAR_SEEN(ENV, STATE) SEEN_BITOP(ENV, STATE, &=~) - -struct capture_set_path_env { - struct fsm *fsm; - unsigned capture_id; - fsm_state_t start; - fsm_state_t end; - - unsigned trail_i; - unsigned trail_ceil; - struct trail_cell { - fsm_state_t state; - enum trail_step step; - char has_self_edge; - struct edge_iter iter; - } *trail; - - /* bitset for which states have already been seen. */ - uint64_t *seen; -}; - -static int -init_capture_action_htab(struct fsm *fsm, struct fsm_capture_info *ci); - -static int -mark_capture_path(struct capture_set_path_env *env); - -static int -add_capture_action(struct fsm *fsm, struct fsm_capture_info *ci, - fsm_state_t state, const struct fsm_capture_action *action); - -static int -grow_capture_action_buckets(const struct fsm_alloc *alloc, - struct fsm_capture_info *ci); - -static int -grow_trail(struct capture_set_path_env *env); - -static int -step_trail_start(struct capture_set_path_env *env); -static int -step_trail_iter_edges(struct capture_set_path_env *env); -static int -step_trail_iter_epsilons(struct capture_set_path_env *env); -static int -step_trail_done(struct capture_set_path_env *env); - -static int -cmp_action(const struct fsm_capture_action *a, - const struct fsm_capture_action *b); - -#endif diff --git a/src/libfsm/capture_log.h b/src/libfsm/capture_log.h new file mode 100644 index 000000000..c850460bb --- /dev/null +++ b/src/libfsm/capture_log.h @@ -0,0 +1,27 @@ +/* + * Copyright 2020 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_LOG_H +#define CAPTURE_LOG_H + +/* Log levels */ +#define LOG_CAPTURE 0 +#define LOG_CAPTURE_COMBINING_ANALYSIS 0 +#define LOG_EVAL 0 +#define LOG_APPEND_ACTION 0 +#define LOG_PRINT_FSM 0 +#define LOG_MARK_PATH 0 + +#include + +#define LOG(LEVEL, ...) \ + do { \ + if ((LEVEL) <= LOG_CAPTURE) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while(0) + +#endif diff --git a/src/libfsm/capture_vm.c b/src/libfsm/capture_vm.c new file mode 100644 index 000000000..e6a1f0539 --- /dev/null +++ b/src/libfsm/capture_vm.c @@ -0,0 +1,194 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +/* Virtual machine for resolving captures while executing regular + * expressions from a subset of PCRE. This is based on the approach + * described in Russ Cox's "Regular Expression Matching: the Virtual + * Machine Approach" (https://swtch.com/~rsc/regexp/regexp2.html), but + * has a couple major modifications, mainly to keep memory usage low and + * predictable, and to be more consistent (arguably, bug-compatible...) + * with PCRE's behavior for libfsm's supported subset of PCRE. + * + * Instead of giving each green thread its own copy of the capture + * buffers, which uses a prohibitive amount of memory when combining DFAs + * with several captures each, operate in two passes: + * + * In the first pass, each thread keeps track of its execution path, + * appending a bit for each branch: 1 for the greedy option, 0 for the + * non-greedy. Since there can be at most one live thread per program + * instruction, and all of them are either on the current or next input + * character, there's a bounded window for diverging paths during execution. + * After a certain distance back all paths either have a common prefix + * or consist entirely of 0 bits (for continually looping at an unanchored + * start). The path bits are stored in chunks in a backwards linked list, + * so nodes for common path prefixes can be shared by multiple threads, + * and the prefix of all 0 bits is instead stored as a counter. This + * keeps memory usage substantially lower. This search runs threads in + * parallel, breadth-first, halting any threads that duplicate work of + * a greedier search path (since PCRE's results match the greediest). + * + * In the second pass, replay the execution path for just the single + * greediest thread, which represents the "correct" match (according to + * PCRE semantics), and write capture offsets into buffers passed in by + * the caller. + * + * Most of the other differences have to do with matching PCRE + * edge cases, particularly interactions between newlines and start/end + * anchors. */ + +#include "capture_vm.h" +#include "capture_vm_program.h" + +#include + +#include +#include +#include + +void +fsm_capvm_program_free(const struct fsm_alloc *alloc, + struct capvm_program *program) +{ + if (program == NULL) { return; } + f_free(alloc, program->ops); + f_free(alloc, program->char_classes.sets); + f_free(alloc, program); +} + +struct capvm_program * +capvm_program_copy(const struct fsm_alloc *alloc, + const struct capvm_program *src) +{ + assert(src != NULL); + struct capvm_program *p = NULL; + struct capvm_opcode *ops = NULL; + struct capvm_char_class *sets = NULL; + + p = f_calloc(alloc, 1, sizeof(*p)); + if (p == NULL) { goto cleanup; } + + /* This allocates exactly as many instructions and char_classes + * as necessary, rather than a power-of-2 buffer, because + * they are only added during compilation in libre. */ + + ops = f_calloc(alloc, src->used, sizeof(ops[0])); + if (ops == NULL) { goto cleanup; } + + sets = f_calloc(alloc, + /* do non-zero allocation to silence EFENCE */ + src->char_classes.count == 0 ? 1 : src->char_classes.count, + sizeof(src->char_classes.sets[0])); + if (sets == NULL) { goto cleanup; } + + memcpy(ops, src->ops, src->used * sizeof(src->ops[0])); + + assert(src->char_classes.sets != NULL || src->char_classes.count == 0); + if (src->char_classes.count > 0) { + memcpy(sets, src->char_classes.sets, + src->char_classes.count * sizeof(src->char_classes.sets[0])); + } + + struct capvm_program np = { + .capture_count = src->capture_count, + .capture_base = src->capture_base, + + .used = src->used, + .ceil = src->used, + .ops = ops, + + .char_classes = { + .count = src->char_classes.count, + .ceil = src->char_classes.count, + .sets = sets, + }, + }; + memcpy(p, &np, sizeof(np)); + return p; + +cleanup: + f_free(alloc, p); + f_free(alloc, ops); + f_free(alloc, sets); + return NULL; +} + +void +capvm_program_rebase(struct capvm_program *program, unsigned capture_offset) +{ + assert(program->capture_base + capture_offset > program->capture_base); + program->capture_base += capture_offset; +} + +void +fsm_capvm_program_dump(FILE *f, + const struct capvm_program *p) +{ + for (size_t i = 0; i < p->used; i++) { + const struct capvm_opcode *op = &p->ops[i]; + switch (op->t) { + case CAPVM_OP_CHAR: + fprintf(f, "%zu: char 0x%02x (%c)\n", + i, op->u.chr, isprint(op->u.chr) ? op->u.chr : '.'); + break; + case CAPVM_OP_CHARCLASS: + { + const uint32_t id = op->u.charclass_id; + assert(id < p->char_classes.count); + const struct capvm_char_class *cc = &p->char_classes.sets[id]; + fprintf(f, "%zu: charclass %u -> [", i, id); + for (size_t i = 0; i < 4; i++) { + fprintf(f, "%016lx", cc->octets[i]); + } + fprintf(f, "]\n"); + break; + } + case CAPVM_OP_MATCH: + fprintf(f, "%zu: match\n", i); + break; + case CAPVM_OP_JMP: + fprintf(f, "%zu: jmp %u\n", i, op->u.jmp); + break; + case CAPVM_OP_JMP_ONCE: + fprintf(f, "%zu: jmp_once %u\n", i, op->u.jmp_once); + break; + case CAPVM_OP_SPLIT: + fprintf(f, "%zu: split greedy %u nongreedy %u\n", i, op->u.split.greedy, op->u.split.nongreedy); + break; + case CAPVM_OP_SAVE: + fprintf(f, "%zu: save %u (cap %u, %s)\n", + i, op->u.save, + op->u.save / 2, (op->u.save & (uint32_t)0x01) ? "end" : "start"); + break; + case CAPVM_OP_ANCHOR: + fprintf(f, "%zu: anchor %s\n", i, + op->u.anchor == CAPVM_ANCHOR_START ? "start" : "end"); + break; + default: + assert(!"matchfail"); + } + } + for (size_t i = 0; i < p->char_classes.count; i++) { + const uint64_t *octets = p->char_classes.sets[i].octets; + fprintf(f, "char_class %zu: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", + i, octets[0], octets[1], octets[2], octets[3]); + } +} + +unsigned +fsm_capvm_program_get_capture_count(const struct capvm_program *program) +{ + assert(program != NULL); + return program->capture_count; +} + +unsigned +fsm_capvm_program_get_max_capture_id(const struct capvm_program *program) +{ + assert(program != NULL); + return (program->capture_count == 0 + ? 0 + : program->capture_base + program->capture_count - 1); +} diff --git a/src/libfsm/capture_vm.h b/src/libfsm/capture_vm.h new file mode 100644 index 000000000..02c198dab --- /dev/null +++ b/src/libfsm/capture_vm.h @@ -0,0 +1,68 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_VM_H +#define CAPTURE_VM_H + +#include +#include + +#include +#include + +/* Interface for the virtual machine used to resolve captures. + * These interfaces are exposed to libre but should not be + * used directly. */ + +/* Opaque struct, details in capture_vm_program.h. */ +struct capvm_program; + +void +fsm_capvm_program_free(const struct fsm_alloc *alloc, + struct capvm_program *program); + +struct capvm_program * +capvm_program_copy(const struct fsm_alloc *alloc, + const struct capvm_program *program); + +/* Add an offset to the capture ID base for a program. + * Used when FSMs are merged, one of the source FSMs' capture IDs + * will be shifted to appear after the others. */ +void +capvm_program_rebase(struct capvm_program *program, unsigned capture_offset); + +void +fsm_capvm_program_dump(FILE *f, + const struct capvm_program *program); + +enum fsm_capvm_program_exec_res { + FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN, + FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND, + FSM_CAPVM_PROGRAM_EXEC_STEP_LIMIT_REACHED, + FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC = -1, +}; + +/* Execute a capture program with the given input and populate + * the capture buffer. + * + * This asserts that the capture buffer is at least as large as + * necessary. This is an internal interface, and the buffer size + * should have already been checked by the caller. */ +enum fsm_capvm_program_exec_res +fsm_capvm_program_exec(const struct capvm_program *program, + const uint8_t *input, size_t length, + struct fsm_capture *capture_buf, size_t capture_buf_length); + +/* Get the capture count from the program. */ +unsigned +fsm_capvm_program_get_capture_count(const struct capvm_program *program); + +/* Get the max capture ID from the program. + * If there are no captures (which is pointless) it will return 0. */ +unsigned +fsm_capvm_program_get_max_capture_id(const struct capvm_program *program); + +#endif diff --git a/src/libfsm/capture_vm_exec.c b/src/libfsm/capture_vm_exec.c new file mode 100644 index 000000000..9d4be066a --- /dev/null +++ b/src/libfsm/capture_vm_exec.c @@ -0,0 +1,2076 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "capture_vm.h" +#include "capture_vm_program.h" +#include "capture_vm_log.h" + +#include +#include +#include +#include + +#include + +/* for EXPENSIVE_CHECKS and TRACK_TIMES */ +#include "internal.h" + +#if EXPENSIVE_CHECKS +#include +#endif + +/* Special handling for a path node that has a long prefix of all 0 + * bits, as is common when the regex is unanchored at the start. */ +#define USE_COLLAPSED_ZERO_PREFIX 1 + +/* Special out-of-range NONE values. */ +#define NO_POS ((uint32_t)-1) +#define NO_ID ((uint32_t)-1) +#define COLLAPSED_ZERO_PREFIX_ID ((uint32_t)-2) +#define NO_POS_SIZE_T ((size_t)-1) + +/* Max number of bits each path link can store. + * This value cannot be changed without reworking the data structures. */ +#define PATH_LINK_BITS 32 + +/* This enables extra debugging/testing output in an easily scraped format */ +#ifndef TESTING_OPTIONS +#define TESTING_OPTIONS 0 +#endif + +/* Write the solution to stdout (used for testing). */ +#define LOG_SOLUTION_TO_STDOUT (0 || TESTING_OPTIONS) + +/* Enable extra fields for debugging/performance tuning, most notably + * a 'uniq_id' field that helps to see the various execution paths. */ +#define CAPVM_STATS (0 || TESTING_OPTIONS) +#define CAPVM_PATH_STATS (0 && CAPVM_STATS) + +/* Allow the path table to grow on demand. + * In theory it should be possible to determine the worst case + * based on compile-time analysis and the input length; if an + * appropriately sized buffer was passed in capture resolution + * would not need dynamic allocation at all. */ +#define ALLOW_PATH_TABLE_RESIZING 1 + +/* Set to non-zero to trap runaway path table growth */ +#define PATH_TABLE_CEIL_LIMIT 0 + +/* Specialized logging that can be scraped to reconstruct non-interleaved + * execution paths per thread. */ +#define LOG_EXECUTION 0 +#define LOG_EXECUTION_FILE stderr +#if LOG_EXECUTION + +#if CAPVM_STATS == 0 +#error CAPVM_STATS must be 1 for uniq_id +#endif + +/* Various execution log messages, in an easily scraped format */ +#define LOG_EXEC_OP(UNIQ_ID, INPUT_POS, OP_ID, OP_NAME) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC OP %u %u %u %s\n", \ + UNIQ_ID, INPUT_POS, OP_ID, OP_NAME) + +#define LOG_EXEC_CHAR(UNIQ_ID, CHAR) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC CHAR %u %c 0x%02x\n", UNIQ_ID, isprint(CHAR) ? CHAR : '.', CHAR) + +#define LOG_EXEC_HALT(UNIQ_ID) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC HALT %u\n", UNIQ_ID) + +#define LOG_EXEC_PATH_FIND_SOLUTION(UNIQ_ID, BIT) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC PATH_FIND_SOLUTION %u %u\n", UNIQ_ID, BIT) + +#define LOG_EXEC_PATH_SAVE_CAPTURES(UNIQ_ID, BIT) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC PATH_SAVE_CAPTURES %u %u\n", UNIQ_ID, BIT) + +#define LOG_EXEC_SPLIT(PARENT_UNIQ_ID, CHILD_UNIQ_ID) \ + fprintf(LOG_EXECUTION_FILE, \ + "LOG_EXEC SPLIT %u %u\n", PARENT_UNIQ_ID, CHILD_UNIQ_ID) +#else +#define LOG_EXEC_OP(UNIQ_ID, INPUT_POS, OP_ID, OP_NAME) /* no-op */ +#define LOG_EXEC_CHAR(UNIQ_ID, CHAR) /* no-op */ +#define LOG_EXEC_HALT(UNIQ_ID) /* no-op */ +#define LOG_EXEC_PATH_FIND_SOLUTION(UNIQ_ID, BIT) /* no-op */ +#define LOG_EXEC_PATH_SAVE_CAPTURES(UNIQ_ID, BIT) /* no-op */ +#define LOG_EXEC_SPLIT(PARENT_UNIQ_ID, CHILD_UNIQ_ID) /* no-op */ +#endif + +/* Bitset backed by an array of 32-bit words */ +#define GET_BIT32(BITARRAY, BIT) (BITARRAY[BIT/32] & ((uint32_t)1 << (BIT & 31))) +#define SET_BIT32(BITARRAY, BIT) (BITARRAY[BIT/32] |= ((uint32_t)1 << (BIT & 31))) + +static const char * +op_name[] = { + [CAPVM_OP_CHAR] = "CHAR", + [CAPVM_OP_CHARCLASS] = "CHARCLASS", + [CAPVM_OP_MATCH] = "MATCH", + [CAPVM_OP_JMP] = "JMP", + [CAPVM_OP_JMP_ONCE] = "JMP_ONCE", + [CAPVM_OP_SPLIT] = "SPLIT", + [CAPVM_OP_SAVE] = "SAVE", + [CAPVM_OP_ANCHOR] = "ANCHOR", +}; + +enum pair_id { PAIR_ID_CURRENT = 0, PAIR_ID_NEXT = 1 }; + +struct capvm { + const struct capvm_program *p; + const uint8_t *input; + const uint32_t input_len; + struct fsm_capture *capture_buf; + const size_t capture_buf_length; + size_t step_limit; + +#if CAPVM_STATS + uint32_t uniq_id_counter; +#endif + + /* Two stacks, used to track which execution instruction should + * be advanced next. The current stack is + * run_stacks[PAIR_ID_CURRENT], run_stacks[PAIR_ID_NEXT] is the + * stack for the next input position, and when the current stack + * is completed the next stack is copied over (and reversed, so + * the greediest threads end up on top and resume first). + * Same with run_stacks_h, the height for each stack, and the + * other fields with [2] below. */ + uint32_t *run_stacks[2]; + uint32_t run_stacks_h[2]; + + /* Similarly, two columns of bits and two arrays of path_info + * node IDs and uniq_ids for the execution at a particular + * opcode. + * + * evaluated bit array[]: Has the instruction n already been + * evaluated at the current input position? */ + uint32_t *evaluated[2]; + uint32_t *path_info_heads[2]; /* path for thread on instruction */ +#if CAPVM_STATS + uint32_t *uniq_ids[2]; +#endif + + struct capvm_thread_stats { + uint32_t live; + uint32_t max_live; + } threads; + + /* Pool of nodes for linked lists of path segments. */ + struct capvm_path_info_pool { + uint32_t ceil; + uint32_t live; + uint32_t max_live; + uint32_t freelist_head; + struct capvm_path_info { + union { + struct capvm_path_freelist_link { + uint16_t refcount; /* == 0: tag for freelist node */ + uint32_t freelist; + } freelist_node; + struct capvm_path_info_link { + /* refcount: When > 0 this is a path node. + * This could be sticky at UINT16_MAX, but in order + * to get there it would need a regex whose compiled + * program has well over 2**16 instructions that all + * share the same path info node. */ + uint16_t refcount; + uint8_t used; /* .bits used, <= PATH_LINK_BITS */ + uint32_t bits; /* buffer for this link's path bits */ + uint32_t offset; /* offset into the path bit array */ + /* Linked list to earlier path nodes, with common + * nodes shared until paths diverge. + * + * This can be either a valid path node ID, NO_ID + * for end of list, or COLLAPSED_ZERO_PREFIX_ID + * to indicate that the node is preceded by + * (offset) zero bits. */ + uint32_t backlink; +#if CAPVM_PATH_STATS + uint32_t bits_added_per_input_character; +#endif + } path; + } u; + } *pool; + } paths; + + struct capvm_solution_info { + uint32_t best_path_id; +#if CAPVM_STATS + uint32_t best_path_uniq_id; +#endif + uint32_t zeros_evaluated_up_to; + } solution; + + struct { + size_t steps; +#if CAPVM_STATS + uint32_t matches; + uint32_t path_prefixes_shared; + uint32_t collapsed_zero_prefixes; +#endif +#if CAPVM_PATH_STATS + uint32_t max_bits_added_per_input_character; + uint32_t max_path_length_memory; +#endif + } stats; + + enum fsm_capvm_program_exec_res res; +}; + +/* Type identifier macros */ +#define IS_THREAD_FREELIST(T) (T->u.thread.path_info_head == NO_ID) +#define IS_PATH_FREELIST(P) (P->u.path.refcount == 0) +#define IS_PATH_NODE(P) (P->u.path.refcount > 0 && P->u.path.used <= PATH_LINK_BITS) + +static void +release_path_info_link(struct capvm *vm, uint32_t *pi_id); + +static void +dump_path_table(FILE *f, const struct capvm *vm); + +static void +set_max_threads_live(struct capvm *vm, uint32_t new_max_live) +{ + vm->threads.max_live = new_max_live; + if (LOG_CAPVM >= 6) { + LOG(0, "==== new vm->threads.max_live: %u\n", vm->threads.max_live); + dump_path_table(stderr, vm); + } +} + + +/*********************** + * path_info functions * + ***********************/ + +static void +set_max_paths_live(struct capvm *vm) +{ + vm->paths.max_live = vm->paths.live; + if (LOG_CAPVM >= 6) { + LOG(0, "==== new vm->paths.max_live: %u\n", vm->paths.max_live); + dump_path_table(stderr, vm); + } +} + +static uint32_t +get_path_node_refcount(const struct capvm *vm, uint32_t p_id) +{ + assert(p_id < vm->paths.ceil); + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { + return pi->u.freelist_node.refcount; + } else { + assert(IS_PATH_NODE(pi)); + return pi->u.path.refcount; + } +} + +static void +inc_path_node_refcount(struct capvm *vm, uint32_t p_id) +{ + /* TODO: sticky refcount handling is not currently implemented */ + if (p_id == COLLAPSED_ZERO_PREFIX_ID) { return; } + assert(p_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(IS_PATH_NODE(pi)); + LOG(5, "%s: p_id %u: refcnt %u -> %u\n", + __func__, p_id, pi->u.path.refcount, pi->u.path.refcount + 1); + pi->u.path.refcount++; +} + +static uint32_t +get_path_node_offset(const struct capvm *vm, uint32_t p_id) +{ + assert(p_id < vm->paths.ceil); + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(IS_PATH_NODE(pi)); + return pi->u.path.offset; +} + +static uint32_t +get_path_node_backlink(const struct capvm *vm, uint32_t p_id) +{ + assert(p_id < vm->paths.ceil); + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { + return pi->u.freelist_node.freelist; + } else { + assert(IS_PATH_NODE(pi)); + return pi->u.path.backlink; + } +} + +static void +set_path_node_backlink(struct capvm *vm, uint32_t p_id, uint32_t backlink) +{ + assert(p_id < vm->paths.ceil); + assert(backlink < vm->paths.ceil || (backlink == NO_ID || backlink == COLLAPSED_ZERO_PREFIX_ID)); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(IS_PATH_NODE(pi)); + pi->u.path.backlink = backlink; +} + +static void +dump_path_table(FILE *f, const struct capvm *vm) +{ + fprintf(f, "=== path table, %u/%u live\n", + vm->paths.live, vm->paths.ceil); + for (uint32_t i = 0; i < vm->paths.ceil; i++) { + struct capvm_path_info *pi = &vm->paths.pool[i]; + if (IS_PATH_FREELIST(pi)) { + if (LOG_CAPVM >= 5) { + fprintf(f, "paths[%u]: freelist -> %d\n", + i, (int)pi->u.freelist_node.freelist); + } + } else { + assert(IS_PATH_NODE(pi)); + fprintf(f, "paths[%u]: refcount %u, used %u, bits 0x%08x, offset %u, backlink %d%s\n", + i, pi->u.path.refcount, pi->u.path.used, pi->u.path.bits, + pi->u.path.offset, (int)pi->u.path.backlink, + pi->u.path.backlink == COLLAPSED_ZERO_PREFIX_ID + ? " (collapsed zero prefix)" + : pi->u.path.backlink == NO_ID + ? " (none)" + : " (link)"); + } + } +} + +static void +check_path_table(const struct capvm *vm) +{ +#if EXPENSIVE_CHECKS + uint32_t *refcounts = calloc(vm->paths.ceil, sizeof(refcounts[0])); + assert(refcounts); + + if (LOG_CAPVM >= 4) { + dump_path_table(stderr, vm); + } + + LOG(4, "%s: stack heights %u, %u\n", __func__, + vm->run_stacks_h[PAIR_ID_CURRENT], vm->run_stacks_h[PAIR_ID_NEXT]); + + for (uint32_t pair_id = 0; pair_id < 2; pair_id++) { + for (uint32_t h = 0; h < vm->run_stacks_h[pair_id]; h++) { + const uint32_t op_id = vm->run_stacks[pair_id][h]; + if (op_id == NO_ID) { continue; } +#if CAPVM_STATS + const uint32_t uniq_id = vm->uniq_ids[pair_id][op_id]; +#else + const uint32_t uniq_id = 0; +#endif + + LOG(4, "%s: run_stacks[%u][%u/%u]: op_id %u (uniq_id %u) -> path_info_head %u\n", + __func__, pair_id, h, vm->run_stacks_h[pair_id], op_id, + uniq_id, vm->path_info_heads[pair_id][op_id]); + if (op_id == NO_ID) { continue; } + const uint32_t p_id = vm->path_info_heads[pair_id][op_id]; + if (p_id != NO_ID) { + refcounts[p_id]++; + } + } + } + + for (uint32_t p_id = 0; p_id < vm->paths.ceil; p_id++) { + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { + continue; + } + const uint32_t backlink = get_path_node_backlink(vm, p_id); + if (backlink != NO_ID && backlink != COLLAPSED_ZERO_PREFIX_ID) { + assert(backlink < vm->paths.ceil); + refcounts[backlink]++; + } + } + + if (vm->solution.best_path_id != NO_ID) { + assert(vm->solution.best_path_id < vm->paths.ceil); + refcounts[vm->solution.best_path_id]++; + } + + for (uint32_t p_id = 0; p_id < vm->paths.ceil; p_id++) { + const struct capvm_path_info *pi = &vm->paths.pool[p_id]; + if (IS_PATH_FREELIST(pi)) { continue; } + bool ok; + const uint32_t refcount = get_path_node_refcount(vm, p_id); + ok = refcounts[p_id] == refcount; + + if (!ok) { + dump_path_table(stderr, vm); + + fprintf(stderr, "BAD REFCOUNT: pi[%u], expected %u, got %u\n", + p_id, refcounts[p_id], refcount); + assert(ok); + } + } + + free(refcounts); + LOG(6, "%s: passed\n", __func__); +#else + (void)vm; +#endif +} + +static bool +reserve_path_info_link(struct capvm *vm, uint32_t *pi_id) +{ + if (vm->paths.live == vm->paths.ceil) { +#if ALLOW_PATH_TABLE_RESIZING + if (LOG_CAPVM >= 4) { + fprintf(stderr, "\n"); + dump_path_table(stderr, vm); + check_path_table(vm); + fprintf(stderr, "\n"); + } + + const uint32_t nceil = 2*vm->paths.ceil; + LOG(1, "%s: growing path table %u -> %u\n", + __func__, vm->paths.ceil, nceil); + + /* This can legitimitely be reached with very long inputs, but + * if PATH_TABLE_CEIL_LIMIT is non-zero and this is hit then + * it's most likely a sign of an infinite loop. */ + if (PATH_TABLE_CEIL_LIMIT != 0 && nceil > PATH_TABLE_CEIL_LIMIT) { + assert(!"reached PATH_TABLE_CEIL_LIMIT"); + } + + assert(nceil > vm->paths.ceil); + struct capvm_path_info *npool = realloc(vm->paths.pool, + nceil * sizeof(npool[0])); + if (npool == NULL) { + return false; + } + + for (size_t i = vm->paths.ceil; i < nceil; i++) { + npool[i].u.freelist_node.refcount = 0; + npool[i].u.freelist_node.freelist = i + 1; + } + npool[nceil - 1].u.freelist_node.refcount = 0; + npool[nceil - 1].u.freelist_node.freelist = NO_POS; + vm->paths.freelist_head = vm->paths.ceil; + vm->paths.ceil = nceil; + vm->paths.pool = npool; +#else + assert(!"shouldn't need to grow path pool"); +#endif + } + + assert(vm->paths.live < vm->paths.ceil); + assert(vm->paths.freelist_head != NO_POS); + + *pi_id = vm->paths.freelist_head; + LOG(3, "%s: returning %u\n", __func__, *pi_id); + return true; +} + +/* Release a reference to a path_info_link. Consume the argument. + * If the reference count reaches 0, repool the node and release + * its backlink. */ +static void +release_path_info_link(struct capvm *vm, uint32_t *pi_id) +{ +#define LOG_RELEASE_PI 0 + size_t count = 0; + assert(pi_id != NULL); + uint32_t cur_id = *pi_id; + LOG(4 - LOG_RELEASE_PI, "%s: pi_id %u\n", __func__, cur_id); + *pi_id = NO_ID; + + while (cur_id != NO_ID) { + struct capvm_path_info *pi = &vm->paths.pool[cur_id]; + uint32_t refcount = get_path_node_refcount(vm, cur_id); + LOG(4 - LOG_RELEASE_PI, "-- checking path_info[%u]: refcount %u\n", + cur_id, refcount); + assert(refcount > 0); + LOG(4 - LOG_RELEASE_PI, "release: pi[%u] refcount %u -> %u\n", + cur_id, refcount, refcount - 1); + + const uint32_t backlink = get_path_node_backlink(vm, cur_id); + assert(IS_PATH_NODE(pi)); + pi->u.path.refcount--; + refcount = pi->u.path.refcount; + + if (refcount > 0) { + break; + } + + count++; + LOG(3 - LOG_RELEASE_PI, "-- repooling path_info %u, now %u live\n", + cur_id, vm->paths.live - 1); + LOG(3 - LOG_RELEASE_PI, "-- backlink: %d\n", backlink); + + pi->u.freelist_node.freelist = vm->paths.freelist_head; + vm->paths.freelist_head = cur_id; + assert(vm->paths.live > 0); + vm->paths.live--; + + cur_id = backlink; + if (cur_id == COLLAPSED_ZERO_PREFIX_ID) { + break; + } + } +} + +static void +print_path(FILE *f, const struct capvm *vm, uint32_t p_id) +{ + if (p_id == NO_ID) { + fprintf(f, "/0"); + return; + } + + /* reverse links to the root node */ + uint32_t zero_prefix = 0; + uint32_t next = NO_ID; + uint32_t first = NO_ID; + uint32_t prev; + + while (p_id != NO_ID) { + assert(p_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(!IS_PATH_FREELIST(pi)); + + uint32_t bl; + assert(IS_PATH_NODE(pi)); + bl = pi->u.path.backlink; + pi->u.path.backlink = next; + + if (bl == NO_ID) { + prev = bl; + first = p_id; + break; + } else if (bl == COLLAPSED_ZERO_PREFIX_ID) { + prev = bl; + first = p_id; + zero_prefix = pi->u.path.offset; + break; + } + + next = p_id; + p_id = bl; + } + + if (zero_prefix > 0) { + fprintf(f, "0/%u", zero_prefix); + } + + /* iterate forward, printing and restoring link order */ + p_id = first; + while (p_id != NO_ID) { + assert(p_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[p_id]; + assert(!IS_PATH_FREELIST(pi)); + + uint32_t fl; /* now a forward link */ + assert(IS_PATH_NODE(pi)); + fl = pi->u.path.backlink; + pi->u.path.backlink = prev; + prev = p_id; + fprintf(f, "%s%08x/%u", prev == NO_ID ? "" : " ", + pi->u.path.bits, pi->u.path.used); + + p_id = fl; + } +} + +#if EXPENSIVE_CHECKS +SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() +#endif +static int +cmp_paths(struct capvm *vm, uint32_t p_a, uint32_t p_b) +{ +#if EXPENSIVE_CHECKS + /* When EXPENSIVE_CHECKS is set, walk the chains + * before and after and compare incremental hashing of node IDs, + * to ensure the chains are restored properly. */ + uint64_t hash_a_before = 0; + uint64_t hash_b_before = 0; +#endif + +#define LOG_CMP_PATHS 0 + LOG(3 - LOG_CMP_PATHS, "%s: p_a %d, p_b %d\n", __func__, p_a, p_b); + + if (p_a == NO_ID) { + return p_b == NO_ID ? 0 : -1; + } else if (p_b == NO_ID) { + return 1; + } + + assert(p_a != p_b); + + if (LOG_CAPVM >= 5) { + fprintf(stderr, "A: "); + print_path(stderr, vm, p_a); + fprintf(stderr, "\n"); + + fprintf(stderr, "B: "); + print_path(stderr, vm, p_b); + fprintf(stderr, "\n"); + } + + /* walk both paths backward until they reach a beginning + * or the common prefix node, reversing links along the + * way, then compare forward and restore link order. */ + uint32_t link_a = p_a; + uint32_t link_b = p_b; + + uint32_t fwd_a = NO_ID; + uint32_t fwd_b = NO_ID; + + /* Walk both paths backward, individually until reaching a + * common offset, then back until reaching a common prefix + * (including the start). While iterating backward, replace + * the .backlink field with a forward link, which will be + * reverted when iterating forward and comparing from the + * common prefix. */ + bool common_prefix_found = false; + uint32_t first_a = NO_ID; + uint32_t first_b = NO_ID; + uint32_t common_prefix_link; /* can be NO_ID */ + +#if EXPENSIVE_CHECKS + uint32_t hash_step = 0; /* added so ordering matters */ + while (link_a != NO_ID) { + assert(link_a < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_a); + hash_a_before += hash_id(link_a + hash_step); + link_a = prev; + hash_step++; + } + hash_step = 0; + while (link_b != NO_ID) { + assert(link_b < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_b); + hash_b_before += hash_id(link_b + hash_step); + link_b = prev; + hash_step++; + } + + link_a = p_a; + link_b = p_b; +#endif + + while (!common_prefix_found) { + assert(link_a != NO_ID); + assert(link_b != NO_ID); + assert(link_a < vm->paths.ceil); + assert(link_b < vm->paths.ceil); + + const uint32_t prev_a = get_path_node_backlink(vm, link_a); + const uint32_t prev_b = get_path_node_backlink(vm, link_b); + const uint32_t offset_a = get_path_node_offset(vm, link_a); + const uint32_t offset_b = get_path_node_offset(vm, link_b); + const uint32_t backlink_a = get_path_node_backlink(vm, link_a); + const uint32_t backlink_b = get_path_node_backlink(vm, link_b); + + /* These are only used for logging, which may compile out. */ + (void)backlink_a; + (void)backlink_b; + + LOG(3 - LOG_CMP_PATHS, + "%s: backward loop: link_a %d (offset %u, prev %d), link_b %d (offset %u, prev %d)\n", + __func__, link_a, offset_a, prev_a, link_b, offset_b, prev_b); + + assert((offset_a & (PATH_LINK_BITS - 1)) == 0); /* multiple of 32 */ + assert((offset_b & (PATH_LINK_BITS - 1)) == 0); /* multiple of 32 */ + if (offset_a > offset_b) { + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: a longer than b\n", __func__); + set_path_node_backlink(vm, link_a, fwd_a); + fwd_a = link_a; + link_a = prev_a; + } else if (offset_b > offset_a) { + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: b longer than a\n", __func__); + set_path_node_backlink(vm, link_b, fwd_b); + fwd_b = link_b; + link_b = prev_b; + } else { + assert(offset_b == offset_a); + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: comparing backlinks: a: %d, b: %d\n", + __func__, backlink_a, backlink_b); + assert(fwd_a != link_a); + set_path_node_backlink(vm, link_a, fwd_a); + assert(fwd_b != link_b); + set_path_node_backlink(vm, link_b, fwd_b); + + if (prev_a == prev_b) { + /* if == NO_ID, empty prefix */ + common_prefix_found = true; + common_prefix_link = prev_a; + first_a = link_a; + first_b = link_b; + + LOG(3 - LOG_CMP_PATHS, "%s: backward loop: common_prefix_found: %d\n", + __func__, common_prefix_link); + } else { + fwd_a = link_a; + fwd_b = link_b; + + link_a = prev_a; + link_b = prev_b; + } + } + } + + assert(first_a != NO_ID); + assert(first_b != NO_ID); + link_a = first_a; + link_b = first_b; + + bool cmp_done = false; + int res; + bool done_restoring_link_order = false; + uint32_t prev_a = common_prefix_link; + uint32_t prev_b = common_prefix_link; + while (!done_restoring_link_order) { + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, link_a %d, link_b %d, cmp_done %d\n", + __func__, link_a, link_b, cmp_done); + if (!cmp_done) { + if (link_a == NO_ID) { /* b is longer */ + cmp_done = true; + if (link_b == NO_ID) { + res = 0; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, equal length, res %d\n", __func__, res); + } else { + res = -1; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, b is longer, res %d\n", __func__, res); + } + } else if (link_b == NO_ID) { /* a is longer */ + cmp_done = true; + res = 1; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, a is longer, res %d\n", __func__, res); + } else { + assert(link_a < vm->paths.ceil); + assert(link_b < vm->paths.ceil); + struct capvm_path_info *pi_a = &vm->paths.pool[link_a]; + struct capvm_path_info *pi_b = &vm->paths.pool[link_b]; + + const uint32_t offset_a = get_path_node_offset(vm, link_a); + const uint32_t offset_b = get_path_node_offset(vm, link_b); + + assert(offset_a == offset_b); + + if (pi_a->u.path.bits != pi_b->u.path.bits) { + res = pi_a->u.path.bits < pi_b->u.path.bits ? -1 : 1; + cmp_done = true; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, different bits (0x%08x, 0x%08x) => res %d\n", + __func__, pi_a->u.path.bits, pi_b->u.path.bits, res); + } + } + } + + /* Check if both have reached the original head node. */ + if (link_a == NO_ID && link_b == NO_ID) { + done_restoring_link_order = true; + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop: reached end of both paths, prev_a %d (p_a %d), prev_b %d (p_b %d)\n", + __func__, prev_a, p_a, prev_b, p_b); + assert(prev_a == p_a); + assert(prev_b == p_b); + } + + /* Whether or not comparison has finished, iterate forward, + * restoring forward links. */ + if (link_a != NO_ID) { + assert(link_a < vm->paths.ceil); + const uint32_t fwd_a = get_path_node_backlink(vm, link_a); + LOG(3 - LOG_CMP_PATHS, "%s: fwd loop: link_a %d, fwd_a %d\n", + __func__, link_a, fwd_a); + assert(fwd_a != link_a); + + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, restoring a's backlink: pi[%u].backlink <- %d\n", + __func__, link_a, prev_a); + set_path_node_backlink(vm, link_a, prev_a); + prev_a = link_a; + link_a = fwd_a; + } + + if (link_b != NO_ID) { + assert(link_b < vm->paths.ceil); + const uint32_t fwd_b = get_path_node_backlink(vm, link_b); + LOG(3 - LOG_CMP_PATHS, "%s: fwd loop: link_b %d, fwd_b %d\n", + __func__, link_b, fwd_b); + assert(fwd_b != link_b); + + LOG(3 - LOG_CMP_PATHS, + "%s: fwd loop, restoring b's backlink: pi[%u].backlink <- %d\n", + __func__, link_b, prev_b); + set_path_node_backlink(vm, link_b, prev_b); + prev_b = link_b; + link_b = fwd_b; + } + } + + LOG(3 - LOG_CMP_PATHS, "%s: res %d\n", __func__, res); + +#if EXPENSIVE_CHECKS + uint64_t hash_a_after = 0; + uint64_t hash_b_after = 0; + hash_step = 0; + link_a = p_a; + while (link_a != NO_ID) { + assert(link_a < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_a); + hash_a_after += hash_id(link_a + hash_step); + link_a = prev; + hash_step++; + } + link_b = p_b; + hash_step = 0; + while (link_b != NO_ID) { + assert(link_b < vm->paths.ceil); + const uint32_t prev = get_path_node_backlink(vm, link_b); + hash_b_after += hash_id(link_b + hash_step); + link_b = prev; + hash_step++; + } + + assert(hash_a_after == hash_a_before); + assert(hash_b_after == hash_b_before); +#endif + + return res; +#undef LOG_CMP_PATHS +} + +static void +handle_possible_matching_path(struct capvm *vm, uint32_t path_info_head, uint32_t uniq_id); + +static bool +copy_path_info(struct capvm *vm, uint32_t path_info_head, + uint32_t *new_path_info_head); + +static bool +extend_path_info(struct capvm *vm, uint32_t path_info_head, bool greedy, uint32_t uniq_id, + uint32_t *new_path_info_head); + +/* Push the next execution step onto the stack, if it hasn't already + * been run by a greedier path. Calling this hands off ownership of the + * path_info_head, so it is released if execution will not be resumed + * later. */ +static void +schedule_possible_next_step(struct capvm *vm, enum pair_id pair_id, + uint32_t input_pos, uint32_t op_id, + uint32_t path_info_head, uint32_t uniq_id) +{ + assert(path_info_head != NO_ID); + uint32_t *stack = vm->run_stacks[pair_id]; + uint32_t *stack_h = &vm->run_stacks_h[pair_id]; + assert((*stack_h) < vm->p->used); + + /* If that instruction has already been evaluated, skip the + * redundant execution by a less greedy path. */ + const uint32_t *evaluated = vm->evaluated[pair_id]; + const bool already_evaluated = GET_BIT32(evaluated, op_id) != 0; + LOG(3, "%s: pair_id %u, input_pos %u, op_id %u, path_info_head %u, uniq_id %u, already_evaluated %d, stack_h %u\n", + __func__, pair_id, input_pos, op_id, path_info_head, uniq_id, already_evaluated, *stack_h); + + if (already_evaluated) { + LOG_EXEC_HALT(uniq_id); + release_path_info_link(vm, &path_info_head); + assert(vm->threads.live > 0); + vm->threads.live--; + } else { + + /* If the work being scheduled by the current greediest + * thread pre-empts work scheduled by a less greedy + * thread, release that thread's path link and clear its + * op ID on the run stack. + * + * TODO: Ideally, avoid the linear scan here, but the + * total stack height is bounded by the generated program size + * and should be fairly small in practice. Wait to change this + * untill there are benchmarks in place showing it's necessary. + * + * An extra two bits per opcode (one for each stack) could + * be used to track whether the stack already contains + * op_id, so the linear scan could be avoided except when + * actually necessary. */ + uint32_t cur_pih = vm->path_info_heads[pair_id][op_id]; + if (cur_pih != NO_ID) { + release_path_info_link(vm, &cur_pih); + vm->path_info_heads[pair_id][op_id] = NO_ID; + const size_t h = *stack_h; + for (size_t i = 0; i < h; i++) { + if (stack[i] == op_id) { + stack[i] = NO_ID; /* cancel thread */ + vm->threads.live--; + } + } + } + stack[(*stack_h)++] = op_id; + vm->path_info_heads[pair_id][op_id] = path_info_head; +#if CAPVM_STATS + vm->uniq_ids[pair_id][op_id] = uniq_id; +#endif + + if (*stack_h > vm->threads.max_live) { + vm->threads.max_live = *stack_h; + if (LOG_CAPVM >= 6) { + LOG(0, "==== new vm->threads.max_live: %u\n", vm->threads.max_live); + dump_path_table(stderr, vm); + } + } + } +} + +/* returns whether the vm should continue. */ +static bool +eval_vm_advance_greediest(struct capvm *vm, uint32_t input_pos, + uint32_t path_info_head, uint32_t uniq_id, uint32_t op_id) +{ + LOG(5, "%s: input_pos %u, input_len %u, op_id %u, threads_live %u\n", + __func__, input_pos, vm->input_len, op_id, vm->threads.live); + + assert(op_id < vm->p->used); + + if (vm->stats.steps == vm->step_limit) { + LOG(1, "%s: halting, steps == step_limit %zu\n", + __func__, vm->step_limit); + vm->res = FSM_CAPVM_PROGRAM_EXEC_STEP_LIMIT_REACHED; + return false; + } + vm->stats.steps++; + + const struct capvm_opcode *op = &vm->p->ops[op_id]; + LOG(2, "%s: op_id[%u]: input_pos %u, path_info_head %u, uniq_id %u, op %s\n", + __func__, op_id, input_pos, path_info_head, uniq_id, op_name[op->t]); + LOG_EXEC_OP(uniq_id, input_pos, op_id, op_name[op->t]); + + switch (op->t) { + case CAPVM_OP_CHAR: + if (input_pos == vm->input_len) { + goto halt_thread; /* past end of input */ + } + + LOG(3, "OP_CHAR: input_pos %u, exp char '%c', got '%c'\n", + input_pos, op->u.chr, vm->input[input_pos]); + + if (vm->input[input_pos] != op->u.chr) { + goto halt_thread; /* character mismatch */ + } + LOG_EXEC_CHAR(uniq_id, vm->input[input_pos]); + + schedule_possible_next_step(vm, PAIR_ID_NEXT, input_pos + 1, op_id + 1, + path_info_head, uniq_id); + break; + + case CAPVM_OP_CHARCLASS: + { + if (input_pos == vm->input_len) { + goto halt_thread; /* past end of input */ + } + + const uint8_t c = vm->input[input_pos]; + const uint32_t cc_id = op->u.charclass_id; + assert(cc_id < vm->p->char_classes.count); + const struct capvm_char_class *class = &vm->p->char_classes.sets[cc_id]; + + if (!(class->octets[c/64] & ((uint64_t)1 << (c&63)))) { + goto halt_thread; /* character not in class */ + } + LOG_EXEC_CHAR(uniq_id, vm->input[input_pos]); + + schedule_possible_next_step(vm, PAIR_ID_NEXT, input_pos + 1, op_id + 1, + path_info_head, uniq_id); + break; + } + + case CAPVM_OP_MATCH: + if (input_pos == vm->input_len) { + handle_possible_matching_path(vm, path_info_head, uniq_id); + } else if (vm->input_len > 0 && input_pos == vm->input_len - 1 + && vm->input[input_pos] == '\n') { + LOG(3, "OP_MATCH: special case for trailing newline\n"); + handle_possible_matching_path(vm, path_info_head, uniq_id); + } + goto halt_thread; + + case CAPVM_OP_JMP: + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op->u.jmp, + path_info_head, uniq_id); + break; + + case CAPVM_OP_JMP_ONCE: + { + /* If the destination for this jump has already been visited + * without advancing input, then skip the jump. This is necessary + * for edge cases like the first branch in `^(^|.$)*`, which would + * otherwise have a backward jump to before the first case, due to + * the repetition, and would effectively be treated as an infinite + * loop and ignored, leading to incorrect match bounds for "x". + * + * Replaying the capture path does not track what has been evaluated, + * so this needs to record the branch in the path. This will make + * repetition more expensive in some cases, but compilation could + * emit a JMP when it's safe to do so. */ + const bool greedy = GET_BIT32(vm->evaluated[PAIR_ID_CURRENT], op->u.jmp_once); + if (greedy) { + /* non-greedy branch -- fall through */ + uint32_t new_path_info_head = NO_ID; + if (!extend_path_info(vm, path_info_head, 0, uniq_id, &new_path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + new_path_info_head, uniq_id); + } else { + /* greedy branch -- loop back and potentially match more */ + uint32_t new_path_info_head = NO_ID; + if (!extend_path_info(vm, path_info_head, 1, uniq_id, &new_path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op->u.jmp_once, + new_path_info_head, uniq_id); + } + break; + } + + case CAPVM_OP_SPLIT: + { + const uint32_t dst_greedy = op->u.split.greedy; + const uint32_t dst_nongreedy = op->u.split.nongreedy; + + /* destinations must be in range and not self-referential */ + assert(dst_greedy < vm->p->used); + assert(dst_nongreedy < vm->p->used); + assert(dst_greedy != op_id); + assert(dst_nongreedy != op_id); + + uint32_t nongreedy_path_info_head; + if (!copy_path_info(vm, path_info_head, &nongreedy_path_info_head)) { + goto alloc_error; + } + + if (!extend_path_info(vm, path_info_head, 1, uniq_id, &path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + + /* nongreedy is the non-greedy branch */ + if (!extend_path_info(vm, nongreedy_path_info_head, 0, uniq_id, &nongreedy_path_info_head)) { + release_path_info_link(vm, &path_info_head); + goto alloc_error; + } + +#if CAPVM_STATS + const uint32_t nongreedy_uniq_id = ++vm->uniq_id_counter; +#else + const uint32_t nongreedy_uniq_id = 0; +#endif + + vm->threads.live++; + if (vm->threads.live > vm->threads.max_live) { + set_max_threads_live(vm, vm->threads.live); + } + + /* Push the split.nongreedy destination, and then the + * split.greedy destination on top of it, so that the + * greedier branch will be fully evaluated first. */ + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_nongreedy, + nongreedy_path_info_head, nongreedy_uniq_id); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, dst_greedy, + path_info_head, uniq_id); + LOG_EXEC_SPLIT(uniq_id, nongreedy_uniq_id); + + break; + } + + case CAPVM_OP_SAVE: + /* no-op, during this stage */ + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + + case CAPVM_OP_ANCHOR: + if (op->u.anchor == CAPVM_ANCHOR_START) { + LOG(3, "%s: ^ anchor\n", __func__); + /* ignore a single trailing newline, because PCRE does. + * For ^ this affects the capture position. */ + if (input_pos == 0 + && vm->input_len == 1 + && vm->input[0] == '\n') { + /* allowed */ + LOG(3, "%s: special case: ^ ignoring trailing newline\n", __func__); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + } else if (input_pos == 1 + && vm->input_len == 1 + && vm->input[0] == '\n') { + /* allowed */ + } else if (input_pos != 0) { goto halt_thread; } + } else { + assert(op->u.anchor == CAPVM_ANCHOR_END); + LOG(3, "%s: $ anchor: input_len %u, input_pos %u\n", + __func__, vm->input_len, input_pos); + + /* ignore a single trailing newline, because PCRE does */ + if (vm->input_len > 0 && input_pos == vm->input_len - 1) { + if (vm->input[input_pos] != '\n') { + goto halt_thread; + } + LOG(3, "%s: special case: $ allowing trailing newline\n", __func__); + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + } else if (input_pos != vm->input_len) { + goto halt_thread; + } + } + + schedule_possible_next_step(vm, PAIR_ID_CURRENT, input_pos, op_id + 1, + path_info_head, uniq_id); + break; + + default: + assert(!"unreachable"); + return false; + } + + if (EXPENSIVE_CHECKS) { /* postcondition */ + check_path_table(vm); + } + + return true; + +halt_thread: + /* do not push further execution on the run stack */ + LOG_EXEC_HALT(uniq_id); + + release_path_info_link(vm, &path_info_head); + assert(vm->threads.live > 0); + vm->threads.live--; + return true; + +alloc_error: + release_path_info_link(vm, &path_info_head); + vm->res = FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC; + return false; +} + +static void +handle_possible_matching_path(struct capvm *vm, uint32_t pi_id, uint32_t uniq_id) +{ + LOG(3, "\n%s: HIT, pi_id %u, uniq_id %u\n", __func__, pi_id, uniq_id); + + if (LOG_CAPVM >= 3) { + LOG(3, "--- current_live: %u, max_live: %u\n", + vm->threads.live, vm->threads.max_live); + dump_path_table(stderr, vm); + LOG(3, "=====\n"); + } + +#if CAPVM_STATS + vm->stats.matches++; +#endif + + assert(pi_id < vm->paths.ceil); + + if (vm->solution.best_path_id == NO_ID) { + struct capvm_path_info *pi = &vm->paths.pool[pi_id]; + assert(!IS_PATH_FREELIST(pi)); + if (LOG_CAPVM >= 5) { + const uint32_t refcount = get_path_node_refcount(vm, pi_id); + LOG(5, "MATCH: pi_id %u refcount %u -> %u\n", + pi_id, refcount, refcount + 1); + } + inc_path_node_refcount(vm, pi_id); + vm->solution.best_path_id = pi_id; +#if CAPVM_STATS + vm->solution.best_path_uniq_id = uniq_id; +#endif + LOG(3, "MATCH: saved current best solution path (pi_id %u)\n", pi_id); + } else { + /* Compare path info and only keep the path associated + * with the greediest match so far. */ + const int res = cmp_paths(vm, pi_id, vm->solution.best_path_id); + if (res > 0) { + /* replace current best solution */ + struct capvm_path_info *pi = &vm->paths.pool[pi_id]; + assert(!IS_PATH_FREELIST(pi)); + if (LOG_CAPVM >= 5) { + const uint32_t refcount = get_path_node_refcount(vm, pi_id); + LOG(5, "MATCH: pi_id %u refcount %u -> %u\n", + pi_id, refcount, refcount + 1); + } + inc_path_node_refcount(vm, pi_id); + + LOG(3, "MATCH: replacing current best solution path %u with %u\n", + vm->solution.best_path_id, pi_id); + + release_path_info_link(vm, &vm->solution.best_path_id); + vm->solution.best_path_id = pi_id; +#if CAPVM_STATS + vm->solution.best_path_uniq_id = uniq_id; +#endif + } else { + /* keep the current best solution */ + LOG(3, "MATCH: ignoring new solution path %u, keeping %u\n", + pi_id, vm->solution.best_path_id); + } + } +} + +static bool +eval_vm(struct capvm *vm) +{ + uint32_t i_i; + + /* init the path_info_heads tables to NO_ID, except for cell 0 + * in next, which contains the starting point. */ + for (size_t op_i = 0; op_i < vm->p->used; op_i++) { + vm->path_info_heads[PAIR_ID_CURRENT][op_i] = NO_ID; +#if CAPVM_STATS + vm->uniq_ids[PAIR_ID_CURRENT][op_i] = NO_ID; +#endif + } + for (size_t op_i = 1; op_i < vm->p->used; op_i++) { + vm->path_info_heads[PAIR_ID_NEXT][op_i] = NO_ID; +#if CAPVM_STATS + vm->uniq_ids[PAIR_ID_NEXT][op_i] = NO_ID; +#endif + } + + for (i_i = 0; i_i <= vm->input_len; i_i++) { + if (vm->threads.live == 0 + || vm->stats.steps == vm->step_limit) { + LOG(3, "%s: breaking, live %u, steps %zu/%zd\n", + __func__, vm->threads.live, vm->stats.steps, vm->step_limit); + break; + } + LOG(3, "\n###### i_i %u\n", i_i); + + LOG(4, "-- clearing evaluated\n"); + const size_t evaluated_bit_words = vm->p->used/32 + 1; + for (size_t i = 0; i < evaluated_bit_words; i++) { + vm->evaluated[PAIR_ID_CURRENT][i] = 0; + vm->evaluated[PAIR_ID_NEXT][i] = 0; + } + + uint32_t *stack_h = &vm->run_stacks_h[PAIR_ID_CURRENT]; + uint32_t *run_stack = vm->run_stacks[PAIR_ID_CURRENT]; + + /* Copy everything from the next run stack to the + * current. Copy in reverse, so items that were pushed + * earlier by greedier paths end up on the top of the + * stack and evalated first, preserving greedy + * ordering. */ + { + const uint32_t next_stack_h = vm->run_stacks_h[PAIR_ID_NEXT]; + const uint32_t *next_stack = vm->run_stacks[PAIR_ID_NEXT]; + uint32_t *next_path_info_heads = vm->path_info_heads[PAIR_ID_NEXT]; + uint32_t *cur_path_info_heads = vm->path_info_heads[PAIR_ID_CURRENT]; + + uint32_t discarded = 0; + for (size_t i = 0; i < next_stack_h; i++) { + const uint32_t op_id = next_stack[i]; + if (op_id == NO_ID) { + assert(!"unreachable"); + discarded++; + continue; + } + + cur_path_info_heads[op_id] = next_path_info_heads[op_id]; + LOG(3, "%s: run_stack[%zd] <- %u, path_info_head %u\n", + __func__, i, op_id, cur_path_info_heads[op_id]); + assert(next_path_info_heads[op_id] < vm->paths.ceil); + next_path_info_heads[op_id] = NO_ID; /* move reference */ +#if CAPVM_STATS + vm->uniq_ids[PAIR_ID_CURRENT][op_id] = + vm->uniq_ids[PAIR_ID_NEXT][op_id]; +#endif + run_stack[next_stack_h - i - 1 - discarded] = op_id; + } + *stack_h = next_stack_h - discarded; + vm->run_stacks_h[PAIR_ID_NEXT] = 0; + +#if CAPVM_PATH_STATS + /* reset counters */ + for (size_t i = 0; i < vm->paths.ceil; i++) { + struct capvm_path_info *pi = &vm->paths.pool[i]; + if (IS_PATH_NODE(pi)) { + pi->u.path.bits_added_per_input_character = 0; + } + } +#endif + } + + uint32_t *path_info_heads = vm->path_info_heads[PAIR_ID_CURRENT]; + while (vm->run_stacks_h[PAIR_ID_CURRENT] > 0) { + /* Do this here, before popping, so that the reference + * on the stack can be counted properly. */ + if (EXPENSIVE_CHECKS) { + check_path_table(vm); + } + + const uint32_t h = --(*stack_h); + assert(h < vm->p->used); + const uint32_t op_id = run_stack[h]; + LOG(4, "%s: popped op_id %d off stack\n", __func__, op_id); + if (op_id == NO_ID) { + LOG(4, "%s: ignoring halted pending execution\n", __func__); + continue; + } + assert(op_id < vm->p->used); + + if (GET_BIT32(vm->evaluated[PAIR_ID_CURRENT], op_id)) { + LOG(2, "%s: evaluated[current] already set for op_id %u (popped off stack), skipping\n", + __func__, op_id); + assert(!"unreachable"); + continue; + } + + LOG(4, "%s: setting evaluated[current] for op_id %u (popped off stack)\n", __func__, op_id); + SET_BIT32(vm->evaluated[PAIR_ID_CURRENT], op_id); + + const uint32_t path_info_head = path_info_heads[op_id]; + LOG(4, "%s: op_id %d's path_info_head: %d\n", __func__, op_id, path_info_head); + path_info_heads[op_id] = NO_ID; + + +#if CAPVM_STATS + const uint32_t uniq_id = vm->uniq_ids[PAIR_ID_CURRENT][op_id]; + assert(uniq_id != NO_ID); +#else + const uint32_t uniq_id = 0; +#endif + if (!eval_vm_advance_greediest(vm, i_i, path_info_head, uniq_id, op_id)) { + return false; + } + } + + +#if CAPVM_PATH_STATS + uint32_t max_path_bits_added = 0; + for (size_t i = 0; i < vm->paths.ceil; i++) { + const struct capvm_path_info *pi = &vm->paths.pool[i]; + if (IS_PATH_NODE(pi)) { + if (pi->u.path.bits_added_per_input_character > max_path_bits_added) { + max_path_bits_added = pi->u.path.bits_added_per_input_character; + } + } + } + LOG(2, "%s: input_i %u: max_path_bits_added: %u\n", + __func__, i_i, max_path_bits_added); + if (max_path_bits_added > vm->stats.max_bits_added_per_input_character) { + vm->stats.max_bits_added_per_input_character = max_path_bits_added; + } + + if (CAPVM_PATH_STATS > 1) { + dump_path_table(stderr, vm); + } +#endif + } + + return vm->solution.best_path_id != NO_ID; +} + +static bool +copy_path_info(struct capvm *vm, uint32_t path_info_head, + uint32_t *new_path_info_head) +{ + if (!reserve_path_info_link(vm, new_path_info_head)) { + return false; + } + + assert(path_info_head != NO_ID); + assert(path_info_head < vm->paths.ceil); + assert(*new_path_info_head < vm->paths.ceil); + assert(*new_path_info_head != path_info_head); + + /* Since this is the path head, it can never be a collapsed + * zero prefix node. */ + const struct capvm_path_info *pi = &vm->paths.pool[path_info_head]; + assert(IS_PATH_NODE(pi)); + + struct capvm_path_info *npi = &vm->paths.pool[*new_path_info_head]; + assert(IS_PATH_FREELIST(npi)); + + /* unlink from freelist */ + vm->paths.freelist_head = npi->u.freelist_node.freelist; + vm->paths.live++; + if (vm->paths.live > vm->paths.max_live) { + set_max_paths_live(vm); + } + + *npi = (struct capvm_path_info){ + .u.path = { + .refcount = 1, + .used = pi->u.path.used, + .bits = pi->u.path.bits, + .offset = pi->u.path.offset, + .backlink = pi->u.path.backlink, + } + }; + if (pi->u.path.backlink != NO_ID) { + inc_path_node_refcount(vm, pi->u.path.backlink); + } + return true; +} + +#if CAPVM_PATH_STATS +static void +update_max_path_length_memory(struct capvm *vm, const struct capvm_path_info *pi) +{ + const uint32_t len = pi->u.path.used + + (pi->u.path.backlink == COLLAPSED_ZERO_PREFIX_ID + ? 0 /* not actually stored, so don't count it */ + : pi->u.path.offset); + + if (len > vm->stats.max_path_length_memory) { + vm->stats.max_path_length_memory = len; + } +} +#endif + +static bool +extend_path_info(struct capvm *vm, uint32_t pi_id, bool greedy, uint32_t uniq_id, + uint32_t *new_path_info_head) +{ + assert(pi_id < vm->paths.ceil); + struct capvm_path_info *pi = &vm->paths.pool[pi_id]; + assert(IS_PATH_NODE(pi)); + + (void)uniq_id; + LOG_EXEC_PATH_FIND_SOLUTION(uniq_id, greedy); + +#define LOG_EPI 0 + LOG(5 - LOG_EPI, "%s: pi_id %u, greedy %d\n", + __func__, pi_id, greedy); + + + if (pi->u.path.used == PATH_LINK_BITS) { /* full */ + uint32_t npi_id; + if (!reserve_path_info_link(vm, &npi_id)) { + assert(!"alloc fail"); + } + pi = &vm->paths.pool[pi_id]; /* refresh stale pointer */ + LOG(5 - LOG_EPI, "%s: new head at %u (%u is full)\n", __func__, npi_id, pi_id); + assert(npi_id < vm->paths.ceil); + struct capvm_path_info *npi = &vm->paths.pool[npi_id]; + vm->paths.freelist_head = npi->u.freelist_node.freelist; + vm->paths.live++; + if (vm->paths.live > vm->paths.max_live) { + set_max_paths_live(vm); + } + + LOG(5 - LOG_EPI, "%s: npi_id %u refcount 1 (new link)\n", + __func__, npi_id); + npi->u.path.refcount = 1; + npi->u.path.offset = pi->u.path.offset + pi->u.path.used; + npi->u.path.bits = (greedy ? ((uint32_t)1 << 31) : 0); + LOG(5 - LOG_EPI, "%s: bits after: 0x%08x\n", __func__, npi->u.path.bits); + npi->u.path.used = 1; + +#if CAPVM_PATH_STATS + npi->u.path.bits_added_per_input_character = pi->u.path.bits_added_per_input_character + 1; +#endif + + /* If the path node is full of zero bits and it's either at the start, + * or its backlink is a COLLAPSED_ZERO_PREFIX_ID, then extend the + * backlink to a collapsed run of zeroes. The node's offset field + * indicates the prefix length. Long prefixes of zero bits tend to + * occur with an unanchored start loop. */ + if (pi->u.path.bits == (uint32_t)0 && USE_COLLAPSED_ZERO_PREFIX + && (pi->u.path.offset == 0 || pi->u.path.backlink == COLLAPSED_ZERO_PREFIX_ID)) { + release_path_info_link(vm, &pi_id); + pi_id = COLLAPSED_ZERO_PREFIX_ID; + +#if CAPVM_STATS + vm->stats.collapsed_zero_prefixes++; +#endif + } else { + /* Check if there's an existing full path node with + * exactly the same bits. If so, link backward to that + * and free the old full one, rather than saving it as + * a duplicate. */ + const uint32_t old_path_bits = pi->u.path.bits; + const uint32_t old_path_offset = pi->u.path.offset; + const uint32_t old_path_backlink = pi->u.path.backlink; + + for (uint32_t epi_id = 0; epi_id < vm->paths.ceil; epi_id++) { + if (epi_id == pi_id) { continue; } + struct capvm_path_info *epi = &vm->paths.pool[epi_id]; + if (IS_PATH_FREELIST(epi)) { + continue; + } + + assert(IS_PATH_NODE(epi)); + if (epi->u.path.used == PATH_LINK_BITS + && epi->u.path.bits == old_path_bits + && epi->u.path.offset == old_path_offset + && epi->u.path.backlink == old_path_backlink) { + + if (LOG_CAPVM >= 4 || 1) { + const uint32_t refcount = get_path_node_refcount(vm, epi_id); + (void)refcount; + LOG(4 - LOG_EPI, "%s: pi[%u] refcount %u -> %u (reusing identical path backlink %u instead of %u)\n", + __func__, epi_id, refcount, refcount + 1, + epi_id, pi_id); + } + inc_path_node_refcount(vm, epi_id); + release_path_info_link(vm, &pi_id); + pi_id = epi_id; +#if CAPVM_STATS + vm->stats.path_prefixes_shared++; +#endif + break; + } + } + } + + assert(IS_PATH_NODE(npi)); + npi->u.path.backlink = pi_id; + /* transfer pi_id's reference to npi_id */ + *new_path_info_head = npi_id; + +#if CAPVM_PATH_STATS + update_max_path_length_memory(vm, npi); +#endif + + return true; + } else { + assert(IS_PATH_NODE(pi)); + assert(pi->u.path.used < PATH_LINK_BITS); + + LOG(5 - LOG_EPI, "%s: appending to head node %u, %u -> %u used\n", + __func__, pi_id, pi->u.path.used, pi->u.path.used + 1); + assert(pi->u.path.used < PATH_LINK_BITS); + if (greedy) { + LOG(5 - LOG_EPI, "%s: bits before: 0x%08x (greedy: %d)\n", + __func__, pi->u.path.bits, greedy); + pi->u.path.bits |= (uint32_t)1 << (31 - pi->u.path.used); + LOG(5 - LOG_EPI, "%s: bits after: 0x%08x\n", + __func__, pi->u.path.bits); + } + pi->u.path.used++; +#if CAPVM_PATH_STATS + pi->u.path.bits_added_per_input_character++; +#endif + +#if CAPVM_PATH_STATS + update_max_path_length_memory(vm, pi); +#endif + + *new_path_info_head = pi_id; + return true; + } +#undef LOG_EPI +} + +static void +populate_solution(struct capvm *vm) +{ + if (LOG_CAPVM >= 3) { + fsm_capvm_program_dump(stderr, vm->p); + LOG(0, "%s: best_path_id %d, tables:\n", __func__, vm->solution.best_path_id); + dump_path_table(stderr, vm); + check_path_table(vm); + fprintf(stderr, "SOLUTION_PATH: "); + print_path(stderr, vm, vm->solution.best_path_id); + fprintf(stderr, "\n"); + } + +#if CAPVM_PATH_STATS + LOG(1, "%s: prog_size %u, max_path_length_memory %u (bits)\n", + __func__, vm->p->used, vm->stats.max_path_length_memory); + const uint32_t uniq_id = vm->solution.best_path_uniq_id; +#else + const uint32_t uniq_id = NO_ID; +#endif + (void)uniq_id; + + /* Interpret the program again, but rather than using the input to + * drive execution, use the saved path for the primary solution. */ + + /* Walk the solution path, reversing the edges temporarily so it + * can be executed start to finish, and truncate any bits appended + * after branches on the path. */ + assert(vm->solution.best_path_id != NO_ID); + assert(vm->solution.best_path_id < vm->paths.ceil); + + uint32_t path_link = vm->solution.best_path_id; + uint32_t next_link = NO_ID; + uint32_t first_link = NO_ID; + + size_t split_count = 0; + uint32_t zero_prefix_length = 0; + + if (LOG_CAPVM >= 3) { + const struct capvm_path_info *pi = &vm->paths.pool[path_link]; + assert(!IS_PATH_FREELIST(pi)); + LOG(3, "%s: best_path %d, path_length %u\n", + __func__, vm->solution.best_path_id, pi->u.path.offset + pi->u.path.used); + if (LOG_CAPVM > 4) { + dump_path_table(stderr, vm); + } + } + + uint32_t prev; + do { + struct capvm_path_info *pi = &vm->paths.pool[path_link]; + assert(!IS_PATH_FREELIST(pi)); + const uint32_t prev_link = get_path_node_backlink(vm, path_link); + + if (LOG_CAPVM >= 3) { + if (IS_PATH_NODE(pi)) { + LOG(3, "%s (moving back), node %u: refcount %u, used %u, offset %u, backlink %d, bits '", + __func__, path_link, pi->u.path.refcount, pi->u.path.used, + pi->u.path.offset, pi->u.path.backlink); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", bit ? '1' : '0'); + } + LOG(3, "'\n"); + } + } + + split_count += pi->u.path.used; + + if (next_link != NO_ID) { + LOG(3, "-- setting backlink to %d\n", next_link); + set_path_node_backlink(vm, path_link, next_link); /* point fwd */ + } else { + LOG(3, "-- setting backlink to %d\n", NO_ID); + set_path_node_backlink(vm, path_link, NO_ID); /* now EOL */ + } + + if (prev_link == NO_ID) { + first_link = path_link; + prev = prev_link; + } else if (prev_link == COLLAPSED_ZERO_PREFIX_ID) { + first_link = path_link; + split_count += pi->u.path.offset; + zero_prefix_length = pi->u.path.offset; + prev = prev_link; + } + + next_link = path_link; + assert(path_link != prev_link); + path_link = prev_link; + } while (path_link != NO_ID && path_link != COLLAPSED_ZERO_PREFIX_ID); + + /* iter forward */ + if (LOG_CAPVM >= 3) { + uint32_t cur = first_link; + do { + struct capvm_path_info *pi = &vm->paths.pool[cur]; + + assert(IS_PATH_NODE(pi)); + LOG(3, "%s (moving fwd): node %u: refcount %u, used %u, offset %u, fwdlink %d, bits '", + __func__, cur, get_path_node_refcount(vm, cur), + pi->u.path.used, + get_path_node_offset(vm, cur), + get_path_node_backlink(vm, cur)); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", bit ? '1' : '0'); + } + LOG(3, "'\n"); + + const uint32_t next_cur = get_path_node_backlink(vm, cur); + assert(cur != next_cur); + cur = next_cur; /* fwd link */ + } while (cur != NO_ID); + } + + /* evaluate program with forward path */ + LOG(3, "%s: split_count %zu\n", __func__, split_count); + size_t split_i = 0; + uint32_t prog_i = 0; + uint32_t input_i = 0; + size_t capture_lookup_steps = 0; + bool done = false; + + /* This flag tracks whether an explicit newline was matched at + * the end of input. Normally a trailing newline is implicitly + * ignored in the bounds for captures, but when the regex + * matches a newline at the end, it must still be included. An + * example case where this matters is `^[^x]$` for "\n", because + * the character class matches the newline this should capture + * as (0,1). */ + bool explicitly_matched_nl_at_end = false; + + uint32_t cur = first_link; + while (split_i < split_count || !done) { + assert(prog_i < vm->p->used); + const uint32_t cur_prog_i = prog_i; + const struct capvm_opcode *op = &vm->p->ops[cur_prog_i]; + LOG(3, "%s: i_i %u, p_i %u, s_i %zu/%zu, op %s\n", + __func__, input_i, cur_prog_i, split_i, split_count, op_name[op->t]); + + prog_i++; + capture_lookup_steps++; + switch (op->t) { + case CAPVM_OP_CHAR: + assert(input_i < vm->input_len); + LOG(3, "OP_CHAR: input_i %u, exp char '%c', got '%c'\n", + input_i, op->u.chr, vm->input[input_i]); + assert(vm->input[input_i] == op->u.chr); + if (vm->input_len > 0 + && input_i == vm->input_len - 1 + && vm->input[input_i] == '\n') { + explicitly_matched_nl_at_end = true; + } + input_i++; + break; + case CAPVM_OP_CHARCLASS: + assert(input_i < vm->input_len); + if (vm->input_len > 0 + && input_i == vm->input_len - 1 + && vm->input[input_i] == '\n') { + explicitly_matched_nl_at_end = true; + } + input_i++; + break; + case CAPVM_OP_MATCH: + LOG(2, "split_i %zu, split_count %zu\n", split_i, split_count); + assert(split_i == split_count); + done = true; + break; + case CAPVM_OP_JMP: + prog_i = op->u.jmp; + break; + case CAPVM_OP_JMP_ONCE: + { + /* look at next bit of path and jmp or fall through */ + const uint32_t offset = get_path_node_offset(vm, cur); + const struct capvm_path_info *pi = &vm->paths.pool[cur]; + + assert(IS_PATH_NODE(pi)); + bool next_bit; + LOG(3, "%s: OP_JMP_ONCE: split_i %zu, zpl %u, offset %u, pi->u.path.used %u\n", + __func__, split_i, zero_prefix_length, offset, pi->u.path.used); + if (split_i < zero_prefix_length) { + next_bit = 0; + } else { + assert(split_i >= offset && + split_i <= offset + pi->u.path.used); + const uint32_t shift = 31 - (split_i & 31); + assert(shift < PATH_LINK_BITS); + next_bit = (pi->u.path.bits & ((uint32_t)1 << shift)) != 0; + } + LOG(3, "jmp_once: next_bit %d\n", next_bit); + LOG_EXEC_PATH_SAVE_CAPTURES(uniq_id, next_bit); + if (next_bit) { /* greedy edge */ + prog_i = op->u.jmp_once; + } else { /* non-greedy edge */ + /* fall through */ + } + split_i++; + if (split_i >= offset && + split_i - offset == pi->u.path.used && split_i < split_count) { + const uint32_t backlink = get_path_node_backlink(vm, cur); + assert(backlink != NO_ID); + cur = backlink; + } + LOG(3, "%s: prog_i now %u, split_i %zu/%zu\n", + __func__, prog_i, split_i, split_count); + assert(split_i <= split_count); + break; + } + case CAPVM_OP_SPLIT: + { + /* look at next bit of path and act accordingly */ + const uint32_t offset = get_path_node_offset(vm, cur); + const struct capvm_path_info *pi = &vm->paths.pool[cur]; + + const uint32_t dst_greedy = op->u.split.greedy; + const uint32_t dst_nongreedy = op->u.split.nongreedy; + + assert(IS_PATH_NODE(pi)); + bool next_bit; + LOG(3, "%s: OP_SPLIT_CONT: split_i %zu, zpl %u, offset %u, pi->u.path.used %u\n", + __func__, split_i, zero_prefix_length, offset, pi->u.path.used); + if (split_i < zero_prefix_length) { + next_bit = 0; + } else { + assert(split_i >= offset && + split_i <= offset + pi->u.path.used); + const uint32_t shift = 31 - (split_i & 31); + assert(shift < PATH_LINK_BITS); + next_bit = (pi->u.path.bits & ((uint32_t)1 << shift)) != 0; + } + LOG(3, "split: next_bit %d\n", next_bit); + LOG_EXEC_PATH_SAVE_CAPTURES(uniq_id, next_bit); + if (next_bit) { /* greedy edge */ + prog_i = dst_greedy; + } else { /* non-greedy edge */ + prog_i = dst_nongreedy; + } + split_i++; + if (split_i >= offset && + split_i - offset == pi->u.path.used && split_i < split_count) { + const uint32_t backlink = get_path_node_backlink(vm, cur); + assert(backlink != NO_ID); + cur = backlink; + } + LOG(3, "%s: prog_i now %u, split_i %zu/%zu\n", + __func__, prog_i, split_i, split_count); + assert(split_i <= split_count); + + break; + } + case CAPVM_OP_SAVE: + { + const unsigned capture_id = op->u.save/2; + const bool is_end = (op->u.save & 1) == 1; + + LOG(5, "%s: input_i %u, save %d -> capture %d pos %d, cur value %zd, prev char 0x%02x\n", + __func__, + input_i, op->u.save, + capture_id, is_end, + vm->capture_buf[op->u.save/2].pos[op->u.save & 1], + input_i > 0 ? vm->input[input_i - 1] : 0xff); + + /* Special case to ignore a trailing + * newline when capturing, unless the + * newline was explicitly matched as the + * last character of input. */ + if (input_i > 0 + && !explicitly_matched_nl_at_end + && input_i == vm->input_len + && vm->input[input_i - 1] == '\n') { + LOG(3, "%s: updating capture[%u].pos[1] to ignore trailing '\\n' at %u\n", + __func__, capture_id, input_i); + vm->capture_buf[capture_id].pos[is_end] = input_i - 1; + } else { + /* Save current position to appropriate capture buffer endpoint */ + vm->capture_buf[op->u.save/2].pos[op->u.save & 1] = input_i; + LOG(3, "%s: saved capture[%d].pos[%d] <- %u\n", + __func__, op->u.save/2, op->u.save&1, input_i); + } + break; + } + case CAPVM_OP_ANCHOR: + if (op->u.anchor == CAPVM_ANCHOR_START) { + assert(input_i == 0 + || (input_i == 1 + && vm->input_len == 1 + && vm->input[0] == '\n')); + } else { + assert(op->u.anchor == CAPVM_ANCHOR_END); + LOG(3, "%s: $ anchor: input_len %u, input_i %u\n", + __func__, vm->input_len, input_i); + + if (vm->input_len > 0 && input_i == vm->input_len - 1) { + /* special hack to not include trailing newline + * in match group zero */ + if (vm->p->capture_count > 0) { + vm->capture_buf[0].pos[1] = input_i; + } + + assert(vm->input[input_i] == '\n'); + input_i++; + } else { + assert(input_i == vm->input_len); + } + } + break; + + default: + assert(!"match fail"); + } + } + + /* write solution into caller's buffers and print */ + if (LOG_SOLUTION_TO_STDOUT) { + /* fprintf(stderr, "capture_count %u\n", vm->p->capture_count); */ + printf("HIT:"); + for (unsigned i = 0; i < vm->p->capture_count; i++) { + printf(" %zd %zd", + vm->capture_buf[i].pos[0], vm->capture_buf[i].pos[1]); + } + printf("\n"); + } + + /* restore original link order */ + cur = first_link; + do { + struct capvm_path_info *pi = &vm->paths.pool[cur]; + assert(!IS_PATH_FREELIST(pi)); + const uint32_t backlink = get_path_node_backlink(vm, cur); + + LOG(3, "%s (moving fwd again): node %u: refcount %u, used %u, offset %u, fwdlink %d, bits '", + __func__, cur, get_path_node_refcount(vm, cur), + pi->u.path.used, + get_path_node_offset(vm, cur), + backlink); + for (uint8_t i = 0; i < pi->u.path.used; i++) { + const uint32_t bit = (pi->u.path.bits & ((uint32_t)1 << (31 - i))); + LOG(3, "%c", (pi->u.path.bits & bit) ? '1' : '0'); + (void)bit; + } + LOG(3, "'\n"); + + LOG(3, "-- setting node %u's backlink to %d\n", cur, prev); + const uint32_t next = backlink; + set_path_node_backlink(vm, cur, prev); + + prev = cur; + cur = next; /* fwd link */ + } while (cur != NO_ID); +} + +/* TODO: It should be possible to avoid dynamic allocation here + * by calculating the max space needed upfront and passing in a + * uint32_t or uint64_t-aligned array for working space. */ + +enum fsm_capvm_program_exec_res +fsm_capvm_program_exec(const struct capvm_program *program, + const uint8_t *input, size_t length, + struct fsm_capture *capture_buf, size_t capture_buf_length) +{ + assert(program != NULL); + assert(input != NULL || length == 0); + assert(capture_buf != NULL); + + const size_t thread_max = program->used; + + /* FIXME: The path node table can grow beyond this, but in + * practice will usually stay fairly small. The worst case + * should be decidable based on the compiled program and input + * length. */ +#if ALLOW_PATH_TABLE_RESIZING + const size_t path_info_max = thread_max; +#else + const size_t path_info_max = 3 * thread_max; +#endif + + struct capvm_path_info *path_info_pool = malloc(path_info_max + * sizeof(path_info_pool[0])); + if (path_info_pool == NULL) { + return FSM_CAPVM_PROGRAM_EXEC_ERROR_ALLOC; + } + assert(path_info_pool != NULL); + + /* link path_info freelist */ + for (size_t i = 1; i < path_info_max - 1; i++) { + struct capvm_path_info *pi = &path_info_pool[i]; + pi->u.freelist_node.refcount = 0; + pi->u.freelist_node.freelist = i + 1; + } + struct capvm_path_info *piZ = &path_info_pool[path_info_max - 1]; + piZ->u.freelist_node.refcount = 0; + piZ->u.freelist_node.freelist = NO_ID; + + /* init an empty path descriptor for initial execution */ + struct capvm_path_info *pi0 = &path_info_pool[0]; + pi0->u.path.refcount = 1; + pi0->u.path.used = 0; + pi0->u.path.bits = 0; + pi0->u.path.offset = 0; + pi0->u.path.backlink = NO_ID; + + uint32_t stack_a[thread_max]; + uint32_t stack_b[thread_max]; + + const size_t evaluated_bit_words = program->used/32 + 1; + uint32_t evaluated_a[evaluated_bit_words]; + uint32_t evaluated_b[evaluated_bit_words]; + uint32_t path_info_head_a[thread_max]; + uint32_t path_info_head_b[thread_max]; +#if CAPVM_STATS + uint32_t uniq_ids_a[thread_max]; + uint32_t uniq_ids_b[thread_max]; +#endif + + assert(capture_buf_length >= program->capture_base + program->capture_count); + + struct fsm_capture *offset_capture_buf = &capture_buf[program->capture_base]; + + struct capvm vm = { + .res = FSM_CAPVM_PROGRAM_EXEC_NO_SOLUTION_FOUND, + .p = program, + .input = input, + .input_len = length, + .capture_buf = offset_capture_buf, + .capture_buf_length = capture_buf_length, + .step_limit = SIZE_MAX, +#if CAPVM_STATS + .uniq_id_counter = 0, +#endif + + .run_stacks = { stack_a, stack_b }, + .evaluated = { evaluated_a, evaluated_b }, + .path_info_heads = { path_info_head_a, path_info_head_b }, +#if CAPVM_STATS + .uniq_ids = { uniq_ids_a, uniq_ids_b }, +#endif + + .paths = { + .ceil = path_info_max, + .live = 1, + .max_live = 1, + .freelist_head = 1, + .pool = path_info_pool, + }, + .solution = { + .best_path_id = NO_ID, + }, + }; + + /* enqueue execution at first opcode */ + vm.run_stacks[PAIR_ID_NEXT][0] = 0; + vm.run_stacks_h[PAIR_ID_NEXT] = 1; + vm.threads.live = 1; + vm.threads.max_live = 1; + vm.path_info_heads[PAIR_ID_NEXT][0] = 0; + +#if CAPVM_STATS + vm.uniq_ids[PAIR_ID_NEXT][0] = 0; +#endif + + INIT_TIMERS(); + TIME(&pre); + if (eval_vm(&vm)) { + assert(vm.threads.live == 0); + assert(vm.paths.live > 0); + + populate_solution(&vm); + release_path_info_link(&vm, &vm.solution.best_path_id); + vm.res = FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN; + + /* TODO: This assert will not work if refcounts are + * sticky at the max value, but if the number of paths + * and threads is bounded then it shouldn't be possible + * to overflow the refcount anyway. If sticky refcounts + * are used then reaching one should probably set a + * flag, which would skip this assertion. */ + assert(vm.paths.live == 0); + } else { + assert(vm.res != FSM_CAPVM_PROGRAM_EXEC_SOLUTION_WRITTEN); + } + + TIME(&post); + DIFF_MSEC(__func__, pre, post, NULL); + +#if CAPVM_STATS + LOG(2, "%s: %zu steps, max_threads %u, max_paths %u, matches %u, path_prefixes_shared %u, collapsed_zero_prefixes %u\n", + __func__, vm.stats.steps, vm.threads.max_live, vm.paths.max_live, vm.stats.matches, + vm.stats.path_prefixes_shared, vm.stats.collapsed_zero_prefixes); +#if CAPVM_PATH_STATS + LOG(2, "%s: prog_size %u, max_path_length_memory %u (bits), input length %zu, max_paths * %zu bytes/path => %zu bytes\n", + __func__, vm.p->used, vm.stats.max_path_length_memory, length, + sizeof(vm.paths.pool[0]), + vm.paths.max_live * sizeof(vm.paths.pool[0])); +#endif +#endif + + free(vm.paths.pool); + return vm.res; +} diff --git a/src/libfsm/capture_vm_log.h b/src/libfsm/capture_vm_log.h new file mode 100644 index 000000000..8ff51d8b4 --- /dev/null +++ b/src/libfsm/capture_vm_log.h @@ -0,0 +1,21 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_VM_LOG_H +#define CAPTURE_VM_LOG_H + +#include + +#define LOG_CAPVM (1+0) +#define LOG(LEVEL, ...) \ + do { \ + if ((LEVEL) <= LOG_CAPVM) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while(0) + + +#endif diff --git a/src/libfsm/capture_vm_program.h b/src/libfsm/capture_vm_program.h new file mode 100644 index 000000000..0b24ffb5b --- /dev/null +++ b/src/libfsm/capture_vm_program.h @@ -0,0 +1,74 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef CAPTURE_VM_PROGRAM_H +#define CAPTURE_VM_PROGRAM_H + +#include + +struct capvm_program { + uint32_t capture_count; + uint32_t capture_base; + + uint32_t used; + uint32_t ceil; + struct capvm_opcode { + enum capvm_opcode_type { + /* Next character of input == .u.chr */ + CAPVM_OP_CHAR, + /* Next character of input is in char class */ + CAPVM_OP_CHARCLASS, + /* Input has matched */ + CAPVM_OP_MATCH, + /* Unconditional jump */ + CAPVM_OP_JMP, + /* If destination has already been evaulated + * since advancing the input position, fall + * through to next instruction, otherwise jmp. */ + CAPVM_OP_JMP_ONCE, + /* Split execution to two paths, where .cont + * offset is greedier than .new's offset. */ + CAPVM_OP_SPLIT, + /* Save current input position as capture bound */ + CAPVM_OP_SAVE, + /* Check if current input position is at start/end + * of input, after accounting for PCRE's special + * cases for a trailing newline. */ + CAPVM_OP_ANCHOR, + } t; + union { + uint8_t chr; + uint32_t charclass_id; + uint32_t jmp; /* absolute */ + uint32_t jmp_once; /* absolute */ + struct { + uint32_t greedy; + uint32_t nongreedy; + } split; + /* (save >> 1): capture ID, + * (save & 0x01): save pos to start (0b0) or end (0b1). */ + uint32_t save; + enum capvm_anchor_type { + CAPVM_ANCHOR_START, + CAPVM_ANCHOR_END, + } anchor; + } u; + } *ops; + + /* Most compiled programs only use a few distinct character + * classes (if any), and the data is much larger than the + * other instructions, so they are stored in a separate + * table and referred to by op->u.charclass_id. */ + struct capvm_char_classes { + uint32_t count; + uint32_t ceil; + struct capvm_char_class { + uint64_t octets[4]; /* 256-bitset */ + } *sets; + } char_classes; +}; + +#endif diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index bec16bb0f..8b7b606c8 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -21,10 +21,7 @@ #define LOG_CLONE_ENDIDS 0 static int -copy_capture_actions(struct fsm *dst, const struct fsm *src); - -static int -copy_end_ids(struct fsm *dst, const struct fsm *src); +copy_end_metadata(struct fsm *dst, const struct fsm *src); struct fsm * fsm_clone(const struct fsm *fsm) @@ -70,12 +67,7 @@ fsm_clone(const struct fsm *fsm) } { - if (!copy_capture_actions(new, fsm)) { - fsm_free(new); - return NULL; - } - - if (!copy_end_ids(new, fsm)) { + if (!copy_end_metadata(new, fsm)) { fsm_free(new); return NULL; } @@ -84,75 +76,81 @@ fsm_clone(const struct fsm *fsm) return new; } -struct copy_capture_actions_env { +struct copy_end_ids_env { + char tag; struct fsm *dst; int ok; }; static int -copy_capture_actions_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +copy_end_ids_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque) { - struct copy_capture_actions_env *env = opaque; - assert(env->dst); + struct copy_end_ids_env *env = opaque; + enum fsm_endid_set_res sres; + assert(env->tag == 'c'); + (void)fsm; + (void)nth; - if (!fsm_capture_add_action(env->dst, - state, type, capture_id, to)) { +#if LOG_CLONE_ENDIDS + fprintf(stderr, "clone[%d] <- %d\n", state, id); +#endif + + sres = fsm_endid_set(env->dst, state, id); + if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { env->ok = 0; + return 0; } - return env->ok; + return 1; } static int -copy_capture_actions(struct fsm *dst, const struct fsm *src) +copy_active_capture_ids_cb(fsm_state_t state, unsigned capture_id, void *opaque) { - struct copy_capture_actions_env env = { NULL, 1 }; - env.dst = dst; + struct copy_end_ids_env *env = opaque; - fsm_capture_action_iter(src, - copy_capture_actions_cb, &env); - return env.ok; + if (!fsm_capture_set_active_for_end(env->dst, + capture_id, + state)) { + env->ok = 0; + return 0; + } + return 1; } -struct copy_end_ids_env { - char tag; - struct fsm *dst; - const struct fsm *src; - int ok; -}; - static int -copy_end_ids_cb(fsm_state_t state, const fsm_end_id_t id, void *opaque) +associate_capture_programs_cb(fsm_state_t state, unsigned prog_id, void *opaque) { struct copy_end_ids_env *env = opaque; - enum fsm_endid_set_res sres; - assert(env->tag == 'c'); -#if LOG_CLONE_ENDIDS - fprintf(stderr, "clone[%d] <- %d\n", state, id); -#endif - - sres = fsm_endid_set(env->dst, state, id); - if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { + if (!fsm_capture_associate_program_with_end_state(env->dst, + prog_id, state)) { env->ok = 0; return 0; } - return 1; } static int -copy_end_ids(struct fsm *dst, const struct fsm *src) +copy_end_metadata(struct fsm *dst, const struct fsm *src) { struct copy_end_ids_env env; env.tag = 'c'; /* for clone */ env.dst = dst; - env.src = src; env.ok = 1; fsm_endid_iter(src, copy_end_ids_cb, &env); + fsm_capture_iter_active_for_all_end_states(src, + copy_active_capture_ids_cb, &env); + + if (!fsm_capture_copy_programs(src, dst)) { + return 0; + } + + fsm_capture_iter_program_ids_for_all_end_states(src, + associate_capture_programs_cb, &env); + return env.ok; } diff --git a/src/libfsm/closure.c b/src/libfsm/closure.c index fa2d0c783..9ebf48eb9 100644 --- a/src/libfsm/closure.c +++ b/src/libfsm/closure.c @@ -128,7 +128,7 @@ epsilon_closure_single(const struct fsm *fsm, struct state_set **closures, fsm_s } struct state_set ** -epsilon_closure(struct fsm *fsm) +fsm_epsilon_closure(struct fsm *fsm) { struct state_set **closures; fsm_state_t s; @@ -190,7 +190,7 @@ epsilon_closure(struct fsm *fsm) } void -closure_free(struct state_set **closures, size_t n) +fsm_closure_free(struct state_set **closures, size_t n) { fsm_state_t s; diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index 4518d3926..a5fb98878 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "capture.h" @@ -26,27 +27,15 @@ #define LOG_MAPPING 0 #define LOG_CONSOLIDATE_CAPTURES 0 -#define LOG_CONSOLIDATE_ENDIDS 0 +#define LOG_CONSOLIDATE_END_METADATA 0 struct mapping_closure { size_t count; const fsm_state_t *mapping; }; -struct consolidate_copy_capture_actions_env { - char tag; - struct fsm *dst; - size_t mapping_count; - const fsm_state_t *mapping; - int ok; -}; - -static int -consolidate_copy_capture_actions(struct fsm *dst, const struct fsm *src, - const fsm_state_t *mapping, size_t mapping_count); - static int -consolidate_end_ids(struct fsm *dst, const struct fsm *src, +consolidate_end_metadata(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count); static fsm_state_t @@ -67,7 +56,16 @@ fsm_consolidate(const struct fsm *src, struct mapping_closure closure; size_t max_used = 0; +#if LOG_CONSOLIDATE_END_METADATA > 1 + fprintf(stderr, "==== fsm_consolidate -- endid_info before:\n"); + fsm_endid_dump(stderr, src); + fsm_capture_dump_active_for_ends(stderr, src); +#endif + assert(src != NULL); + if (mapping_count == 0) { + return fsm_clone(src); + } assert(src->opt != NULL); dst = fsm_new(src->opt); @@ -76,12 +74,14 @@ fsm_consolidate(const struct fsm *src, } for (src_i = 0; src_i < mapping_count; src_i++) { + const fsm_state_t dst_i = mapping[src_i]; #if LOG_MAPPING fprintf(stderr, "consolidate_mapping[%u]: %u\n", src_i, mapping[src_i]); #endif - if (mapping[src_i] >= max_used) { - max_used = mapping[src_i]; + if (dst_i > max_used) { + assert(dst_i != FSM_STATE_REMAP_NO_STATE); + max_used = dst_i; } } @@ -96,8 +96,8 @@ fsm_consolidate(const struct fsm *src, goto cleanup; } -#define DST_SEEN(I) (seen[I/64] & ((uint64_t)1 << (I&63))) -#define SET_DST_SEEN(I) (seen[I/64] |= ((uint64_t)1 << (I&63))) +#define DST_SEEN(I) u64bitset_get(seen, I) +#define SET_DST_SEEN(I) u64bitset_set(seen, I) /* map N states to M states, where N >= M. * if it's the first time state[M] is seen, @@ -110,6 +110,9 @@ fsm_consolidate(const struct fsm *src, for (src_i = 0; src_i < mapping_count; src_i++) { const fsm_state_t dst_i = mapping[src_i]; + /* fsm_consolidate does not currently support discarding states. */ + assert(dst_i != FSM_STATE_REMAP_NO_STATE); + if (!DST_SEEN(dst_i)) { SET_DST_SEEN(dst_i); @@ -134,11 +137,11 @@ fsm_consolidate(const struct fsm *src, } } - if (!consolidate_copy_capture_actions(dst, src, mapping, mapping_count)) { + if (!fsm_capture_copy_programs(src, dst)) { goto cleanup; } - if (!consolidate_end_ids(dst, src, mapping, mapping_count)) { + if (!consolidate_end_metadata(dst, src, mapping, mapping_count)) { goto cleanup; } @@ -161,97 +164,89 @@ fsm_consolidate(const struct fsm *src, return NULL; } +struct consolidate_end_ids_env { + char tag; + struct fsm *dst; + const struct fsm *src; + const fsm_state_t *mapping; + size_t mapping_count; + int ok; +}; + static int -consolidate_copy_capture_actions_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, +consolidate_active_captures_cb(fsm_state_t state, unsigned capture_id, void *opaque) { - struct consolidate_copy_capture_actions_env *env = opaque; - fsm_state_t s, t; - + struct consolidate_end_ids_env *env = opaque; + fsm_state_t dst_s; assert(env->tag == 'C'); -#if LOG_CONSOLIDATE_CAPTURES - fprintf(stderr, "consolidate_copy_capture_actions_cb: state %u, type %s, ID %u, TO %d\n", - state, - fsm_capture_action_type_name[type], - capture_id, to); -#endif - assert(state < env->mapping_count); - assert(to == CAPTURE_NO_STATE || to < env->mapping_count); - s = env->mapping[state]; - t = to == CAPTURE_NO_STATE - ? CAPTURE_NO_STATE : env->mapping[to]; + dst_s = env->mapping[state]; - if (!fsm_capture_add_action(env->dst, - s, type, capture_id, t)) { +#if LOG_CONSOLIDATE_END_METADATA + fprintf(stderr, "consolidate_active_captures_cb: state %d -> dst_s %d, capture_id %u\n", + state, dst_s, capture_id); +#endif + + if (!fsm_capture_set_active_for_end(env->dst, capture_id, dst_s)) { env->ok = 0; return 0; } - return 1; } static int -consolidate_copy_capture_actions(struct fsm *dst, const struct fsm *src, - const fsm_state_t *mapping, size_t mapping_count) +consolidate_capture_programs_cb(fsm_state_t state, unsigned program_id, + void *opaque) { - size_t i; + struct consolidate_end_ids_env *env = opaque; + fsm_state_t dst_s; + assert(env->tag == 'C'); - struct consolidate_copy_capture_actions_env env; - env.tag = 'C'; - env.dst = dst; - env.mapping_count = mapping_count; - env.mapping = mapping; - env.ok = 1; + assert(state < env->mapping_count); + dst_s = env->mapping[state]; -#if LOG_MAPPING - for (i = 0; i < mapping_count; i++) { - fprintf(stderr, "mapping[%lu]: %u\n", i, mapping[i]); + if (!fsm_capture_associate_program_with_end_state(env->dst, + (uint32_t)program_id, dst_s)) { + env->ok = 0; } -#else - (void)i; -#endif - fsm_capture_action_iter(src, - consolidate_copy_capture_actions_cb, &env); - return env.ok; + return 1; } -struct consolidate_end_ids_env { - char tag; - struct fsm *dst; - const struct fsm *src; - const fsm_state_t *mapping; - size_t mapping_count; -}; - static int consolidate_end_ids_cb(fsm_state_t state, const fsm_end_id_t *ids, size_t num_ids, void *opaque) { struct consolidate_end_ids_env *env = opaque; - enum fsm_endid_set_res sres; - fsm_state_t s; + fsm_state_t dst_s; assert(env->tag == 'C'); assert(state < env->mapping_count); - s = env->mapping[state]; + dst_s = env->mapping[state]; - sres = fsm_endid_set_bulk(env->dst, s, num_ids, ids, FSM_ENDID_BULK_APPEND); +#if LOG_CONSOLIDATE_END_METADATA > 1 + fprintf(stderr, "consolidate_end_ids_cb: state %u, dst %u, IDs [", + state, dst_s); + for (size_t i = 0; i < num_ids; i++) { + fprintf(stderr, "%s%d", i > 0 ? " " : "", ids[i]); + } + fprintf(stderr, "]\n"); +#endif + + enum fsm_endid_set_res sres = fsm_endid_set_bulk(env->dst, + dst_s, num_ids, ids, FSM_ENDID_BULK_APPEND); if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { return 0; } - return 1; } static int -consolidate_end_ids(struct fsm *dst, const struct fsm *src, +consolidate_end_metadata(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count) { struct consolidate_end_ids_env env; - int ret; env.tag = 'C'; /* for Consolidate */ env.dst = dst; @@ -259,12 +254,31 @@ consolidate_end_ids(struct fsm *dst, const struct fsm *src, env.mapping = mapping; env.mapping_count = mapping_count; - ret = fsm_endid_iter_bulk(src, consolidate_end_ids_cb, &env); + env.ok = fsm_endid_iter_bulk(src, consolidate_end_ids_cb, &env); -#if LOG_CONSOLIDATE_ENDIDS > 1 + if (env.ok) { + fsm_state_t s; + const size_t src_state_count = fsm_countstates(src); + for (s = 0; s < src_state_count; s++) { + fsm_capture_iter_active_for_end_state(src, s, + consolidate_active_captures_cb, &env); + if (!env.ok) { + break; + } + + fsm_capture_iter_program_ids_for_end_state(src, s, + consolidate_capture_programs_cb, &env); + if (!env.ok) { + break; + } + } + } + +#if LOG_CONSOLIDATE_END_METADATA > 1 fprintf(stderr, "==== fsm_consolidate -- endid_info after:\n"); fsm_endid_dump(stderr, dst); + fsm_capture_dump_active_for_ends(stderr, dst); #endif - return ret; + return env.ok; } diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 56e135afd..7aa3fdb66 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -6,6 +6,8 @@ #include "determinise_internal.h" +#define LOG_DETERMINISATION_COUNTERS 0 + static void dump_labels(FILE *f, const uint64_t labels[4]) { @@ -29,6 +31,8 @@ fsm_determinise(struct fsm *nfa) size_t dfacount = 0; struct analyze_closures_env ac_env = { 0 }; + INIT_TIMERS(); + INIT_TIMERS_NAMED(overall); assert(nfa != NULL); map.alloc = nfa->opt->alloc; @@ -39,9 +43,12 @@ fsm_determinise(struct fsm *nfa) * faster where we can start with an epsilon-free NFA in the first place. */ if (fsm_has(nfa, fsm_hasepsilons)) { + TIME(&pre); if (!fsm_remove_epsilons(nfa)) { return 0; } + TIME(&post); + DIFF_MSEC("det_remove_eps", pre, post, NULL); } #if LOG_DETERMINISE_CAPTURES || LOG_INPUT @@ -49,6 +56,7 @@ fsm_determinise(struct fsm *nfa) fsm_print_fsm(stderr, nfa); fsm_capture_dump(stderr, "#### post_remove_epsilons", nfa); #endif + TIME(&overall_pre); issp = interned_state_set_pool_alloc(nfa->opt->alloc); if (issp == NULL) { @@ -104,6 +112,17 @@ fsm_determinise(struct fsm *nfa) ac_env.fsm = nfa; ac_env.issp = issp; +#if LOG_DETERMINISATION_STATS + fprintf(stderr, "%s: determinising FSM with %d states\n", __func__, fsm_countstates(nfa)); +#endif + + INIT_TIMERS_NAMED(iss); + size_t iss_accum = 0; + size_t iss_calls = 0; + size_t stack_pushes = 0; + size_t inner_steps = 0; + + TIME(&pre); do { size_t o_i; @@ -114,18 +133,25 @@ fsm_determinise(struct fsm *nfa) assert(curr != NULL); + TIME(&iss_pre); if (!analyze_closures__pairwise_grouping(&ac_env, curr->iss)) { goto cleanup; } + TIME(&iss_post); + DIFF_MSEC("det_iss", iss_pre, iss_post, &iss_accum); + (void)iss_accum; + iss_calls++; if (!edge_set_advise_growth(&curr->edges, nfa->opt->alloc, ac_env.output_count)) { goto cleanup; } + /* each output is an outgoing (label set) -> interned_state_set pair */ for (o_i = 0; o_i < ac_env.output_count; o_i++) { struct mapping *m; struct ac_output *output = &ac_env.outputs[o_i]; interned_state_set_id iss = output->iss; + inner_steps++; #if LOG_DETERMINISE_CLOSURES fprintf(stderr, "fsm_determinise: output %zu/%zu: cur (dfa %zu) label [", @@ -157,6 +183,7 @@ fsm_determinise(struct fsm *nfa) if (!stack_push(stack, m)) { goto cleanup; } + stack_pushes++; } #if LOG_SYMBOL_CLOSURE @@ -174,6 +201,13 @@ fsm_determinise(struct fsm *nfa) /* All elements in sclosures[] are interned, so they will be freed later. */ } while ((curr = stack_pop(stack))); + TIME(&post); + DIFF_MSEC("det_stack_loop", pre, post, NULL); + + if (LOG_DETERMINISATION_COUNTERS) { + fprintf(stderr, "%s: iss_accum total %zu (%zu calls, %g usec avg.), %zu stack pushes, %zu iterations, %zu inner_steps\n", + __func__, iss_accum, iss_calls, iss_accum / (1.0 * iss_calls), stack_pushes, iss_calls, inner_steps); + } { struct map_iter it; @@ -185,6 +219,13 @@ fsm_determinise(struct fsm *nfa) goto cleanup; } + TIME(&pre); + if (!fsm_capture_copy_programs(nfa, dfa)) { + goto cleanup; + } + TIME(&post); + DIFF_MSEC("det_copy_captures", pre, post, NULL); + #if DUMP_MAPPING { fprintf(stderr, "#### fsm_determinise: mapping\n"); @@ -192,10 +233,10 @@ fsm_determinise(struct fsm *nfa) /* build reverse mappings table: for every NFA state X, if X is part * of the new DFA state Y, then add Y to a list for X */ for (m = map_first(&map, &it); m != NULL; m = map_next(&it)) { - struct state_iter si; interned_state_set_id iss_id = m->iss; + struct state_iter si; fsm_state_t state; - struct state_set *ss = interned_state_set_get_state_set(ac_env.issp, iss_id); + struct state_set *ss = interned_state_set_get_state_set(issp, iss_id); fprintf(stderr, "%zu:", m->dfastate); for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { @@ -238,24 +279,41 @@ fsm_determinise(struct fsm *nfa) fsm_setend(dfa, m->dfastate, 1); /* - * Carry through end IDs, if present. This isn't anything to do - * with the DFA conversion; it's meaningful only to the caller. + * Copy over metadata associated with end + * states, if present. This isn't anything to do + * with the DFA conversion; it's meaningful only + * to the caller. * * The closure may contain non-end states, but at least one state is * known to have been an end state. */ - if (!fsm_endid_carry(nfa, ss, dfa, m->dfastate)) { + if (!remap_end_metadata(nfa, ss, dfa, m->dfastate)) { goto cleanup; } } + TIME(&post); + DIFF_MSEC("det_map_loop", pre, post, NULL); - if (!remap_capture_actions(&map, issp, dfa, nfa)) { - goto cleanup; - } + fsm_capture_integrity_check(dfa); fsm_move(nfa, dfa); } +#if LOG_DETERMINISE_CAPTURES + fprintf(stderr, "# post_determinise\n"); + fsm_print_fsm(stderr, nfa); + fsm_capture_dump(stderr, "#### post_determinise", nfa); +#endif + + TIME(&overall_post); + DIFF_MSEC("det_overall", overall_pre, overall_post, NULL); + +#if LOG_DETERMINISATION_STATS + fprintf(stderr, "%s: created DFA with %d states\n", __func__, fsm_countstates(nfa)); + fprintf(stderr, "%s: analyze_closures_env.analyze_usec: %zu\n", + __func__, ac_env.analyze_usec); +#endif + #if EXPENSIVE_CHECKS assert(fsm_all(nfa, fsm_isdfa)); #endif @@ -311,85 +369,6 @@ fsm_determinise(struct fsm *nfa) return res; } -/* Add DFA_state to the list for NFA_state. */ -static int -add_reverse_mapping(const struct fsm_alloc *alloc, - struct reverse_mapping *reverse_mappings, - fsm_state_t dfastate, fsm_state_t nfa_state) -{ - struct reverse_mapping *rm = &reverse_mappings[nfa_state]; - if (rm->count == rm->ceil) { - const unsigned nceil = (rm->ceil ? 2*rm->ceil : 2); - fsm_state_t *nlist = f_realloc(alloc, - rm->list, nceil * sizeof(rm->list)); - if (nlist == NULL) { - return 0; - } - rm->list = nlist; - rm->ceil = nceil; - } - - rm->list[rm->count] = dfastate; - rm->count++; - return 1; -} - -static int -det_copy_capture_actions_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) -{ - struct reverse_mapping *rm_s; - size_t s_i, t_i; - struct det_copy_capture_actions_env *env = opaque; - assert(env->tag == 'D'); - -#if LOG_DETERMINISE_CAPTURES - fprintf(stderr, "det_copy_capture_actions_cb: state %u, type %s, ID %u, TO %d\n", - state, fsm_capture_action_type_name[type], - capture_id, to); -#endif - - rm_s = &env->reverse_mappings[state]; - - for (s_i = 0; s_i < rm_s->count; s_i++) { - const fsm_state_t s = rm_s->list[s_i]; - - if (to == CAPTURE_NO_STATE) { - if (!fsm_capture_add_action(env->dst, - s, type, capture_id, CAPTURE_NO_STATE)) { - env->ok = 0; - return 0; - } - } else { - struct reverse_mapping *rm_t = &env->reverse_mappings[to]; - for (t_i = 0; t_i < rm_t->count; t_i++) { - const fsm_state_t t = rm_t->list[t_i]; - - if (!fsm_capture_add_action(env->dst, - s, type, capture_id, t)) { - env->ok = 0; - return 0; - } - } - } - } - - return 1; -} - -static int -det_copy_capture_actions(struct reverse_mapping *reverse_mappings, - struct fsm *dst, struct fsm *src) -{ - struct det_copy_capture_actions_env env = { 'D', NULL, NULL, 1 }; - env.dst = dst; - env.reverse_mappings = reverse_mappings; - - fsm_capture_action_iter(src, det_copy_capture_actions_cb, &env); - return env.ok; -} - SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t hash_iss(interned_state_set_id iss) @@ -636,83 +615,6 @@ stack_pop(struct mappingstack *stack) return item; } -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa) -{ - struct map_iter it; - struct state_iter si; - struct mapping *m; - struct reverse_mapping *reverse_mappings; - fsm_state_t state; - const size_t capture_count = fsm_countcaptures(src_nfa); - size_t i, j; - int res = 0; - - if (capture_count == 0) { - return 1; - } - - /* This is not 1 to 1 -- if state X is now represented by multiple - * states Y in the DFA, and state X has action(s) when transitioning - * to state Z, this needs to be added on every Y, for every state - * representing Z in the DFA. - * - * We could probably filter this somehow, at the very least by - * checking reachability from every X, but the actual path - * handling later will also check reachability. */ - reverse_mappings = f_calloc(dst_dfa->opt->alloc, src_nfa->statecount, sizeof(reverse_mappings[0])); - if (reverse_mappings == NULL) { - return 0; - } - - /* build reverse mappings table: for every NFA state X, if X is part - * of the new DFA state Y, then add Y to a list for X */ - for (m = map_first(map, &it); m != NULL; m = map_next(&it)) { - struct state_set *ss; - interned_state_set_id iss_id = m->iss; - assert(m->dfastate < dst_dfa->statecount); - ss = interned_state_set_get_state_set(issp, iss_id); - - for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { - if (!add_reverse_mapping(dst_dfa->opt->alloc, - reverse_mappings, - m->dfastate, state)) { - goto cleanup; - } - } - } - -#if LOG_DETERMINISE_CAPTURES - fprintf(stderr, "#### reverse mapping for %zu states\n", src_nfa->statecount); - for (i = 0; i < src_nfa->statecount; i++) { - struct reverse_mapping *rm = &reverse_mappings[i]; - fprintf(stderr, "%lu:", i); - for (j = 0; j < rm->count; j++) { - fprintf(stderr, " %u", rm->list[j]); - } - fprintf(stderr, "\n"); - } -#else - (void)j; -#endif - - if (!det_copy_capture_actions(reverse_mappings, dst_dfa, src_nfa)) { - goto cleanup; - } - - res = 1; -cleanup: - for (i = 0; i < src_nfa->statecount; i++) { - if (reverse_mappings[i].list != NULL) { - f_free(dst_dfa->opt->alloc, reverse_mappings[i].list); - } - } - f_free(dst_dfa->opt->alloc, reverse_mappings); - - return res; -} - static int group_labels_overlap(const struct ac_group *a, const struct ac_group *b) { @@ -730,6 +632,25 @@ group_labels_overlap(const struct ac_group *a, const struct ac_group *b) return 0; } +static int +remap_end_metadata(const struct fsm *src_fsm, const struct state_set *src_set, + struct fsm *dst_fsm, fsm_state_t dst_state) +{ + if (!fsm_endid_carry(src_fsm, src_set, dst_fsm, dst_state)) { + return 0; + } + + if (!fsm_capture_copy_active_for_ends(src_fsm, src_set, dst_fsm, dst_state)) { + return 0; + } + + if (!fsm_capture_copy_program_end_state_associations(src_fsm, src_set, dst_fsm, dst_state)) { + return 0; + } + + return 1; +} + static void intersect_with(uint64_t *a, const uint64_t *b) { @@ -1339,6 +1260,7 @@ to_set_htab_check(struct analyze_closures_env *env, if (b->count == 0) { return 0; /* empty bucket -> not found */ } else if (b->count == count) { + assert(env->to_sets.buf != NULL); assert(b->offset + count <= env->to_sets.used); const fsm_state_t *ids = &env->to_sets.buf[b->offset]; if (0 == memcmp(ids, dst, count * sizeof(dst[0]))) { @@ -1465,6 +1387,7 @@ save_to_set(struct analyze_closures_env *env, env->to_sets.ceil = nceil; env->to_sets.buf = nbuf; } + assert(env->to_sets.buf != NULL); #if LOG_TO_SET static size_t to_set_id; @@ -2016,28 +1939,87 @@ static void sort_and_dedup_dst_buf(fsm_state_t *buf, size_t *used) { const size_t orig_used = *used; - qsort(buf, orig_used, sizeof(buf[0]), cmp_fsm_state_t); - - /* squash out duplicates */ - size_t rd = 1; - size_t wr = 1; - while (rd < orig_used) { - if (buf[rd - 1] == buf[rd]) { - rd++; /* skip */ - } else { - buf[wr] = buf[rd]; - rd++; - wr++; - } + + if (orig_used <= 1) { + return; /* no change */ } - *used = wr; -#if EXPENSIVE_CHECKS - assert(wr <= orig_used); - for (size_t i = 1; i < *used; i++) { - assert(buf[i - 1] < buf[i]); + /* Figure out what the min and max values are, because + * when the difference between them is not too large it + * can be significantly faster to avoid qsort here. */ + fsm_state_t min = (fsm_state_t)-1; + fsm_state_t max = 0; + for (size_t i = 0; i < orig_used; i++) { + const fsm_state_t cur = buf[i]; + if (cur < min) { min = cur; } + if (cur > max) { max = cur; } } + + /* If there's only one unique value, then we're done. */ + if (min == max) { + buf[0] = min; + *used = 1; + return; + } + +/* 81920 = 10 KB buffer on the stack. This must be divisible by 64. + * Set to 0 to disable. */ +#define QSORT_CUTOFF 81920 + + if (QSORT_CUTOFF == 0 || max - min > QSORT_CUTOFF) { + /* If the bitset would be very large but sparse due to + * extreme values, then fall back on using qsort and + * then sweeping over the array to squash out + * duplicates. */ + qsort(buf, orig_used, sizeof(buf[0]), cmp_fsm_state_t); + + /* squash out duplicates */ + size_t rd = 1; + size_t wr = 1; + while (rd < orig_used) { + if (buf[rd - 1] == buf[rd]) { + rd++; /* skip */ + } else { + buf[wr] = buf[rd]; + rd++; + wr++; + } + } + + *used = wr; +#if EXPENSIVE_CHECKS + assert(wr <= orig_used); + for (size_t i = 1; i < *used; i++) { + assert(buf[i - 1] < buf[i]); + } #endif + } else { + /* Convert the array into a bitset and back, which sorts + * and deduplicates in the process. Add 1 to avoid a zero- + * zero-length array error if QSORT_CUTOFF is 0. */ + uint64_t bitset[QSORT_CUTOFF/64 + 1]; + const size_t words = u64bitset_words(max - min + 1); + memset(bitset, 0x00, words * sizeof(bitset[0])); + + for (size_t i = 0; i < orig_used; i++) { + u64bitset_set(bitset, buf[i] - min); + } + + size_t dst = 0; + for (size_t i = 0; i < words; i++) { + const uint64_t w = bitset[i]; + if (w != 0) { /* skip empty words */ + uint64_t bit = 0x1; + for (size_t b_i = 0; b_i < 64; b_i++, bit <<= 1) { + if (w & bit) { + buf[dst] = 64*i + b_i + min; + dst++; + } + } + } + } + *used = dst; + } } static int diff --git a/src/libfsm/determinise_internal.h b/src/libfsm/determinise_internal.h index 8fe35fcd8..856f8baf8 100644 --- a/src/libfsm/determinise_internal.h +++ b/src/libfsm/determinise_internal.h @@ -75,19 +75,6 @@ struct map_iter { size_t i; }; -struct reverse_mapping { - unsigned count; - unsigned ceil; - fsm_state_t *list; -}; - -struct det_copy_capture_actions_env { - char tag; - struct fsm *dst; - struct reverse_mapping *reverse_mappings; - int ok; -}; - #define MAPPINGSTACK_DEF_CEIL 16 struct mappingstack { const struct fsm_alloc *alloc; @@ -289,6 +276,10 @@ analyze_closures__grow_dst(struct analyze_closures_env *env); static int analyze_closures__grow_outputs(struct analyze_closures_env *env); +static int +remap_end_metadata(const struct fsm *src_fsm, const struct state_set *src_set, + struct fsm *dst_fsm, fsm_state_t dst_state); + static int map_add(struct map *map, fsm_state_t dfastate, interned_state_set_id iss, struct mapping **new_mapping); @@ -306,22 +297,9 @@ map_first(struct map *map, struct map_iter *iter); static struct mapping * map_next(struct map_iter *iter); -static int -add_reverse_mapping(const struct fsm_alloc *alloc, - struct reverse_mapping *reverse_mappings, - fsm_state_t dfastate, fsm_state_t nfa_state); - -static int -det_copy_capture_actions(struct reverse_mapping *reverse_mappings, - struct fsm *dst, struct fsm *src); - static int grow_map(struct map *map); -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa); - static struct mappingstack * stack_init(const struct fsm_alloc *alloc); diff --git a/src/libfsm/endids.c b/src/libfsm/endids.c index 444ccbc2e..8da8e3371 100644 --- a/src/libfsm/endids.c +++ b/src/libfsm/endids.c @@ -4,10 +4,47 @@ * See LICENCE for the full copyright terms. */ +#include +#include + #include #include -#include "endids_internal.h" +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "endids.h" + +#define BUCKET_NO_STATE ((fsm_state_t)-1) +#define DEF_BUCKET_COUNT 4 +#define DEF_BUCKET_ID_COUNT 16 + +struct endid_info { + /* Add-only hash table, with a state ID and an associated + * non-empty ordered array of unique end IDs. The state is the + * key. Grows when the buckets are more than half full. */ + unsigned bucket_count; + unsigned buckets_used; + + struct endid_info_bucket { + fsm_state_t state; + struct end_info_ids { + unsigned count; + unsigned ceil; + fsm_end_id_t ids[1]; + } *ids; + } *buckets; +}; #define LOG_ENDIDS 0 @@ -84,6 +121,14 @@ fsm_setendid(struct fsm *fsm, fsm_end_id_t id) return 1; } +int +fsm_setendid_state(struct fsm *fsm, fsm_state_t s, fsm_end_id_t id) +{ + enum fsm_endid_set_res sres; + sres = fsm_endid_set(fsm, s, id); + return sres != FSM_ENDID_SET_ERROR_ALLOC_FAIL; +} + enum fsm_getendids_res fsm_getendids(const struct fsm *fsm, fsm_state_t end_state, size_t id_buf_count, fsm_end_id_t *id_buf, @@ -778,13 +823,16 @@ struct carry_env { }; static int -carry_iter_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +carry_iter_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, fsm_end_id_t id, void *opaque) { enum fsm_endid_set_res sres; struct carry_env *env = opaque; assert(env->tag == 'C'); + (void)fsm; (void)state; + (void)nth; sres = fsm_endid_set(env->dst, env->dst_state, id); if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { @@ -839,6 +887,72 @@ fsm_endid_carry(const struct fsm *src_fsm, const struct state_set *src_set, return 1; } +/* Make a new hash table, copying over converted entries and/or discarding. */ +int +fsm_endid_compact(struct fsm *fsm, + const fsm_state_t *mapping, size_t mapping_count) +{ + struct endid_info *info = fsm->endid_info; + const size_t ocount = info->bucket_count; + const size_t ncount = ocount; /* does not need to grow */ + struct endid_info_bucket *obuckets = info->buckets; + struct endid_info_bucket *nbuckets = f_malloc(fsm->opt->alloc, + ncount * sizeof(nbuckets[0])); + const size_t nmask = ncount - 1; + size_t ob_i, nb_i; + size_t moved = 0; + +#if LOG_ENDIDS > 3 + fprintf(stderr, "fsm_endid_compact: rehashing mapped entries\n"); +#endif + + if (nbuckets == NULL) { + return 0; + } + + for (nb_i = 0; nb_i < ncount; nb_i++) { /* clear table */ + nbuckets[nb_i].state = BUCKET_NO_STATE; + } + + for (ob_i = 0; ob_i < ocount; ob_i++) { + const struct endid_info_bucket *ob = &obuckets[ob_i]; + uint64_t hash; + fsm_state_t nstate; + + if (ob->state == BUCKET_NO_STATE) { + continue; + } + + assert(ob->state < mapping_count); + nstate = mapping[ob->state]; + + if (nstate == FSM_STATE_REMAP_NO_STATE) { + info->buckets_used--; /* discarded */ + continue; + } + + hash = hash_id(nstate); + for (nb_i = 0; nb_i < ncount; nb_i++) { + struct endid_info_bucket *nb = &nbuckets[(hash + nb_i) & nmask]; + if (nb->state == BUCKET_NO_STATE) { + nb->state = nstate; + nb->ids = ob->ids; + moved++; + break; + } else { + continue; /* collision */ + } + } + } + + assert(moved == info->buckets_used); + + f_free(fsm->opt->alloc, info->buckets); + info->bucket_count = ncount; + info->buckets = nbuckets; + return 1; +} + void fsm_endid_iter(const struct fsm *fsm, fsm_endid_iter_cb *cb, void *opaque) @@ -867,7 +981,7 @@ fsm_endid_iter(const struct fsm *fsm, count = b->ids->count; for (id_i = 0; id_i < count; id_i++) { - if (!cb(b->state, b->ids->ids[id_i], opaque)) { + if (!cb(fsm, b->state, id_i, b->ids->ids[id_i], opaque)) { break; } @@ -969,7 +1083,7 @@ fsm_endid_iter_state(const struct fsm *fsm, fsm_state_t state, fprintf(stderr, "fsm_endid_iter_state[%d], ids[%ld] -> %d\n", b->state, id_i, b->ids->ids[id_i]); #endif - if (!cb(b->state, b->ids->ids[id_i], opaque)) { + if (!cb(fsm, b->state, id_i, b->ids->ids[id_i], opaque)) { return; } id_i++; @@ -991,10 +1105,13 @@ struct dump_env { }; static int -dump_cb(fsm_state_t state, const fsm_end_id_t id, void *opaque) +dump_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque) { struct dump_env *env = opaque; - fprintf(env->f, "state[%u]: %u\n", state, id); + fprintf(env->f, "endids: state[%u]: %u\n", state, id); + (void)fsm; + (void)nth; return 1; } diff --git a/src/libfsm/endids.h b/src/libfsm/endids.h index 6c46567b3..c43e3ffdf 100644 --- a/src/libfsm/endids.h +++ b/src/libfsm/endids.h @@ -58,10 +58,15 @@ int fsm_endid_carry(const struct fsm *src_fsm, const struct state_set *src_set, struct fsm *dst_fsm, fsm_state_t dst_state); +int +fsm_endid_compact(struct fsm *fsm, + const fsm_state_t *mapping, size_t mapping_count); + /* Callback when iterating over the endids. * Return 0 to halt, or non-zero to continue. */ typedef int -fsm_endid_iter_cb(fsm_state_t state, const fsm_end_id_t id, void *opaque); +fsm_endid_iter_cb(const struct fsm *fsm, fsm_state_t state, + size_t nth, const fsm_end_id_t id, void *opaque); void fsm_endid_iter(const struct fsm *fsm, diff --git a/src/libfsm/endids_internal.h b/src/libfsm/endids_internal.h deleted file mode 100644 index 27450af3b..000000000 --- a/src/libfsm/endids_internal.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef ENDIDS_INTERNAL_H -#define ENDIDS_INTERNAL_H - -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "endids.h" - -#define BUCKET_NO_STATE ((fsm_state_t)-1) -#define DEF_BUCKET_COUNT 4 -#define DEF_BUCKET_ID_COUNT 16 - -struct endid_info { - /* Add-only hash table, with a state ID and an associated - * non-empty ordered array of unique end IDs. The state is the - * key. Grows when the buckets are more than half full. */ - unsigned bucket_count; - unsigned buckets_used; - - struct endid_info_bucket { - fsm_state_t state; - struct end_info_ids { - unsigned count; - unsigned ceil; - fsm_end_id_t ids[1]; - } *ids; - } *buckets; -}; - -#endif diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index e87d9d974..52b73db6c 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -21,52 +22,79 @@ #include "endids.h" #define DUMP_EPSILON_CLOSURES 0 -#define DEF_PENDING_CAPTURE_ACTIONS_CEIL 2 #define LOG_RM_EPSILONS_CAPTURES 0 -#define DEF_CARRY_ENDIDS_COUNT 2 +#define LOG_COPYING 0 +#define LOG_RESULT 0 -struct remap_env { - char tag; - const struct fsm_alloc *alloc; - struct state_set **rmap; - int ok; - - size_t count; - size_t ceil; - struct remap_action { - fsm_state_t state; - enum capture_action_type type; - unsigned capture_id; - fsm_state_t to; - } *actions; -}; +/* #define DEF_CARRY_ENDIDS_COUNT 2 */ +/* #define DEF_CARRY_CAPTUREIDS_COUNT 2 */ -static int -remap_capture_actions(struct fsm *nfa, struct state_set **eclosures); +#if LOG_RESULT +#include +#endif -static int -remap_capture_action_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque); +#define DEF_END_METADATA_ENDIDS_CEIL 4 +#define DEF_END_METADATA_CAPTUREIDS_CEIL 4 +#define DEF_END_METADATA_PROGRAMIDS_CEIL 4 +struct carry_end_metadata_env { + struct fsm *fsm; + const struct fsm_alloc *alloc; + + struct { + size_t ceil; + fsm_end_id_t *ids; + } end; + struct { + int ok; + size_t count; + size_t ceil; + unsigned *ids; + } capture; + struct { + int ok; + size_t count; + size_t ceil; + uint32_t *ids; + } program; +}; static int -carry_endids(struct fsm *fsm, struct state_set *states, - fsm_state_t s); +carry_end_metadata(struct carry_end_metadata_env *env, + fsm_state_t end_state, fsm_state_t dst_state); int fsm_remove_epsilons(struct fsm *nfa) { +#if LOG_RESULT + fprintf(stderr, "==== before\n"); + fsm_print_fsm(stderr, nfa); + fsm_capture_dump(stderr, "#### before_remove_epsilons", nfa); + fprintf(stderr, "====\n"); +#endif + const size_t state_count = fsm_countstates(nfa); int res = 0; struct state_set **eclosures = NULL; - fsm_state_t s; + fsm_state_t s, start_id; + const struct fsm_alloc *alloc = nfa->opt->alloc; INIT_TIMERS(); + struct carry_end_metadata_env em_env = { 0 }; + em_env.fsm = nfa; + em_env.alloc = alloc; + assert(nfa != NULL); + if (!fsm_getstart(nfa, &start_id)) { + goto cleanup; + } + + /* TODO: This could successfully exit early if none of the + * states have epsilon edges. */ + TIME(&pre); - eclosures = epsilon_closure(nfa); + eclosures = fsm_epsilon_closure(nfa); TIME(&post); DIFF_MSEC("epsilon_closure", pre, post, NULL); @@ -107,20 +135,14 @@ fsm_remove_epsilons(struct fsm *nfa) * end states. * * Similarly, any end state metadata on states - * in its epsilon-closure is copied to it. - * - * Capture actions are copied in a later pass. */ + * in its epsilon-closure is copied to it. */ if (fsm_isend(nfa, es_id)) { #if LOG_COPYING fprintf(stderr, "remove_epsilons: setting end on %d (due to %d)\n", s, es_id); #endif fsm_setend(nfa, s, 1); - /* - * Carry through end IDs, if present. This isn't anything to do - * with the NFA conversion; it's meaningful only to the caller. - */ - if (!carry_endids(nfa, eclosures[s], s)) { + if (!carry_end_metadata(&em_env, es_id, s)) { goto cleanup; } } @@ -150,14 +172,7 @@ fsm_remove_epsilons(struct fsm *nfa) state->epsilons = NULL; } -#if LOG_RESULT - fprintf(stderr, "=== %s: about to update capture actions\n", __func__); - fsm_print_fsm(stderr, nfa); -#endif - - if (!remap_capture_actions(nfa, eclosures)) { - goto cleanup; - } + fsm_capture_integrity_check(nfa); #if LOG_RESULT fsm_print_fsm(stderr, nfa); @@ -167,255 +182,144 @@ fsm_remove_epsilons(struct fsm *nfa) res = 1; cleanup: if (eclosures != NULL) { - closure_free(eclosures, state_count); - } - - return res; -} - -static int -remap_capture_actions(struct fsm *nfa, struct state_set **eclosures) -{ - int res = 0; - fsm_state_t s, i; - struct state_set **rmap; - struct state_iter si; - fsm_state_t si_s; - struct remap_env env = { 'R', NULL, NULL, 1, 0, 0, NULL }; - env.alloc = nfa->opt->alloc; - - /* build a reverse mapping */ - rmap = f_calloc(nfa->opt->alloc, nfa->statecount, sizeof(rmap[0])); - if (rmap == NULL) { - goto cleanup; + fsm_closure_free(eclosures, state_count); } - - for (s = 0; s < nfa->statecount; s++) { - if (eclosures[s] == NULL) { continue; } - for (state_set_reset(eclosures[s], &si); state_set_next(&si, &si_s); ) { - if (si_s == s) { - continue; /* ignore identical states */ - } -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, "remap_capture_actions: %u <- %u\n", - s, si_s); -#endif - if (!state_set_add(&rmap[si_s], nfa->opt->alloc, s)) { - goto cleanup; - } - } + if (em_env.end.ids != NULL) { + f_free(alloc, em_env.end.ids); } - env.rmap = rmap; - - /* Iterate over the current set of actions with the reverse - * mapping (containing only states which will be skipped, - * collecting info about every new capture action that will need - * to be added. - * - * It can't be added during the iteration, because that would - * modify the hash table as it's being iterated over. */ - fsm_capture_action_iter(nfa, remap_capture_action_cb, &env); - - /* Now that we're done iterating, add those actions. */ - for (i = 0; i < env.count; i++) { - const struct remap_action *a = &env.actions[i]; - if (!fsm_capture_add_action(nfa, a->state, a->type, - a->capture_id, a->to)) { - goto cleanup; - } + if (em_env.program.ids != NULL) { + f_free(alloc, em_env.program.ids); } - - res = 1; - -cleanup: - if (env.actions != NULL) { - f_free(nfa->opt->alloc, env.actions); + if (em_env.capture.ids != NULL) { + f_free(alloc, em_env.capture.ids); } - if (rmap != NULL) { - for (i = 0; i < nfa->statecount; i++) { - state_set_free(rmap[i]); - } - f_free(nfa->opt->alloc, rmap); - } return res; - } static int -add_pending_capture_action(struct remap_env *env, - fsm_state_t state, enum capture_action_type type, - unsigned capture_id, fsm_state_t to) +collect_captureid_cb(fsm_state_t state, unsigned id, void *opaque) { - struct remap_action *a; - if (env->count == env->ceil) { - struct remap_action *nactions; - const size_t nceil = (env->ceil == 0 - ? DEF_PENDING_CAPTURE_ACTIONS_CEIL : 2*env->ceil); - assert(nceil > 0); - nactions = f_realloc(env->alloc, - env->actions, - nceil * sizeof(nactions[0])); - if (nactions == NULL) { - return 0; - } - - env->ceil = nceil; - env->actions = nactions; - } + struct carry_end_metadata_env *env = opaque; + (void)state; - a = &env->actions[env->count]; -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, "add_pending_capture_action: state %d, type %s, capture_id %u, to %d\n", - state, fsm_capture_action_type_name[type], capture_id, to); -#endif + if (env->capture.count == env->capture.ceil) { + const size_t nceil = (env->capture.ceil == 0) + ? DEF_END_METADATA_CAPTUREIDS_CEIL + : 2 * env->capture.ceil; + unsigned *nids; + assert(nceil > env->capture.ceil); + nids = f_realloc(env->alloc, env->capture.ids, + nceil * sizeof(env->capture.ids[0])); + if (nids == NULL) { + env->capture.ok = 0; + return 0; + } + env->capture.ceil = nceil; + env->capture.ids = nids; + } - a->state = state; - a->type = type; - a->capture_id = capture_id; - a->to = to; - env->count++; + env->capture.ids[env->capture.count] = id; + env->capture.count++; return 1; } static int -remap_capture_action_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, - void *opaque) +collect_progid_cb(fsm_state_t state, unsigned id, void *opaque) { - struct state_iter si; - fsm_state_t si_s; - struct remap_env *env = opaque; - assert(env->tag == 'R'); - -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, "remap_capture_action_cb: state %d, type %s, capture_id %u, to %d\n", - state, fsm_capture_action_type_name[type], capture_id, to); -#endif - - for (state_set_reset(env->rmap[state], &si); state_set_next(&si, &si_s); ) { - struct state_iter si_to; - fsm_state_t si_tos; - -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, " -- rcac: state %d -> %d\n", state, si_s); -#endif - - if (!add_pending_capture_action(env, si_s, type, capture_id, to)) { - goto fail; - } - - if (to == CAPTURE_NO_STATE) { - continue; - } - - for (state_set_reset(env->rmap[to], &si_to); state_set_next(&si, &si_tos); ) { -#if LOG_RM_EPSILONS_CAPTURES - fprintf(stderr, " -- rcac: to %d -> %d\n", to, si_tos); -#endif - - if (!add_pending_capture_action(env, si_tos, type, capture_id, to)) { - goto fail; - } - - } - } + struct carry_end_metadata_env *env = opaque; + uint32_t prog_id = (uint32_t)id; + (void)state; + + if (env->program.count == env->program.ceil) { + const size_t nceil = (env->program.ceil == 0) + ? DEF_END_METADATA_PROGRAMIDS_CEIL + : 2 * env->program.ceil; + unsigned *nids; + assert(nceil > env->program.ceil); + nids = f_realloc(env->alloc, env->program.ids, + nceil * sizeof(env->program.ids[0])); + if (nids == NULL) { + env->program.ok = 0; + return 0; + } + env->program.ceil = nceil; + env->program.ids = nids; + } + env->program.ids[env->program.count] = prog_id; + env->program.count++; return 1; - -fail: - env->ok = 0; - return 0; } -struct collect_env { - char tag; - const struct fsm_alloc *alloc; - size_t count; - size_t ceil; - fsm_end_id_t *ids; - int ok; -}; - +/* Because we're modifying the FSM in place, we can't iterate and add + * new entries -- it could lead to the underlying hash table resizing. + * Instead, collect, then add in a second pass. */ static int -collect_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +carry_end_metadata(struct carry_end_metadata_env *env, + fsm_state_t end_state, fsm_state_t dst_state) { - struct collect_env *env = opaque; - assert(env->tag == 'E'); - - (void)state; - - if (env->count == env->ceil) { - const size_t nceil = 2 * env->ceil; - fsm_end_id_t *nids; - assert(nceil > env->ceil); - nids = f_realloc(env->alloc, env->ids, - nceil * sizeof(*env->ids)); - if (nids == NULL) { - env->ok = 0; - return 0; + size_t i; + const size_t id_count = fsm_getendidcount(env->fsm, end_state); + if (id_count > 0) { /* copy end IDs */ + enum fsm_getendids_res id_res; + size_t written; + if (id_count > env->end.ceil) { /* grow buffer */ + size_t nceil = (env->end.ceil == 0) + ? DEF_END_METADATA_ENDIDS_CEIL + : 2*env->end.ceil; + while (nceil < id_count) { + nceil *= 2; + } + assert(nceil > 0); + fsm_end_id_t *nids = f_realloc(env->alloc, + env->end.ids, nceil * sizeof(env->end.ids[0])); + if (nids == NULL) { + return 0; + } + env->end.ids = nids; + env->end.ceil = nceil; } - env->ceil = nceil; - env->ids = nids; - } - env->ids[env->count] = id; - env->count++; + id_res = fsm_getendids(env->fsm, end_state, + id_count, env->end.ids, &written); + assert(id_res == FSM_GETENDIDS_FOUND); + assert(written == id_count); - return 1; -} - -/* fsm_remove_epsilons can't use fsm_endid_carry directly, because the src - * and dst FSMs are the same -- that would lead to adding entries to a - * hash table, possibly causing it to resize, while iterating over it. - * - * Instead, collect entries that need to be added (if not already - * present), and then add them in a second pass. */ -static int -carry_endids(struct fsm *fsm, struct state_set *states, - fsm_state_t dst_state) -{ - struct state_iter it; - fsm_state_t s; - size_t i; + for (i = 0; i < id_count; i++) { +#if LOG_COPYING + fprintf(stderr, "carry_end_metadata: setting end ID %u on %d (due to %d)\n", + env->end.ids[i], dst_state, end_state); +#endif + if (!fsm_setendid_state(env->fsm, dst_state, env->end.ids[i])) { + return 0; + } + } + } - struct collect_env env; - env.tag = 'E'; /* for fsm_remove_epsilons */ - env.alloc = fsm->opt->alloc; - env.count = 0; - env.ceil = DEF_CARRY_ENDIDS_COUNT; - env.ids = f_malloc(fsm->opt->alloc, - env.ceil * sizeof(*env.ids)); - if (env.ids == NULL) { + env->capture.ok = 1; + env->capture.count = 0; + fsm_capture_iter_active_for_end_state(env->fsm, end_state, + collect_captureid_cb, env); + if (!env->capture.ok) { return 0; } - env.ok = 1; - - /* collect from states */ - for (state_set_reset(states, &it); state_set_next(&it, &s); ) { - if (!fsm_isend(fsm, s)) { - continue; - } - - fsm_endid_iter_state(fsm, s, collect_cb, &env); - if (!env.ok) { - goto cleanup; + for (i = 0; i < env->capture.count; i++) { + if (!fsm_capture_set_active_for_end(env->fsm, + env->capture.ids[i], dst_state)) { + return 0; } } - /* add them */ - for (i = 0; i < env.count; i++) { - enum fsm_endid_set_res sres; - sres = fsm_endid_set(fsm, dst_state, env.ids[i]); - if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { - env.ok = 0; - goto cleanup; + env->program.count = 0; + fsm_capture_iter_program_ids_for_end_state(env->fsm, end_state, + collect_progid_cb, env); + for (i = 0; i < env->program.count; i++) { + if (!fsm_capture_associate_program_with_end_state(env->fsm, + env->program.ids[i], dst_state)) { + return 0; } } -cleanup: - f_free(fsm->opt->alloc, env.ids); - - return env.ok; + return 1; } - diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index 9f7b21802..47d27a50e 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,6 @@ static int transition(const struct fsm *fsm, fsm_state_t state, int c, - size_t offset, struct fsm_capture *captures, fsm_state_t *next) { assert(state < fsm->statecount); @@ -35,31 +35,72 @@ transition(const struct fsm *fsm, fsm_state_t state, int c, return 0; } - if (captures != NULL && fsm_capture_has_capture_actions(fsm, state)) { - fsm_capture_update_captures(fsm, state, *next, - offset, captures); - } - return 1; } int fsm_exec(const struct fsm *fsm, - int (*fsm_getc)(void *opaque), void *opaque, - fsm_state_t *end, struct fsm_capture *captures) + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end) { fsm_state_t state; int c; size_t offset = 0; - unsigned i; - size_t capture_count; assert(fsm != NULL); assert(fsm_getc != NULL); assert(end != NULL); - capture_count = fsm_countcaptures(fsm); + /* TODO: check prerequisites; that it has literal edges, DFA, etc */ + + /* TODO: pass struct of callbacks to call during each event; transitions etc */ + + if (!fsm_all(fsm, fsm_isdfa)) { + errno = EINVAL; + return -1; + } + + if (!fsm_getstart(fsm, &state)) { + errno = EINVAL; + return -1; + } + +#if LOG_EXEC + fprintf(stderr, "fsm_exec: starting at %d\n", state); +#endif + + while (c = fsm_getc(opaque), c != EOF) { + if (!transition(fsm, state, c, &state)) { +#if LOG_EXEC + fprintf(stderr, "fsm_exec: edge not found\n"); +#endif + return 0; + } +#if LOG_EXEC + fprintf(stderr, "fsm_exec: @ %zu, input '%c', new state %u\n", + offset, c, state); +#endif + offset++; + } + + if (!fsm_isend(fsm, state)) { + return 0; + } + + *end = state; + return 1; +} + +int +fsm_exec_with_captures(const struct fsm *fsm, const unsigned char *input, + size_t input_length, fsm_state_t *end, + struct fsm_capture *captures, size_t capture_buf_length) +{ + fsm_state_t state; + size_t offset = 0; + + assert(fsm != NULL); + assert(end != NULL); /* TODO: check prerequisites; that it has literal edges, DFA, etc */ /* TODO: pass struct of callbacks to call during each event; transitions etc */ @@ -74,17 +115,26 @@ fsm_exec(const struct fsm *fsm, return -1; } - for (i = 0; i < capture_count; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; + if (captures != NULL) { + const size_t capture_ceil = fsm_capture_ceiling(fsm); + if (capture_buf_length < capture_ceil) { + errno = EINVAL; + return -1; + } + + for (size_t i = 0; i < capture_ceil; i++) { + captures[i].pos[0] = FSM_CAPTURE_NO_POS; + captures[i].pos[1] = FSM_CAPTURE_NO_POS; + } } #if LOG_EXEC fprintf(stderr, "fsm_exec: starting at %d\n", state); #endif - while (c = fsm_getc(opaque), c != EOF) { - if (!transition(fsm, state, c, offset, captures, &state)) { + while (offset < input_length) { + const unsigned char c = input[offset]; + if (!transition(fsm, state, c, &state)) { #if LOG_EXEC fprintf(stderr, "fsm_exec: edge not found\n"); #endif @@ -102,15 +152,15 @@ fsm_exec(const struct fsm *fsm, return 0; } - /* Check for capture actions on end state */ - if (captures != NULL && fsm_capture_has_capture_actions(fsm, state)) { - fsm_capture_update_captures(fsm, state, NEXT_STATE_END, - offset, captures); + /* Resolve captures associated with the end state. */ + if (captures != NULL) { + if (!fsm_capture_resolve_during_exec(fsm, state, + input, offset, captures, capture_buf_length)) { + assert(errno != 0); + return -1; + } } - fsm_capture_finalize_captures(fsm, capture_count, captures); - *end = state; return 1; } - diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index 6e77510a7..5d372e30f 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -52,10 +52,6 @@ struct fsm_edge { struct fsm_state { unsigned int end:1; - /* If 0, then this state has no need for checking - * the fsm->capture_info struct. */ - unsigned int has_capture_actions:1; - /* meaningful within one particular transformation only */ unsigned int visited:1; @@ -90,10 +86,10 @@ state_hasnondeterminism(const struct fsm *fsm, fsm_state_t state, struct bm *bm) * for states, with wrapper to populate malloced array of user-facing structs. */ struct state_set ** -epsilon_closure(struct fsm *fsm); +fsm_epsilon_closure(struct fsm *fsm); void -closure_free(struct state_set **closures, size_t n); +fsm_closure_free(struct state_set **closures, size_t n); /* * Internal free function that invokes free(3) by default, or a user-provided diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 415bffbea..a8a0976ca 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -108,6 +108,7 @@ fsm_shortest fsm_example fsm_exec +fsm_exec_with_captures # exec fsm_fgetc @@ -117,8 +118,8 @@ fsm_print_cfrag # XXX: workaround for lx make_ir # XXX: workaround for lx free_ir # XXX: workaround for lx -epsilon_closure # XXX: workaround for fsm -closure_free # XXX: workaround for fsm +fsm_epsilon_closure # XXX: workaround for fsm +fsm_closure_free # XXX: workaround for fsm fsm_mergeab @@ -129,11 +130,19 @@ fsm_vm_match_buffer fsm_vm_match_file # -fsm_countcaptures -fsm_capture_has_capture_actions +fsm_capture_ceiling fsm_capture_set_path fsm_capture_rebase_capture_id -fsm_capture_alloc +fsm_capture_alloc_capture_buffer +fsm_capture_free_capture_buffer fsm_capture_dump +fsm_capture_set_active_for_end +fsm_capture_add_program +fsm_capture_associate_program_with_end_state + +# capture_vm* +fsm_capvm_program_free +fsm_capvm_program_dump +fsm_capvm_program_exec fsm_minimise_test_oracle diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index fd4d35ee5..7abe8cf25 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -17,25 +17,27 @@ #include #include #include +#include #include "capture.h" +#include "capture_vm.h" #include "internal.h" #include "endids.h" #define LOG_MERGE_ENDIDS 0 - -struct copy_capture_env { - char tag; - struct fsm *dst; - int ok; -}; +#define LOG_COPY_CAPTURE_PROGRAMS 0 static int -copy_capture_actions(struct fsm *dst, struct fsm *src); +copy_end_metadata(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src); static int copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); +static int +copy_active_capture_ids(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src); + static struct fsm * merge(struct fsm *dst, struct fsm *src, fsm_state_t *base_dst, fsm_state_t *base_src, @@ -72,17 +74,12 @@ merge(struct fsm *dst, struct fsm *src, *base_dst = 0; *base_src = dst->statecount; *capture_base_dst = 0; - *capture_base_src = fsm_countcaptures(dst); + *capture_base_src = fsm_capture_ceiling(dst); for (i = 0; i < src->statecount; i++) { state_set_rebase(&src->states[i].epsilons, *base_src); edge_set_rebase(&src->states[i].edges, *base_src); } - - /* FIXME: instead of rebasing these here, they could - * also be updated in copy_capture_actions below. */ - fsm_capture_rebase_capture_id(src, *capture_base_src); - fsm_capture_rebase_capture_action_states(src, *base_src); } memcpy(dst->states + dst->statecount, src->states, @@ -90,22 +87,10 @@ merge(struct fsm *dst, struct fsm *src, dst->statecount += src->statecount; dst->endcount += src->endcount; - /* We need to explicitly copy over the capture actions and end - * ID info here because they're stored on the FSMs as a whole, - * rather than individual states; `memcpy`ing the states alone - * won't transfer them. - * - * They're stored separately because they are likely to only - * be on a small portion of the states, and adding two extra - * NULL pointers to `struct fsm_state` increases memory usage - * significantly. */ - - if (!copy_capture_actions(dst, src)) { - /* non-recoverable -- destructive operation */ - return NULL; - } - - if (!copy_end_ids(dst, src, *base_src)) { + /* We need to explicitly copy over end metadata here. They're + * stored separately because they are likely to only be on a + * small portion of the states. */ + if (!copy_end_metadata(dst, src, *base_src, *capture_base_src)) { /* non-recoverable -- destructive operation */ return NULL; } @@ -123,16 +108,91 @@ merge(struct fsm *dst, struct fsm *src, return dst; } +struct copy_capture_programs_env { + const struct fsm_alloc *alloc; + const struct fsm *src; + struct fsm *dst; + int ok; + fsm_state_t state_base_src; + unsigned capture_base_src; + +#define DEF_MAPPING_CEIL 1 + size_t mapping_used; + size_t mapping_ceil; + /* TODO: could cache last_map to check first if this becomes expensive */ + struct prog_mapping { + unsigned src_prog_id; + unsigned dst_prog_id; + } *mappings; +}; + static int -copy_capture_cb(fsm_state_t state, - enum capture_action_type type, unsigned capture_id, fsm_state_t to, +copy_capture_programs_cb(fsm_state_t src_state, unsigned src_prog_id, void *opaque) { - struct copy_capture_env *env = opaque; - assert(env->tag == 'C'); + struct copy_capture_programs_env *env = opaque; + + const fsm_state_t dst_state = src_state + env->state_base_src; + assert(dst_state < fsm_countstates(env->dst)); + +#if LOG_COPY_CAPTURE_PROGRAMS + fprintf(stderr, "%s: src %p, dst %p, src_prog_id %u, src_state %d, dst_state %d, capture_base_src %u\n", + __func__, (void *)env->src, (void *)env->dst, + src_prog_id, src_state, dst_state, env->capture_base_src); +#endif + int found = 0; + uint32_t dst_prog_id; + + for (size_t i = 0; i < env->mapping_used; i++) { + const struct prog_mapping *m = &env->mappings[i]; + if (m->src_prog_id == src_prog_id) { + dst_prog_id = m->dst_prog_id; + found = 1; + } + } + + if (!found) { + if (env->mapping_used == env->mapping_ceil) { /* grow */ + const size_t nceil = 2*env->mapping_ceil; + struct prog_mapping *nmappings = f_realloc(env->alloc, + env->mappings, nceil * sizeof(nmappings[0])); + if (nmappings == NULL) { + env->ok = 0; + return 0; + } + + env->mapping_ceil = nceil; + env->mappings = nmappings; + } + + const struct capvm_program *p = fsm_capture_get_program_by_id(env->src, + src_prog_id); + assert(p != NULL); + + struct capvm_program *cp = capvm_program_copy(env->alloc, p); + if (cp == NULL) { + env->ok = 0; + return 0; + } + capvm_program_rebase(cp, env->capture_base_src); + + /* add program, if not present */ + if (!fsm_capture_add_program(env->dst, + cp, &dst_prog_id)) { + f_free(env->alloc, cp); + env->ok = 0; + return 0; + } - if (!fsm_capture_add_action(env->dst, state, type, - capture_id, to)) { + struct prog_mapping *m = &env->mappings[env->mapping_used]; + m->src_prog_id = src_prog_id; + m->dst_prog_id = dst_prog_id; + env->mapping_used++; + } + + /* associate with end states */ + if (!fsm_capture_associate_program_with_end_state(env->dst, + dst_prog_id, dst_state)) { env->ok = 0; return 0; } @@ -141,18 +201,55 @@ copy_capture_cb(fsm_state_t state, } static int -copy_capture_actions(struct fsm *dst, struct fsm *src) +copy_capture_programs(struct fsm *dst, const struct fsm *src, + fsm_state_t state_base_src, unsigned capture_base_src) { - struct copy_capture_env env; - env.tag = 'C'; - env.dst = dst; - env.ok = 1; + const struct fsm_alloc *alloc = src->opt->alloc; + struct prog_mapping *mappings = f_malloc(alloc, + DEF_MAPPING_CEIL * sizeof(mappings[0])); + if (mappings == NULL) { + return 0; + } - fsm_capture_action_iter(src, copy_capture_cb, &env); + struct copy_capture_programs_env env = { + .alloc = alloc, + .src = src, + .dst = dst, + .ok = 1, + .state_base_src = state_base_src, + .capture_base_src = capture_base_src, + .mapping_ceil = DEF_MAPPING_CEIL, + .mappings = mappings, + }; + fsm_capture_iter_program_ids_for_all_end_states(src, + copy_capture_programs_cb, &env); + + f_free(alloc, env.mappings); return env.ok; } +static int +copy_end_metadata(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src) +{ + /* TODO: inline */ + + if (!copy_end_ids(dst, src, base_src)) { + return 0; + } + + if (!copy_active_capture_ids(dst, src, base_src, capture_base_src)) { + return 0; + } + + if (!copy_capture_programs(dst, src, base_src, capture_base_src)) { + return 0; + } + + return 1; +} + struct copy_end_ids_env { char tag; struct fsm *dst; @@ -186,12 +283,50 @@ copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) struct copy_end_ids_env env; env.tag = 'M'; /* for Merge */ env.dst = dst; - env.src = src; env.base_src = base_src; return fsm_endid_iter_bulk(src, copy_end_ids_cb, &env); } +struct copy_active_capture_ids_env { + char tag; + struct fsm *dst; + fsm_state_t base_src; + unsigned capture_base_src; + int ok; +}; + +static int +copy_active_capture_ids_cb(fsm_state_t state, unsigned capture_id, void *opaque) +{ + struct copy_active_capture_ids_env *env = opaque; + assert(env->tag == 'A'); + + if (!fsm_capture_set_active_for_end(env->dst, + capture_id + env->capture_base_src, + state + env->base_src)) { + env->ok = 0; + return 0; + } + return 1; +} + +static int +copy_active_capture_ids(struct fsm *dst, struct fsm *src, + fsm_state_t base_src, unsigned capture_base_src) +{ + struct copy_active_capture_ids_env env; + env.tag = 'A'; + env.dst = dst; + env.base_src = base_src; + env.capture_base_src = capture_base_src; + env.ok = 1; + + fsm_capture_iter_active_for_all_end_states(src, + copy_active_capture_ids_cb, &env); + return env.ok; +} + struct fsm * fsm_mergeab(struct fsm *a, struct fsm *b, fsm_state_t *base_b) diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index 60cec48c5..1ca4a36ac 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -22,6 +22,7 @@ #endif #include +#include #include #include #include @@ -38,6 +39,45 @@ #include "minimise_internal.h" #include "minimise_test_oracle.h" +static int +label_sets_match(const uint64_t a[256/64], const uint64_t b[256/64]); + +static int +split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm); + +#if EXPENSIVE_CHECKS +#include + +static void +check_done_ec_offset(const struct min_env *env); + +static int +all_end_states_are_currently_together(const struct min_env *env); +#endif + +#define DEF_CAPTURE_ID_CEIL 4 +struct end_metadata { + struct end_metadata_capture { + unsigned count; + unsigned ceil; + unsigned *ids; + } capture; + + struct end_metadata_program { + unsigned count; + unsigned ceil; + unsigned *ids; + } program; +}; + +static int +collect_capture_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_capture *c); + +static int +collect_capture_program_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_program *p); + int fsm_minimise(struct fsm *fsm) { @@ -55,19 +95,36 @@ fsm_minimise(struct fsm *fsm) assert(fsm != NULL); assert(fsm_all(fsm, fsm_isdfa)); +#if LOG_INIT > 1 + fprintf(stderr, "=== BEFORE TRIM, %d states\n", fsm_countstates(fsm)); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "#### pre_minimise", fsm); + fprintf(stderr, "=== BEFORE TRIM\n"); +#endif + /* The algorithm used below won't remove states without a path * to an end state, because it cannot prove they're * unnecessary, so they must be trimmed away first. */ + TIME(&pre); if (fsm_trim(fsm, FSM_TRIM_START_AND_END_REACHABLE, &shortest_end_distance) < 0) { return 0; } + TIME(&post); + DIFF_MSEC("trim", pre, post, NULL); if (fsm->statecount == 0) { r = 1; goto cleanup; } +#if LOG_INIT > 1 + fprintf(stderr, "=== AFTER TRIM, %d states\n", fsm_countstates(fsm)); + fprintf(stderr, "# pre_minimise\n"); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "#### pre_minimise", fsm); +#endif + TIME(&pre); collect_labels(fsm, labels, &label_count); TIME(&post); @@ -113,6 +170,8 @@ fsm_minimise(struct fsm *fsm) goto cleanup; } + fsm_capture_integrity_check(dst); + #if EXPENSIVE_CHECKS if (!fsm_capture_has_captures(fsm)) { struct fsm *oracle = fsm_minimise_test_oracle(fsm); @@ -253,6 +312,12 @@ build_minimised_mapping(const struct fsm *fsm, goto cleanup; } + /* This only needs to be run once, but must run before the main + * fixpoint loop below, because it potentially refines ECs. */ + if (!split_ecs_by_end_metadata(&env, fsm)) { + goto cleanup; + } + #if LOG_INIT for (i = 0; i < env.ec_count; i++) { fprintf(stderr, "# --ec[%lu]: %d\n", i, env.ecs[i]); @@ -329,7 +394,7 @@ build_minimised_mapping(const struct fsm *fsm, } } -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS check_done_ec_offset(&env); #endif } @@ -365,6 +430,12 @@ build_minimised_mapping(const struct fsm *fsm, } #endif +#if EXPENSIVE_CHECKS + for (i = 0; i < fsm->statecount; i++) { + assert(mapping[i] < fsm->statecount); + } +#endif + #if LOG_STEPS fprintf(stderr, "# done in %lu iteration(s), %lu step(s), %ld -> %ld states, label_count %lu\n", env.iter, env.steps, fsm->statecount, @@ -403,7 +474,7 @@ dump_ecs(FILE *f, const struct min_env *env) #endif } -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS static void check_descending_EC_counts(const struct min_env *env) { @@ -611,7 +682,7 @@ populate_initial_ecs(struct min_env *env, const struct fsm *fsm, /* The dead state is not a member of any EC. */ env->state_ecs[env->dead_state] = NO_ID; -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS check_descending_EC_counts(env); #endif @@ -646,7 +717,394 @@ populate_initial_ecs(struct min_env *env, const struct fsm *fsm, #endif } -#if EXPENSIVE_INTEGRITY_CHECKS +SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() +static void +incremental_hash_of_ids(uint64_t *accum, fsm_end_id_t id) +{ + (*accum) += hash_id(id); +} + +static int +same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) +{ + if (a->capture.count != b->capture.count) { + return 0; + } + + if (a->program.count != b->program.count) { + return 0; + } + + /* compare -- these must be sorted */ + + for (size_t i = 0; i < a->capture.count; i++) { + if (a->capture.ids[i] != b->capture.ids[i]) { + return 0; + } + } + for (size_t i = 0; i < a->program.count; i++) { + if (a->program.ids[i] != b->program.ids[i]) { + return 0; + } + } + + return 1; +} + +static int +split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) +{ + int res = 0; + + struct end_metadata *end_md; + fsm_state_t *htab = NULL; + + const size_t state_count = fsm_countstates(fsm); + +#if EXPENSIVE_CHECKS + /* Invariant: For each EC, either all or none of the states + * are end states. We only partition the set(s) of end states + * here. */ + assert(all_end_states_are_currently_together(env)); +#endif + + /* Use the hash table to assign to new groups. */ + + end_md = f_calloc(fsm->opt->alloc, + state_count, sizeof(end_md[0])); + if (end_md == NULL) { + goto cleanup; + } + + size_t bucket_count = 1; + while (bucket_count < state_count) { + bucket_count *= 2; /* power of 2 ceiling */ + } + const size_t mask = bucket_count - 1; + + htab = f_malloc(fsm->opt->alloc, + bucket_count * sizeof(htab[0])); + if (htab == NULL) { + goto cleanup; + } + + /* First pass: collect end state metadata */ + for (size_t ec_i = 0; ec_i < env->ec_count; ec_i++) { + fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); +#if LOG_ECS + fprintf(stderr, "## EC %zu\n", ec_i); +#endif + while (s != NO_ID) { + struct end_metadata *e = &end_md[s]; + if (!fsm_isend(fsm, s)) { + break; /* this EC has non-end states, skip */ + } + + if (!collect_capture_ids(fsm, s, &e->capture)) { + goto cleanup; + } + + if (!collect_capture_program_ids(fsm, s, &e->program)) { + goto cleanup; + } + + s = env->jump[s]; + } + } + +#if LOG_ECS + fprintf(stderr, "==== BEFORE PARTITIONING BY END METADATA\n"); + dump_ecs(stderr, env); + fprintf(stderr, "====\n"); +#endif + + /* FIXME: is this actually the right behavior? */ + /* Second pass: partition ECs into groups with identical end IDs. + * for each group with different end IDs, unlink them. */ + const size_t max_ec = env->ec_count; + for (size_t ec_i = 0; ec_i < max_ec; ec_i++) { + fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); + fsm_state_t prev = NO_ID; + + for (size_t i = 0; i < bucket_count; i++) { + htab[i] = NO_ID; /* reset hash table */ + } + + while (s != NO_ID) { + const struct end_metadata *s_md = &end_md[s]; + + uint64_t hash = 0; + const fsm_state_t next = env->jump[s]; + + for (size_t pid_i = 0; pid_i < s_md->program.count; pid_i++) { + incremental_hash_of_ids(&hash, s_md->program.ids[pid_i]); + } + + for (size_t b_i = 0; b_i < bucket_count; b_i++) { + fsm_state_t *b = &htab[(b_i + hash) & mask]; + const fsm_state_t other = *b; + const struct end_metadata *other_md = &end_md[other]; + + if (other == NO_ID) { /* empty hash bucket */ + *b = s; + if (prev == NO_ID) { + /* keep the first state, along with other states + * with matching end IDs, in this EC. no-op. */ +#if LOG_ECS + fprintf(stderr, " -- keeping state s %d in EC %u\n", + s, env->state_ecs[s]); +#endif + prev = s; + } else { /* not first (prev is set), so it landed somewhere else */ + /* unlink and assign new EC */ +#if LOG_ECS + fprintf(stderr, " -- moving state s %d from EC %u to EC %u\n", + s, env->state_ecs[s], env->ec_count); +#endif + env->jump[prev] = env->jump[s]; /* unlink */ + env->ecs[env->ec_count] = s; /* head of new EC */ + env->state_ecs[s] = env->ec_count; + env->jump[s] = NO_ID; + env->ec_count++; + } + break; + } else if (same_end_metadata(s_md, other_md)) { + if (env->state_ecs[other] == ec_i) { + /* keep in the current EC -- no-op */ +#if LOG_ECS + fprintf(stderr, " -- keeping state s %d in EC %u\n", + s, env->state_ecs[s]); +#endif + prev = s; + } else { + /* unlink and link to other state's EC */ +#if LOG_ECS + fprintf(stderr, " -- appending s %d to EC %u, after state %d, before %d\n", + s, env->state_ecs[other], other, env->jump[other]); +#endif + assert(prev != NO_ID); + env->jump[prev] = env->jump[s]; /* unlink */ + env->state_ecs[s] = env->state_ecs[other]; + env->jump[s] = env->jump[other]; + env->jump[other] = s; /* link after other */ + } + break; + } else { + continue; /* collision */ + } + } + + s = next; + } + + /* If this EC only has one entry and it's before the + * done_ec_offset, then set that here so that invariants + * will be restored while sweeping forward after this loop. */ + + if (env->jump[MASK_EC_HEAD(env->ecs[ec_i])] == NO_ID && ec_i < env->done_ec_offset) { + env->done_ec_offset = ec_i; /* will be readjusted later */ + } + +#if LOG_ECS + fprintf(stderr, "==== AFTER PARTITIONING BY END METADATA -- EC %zu\n", ec_i); + dump_ecs(stderr, env); + fprintf(stderr, "==== (done_ec_offset: %d)\n", env->done_ec_offset); +#endif + } + +#if LOG_ECS + fprintf(stderr, "==== AFTER PARTITIONING BY END IDs\n"); + dump_ecs(stderr, env); + fprintf(stderr, "==== (done_ec_offset: %d)\n", env->done_ec_offset); +#endif + + /* Sweep forward and swap ECs as necessary so all single-entry + * ECs are at the end -- they're done. */ + size_t ec_i = env->done_ec_offset; + + while (ec_i < env->ec_count) { + const fsm_state_t head = MASK_EC_HEAD(env->ecs[ec_i]); + if (env->jump[head] == NO_ID) { + /* offset stays where it is */ +#if LOG_ECS + fprintf(stderr, "ec_i: %zu / %u -- branch a\n", ec_i, env->ec_count); +#endif + env->ecs[ec_i] = SET_SMALL_EC_FLAG(head); + } else { + /* this EC has more than one state, but is after + * the done_ec_offset, so swap it with an EC at + * the boundary. */ + const fsm_state_t n_ec_i = env->done_ec_offset; +#if LOG_ECS + fprintf(stderr, "ec_i: %zu / %u -- branch b -- swap %ld and %d\n", + ec_i, env->ec_count, ec_i, n_ec_i); +#endif + + /* swap ec[n_ec_i] and ec[ec_i] */ + const fsm_state_t tmp = env->ecs[ec_i]; + env->ecs[ec_i] = env->ecs[n_ec_i]; + env->ecs[n_ec_i] = tmp; + /* note: this may set the SMALL_EC_FLAG. */ + update_ec_links(env, ec_i); + update_ec_links(env, n_ec_i); + env->done_ec_offset++; + } + ec_i++; + } + +#if LOG_ECS + fprintf(stderr, "==== (done_ec_offset is now: %d, ec_count %u)\n", env->done_ec_offset, env->ec_count); + dump_ecs(stderr, env); +#endif + + /* check that all ECs are before/after done_ec_offset */ + for (size_t ec_i = 0; ec_i < env->ec_count; ec_i++) { + const fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); +#if LOG_ECS + fprintf(stderr, " -- ec_i %zu: s %d\n", ec_i, s); +#endif + if (ec_i < env->done_ec_offset) { + assert(env->jump[s] != NO_ID); + } else { + assert(env->jump[s] == NO_ID); + } + } + + res = 1; + +cleanup: + if (htab != NULL) { + f_free(fsm->opt->alloc, htab); + } + if (end_md != NULL) { + size_t i; + for (i = 0; i < state_count; i++) { + struct end_metadata *e = &end_md[i]; + if (e->capture.ids != NULL) { + f_free(fsm->opt->alloc, e->capture.ids); + } + if (e->program.ids != NULL) { + f_free(fsm->opt->alloc, e->program.ids); + } + } + f_free(fsm->opt->alloc, end_md); + } + + return res; +} + +static int +cmp_unsigned(const void *pa, const void *pb) +{ + const unsigned a = *(unsigned *)pa; + const unsigned b = *(unsigned *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +struct collect_capture_env { + int ok; + const struct fsm_alloc *alloc; + struct end_metadata_capture *c; + struct end_metadata_program *p; +}; + +static int +collect_capture_cb(fsm_state_t state, unsigned capture_id, + void *opaque) +{ + struct collect_capture_env *env = opaque; + struct end_metadata_capture *c = env->c; + (void)state; + if (c->count == c->ceil) { + const size_t nceil = (c->count == 0) + ? DEF_CAPTURE_ID_CEIL + : 2*c->ceil; + unsigned *nids = f_realloc(env->alloc, c->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + env->ok = 0; + return 0; + } + c->ids = nids; + c->ceil = nceil; + } + + c->ids[c->count] = capture_id; + c->count++; + return 1; +} + +static int +collect_capture_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_capture *c) +{ + struct collect_capture_env env = { + .ok = 1, + .alloc = fsm->opt->alloc, + .c = c, + }; + fsm_capture_iter_active_for_end_state(fsm, s, + collect_capture_cb, &env); + + if (env.ok) { + if (c->ids == NULL) { + assert(c->count == 0); + } else { + qsort(c->ids, c->count, sizeof(c->ids[0]), cmp_unsigned); + } + } + + return env.ok; +} + +static int +collect_capture_program_ids_cb(fsm_state_t state, unsigned prog_id, + void *opaque) +{ + struct collect_capture_env *env = opaque; + struct end_metadata_program *p = env->p; + (void)state; + if (p->count == p->ceil) { + const size_t nceil = (p->count == 0) + ? DEF_CAPTURE_ID_CEIL + : 2*p->ceil; + unsigned *nids = f_realloc(env->alloc, p->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + env->ok = 0; + return 0; + } + p->ids = nids; + p->ceil = nceil; + } + + p->ids[p->count] = prog_id; + p->count++; + return 1; +} + +static int +collect_capture_program_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_program *p) +{ + struct collect_capture_env env = { + .ok = 1, + .alloc = fsm->opt->alloc, + .p = p, + }; + fsm_capture_iter_program_ids_for_end_state(fsm, s, + collect_capture_program_ids_cb, &env); + + if (env.ok) { + if (p->ids == NULL) { + assert(p->count == 0); + } else { + qsort(p->ids, p->count, sizeof(p->ids[0]), cmp_unsigned); + } + } + + return env.ok; +} + +#if EXPENSIVE_CHECKS static void check_done_ec_offset(const struct min_env *env) { @@ -661,13 +1119,34 @@ check_done_ec_offset(const struct min_env *env) * worth the added complexity to avoid checking ECs 0 and 1. */ for (i = 0; i < env->ec_count; i++) { const fsm_state_t head = MASK_EC_HEAD(env->ecs[i]); - if (i >= done_ec_offset) { + if (i >= env->done_ec_offset) { assert(head == NO_ID || env->jump[head] == NO_ID); } else if (i >= 2) { assert(env->jump[head] != NO_ID); } } } + +static int +all_end_states_are_currently_together(const struct min_env *env) +{ + /* For each EC, either all or none of the states in it + * are end states. */ + for (size_t i = 0; i < env->ec_count; i++) { + const fsm_state_t head = MASK_EC_HEAD(env->ecs[i]); + const int ec_first_is_end = fsm_isend(env->fsm, head); + + fsm_state_t s = env->jump[head]; + while (s != NO_ID) { + if (fsm_isend(env->fsm, s) != ec_first_is_end) { + return 0; + } + s = env->jump[s]; + } + } + + return 1; +} #endif static int @@ -817,7 +1296,7 @@ try_partition(struct min_env *env, unsigned char label, const unsigned dead_state_ec = env->state_ecs[env->dead_state]; const struct fsm_state *states = env->fsm->states; -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS /* Count states here, to compare against the partitioned * EC' counts later. */ size_t state_count = 0, psrc_count, pdst_count; @@ -857,7 +1336,7 @@ try_partition(struct min_env *env, unsigned char label, first_ec = dead_state_ec; } #if LOG_PARTITIONS > 1 - fprintf(stderr, "# --- try_partition: label '%c' -> EC %d\n", label, first_ec); + fprintf(stderr, "# --- try_partition: label '%c' -> first_ec %d\n", label, first_ec); #endif partition_counts[0] = 1; @@ -897,7 +1376,7 @@ try_partition(struct min_env *env, unsigned char label, partition_counts[0]++; prev = cur; cur = env->jump[cur]; - } else { /* unlink, split */ + } else if (to_ec != first_ec) { /* definitely different destination EC: unlink, split */ fsm_state_t next; #if LOG_PARTITIONS > 1 fprintf(stderr, "# try_partition: unlinking -- label '%c', src %u, dst %u, first_ec %d, cur %u -> to_ec %d\n", label, ec_src, ec_dst, first_ec, cur, to_ec); @@ -912,10 +1391,21 @@ try_partition(struct min_env *env, unsigned char label, env->ecs[ec_dst] = cur; cur = next; partition_counts[1]++; + } else { + /* Restrict the ones that will be marked as checked + * to the common subset before continuing, so that any + * other labels will still be checked in a later pass. */ + for (size_t i = 0; i < 4; i++) { + checked_labels[i] &= cur_label_set[i]; + } + + partition_counts[0]++; + prev = cur; + cur = env->jump[cur]; } } -#if EXPENSIVE_INTEGRITY_CHECKS +#if EXPENSIVE_CHECKS /* Count how many states were split into each EC * and check that the sum matches the original count. */ psrc_count = 0; diff --git a/src/libfsm/mode.c b/src/libfsm/mode.c index 76c60b8ad..87af0bdf9 100644 --- a/src/libfsm/mode.c +++ b/src/libfsm/mode.c @@ -28,6 +28,7 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq) } mode; mode.freq = 1; + mode.state = (fsm_state_t)-1; edge_set_group_iter_reset(fsm->states[state].edges, EDGE_GROUP_ITER_ALL, &iter); while (edge_set_group_iter_next(&iter, &info)) { @@ -46,6 +47,9 @@ fsm_findmode(const struct fsm *fsm, fsm_state_t state, unsigned int *freq) *freq = mode.freq; } + /* It's not meaningful to call this on a state without edges. */ + assert(mode.state != (fsm_state_t)-1); + assert(mode.freq >= 1); return mode.state; } diff --git a/src/libfsm/print/Makefile b/src/libfsm/print/Makefile index c2911318b..9c6f42bbf 100644 --- a/src/libfsm/print/Makefile +++ b/src/libfsm/print/Makefile @@ -20,6 +20,9 @@ SRC += src/libfsm/print/vmasm.c .for src in ${SRC:Msrc/libfsm/print/*.c} CFLAGS.${src} += -I src # XXX: for internal.h DFLAGS.${src} += -I src # XXX: for internal.h + +CFLAGS.${src} += -std=c99 +DFLAGS.${src} += -std=c99 .endfor .for src in ${SRC:Msrc/libfsm/print/*.c} diff --git a/src/libfsm/state.c b/src/libfsm/state.c index acf2bff25..c4425077a 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -17,6 +17,8 @@ #include #include "internal.h" +#include "capture.h" +#include "endids.h" int fsm_addstate(struct fsm *fsm, fsm_state_t *state) @@ -33,17 +35,12 @@ fsm_addstate(struct fsm *fsm, fsm_state_t *state) const size_t factor = 2; /* a guess */ const size_t n = fsm->statealloc * factor; struct fsm_state *tmp; - size_t i; tmp = f_realloc(fsm->opt->alloc, fsm->states, n * sizeof *fsm->states); if (tmp == NULL) { return 0; } - for (i = fsm->statealloc; i < n; i++) { - tmp[i].has_capture_actions = 0; - } - fsm->statealloc = n; fsm->states = tmp; } @@ -253,6 +250,18 @@ fsm_compact_states(struct fsm *fsm, } } + if (!fsm_endid_compact(fsm, mapping, orig_statecount)) { + goto error; + } + + if (!fsm_capture_id_compact(fsm, mapping, orig_statecount)) { + goto error; + } + + if (!fsm_capture_program_association_compact(fsm, mapping, orig_statecount)) { + goto error; + } + assert(dst == kept); assert(kept == fsm->statecount); @@ -278,4 +287,9 @@ fsm_compact_states(struct fsm *fsm, *removed = removed_count; } return 1; + +error: + f_free(fsm->opt->alloc, mapping); + + return 0; } diff --git a/src/libfsm/trim.c b/src/libfsm/trim.c index 6a9a25f09..36bf9145d 100644 --- a/src/libfsm/trim.c +++ b/src/libfsm/trim.c @@ -10,13 +10,16 @@ #include #include +#include #include #include +#include #include #include #include "internal.h" +#include "capture.h" #define DEF_EDGES_CEIL 8 #define DEF_ENDS_CEIL 8 @@ -42,17 +45,18 @@ save_edge(const struct fsm_alloc *alloc, size_t *count, size_t *ceil, struct edge **edges, fsm_state_t from, fsm_state_t to); -static int -cmp_edges_by_to(const void *pa, const void *pb) +static fsm_state_t +get_max_to(const struct edge *edges, size_t edge_count) { - const struct edge *a = (const struct edge *)pa; - const struct edge *b = (const struct edge *)pb; - - return a->to < b->to ? -1 - : a->to > b->to ? 1 - : a->from < b->from ? -1 - : a->from > b->from ? 1 - : 0; + size_t i; + fsm_state_t res = edges[0].to; + for (i = 1; i < edge_count; i++) { + const fsm_state_t to = edges[i].to; + if (to > res) { + res = to; + } + } + return res; } static int @@ -61,7 +65,7 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, { /* Use a queue to walk breath-first over all states reachable * from the start state. Note all end states. Collect all the - * edges, then sort them by the note they lead to, to convert it + * edges, then sort them by the node they lead to, to convert it * to a reverse edge index. Then, enqueue all the end states, * and again use the queue to walk the graph breadth-first, but * this time iterating bottom-up from the end states, and mark @@ -81,10 +85,13 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, fsm_state_t max_end; const size_t state_count = fsm->statecount; + fsm_state_t max_to; + unsigned *pv = NULL; size_t *offsets = NULL; + INIT_TIMERS(); - if (!fsm_getstart(fsm, &start)) { + if (!fsm_getstart(fsm, &start) || state_count == 0) { return 1; /* nothing is reachable */ } @@ -225,7 +232,33 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, } /* Sort edges by state they lead to, inverting the index. */ - qsort(edges, edge_count, sizeof(edges[0]), cmp_edges_by_to); + max_to = edge_count == 0 ? 0 : get_max_to(edges, edge_count); +#if LOG_TRIM + fprintf(stderr, " -- edge count %zu, got max_to %u\n", edge_count, max_to); +#endif + TIME(&pre); + pv = permutation_vector_with_size_and_offset(fsm->opt->alloc, + edge_count, max_to, edges, sizeof(edges[0]), offsetof(struct edge, to)); + TIME(&post); + DIFF_MSEC("trim_pv_so", pre, post, NULL); + + if (EXPENSIVE_CHECKS) { + size_t i; + int ok = 1; +#if LOG_TRIM + fprintf(stderr, "\n#i\tedge\tpv\tsorted, max_to %u\n", max_to); +#endif + for (i = 0; i < edge_count; i++) { +#if LOG_TRIM + fprintf(stderr, "%zu\t%u\t%u\t%u\n", + i, edges[i].to, pv[i], edges[pv[i]].to); +#endif + if (i > 0 && edges[pv[i]].to < edges[pv[i - 1]].to) { + ok = 0; + } + } + assert(ok); + } max_end = 0; @@ -282,7 +315,6 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, * offsets[i - 1], to represent zero entries. */ { size_t i; - const fsm_state_t max_to = edges[edge_count - 1].to; const size_t offset_count = fsm_countstates(fsm); offsets = f_calloc(fsm->opt->alloc, @@ -292,7 +324,7 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, } for (i = 0; i < edge_count; i++) { - const fsm_state_t to = edges[i].to; + const fsm_state_t to = edges[pv[i]].to; offsets[to] = i + 1; } @@ -312,8 +344,8 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, if (LOG_TRIM > 1) { size_t i; for (i = 0; i < edge_count; i++) { - fprintf(stderr, "mark_states: edges[%zu]: %d -> %d\n", - i, edges[i].from, edges[i].to); + fprintf(stderr, "mark_states: edges[pv[%zu]]: %d -> %d\n", + i, edges[pv[i]].from, edges[pv[i]].to); } } @@ -331,13 +363,13 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, } for (e_i = base; e_i < limit; e_i++) { - const fsm_state_t from = edges[e_i].from; + const fsm_state_t from = edges[pv[e_i]].from; const unsigned end_distance = (sed == NULL ? 0 : sed[s_id]); assert(from < state_count); if (LOG_TRIM > 0) { - fprintf(stderr, "mark_states: edges[%zu]: from: %d, visited? %d\n", + fprintf(stderr, "mark_states: edges[pv[%zu]]: from: %d, visited? %d\n", e_i, from, fsm->states[from].visited); } @@ -370,6 +402,7 @@ mark_states(struct fsm *fsm, enum fsm_trim_mode mode, if (ends != NULL) { f_free(fsm->opt->alloc, ends); } if (offsets != NULL) { f_free(fsm->opt->alloc, offsets); } if (q != NULL) { queue_free(q); } + if (pv != NULL) { f_free(fsm->opt->alloc, pv); } return res; } @@ -457,7 +490,7 @@ integrity_check(const char *descr, const struct fsm *fsm) struct edge_iter edge_iter; struct fsm_edge e; -#ifdef NDEBUG +#if defined(NDEBUG) || !EXPENSIVE_CHECKS return; #endif @@ -484,8 +517,14 @@ integrity_check(const char *descr, const struct fsm *fsm) } } + fsm_capture_integrity_check(fsm); + if (LOG_TRIM > 1) { fprintf(stderr, "integrity check: %s...PASS\n", descr); + if (LOG_TRIM > 2) { + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, "post_trim", fsm); + } } } @@ -504,6 +543,8 @@ fsm_trim(struct fsm *fsm, enum fsm_trim_mode mode, return 1; } + integrity_check("pre", fsm); + if (shortest_end_distance != NULL && mode == FSM_TRIM_START_AND_END_REACHABLE) { size_t s_i; diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 60e1fbaff..736292a8b 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -32,6 +32,7 @@ fsm_union(struct fsm *a, struct fsm *b, if (combine_info == NULL) { combine_info = &combine_info_internal; } + memset(combine_info, 0x00, sizeof(*combine_info)); memset(combine_info, 0x00, sizeof(*combine_info)); @@ -105,6 +106,7 @@ fsm_union_array(size_t fsm_count, for (i = 1; i < fsm_count; i++) { struct fsm_combine_info ci; + struct fsm *combined = fsm_union(res, fsms[i], &ci); fsms[i] = NULL; if (combined == NULL) { diff --git a/src/libfsm/vm/v1.c b/src/libfsm/vm/v1.c index a326b88d8..de1f6ea93 100644 --- a/src/libfsm/vm/v1.c +++ b/src/libfsm/vm/v1.c @@ -217,7 +217,9 @@ encode_opasm_v1(const struct dfavm_vm_op *instr, size_t ninstr, size_t total_byt return ret; error: - /* XXX - cleanup */ + if (ret != NULL) { + free(ret); + } return NULL; } diff --git a/src/libfsm/vm/v2.c b/src/libfsm/vm/v2.c index c85edff98..07eb12ef4 100644 --- a/src/libfsm/vm/v2.c +++ b/src/libfsm/vm/v2.c @@ -155,7 +155,10 @@ encode_opasm_v2(const struct dfavm_vm_op *instr, size_t ninstr) return ret; error: - /* XXX - cleanup */ + if (ret != NULL) { + free(ret); + } + return NULL; } diff --git a/src/libre/Makefile b/src/libre/Makefile index a88a92418..508b77a76 100644 --- a/src/libre/Makefile +++ b/src/libre/Makefile @@ -10,13 +10,17 @@ SRC += src/libre/ast_new_from_fsm.c SRC += src/libre/ast_rewrite.c SRC += src/libre/ac.c SRC += src/libre/re_strings.c +SRC += src/libre/re_capvm_compile.c # generated SRC += src/libre/class_name.c -.for src in ${SRC:Msrc/libre/ast_compile.c} +.for src in ${SRC:Msrc/libre/ast*.c} ${SRC:Msrc/libre/re*.c} CFLAGS.${src} += -I src # XXX: for internal.h DFLAGS.${src} += -I src # XXX: for internal.h + +CFLAGS.${src} += -std=c99 +DFLAGS.${src} += -std=c99 .endfor LIB += libre diff --git a/src/libre/ast.c b/src/libre/ast.c index 2e1d21817..6bd6063f5 100644 --- a/src/libre/ast.c +++ b/src/libre/ast.c @@ -494,6 +494,7 @@ ast_expr_cmp(const struct ast_expr *a, const struct ast_expr *b) case AST_EXPR_GROUP: if (a->u.group.id < b->u.group.id) { return -1; } if (a->u.group.id > b->u.group.id) { return +1; } + /* .repeated flag is ignored here */ return ast_expr_cmp(a->u.group.e, b->u.group.e); @@ -753,6 +754,7 @@ ast_make_expr_group(struct ast_expr_pool **poolp, enum re_flags re_flags, struct res->re_flags = re_flags; res->u.group.e = e; res->u.group.id = id; + res->u.group.repeated = 0; /* may be set during analysis */ return res; } @@ -770,6 +772,7 @@ ast_make_expr_anchor(struct ast_expr_pool **poolp, enum re_flags re_flags, enum res->type = AST_EXPR_ANCHOR; res->re_flags = re_flags; res->u.anchor.type = type; + res->u.anchor.is_end_nl = 0; /* may be set later */ return res; } diff --git a/src/libre/ast.h b/src/libre/ast.h index 233744847..3ef0c1f5f 100644 --- a/src/libre/ast.h +++ b/src/libre/ast.h @@ -7,6 +7,11 @@ #ifndef RE_AST_H #define RE_AST_H +#include +#include +#include +#include + /* * This is a duplicate of struct lx_pos, but since we're linking to * code with several distinct lexers, there isn't a clear lexer.h @@ -62,7 +67,9 @@ enum ast_anchor_type { * followed by nullable nodes. * * - AST_FLAG_UNSATISFIABLE - * The node caused the regex to become unsatisfiable. + * The node is unsatisfiable (can never match anything). + * This can cause AST subtrees to be pruned, or for the + * entire regex to become unsatisfiable. * * - AST_FLAG_NULLABLE * The node is not always evaluated, such as nodes that @@ -159,6 +166,8 @@ struct ast_expr { size_t count; /* used */ size_t alloc; /* allocated */ struct ast_expr **n; + int contains_empty_groups; + int nullable_alt_inside_plus_repeat; } alt; struct { @@ -172,12 +181,14 @@ struct ast_expr { struct ast_expr_repeat { struct ast_expr *e; unsigned min; - unsigned max; + unsigned max; /* can be AST_COUNT_UNBOUNDED */ + int contains_empty_groups; } repeat; struct { struct ast_expr *e; unsigned id; + int repeated; /* set during analysis */ } group; struct { @@ -235,9 +246,12 @@ ast_pool_free(struct ast_expr_pool *pool); struct ast_expr_pool * ast_expr_pool_save(void); +#define AST_NO_MAX_CAPTURE_ID ((long)-1) + struct ast { struct ast_expr_pool *pool; struct ast_expr *expr; + long max_capture_id; int has_unanchored_start; int has_unanchored_end; }; diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index 3298f62c2..df9e8ce54 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -21,7 +21,7 @@ #define LOG_ANALYSIS 0 #define LOG_FIRST_ANALYSIS (0 + LOG_ANALYSIS) -#define LOG_REPEATED_GROUPS (0 + LOG_ANALYSIS) +#define LOG_REPETITION_CASES (0 + LOG_ANALYSIS) #define LOG_FORKING (0 + LOG_ANALYSIS) #define LOG_ANCHORING (0 + LOG_ANALYSIS) #define LOG_CONCAT_FLAGS (0 + LOG_ANALYSIS) @@ -37,6 +37,11 @@ /* Mask for end-anchor flags */ #define END_ANCHOR_FLAG_MASK (AST_FLAG_ANCHORED_END | AST_FLAG_END_NL) +struct capture_env { + long max_capture_id; + int use_captures; +}; + static int is_nullable(const struct ast_expr *n) { @@ -239,7 +244,7 @@ analysis_iter(struct ast_expr *n) } case AST_EXPR_ANCHOR: - /* anchor flags will be handled on the second pass */ + /* anchor flags will be handled on the next pass */ break; case AST_EXPR_SUBTRACT: @@ -267,23 +272,280 @@ analysis_iter(struct ast_expr *n) return AST_ANALYSIS_OK; } -/* Analysis for unanchored starts/ends uses three values, because some - * details decide the whole subtree is/isn't, others should defer to - * analysis elsewhere it the tree, but an overall result of undecided - * still decides yes. */ -enum unanchored_analysis_res { - UA_NO = 'N', - UA_YES = 'Y', - UA_UNDECIDED = 'U', +static int +is_only_anchors(struct ast_expr *expr) +{ + if (can_consume_input(expr)) { return 0; } + + switch (expr->type) { + case AST_EXPR_ANCHOR: + return 1; + + case AST_EXPR_CONCAT: + if (expr->u.concat.count == 0) { return 0; } + for (size_t i = 0; i < expr->u.concat.count; i++) { + if (!is_only_anchors(expr->u.concat.n[i]) + && can_consume_input(expr->u.concat.n[i])) { + return 0; + } + } + return 1; + + case AST_EXPR_ALT: + assert(expr->u.alt.count > 0); + for (size_t i = 0; i < expr->u.alt.count; i++) { + /* earlier matches will shadow later ones */ + if (is_only_anchors(expr->u.alt.n[i])) { + return 1; + } + } + return 0; + + case AST_EXPR_REPEAT: + if (expr->u.repeat.min == 0 && expr->u.repeat.max == 0) { + return 0; + } + return is_only_anchors(expr->u.repeat.e); + + case AST_EXPR_GROUP: + return is_only_anchors(expr->u.group.e); + + default: + break; + } + + return 0; +} + +static enum ast_analysis_res +analysis_iter_repetition(struct ast_expr *n, struct ast_expr *outermost_repeat_parent, + int shadowed_by_previous_alt_case, struct ast_expr *repeat_plus_ancestor) +{ + enum ast_analysis_res res = AST_ANALYSIS_OK; + + LOG(3 - LOG_REPETITION_CASES, "%s: node %p, type %s, shadowed_by_previous_alt_case %d\n", + __func__, (void *)n, ast_node_type_name(n->type), shadowed_by_previous_alt_case); + + if (shadowed_by_previous_alt_case) { + assert(outermost_repeat_parent == NULL + || outermost_repeat_parent->type == AST_EXPR_ALT); + } + + switch (n->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_TOMBSTONE: + case AST_EXPR_ANCHOR: + case AST_EXPR_LITERAL: + case AST_EXPR_CODEPOINT: + case AST_EXPR_RANGE: + break; + + case AST_EXPR_CONCAT: { + /* If this CONCAT array of nodes always consumes input, then + * it cannot be repeated empty, so it cannot produce the + * special case that needs outermost_repeat_parent for + * AST_EXPR_REPEAT's case below. + * + * An example input that needs this is 'x(()x)*' for "xx", + * because the 'x' prevents the outermost group + * from repeating and matching empty again after consuming + * a run of "x"s. */ + if (always_consumes_input(n)) { + outermost_repeat_parent = NULL; + } + + for (size_t i = 0; i < n->u.concat.count; i++) { + res = analysis_iter_repetition(n->u.concat.n[i], outermost_repeat_parent, + shadowed_by_previous_alt_case, repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + } + break; + } + + case AST_EXPR_ALT: { + /* See AST_EXPR_GROUP below for why this matters. */ + int new_shadowed_by_previous_alt_case = shadowed_by_previous_alt_case; + + /* FIXME: check nesting of this construct */ + + LOG(3 - LOG_REPETITION_CASES, + "%s: ALT node %p, repeat_plus_ancestor %p\n", + __func__, (void *)n, (void *)repeat_plus_ancestor); + + for (size_t i = 0; i < n->u.alt.count; i++) { + /* If this is an ALT inside of a repeated subtree that contains + * a capture, this will need special handling. */ + if (outermost_repeat_parent != NULL) { + LOG(3 - LOG_REPETITION_CASES, + "%s: setting outermost_repeat_parent to %p for alt branch %zu, repeat_plus_ancestor %p\n", + __func__, (void *)n, i, (void *)repeat_plus_ancestor); + outermost_repeat_parent = n; + } + + if (is_nullable(n->u.alt.n[i]) || is_only_anchors(n->u.alt.n[i])) { + LOG(3 - LOG_REPETITION_CASES, + "%s: setting new_shadowed_by_previous_alt_case for alt branch %zu, repeat_plus_ancestor %p\n", + __func__, i, (void *)repeat_plus_ancestor); + new_shadowed_by_previous_alt_case = 1; + if (repeat_plus_ancestor != NULL) { + n->u.alt.nullable_alt_inside_plus_repeat = 1; + assert(repeat_plus_ancestor->type == AST_EXPR_REPEAT); + assert(repeat_plus_ancestor->u.repeat.min == 1); + assert(repeat_plus_ancestor->u.repeat.max == AST_COUNT_UNBOUNDED); + + /* Repetition of an alt subtree which has a capture group child that + * only contains only* anchors is not handled properly yet. This + * isn't actually _useful_, it's just something that comes up + * in fuzzing, so reject it as an unsupported PCRE construct. + * + * An example input that triggers this is '^(($)|)+$' . */ + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + } + + res = analysis_iter_repetition(n->u.alt.n[i], + outermost_repeat_parent, + new_shadowed_by_previous_alt_case, + repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + } + break; + } + + case AST_EXPR_REPEAT: + { + struct ast_expr *child = n->u.repeat.e; + + LOG(3 - LOG_REPETITION_CASES, "%s: REPEAT node %p, min %u max %u nullable? %d, !cci %d\n", + __func__, (void *)n, n->u.repeat.min, n->u.repeat.max, + is_nullable(child), can_consume_input(child)); + + if (n->u.repeat.min == 1 && + n->u.repeat.max == AST_COUNT_UNBOUNDED) { + LOG(3 - LOG_REPETITION_CASES, "%s: setting repeat_plus_ancestor to %p\n", + __func__, (void *)n); + repeat_plus_ancestor = n; + } else { + repeat_plus_ancestor = NULL; + } + + /* Special cases for a repeated group that contains possibly empty captures, + * in order to correctly reflect their repeating one more time and capture + * at the end (but without an infinite loop). + * + * For example, '^((x?))*$' will always end up with capture groups 1 and 2 + * at the end of the input for any number of "x"s, since the outermost ()* + * can always repeat once more time, consuming nothing, and clobber the + * existing captures. We mark repeated groups so that the compiled capture + * program can move saving the captures after the repetition, instead + * behaving like `^((?:x?)*(())$`. + * + * However, if the repeated subtree always consumes input, such as with + * '^(()a)+b$', then clear any passed in outermost_repeat_parent, because + * having to consume input will prevent that extra repetition of the + * empty captures. */ + if (always_consumes_input(n)) { + res = analysis_iter_repetition(child, NULL, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + } else if (outermost_repeat_parent == NULL && n->u.repeat.max > 1) { + LOG(3 - LOG_REPETITION_CASES, "%s: recursing with outermost_repeat_parent set to %p\n", + __func__, (void *)n); + res = analysis_iter_repetition(child, n, 0, + repeat_plus_ancestor); + } else { + LOG(3 - LOG_REPETITION_CASES, "%s: recursing with outermost_repeat_parent %p\n", + __func__, (void *)outermost_repeat_parent); + res = analysis_iter_repetition(child, outermost_repeat_parent, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + } + if (res != AST_ANALYSIS_OK) { return res; } + break; + } + + case AST_EXPR_GROUP: + LOG(3 - LOG_REPETITION_CASES, + "%s: GROUP %p, repeat_plus_ancestor %p\n", + __func__, (void *)n, (void *)repeat_plus_ancestor); + + + if (outermost_repeat_parent != NULL && (is_nullable(n) || !can_consume_input(n))) { + int should_mark_repeated = 1; + /* If the outermost_repeat_parent is an ALT node and a previous ALT subtree + * matching the empty string is shadowing this group, then do not mark it + * as repeated, because that can lead to incorrect handling in somewhat + * contrived regexes like '^(?:|(|x))*$'. */ + if (outermost_repeat_parent->type == AST_EXPR_ALT && shadowed_by_previous_alt_case) { + LOG(3 - LOG_REPETITION_CASES, + "%s: hit group shadowed_by_previous_alt_case, skipping\n", __func__); + should_mark_repeated = 0; + } + + if (n->flags & (AST_FLAG_ANCHORED_START | AST_FLAG_ANCHORED_END)) { + LOG(3 - LOG_REPETITION_CASES, + "%s: hit repeating anchor, skipping\n", __func__); + should_mark_repeated = 0; + } + + if (should_mark_repeated) { + LOG(3 - LOG_REPETITION_CASES, "%s: setting group %u to repeated\n", + __func__, n->u.group.id); + n->u.group.repeated = 1; + assert(outermost_repeat_parent->type == AST_EXPR_REPEAT || + outermost_repeat_parent->type == AST_EXPR_ALT); + LOG(3 - LOG_REPETITION_CASES, "%s: setting contains_empty_groups on outermost_repeat_parent %p\n", + __func__, (void *)outermost_repeat_parent); + if (outermost_repeat_parent->type == AST_EXPR_REPEAT) { + outermost_repeat_parent->u.repeat.contains_empty_groups = 1; + } else if (outermost_repeat_parent->type == AST_EXPR_ALT) { + outermost_repeat_parent->u.alt.contains_empty_groups = 1; + } else { + assert(!"type mismatch"); + } + } + } + + if (repeat_plus_ancestor != NULL && (is_nullable(n) || !can_consume_input(n))) { + assert(repeat_plus_ancestor->type == AST_EXPR_REPEAT + && repeat_plus_ancestor->u.repeat.min == 1 + && repeat_plus_ancestor->u.repeat.max == AST_COUNT_UNBOUNDED); + LOG(3 - LOG_REPETITION_CASES, + "%s: not yet implemented, skipping\n", __func__); + /* return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; */ + } + + res = analysis_iter_repetition(n->u.group.e, outermost_repeat_parent, + shadowed_by_previous_alt_case, repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + break; + + case AST_EXPR_SUBTRACT: + res = analysis_iter_repetition(n->u.subtract.a, outermost_repeat_parent, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + if (res != AST_ANALYSIS_OK) { return res; } + res = analysis_iter_repetition(n->u.subtract.b, outermost_repeat_parent, shadowed_by_previous_alt_case, + repeat_plus_ancestor); + break; + + default: + assert(!"unreached"); + } + return res; +} + +enum anchoring_analysis_res { + ANCHOR_ANALYSIS_ANCHORED = 'a', + ANCHOR_ANALYSIS_UNANCHORED = 'u', + ANCHOR_ANALYSIS_UNDECIDED = '_', }; -static enum unanchored_analysis_res +static enum anchoring_analysis_res analysis_iter_unanchored_start(const struct ast_expr *n) { if (n->flags & AST_FLAG_ANCHORED_START) { LOG(4 - LOG_UNANCHORED_FLAGS, "%s: n (%p)->flags & AST_FLAG_ANCHORED_START -> N\n", __func__, (void *)n); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; } LOG(4 - LOG_UNANCHORED_FLAGS, "%s: node %p, type %s\n", @@ -293,7 +555,7 @@ analysis_iter_unanchored_start(const struct ast_expr *n) case AST_EXPR_EMPTY: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; case AST_EXPR_TOMBSTONE: break; @@ -302,12 +564,12 @@ analysis_iter_unanchored_start(const struct ast_expr *n) switch (n->u.anchor.type) { case AST_ANCHOR_START: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ^ anchor -> N\n", __func__); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; case AST_ANCHOR_END: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: $ anchor -> U\n", __func__); /* will be handled by other cases */ - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; default: assert(!"unreached"); @@ -323,7 +585,7 @@ analysis_iter_unanchored_start(const struct ast_expr *n) case AST_EXPR_RANGE: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> Y\n", __func__, ast_node_type_name(n->type)); - return UA_YES; + return ANCHOR_ANALYSIS_UNANCHORED; case AST_EXPR_CONCAT: { size_t i; @@ -331,8 +593,8 @@ analysis_iter_unanchored_start(const struct ast_expr *n) /* has unanchored start if first non-nullable child does */ for (i = 0; i < n->u.concat.count; i++) { const struct ast_expr *child = n->u.concat.n[i]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_start(child); - if (child_res != UA_UNDECIDED) { + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_start(child); + if (child_res != ANCHOR_ANALYSIS_UNDECIDED) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), child_res); return child_res; @@ -350,19 +612,19 @@ analysis_iter_unanchored_start(const struct ast_expr *n) size_t i; /* if all children agree, return that result, otherwise undecided */ - const enum unanchored_analysis_res first_child_res = analysis_iter_unanchored_start(n->u.alt.n[0]); + const enum anchoring_analysis_res first_child_res = analysis_iter_unanchored_start(n->u.alt.n[0]); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child 0 -- %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), first_child_res); for (i = 1; i < n->u.alt.count; i++) { const struct ast_expr *child = n->u.alt.n[i]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_start(child); + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_start(child); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child %zd -- %s -> %c (child res)\n", __func__, i, ast_node_type_name(n->type), child_res); if (child_res != first_child_res) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child result) contracts first, returning U\n", __func__, ast_node_type_name(n->type), child_res); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } } @@ -375,7 +637,7 @@ analysis_iter_unanchored_start(const struct ast_expr *n) if (n->u.repeat.min == 0) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U (repeat.min == 0)\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } return analysis_iter_unanchored_start(n->u.repeat.e); @@ -389,16 +651,16 @@ analysis_iter_unanchored_start(const struct ast_expr *n) assert(!"unreached"); } - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } -static enum unanchored_analysis_res +static enum anchoring_analysis_res analysis_iter_unanchored_end(const struct ast_expr *n) { if (n->flags & AST_FLAG_ANCHORED_END) { LOG(4 - LOG_UNANCHORED_FLAGS, "%s: node (%p)->flags & AST_FLAG_ANCHORED_END -> N\n", __func__, (void *)n); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; } LOG(4 - LOG_UNANCHORED_FLAGS, "%s: node %p, type %s\n", @@ -408,7 +670,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) case AST_EXPR_EMPTY: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; case AST_EXPR_TOMBSTONE: break; @@ -418,12 +680,12 @@ analysis_iter_unanchored_end(const struct ast_expr *n) case AST_ANCHOR_START: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ^ %s -> U\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; case AST_ANCHOR_END: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: $ %s -> N\n", __func__, ast_node_type_name(n->type)); - return UA_NO; + return ANCHOR_ANALYSIS_ANCHORED; default: assert(!"unreached"); @@ -439,7 +701,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) case AST_EXPR_RANGE: LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> Y\n", __func__, ast_node_type_name(n->type)); - return UA_YES; + return ANCHOR_ANALYSIS_UNANCHORED; case AST_EXPR_CONCAT: { size_t i; @@ -447,8 +709,8 @@ analysis_iter_unanchored_end(const struct ast_expr *n) /* has unanchored end if last non-nullable child does */ for (i = n->u.concat.count; i > 0; i--) { const struct ast_expr *child = n->u.concat.n[i - 1]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_end(child); - if (child_res != UA_UNDECIDED) { + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_end(child); + if (child_res != ANCHOR_ANALYSIS_UNDECIDED) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), child_res); return child_res; @@ -466,19 +728,19 @@ analysis_iter_unanchored_end(const struct ast_expr *n) size_t i; /* if all children agree, return that result, otherwise undecided */ - const enum unanchored_analysis_res first_child_res = analysis_iter_unanchored_end(n->u.alt.n[0]); + const enum anchoring_analysis_res first_child_res = analysis_iter_unanchored_end(n->u.alt.n[0]); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child 0 -- %s -> %c (child res)\n", __func__, ast_node_type_name(n->type), first_child_res); for (i = 1; i < n->u.alt.count; i++) { const struct ast_expr *child = n->u.alt.n[i]; - const enum unanchored_analysis_res child_res = analysis_iter_unanchored_end(child); + const enum anchoring_analysis_res child_res = analysis_iter_unanchored_end(child); LOG(3 - LOG_UNANCHORED_FLAGS, "%s: ALT child %zd -- %s -> %c (child res)\n", __func__, i, ast_node_type_name(n->type), child_res); if (child_res != first_child_res) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> %c (child result) contracts first, returning U\n", __func__, ast_node_type_name(n->type), child_res); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } } @@ -491,7 +753,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) if (n->u.repeat.min == 0) { LOG(3 - LOG_UNANCHORED_FLAGS, "%s: %s -> U (repeat.min == 0)\n", __func__, ast_node_type_name(n->type)); - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } return analysis_iter_unanchored_end(n->u.repeat.e); @@ -505,7 +767,7 @@ analysis_iter_unanchored_end(const struct ast_expr *n) assert(!"unreached"); } - return UA_UNDECIDED; + return ANCHOR_ANALYSIS_UNDECIDED; } static void @@ -563,9 +825,68 @@ struct anchoring_env { /* Corresponding flag for end anchors while sweeping backward. */ int followed_by_consuming; + /* Special case for detecting '$[^a]', which matches "\n" with + * a capture group 0 of (0,1) in PCRE. */ + int followed_by_consuming_newline; + + /* Flag for tracking whether we're in a part of the subtree that + * is always before a start anchor. This influences satisfiability + * and edge cases like '()*^'. */ int before_start_anchor; + + /* Flag used to detect and reject the awkward case in '$[^a]', + * where (according to PCRE) the character class after the '$' + * should match the literal newline, but nothing else, and only + * once. Because $ is actually a zero-width assertion that + * execution is either at the end of input or a trailing + * newline, it has the rather surprising result that '$[^a]' + * will not match "x" but *will* match "x\n" (because it has a $ + * before a trailing newline, and because the newline matches + * the non-skippable [^a]). We just return an unsupported + * error for this case. */ + enum newline_after_end_anchor_state { + NAEAS_NONE, + NAEAS_WOULD_MATCH_ONCE, + } newline_after_end_anchor_state; + + int after_end_anchor; }; +/* Does the subtree match a literal '\n'? */ +static int +matches_newline(const struct ast_expr *n) +{ + switch (n->type) { + case AST_EXPR_LITERAL: + return n->u.literal.c == '\n'; + + case AST_EXPR_SUBTRACT: + return matches_newline(n->u.subtract.a) + && !matches_newline(n->u.subtract.b); + + case AST_EXPR_RANGE: + { + const struct ast_endpoint *f = &n->u.range.from; + const struct ast_endpoint *t = &n->u.range.to; + if (f->type != AST_ENDPOINT_LITERAL + || t->type != AST_ENDPOINT_LITERAL) { + /* not implemented */ + LOG(1 - LOG_ANCHORING, "%s: not implemented\n", __func__); + return 0; + } + + const int res = f->u.literal.c <= '\n' && t->u.literal.c >= '\n'; + LOG(1 - LOG_ANCHORING, "%s: RANGE res %d\n", __func__, res); + return res; + } + + default: + break; + } + + return 0; +} + /* Tree walker that analyzes the AST, marks which nodes and subtrees are * anchored at the start and/or end, and determines which subtrees are * unsatisfiable due to start anchoring. @@ -600,7 +921,7 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) case AST_ANCHOR_START: /* * If it's not possible to get here without consuming - * any input and there's a start anchor, the regex is + * any input and there's a start anchor, the subtree is * inherently unsatisfiable. */ set_flags(n, AST_FLAG_ANCHORED_START); @@ -630,6 +951,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) set_flags(n, AST_FLAG_ANCHORED_END); if (n->u.anchor.is_end_nl && !(env->re_flags & RE_ANCHORED)) { set_flags(n, AST_FLAG_END_NL); + if (env->newline_after_end_anchor_state == NAEAS_NONE) { + env->newline_after_end_anchor_state = NAEAS_WOULD_MATCH_ONCE; + } } break; @@ -642,6 +966,15 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) * These are the types that actually consume input. */ case AST_EXPR_LITERAL: + if (n->u.literal.c == '\n' && + (env->newline_after_end_anchor_state == NAEAS_WOULD_MATCH_ONCE)) { + LOG(3 - LOG_ANCHORING, + "%s: LITERAL: rejecting non-optional newline match after $ as unsupported\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + break; case AST_EXPR_CODEPOINT: case AST_EXPR_RANGE: break; /* handled outside switch/case */ @@ -673,6 +1006,13 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (res != AST_ANALYSIS_OK && res != AST_ANALYSIS_UNSATISFIABLE) { /* unsat is handled below */ + if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + + /* FIXME: check this */ + assert(child->flags & AST_FLAG_UNSATISFIABLE); + + set_flags(n, AST_FLAG_UNSATISFIABLE); + } return res; } @@ -703,12 +1043,11 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) env->past_always_consuming = 1; } + env->newline_after_end_anchor_state = child_env.newline_after_end_anchor_state; } /* flow ANCHORED_START and ANCHORED_END flags upward */ { - int after_always_consumes = 0; - for (i = 0; i < n->u.concat.count; i++) { struct ast_expr *child = n->u.concat.n[i]; if (child->flags & AST_FLAG_ANCHORED_START) { @@ -716,20 +1055,11 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) "%s: setting ANCHORED_START due to child %zu (%p)'s analysis\n", __func__, i, (void *)child); set_flags(n, AST_FLAG_ANCHORED_START); - - if (after_always_consumes) { - LOG(3 - LOG_ANCHORING, - "%s: setting %p and child %p UNSATISFIABLE due to ^-anchored child that always consumes input\n", - __func__, (void *)n, (void *)child); - set_flags(n, AST_FLAG_UNSATISFIABLE); - set_flags(child, AST_FLAG_UNSATISFIABLE); - } } if (always_consumes_input(child)) { LOG(3 - LOG_ANCHORING, "%s: child %zd always consumes input\n", __func__, i); - after_always_consumes = 1; } } } @@ -786,16 +1116,21 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) for (i = 0; i < n->u.concat.count; i++) { struct ast_expr *child = n->u.concat.n[i]; if (after_end_anchor) { - if (always_consumes_input(child)) { + if (child->type == AST_EXPR_REPEAT + && (child->flags & AST_FLAG_UNSATISFIABLE) + && child->u.repeat.min == 0) { LOG(3 - LOG_ANCHORING, - "%s: after_end_anchor & ALWAYS_CONSUMES on child %p -> UNSATISFIABLE\n", - __func__, (void *)child); - set_flags(child, AST_FLAG_UNSATISFIABLE); + "%s: setting unsatisfiable {0,*} repeat after $ anchor to {0,0}\n", + __func__); + child->u.repeat.max = 0; } if (child->type == AST_EXPR_REPEAT - && (child->flags & AST_FLAG_UNSATISFIABLE) + && (child->flags & AST_FLAG_END_NL) && child->u.repeat.min == 0) { + LOG(3 - LOG_ANCHORING, + "%s: setting {0,*} repeat subtree matching \n after $ anchor to {0,0}\n", + __func__); child->u.repeat.max = 0; } } else if (!after_end_anchor @@ -846,6 +1181,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } else if (res == AST_ANALYSIS_OK) { all_set_past_always_consuming &= child_env.past_always_consuming; any_sat = 1; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE + || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + assert(child->flags & AST_FLAG_UNSATISFIABLE); + continue; } else { return res; } @@ -883,7 +1222,7 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (all_end_anchored) { LOG(3 - LOG_ANCHORING, "%s: ALT: all_end_anchored -> ANCHORED_END\n", __func__); - /* FIXME: AST_FLAG_END_NL: need to determine how this interacts + /* TODO: AST_FLAG_END_NL: need to determine how this interacts * with alt nodes. `^(?:(a)\z|(a)$)` */ set_flags(n, AST_FLAG_ANCHORED_END); } @@ -925,15 +1264,26 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) return res; } - if (n->u.repeat.e->flags & AST_FLAG_ANCHORED_END && n->u.repeat.min > 0) { - /* FIXME: if repeating something that is always - * anchored at the end, repeat.max could be - * capped at 1, but I have not yet found any - * inputs where that change is necessary to - * produce a correct result. */ - LOG(3 - LOG_ANCHORING, - "%s: REPEAT: repeating ANCHORED_END subtree >0 times -> ANCHORED_END\n", __func__); - set_flags(n, n->u.repeat.e->flags & END_ANCHOR_FLAG_MASK); + /* If the child subtree is anchored at the start and/or end, then this + * node can be repeated at most once. */ + const int child_is_anchored = n->u.repeat.e->flags & (AST_FLAG_ANCHORED_START | AST_FLAG_ANCHORED_END); + if (child_is_anchored) { + if (n->u.repeat.min >= 1) { + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: copying anchor flags from child subtree since we cannot repeat it 0 times\n", __func__); + set_flags(n, n->u.repeat.e->flags & END_ANCHOR_FLAG_MASK); + } + + if (n->u.repeat.min > 1) { + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: anchored, so clamping min repeat to at most once\n", __func__); + n->u.repeat.min = 1; + } + if (n->u.repeat.max > 1) { + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: anchored, so clamping max repeat to at most once\n", __func__); + n->u.repeat.max = 1; + } } break; @@ -943,27 +1293,22 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) /* This flows anchoring flags upward even when the node * is unsatisfiable, because that info can impact how * the node's unsatisfiability is handled. */ -#define PROPAGATE_CHILD_FLAGS(TAG, N, CHILD) \ - do { \ - struct ast_expr *child = CHILD; \ - if (child->flags & AST_FLAG_ANCHORED_START) { \ - set_flags(N, AST_FLAG_ANCHORED_START); \ - } \ - if (child->flags & AST_FLAG_ANCHORED_END) { \ - set_flags(N, AST_FLAG_ANCHORED_END); \ - } \ - if (res == AST_ANALYSIS_UNSATISFIABLE) { \ - LOG(3 - LOG_ANCHORING, \ - "%s: %s: setting UNSATISFIABLE due to unsatisfiable child\n", \ - __func__, TAG); \ - set_flags(N, AST_FLAG_UNSATISFIABLE); \ - } \ - if (res != AST_ANALYSIS_OK) { \ - return res; \ - } \ - } while(0) - - PROPAGATE_CHILD_FLAGS("GROUP", n, n->u.group.e); + struct ast_expr *child = n->u.group.e; + if (child->flags & AST_FLAG_ANCHORED_START) { + set_flags(n, AST_FLAG_ANCHORED_START); + } + if (child->flags & AST_FLAG_ANCHORED_END) { + set_flags(n, AST_FLAG_ANCHORED_END); + } + if (res == AST_ANALYSIS_UNSATISFIABLE) { + LOG(3 - LOG_ANCHORING, + "%s: GROUP: setting UNSATISFIABLE due to unsatisfiable childn", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + } + if (res != AST_ANALYSIS_OK) { + return res; + } break; case AST_EXPR_SUBTRACT: @@ -974,6 +1319,15 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) if (n->u.subtract.a->flags & AST_FLAG_ANCHORED_END) { set_flags(n, n->u.subtract.a->flags & END_ANCHOR_FLAG_MASK); } + + if (env->newline_after_end_anchor_state == NAEAS_WOULD_MATCH_ONCE) { + LOG(3 - LOG_ANCHORING, + "%s: SUBTRACT: rejecting non-optional newline match after $ as unsupported\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + if (res != AST_ANALYSIS_OK) { if (res == AST_ANALYSIS_UNSATISFIABLE) { set_flags(n, AST_FLAG_UNSATISFIABLE); @@ -1017,9 +1371,9 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) enum ast_analysis_res res; /* - * Second pass, in reverse, specifically checking for end-anchored - * subtrees that are unsatisfiable because they are followed by - * nodes that always consume input. + * Second anchoring pass, in reverse, specifically checking for + * end-anchored subtrees that are unsatisfiable because they are + * followed by nodes that always consume input. * * Also check for subtrees that always consume input appearing * before a start anchor and mark them as unsatisfiable. @@ -1031,6 +1385,14 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) switch (n->type) { case AST_EXPR_EMPTY: + if (env->before_start_anchor) { + /* Needed for cases like '()*^' matching "". */ + LOG(3 - LOG_ANCHORING, "%s: skipping EMPTY before ^\n", __func__); + + set_flags(n, AST_FLAG_ANCHORED_START); + } + break; + case AST_EXPR_TOMBSTONE: break; @@ -1044,10 +1406,19 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) break; case AST_ANCHOR_END: - /* should already be set during forward pass */ - assert(n->flags & AST_FLAG_ANCHORED_END); + /* Normally this will have been set during the forward pass, + * but if it's inside an unsatisfiable CONCAT node whose earlier + * children casued it to be rejected, forward analysis will not + * have reached it. */ + set_flags(n, AST_FLAG_ANCHORED_END); - if (env->followed_by_consuming) { + if (env->followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: RANGE: rejecting possible newline match after $ as unsupported\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } else if (env->followed_by_consuming) { LOG(3 - LOG_ANCHORING, "%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n", __func__); @@ -1113,6 +1484,7 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) set_flags(n, AST_FLAG_UNSATISFIABLE); } } else if (res != AST_ANALYSIS_OK) { + set_flags(n, AST_FLAG_UNSATISFIABLE); return res; } @@ -1135,6 +1507,13 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) __func__, (void *)child); env->before_start_anchor = 1; } + + if (!env->followed_by_consuming_newline && child_env.followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: setting followed_by_consuming_newline due to child %p's analysis\n", + __func__, (void *)child); + env->followed_by_consuming_newline = 1; + } } break; @@ -1143,6 +1522,7 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) case AST_EXPR_ALT: { int any_sat = 0; int all_set_followed_by_consuming = 1; + int any_set_followed_by_consuming_newline = 0; int all_set_before_start_anchor = 1; assert(n->u.alt.count > 0); @@ -1168,7 +1548,14 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) } else if (res == AST_ANALYSIS_OK) { all_set_followed_by_consuming &= child_env.followed_by_consuming; all_set_before_start_anchor &= child_env.before_start_anchor; + + any_set_followed_by_consuming_newline |= child_env.followed_by_consuming_newline; + any_sat = 1; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE + || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + assert(child->flags & AST_FLAG_UNSATISFIABLE); + continue; } else { return res; } @@ -1181,6 +1568,13 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) env->followed_by_consuming = 1; } + if (!env->followed_by_consuming_newline && any_set_followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: ALT: any_set_followed_by_consuming_newline -> setting env->followed_by_consuming_newline for feature PCRE rejection\n", + __func__); + env->followed_by_consuming_newline = 1; + } + if (!env->before_start_anchor && all_set_before_start_anchor) { LOG(3 - LOG_ANCHORING, "%s: ALT: all_set_before_start_anchor -> setting env->before_start_anchor\n", @@ -1205,6 +1599,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) if (n->u.repeat.min == 0) { LOG(3 - LOG_ANCHORING, "%s: REPEAT: UNSATISFIABLE but can be repeated 0 times, ignoring\n", __func__); + /* Set the REPEAT node to repeat 0 times (to + * prune it) rather than making it as unsatisfiable. */ n->u.repeat.max = 0; /* skip */ break; } else if (n->u.repeat.min > 0) { @@ -1269,6 +1665,10 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) return AST_ANALYSIS_UNSATISFIABLE; } + if (n->flags & AST_FLAG_CAN_CONSUME && matches_newline(n)) { + env->followed_by_consuming_newline = 1; + } + return AST_ANALYSIS_OK; } @@ -1333,7 +1733,7 @@ assign_firsts(struct ast_expr *n) * subexpression is compiled, the links to the global self-loop * are created, which the REPEAT node then copies. * - * It probably makes sense to not go further + * FIXME: needs tests. It probably makes sense to not go further * here because the top layer of the repeated section * should only link to the global start once. */ @@ -1416,7 +1816,7 @@ assign_lasts(struct ast_expr *n) * subexpression is compiled, the links to the global self-loop * are created, which the REPEAT node then copies. * - * It probably makes sense to not go further + * FIXME: needs tests. It probably makes sense to not go further * here because the top layer of the repeated section * should only link to the global start once. */ @@ -1437,6 +1837,112 @@ assign_lasts(struct ast_expr *n) } } +static enum ast_analysis_res +analysis_iter_captures(struct capture_env *env, struct ast_expr *n) +{ + enum ast_analysis_res res; + + switch (n->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_TOMBSTONE: + break; + + case AST_EXPR_ANCHOR: + if (env->use_captures && n->u.anchor.type == AST_ANCHOR_END && !n->u.anchor.is_end_nl) { + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } + break; + + case AST_EXPR_LITERAL: + case AST_EXPR_CODEPOINT: + case AST_EXPR_RANGE: + break; + + case AST_EXPR_CONCAT: { + size_t i; + + for (i = 0; i < n->u.concat.count; i++) { + res = analysis_iter_captures(env, n->u.concat.n[i]); + if (res != AST_ANALYSIS_OK) { + return res; + } + } + + break; + } + + case AST_EXPR_ALT: { + size_t i; + + for (i = 0; i < n->u.alt.count; i++) { + res = analysis_iter_captures(env, n->u.alt.n[i]); + if (res != AST_ANALYSIS_OK) { + return res; + } + } + + break; + } + + case AST_EXPR_REPEAT: { + res = analysis_iter_captures(env, n->u.repeat.e); + if (res != AST_ANALYSIS_OK) { + return res; + } + +/* Set this to 1 when running the fuzzer, so that it ignores + * uninteresting failures from regexes like '.{1000000}' that use + * repetition to hit memory limits. + * + * This should be set by the build system when building for fuzzing. */ +#ifndef FUZZING_LIMITS +#define FUZZING_LIMITS 0 +#endif + +#if FUZZING_LIMITS + if ((n->u.repeat.max != AST_COUNT_UNBOUNDED && n->u.repeat.max >= 10) + || (n->u.repeat.min >= 10)) { + fprintf(stderr, "%s: rejecting regex with {count} >= 10 (%u)\n", + __func__, n->u.repeat.max); + return AST_ANALYSIS_ERROR_MEMORY; + } +#endif + + break; + } + + case AST_EXPR_GROUP: + if (env->max_capture_id == AST_NO_MAX_CAPTURE_ID + || n->u.group.id > env->max_capture_id) { + env->max_capture_id = n->u.group.id; + } + + res = analysis_iter_captures(env, n->u.group.e); + if (res != AST_ANALYSIS_OK) { + return res; + } + break; + + case AST_EXPR_SUBTRACT: + res = analysis_iter_captures(env, n->u.subtract.a); + if (res != AST_ANALYSIS_OK) { + return res; + } + + res = analysis_iter_captures(env, n->u.subtract.b); + if (res != AST_ANALYSIS_OK) { + return res; + } + break; + + default: + assert(!"unreached"); + } + + return AST_ANALYSIS_OK; +} + enum ast_analysis_res ast_analysis(struct ast *ast, enum re_flags flags) { @@ -1449,14 +1955,26 @@ ast_analysis(struct ast *ast, enum re_flags flags) assert(ast->expr != NULL); /* - * First pass -- track nullability, clean up some artifacts from - * parsing. + * First pass -- track nullability, which subtrees can/always consume + * input, and clean up some artifacts from parsing. */ res = analysis_iter(ast->expr); if (res != AST_ANALYSIS_OK) { return res; } + /* Next pass, check for capture IDs. */ + { + struct capture_env env = { .max_capture_id = AST_NO_MAX_CAPTURE_ID }; + env.use_captures = !(flags & RE_NOCAPTURE); + + res = analysis_iter_captures(&env, ast->expr); + if (res != AST_ANALYSIS_OK) { + return res; + } + ast->max_capture_id = env.max_capture_id; + } + /* * Next pass: set anchoring, now that nullability info from * the first pass is in place and some other things have been @@ -1464,8 +1982,10 @@ ast_analysis(struct ast *ast, enum re_flags flags) * start anchors. */ { - /* first anchoring analysis pass, sweeping forward */ - struct anchoring_env env = { .re_flags = flags }; + struct anchoring_env env = { + .re_flags = flags, + .newline_after_end_anchor_state = NAEAS_NONE, + }; res = analysis_iter_anchoring(&env, ast->expr); if (res != AST_ANALYSIS_OK) { return res; } @@ -1473,20 +1993,30 @@ ast_analysis(struct ast *ast, enum re_flags flags) res = analysis_iter_reverse_anchoring(&env, ast->expr); if (res != AST_ANALYSIS_OK) { return res; } - /* - * Next passes, mark all nodes in a first and/or last - * position. This is informed by the anchoring flags, so - * that needs to happen first. - */ - assign_firsts(ast->expr); - assign_lasts(ast->expr); - - ast->has_unanchored_start = (analysis_iter_unanchored_start(ast->expr) != UA_NO); - ast->has_unanchored_end = (analysis_iter_unanchored_end(ast->expr) != UA_NO); - LOG(2 - LOG_UNANCHORED_FLAGS, - "%s: has_unanchored_start %d, has_unanchored_end %d\n", - __func__, ast->has_unanchored_start, ast->has_unanchored_end); } + /* Next pass, mark some cases that need special handling + * due to repetition. For example, with cases like + * ^((x?))*$ the inner capture will always need to repeat + * one more time to match () after any 'x's. + * + * This needs to happen after the anchoring passes. */ + res = analysis_iter_repetition(ast->expr, NULL, 0, NULL); + if (res != AST_ANALYSIS_OK) { return res; } + + /* + * Next passes, mark all nodes in a first and/or last + * position. This is informed by the anchoring flags, so + * that needs to happen first. + */ + assign_firsts(ast->expr); + assign_lasts(ast->expr); + + ast->has_unanchored_start = (analysis_iter_unanchored_start(ast->expr) != ANCHOR_ANALYSIS_ANCHORED); + ast->has_unanchored_end = (analysis_iter_unanchored_end(ast->expr) != ANCHOR_ANALYSIS_ANCHORED); + LOG(2 - LOG_UNANCHORED_FLAGS, + "%s: has_unanchored_start %d, has_unanchored_end %d\n", + __func__, ast->has_unanchored_start, ast->has_unanchored_end); + return res; } diff --git a/src/libre/ast_analysis.h b/src/libre/ast_analysis.h index f9ec8ebbb..5390cce57 100644 --- a/src/libre/ast_analysis.h +++ b/src/libre/ast_analysis.h @@ -30,7 +30,9 @@ enum ast_analysis_res { AST_ANALYSIS_UNSATISFIABLE, AST_ANALYSIS_ERROR_NULL = -1, - AST_ANALYSIS_ERROR_MEMORY = -2 + AST_ANALYSIS_ERROR_MEMORY = -2, + AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE = -3, + AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE = -4 }; enum ast_analysis_res diff --git a/src/libre/ast_compile.c b/src/libre/ast_compile.c index 502faf8b4..b376aa144 100644 --- a/src/libre/ast_compile.c +++ b/src/libre/ast_compile.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -15,18 +16,25 @@ #include #include +#include #include #include #include +#include +#include + #include "class.h" #include "ast.h" #include "ast_compile.h" +#include "re_capvm_compile.h" +#include "libfsm/capture.h" #include "libfsm/internal.h" /* XXX */ #define LOG_LINKAGE 0 +#define LOG_TRAMPOLINE 0 #if LOG_LINKAGE #include "print.h" @@ -44,12 +52,12 @@ enum link_side { * Use the passed in start/end states (x and y) * * - LINK_GLOBAL - * Link to the global start/end state (env->start or env->end), + * Link to the global start/end state (env->start_inner or env->end_inner), * because this node has a ^ or $ anchor * * - LINK_GLOBAL_SELF_LOOP * Link to the unanchored self loop adjacent to the start/end - * state (env->start_any_loop or env->end_any_loop), because + * states (env->start_any_inner or env->end_any_inner), because * this node is in a FIRST or LAST position, but unanchored. */ enum link_types { @@ -58,7 +66,12 @@ enum link_types { LINK_GLOBAL_SELF_LOOP, }; +/* Call stack for AST -> NFA conversion. */ +#define DEF_COMP_STACK_CEIL 4 +#define NO_MAX_CAPTURE_IDS ((unsigned)-1) + struct comp_env { + const struct fsm_alloc *alloc; struct fsm *fsm; enum re_flags re_flags; struct re_err *err; @@ -72,21 +85,90 @@ struct comp_env { * Also, some states in a first/last context need to link * directly to the overall start/end states, either in * place of or along with the adjacent states. + * + * The inner start and end states are considered inside of + * match group 0, outer are not. */ - fsm_state_t start; - fsm_state_t end; + fsm_state_t start_outer; + fsm_state_t start_inner; fsm_state_t start_any_loop; + fsm_state_t start_any_inner; + int have_start_any_loop; + + /* States leading to the end, with and without an unanchored + * `.*` loop that consumes any trailing characters. */ + fsm_state_t end_inner; + fsm_state_t end_outer; + fsm_state_t end_nl_inner; fsm_state_t end_any_loop; - fsm_state_t end_nl; - int has_start_any_loop; + fsm_state_t end_any_inner; int has_end_any_loop; - int has_end_nl; + int has_end_nl_inner; + + /* bitset for active capture IDs */ + uint64_t *active_capture_ids; + long max_capture_id; /* upper bound */ + + /* Evaluation stack */ + struct comp_stack { + size_t ceil; + size_t depth; /* 0 -> empty */ + + struct comp_stack_frame { + struct ast_expr *n; + fsm_state_t x; + fsm_state_t y; + unsigned step; + + union { + struct { + fsm_state_t link; + } concat; + struct { + unsigned count; + } alt; + struct { + struct fsm_subgraph subgraph; + fsm_state_t na; + fsm_state_t nz; + } repeat; + } u; + } *frames; + } stack; }; static int -comp_iter(struct comp_env *env, - fsm_state_t x, fsm_state_t y, - struct ast_expr *n, const struct ast_expr *parent); +comp_iter(struct comp_env *env, fsm_state_t x, const struct ast *ast); + +static int +eval_stack_frame(struct comp_env *env); + +static int +eval_EMPTY(struct comp_env *env); +static int +eval_CONCAT(struct comp_env *env); +static int +eval_ALT(struct comp_env *env); +static int +eval_LITERAL(struct comp_env *env); +static int +eval_CODEPOINT(struct comp_env *env); +static int +eval_REPEAT(struct comp_env *env); +static int +eval_GROUP(struct comp_env *env); +static int +eval_ANCHOR(struct comp_env *env); +static int +eval_SUBTRACT(struct comp_env *env); +static int +eval_RANGE(struct comp_env *env); +static int +eval_TOMBSTONE(struct comp_env *env); + +static int +compile_capvm_program_for_stack_end_states(struct comp_env *env, + const struct ast *ast, uint32_t *prog_id); static int utf8(uint32_t cp, char c[]) @@ -193,6 +275,9 @@ expr_compile(struct ast_expr *e, enum re_flags flags, struct ast ast; ast.expr = e; + ast.max_capture_id = 0; + ast.has_unanchored_start = 0; + ast.has_unanchored_end = 0; return ast_compile(&ast, flags, opt, err); } @@ -208,11 +293,11 @@ addedge_literal(struct comp_env *env, enum re_flags re_flags, assert(to < env->fsm->statecount); if (re_flags & RE_ICASE) { - if (!fsm_addedge_literal(fsm, from, to, tolower((unsigned char) c))) { + if (!fsm_addedge_literal(fsm, from, to, (char)tolower((unsigned char) c))) { return 0; } - - if (!fsm_addedge_literal(fsm, from, to, toupper((unsigned char) c))) { + + if (!fsm_addedge_literal(fsm, from, to, (char)toupper((unsigned char) c))) { return 0; } } else { @@ -220,38 +305,50 @@ addedge_literal(struct comp_env *env, enum re_flags re_flags, return 0; } } - + return 1; } static int intern_start_any_loop(struct comp_env *env) { - fsm_state_t loop; + fsm_state_t loop, inner; assert(env != NULL); - if (env->has_start_any_loop) { + if (env->have_start_any_loop) { return 1; } assert(~env->re_flags & RE_ANCHORED); - assert(env->start < env->fsm->statecount); + assert(env->start_outer < env->fsm->statecount); if (!fsm_addstate(env->fsm, &loop)) { return 0; } + if (!fsm_addstate(env->fsm, &inner)) { + return 0; + } + +#if LOG_LINKAGE + fprintf(stderr, "%s: start_any: loop %d, inner: %d\n", __func__, loop, inner); +#endif + if (!fsm_addedge_any(env->fsm, loop, loop)) { return 0; } - if (!fsm_addedge_epsilon(env->fsm, env->start, loop)) { + if (!fsm_addedge_epsilon(env->fsm, env->start_outer, loop)) { + return 0; + } + if (!fsm_addedge_epsilon(env->fsm, loop, inner)) { return 0; } env->start_any_loop = loop; - env->has_start_any_loop = 1; + env->start_any_inner = inner; + env->have_start_any_loop = 1; return 1; } @@ -259,7 +356,7 @@ intern_start_any_loop(struct comp_env *env) static int intern_end_any_loop(struct comp_env *env) { - fsm_state_t loop; + fsm_state_t loop, inner; assert(env != NULL); @@ -268,21 +365,32 @@ intern_end_any_loop(struct comp_env *env) } assert(~env->re_flags & RE_ANCHORED); - assert(env->end < env->fsm->statecount); + assert(env->end_outer < env->fsm->statecount); if (!fsm_addstate(env->fsm, &loop)) { return 0; } + if (!fsm_addstate(env->fsm, &inner)) { + return 0; + } + +#if LOG_LINKAGE + fprintf(stderr, "%s: end_any: %d, inner: %d\n", __func__, loop, inner); +#endif if (!fsm_addedge_any(env->fsm, loop, loop)) { return 0; } - if (!fsm_addedge_epsilon(env->fsm, loop, env->end)) { + if (!fsm_addedge_epsilon(env->fsm, inner, loop)) { + return 0; + } + if (!fsm_addedge_epsilon(env->fsm, loop, env->end_outer)) { return 0; } env->end_any_loop = loop; + env->end_any_inner = inner; env->has_end_any_loop = 1; return 1; @@ -291,37 +399,39 @@ intern_end_any_loop(struct comp_env *env) static int intern_end_nl(struct comp_env *env) { - /* PCRE's end anchor $ matches a single optional newline. + /* PCRE's end anchor $ matches a single optional newline, + * which should exist outside of match group 0. * - * Intern states for a `\n?` that links to the global end. */ + * Intern states for a `\n?` that links to */ assert(env != NULL); - if (env->has_end_nl) { + if (env->has_end_nl_inner) { return 1; } assert(~env->re_flags & RE_ANCHORED); assert(env->re_flags & RE_END_NL); - assert(env->end < env->fsm->statecount); + assert(~env->re_flags & RE_END_NL_DISABLE); + assert(env->end_outer < env->fsm->statecount); - fsm_state_t end_nl; - if (!fsm_addstate(env->fsm, &end_nl)) { + fsm_state_t inner; + if (!fsm_addstate(env->fsm, &inner)) { return 0; } #if LOG_LINKAGE - fprintf(stderr, "%s: end_nl: %d\n", __func__, end_nl); + fprintf(stderr, "%s: end_nl_inner: %d\n", __func__, inner); #endif - if (!fsm_addedge_epsilon(env->fsm, end_nl, env->end)) { + if (!fsm_addedge_epsilon(env->fsm, inner, env->end_outer)) { return 0; } - if (!fsm_addedge_literal(env->fsm, end_nl, env->end, (char)'\n')) { + if (!fsm_addedge_literal(env->fsm, inner, env->end_outer, (char)'\n')) { return 0; } - env->end_nl = end_nl; - env->has_end_nl = 1; + env->end_nl_inner = inner; + env->has_end_nl_inner = 1; return 1; } @@ -338,8 +448,8 @@ can_have_backward_epsilon_edge(const struct ast_expr *e) return 0; case AST_EXPR_SUBTRACT: - /* XXX: not sure */ - return 1; + /* Single character class */ + return 0; case AST_EXPR_REPEAT: /* 0 and 1 don't have backward epsilon edges */ @@ -397,21 +507,32 @@ can_skip_concat_state_and_epsilon(const struct ast_expr *l, return 0; } +static const struct ast_expr * +get_parent_node_from_stack(const struct comp_stack *stack) +{ + if (stack->depth < 2) { return NULL; } + return stack->frames[stack->depth - 2].n; +} + static enum link_types -decide_linking(struct comp_env *env, - struct ast_expr *n, const struct ast_expr *parent, enum link_side side) +decide_linking(struct comp_env *env, fsm_state_t x, fsm_state_t y, + struct ast_expr *n, enum link_side side) { assert(n != NULL); assert(env != NULL); + (void)x; + (void)y; + + struct comp_stack *stack = &env->stack; + /* If the regex is implicitly anchored and the dialect does * not support anchoring, linking is always top-down. */ if ((env->re_flags & RE_ANCHORED)) { return LINK_TOP_DOWN; } - /* parent can be NULL, if we're at the root node, but it must - * never be the same node. */ + const struct ast_expr *parent = get_parent_node_from_stack(stack); assert(parent != n); /* Note: any asymmetry here should be due to special cases @@ -479,183 +600,71 @@ print_linkage(enum link_types t) #define EPSILON(FROM, TO) \ assert((FROM) != (TO)); \ if (!fsm_addedge_epsilon(env->fsm, (FROM), (TO))) { return 0; } - + #define ANY(FROM, TO) \ if (!fsm_addedge_any(env->fsm, (FROM), (TO))) { return 0; } #define LITERAL(FROM, TO, C) \ if (!addedge_literal(env, n->re_flags, (FROM), (TO), ((char)C))) { return 0; } -#define RECURSE(FROM, TO, NODE, PARENT) \ - if (!comp_iter(env, (FROM), (TO), (NODE), (PARENT))) { return 0; } - -static int -comp_iter_repeated(struct comp_env *env, - fsm_state_t x, fsm_state_t y, - struct ast_expr *n) -{ - fsm_state_t a, b; - fsm_state_t na, nz; - unsigned i; - - assert(n->type == AST_EXPR_REPEAT); - const unsigned min = n->u.repeat.min; - const unsigned max = n->u.repeat.max; - struct ast_expr *e = n->u.repeat.e; - - assert(min <= max); - - if (min == 0 && max == 0) { /* {0,0} */ - EPSILON(x, y); - } else if (min == 0 && max == 1) { /* '?' */ - RECURSE(x, y, e, n); - EPSILON(x, y); - } else if (min == 1 && max == 1) { /* {1,1} */ - RECURSE(x, y, e, n); - } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* '*' */ - NEWSTATE(na); - NEWSTATE(nz); - EPSILON(x,na); - EPSILON(nz,y); - - EPSILON(na, nz); - RECURSE(na, nz, e, n); - EPSILON(nz, na); - } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* '+' */ - NEWSTATE(na); - NEWSTATE(nz); - EPSILON(x,na); - EPSILON(nz,y); - - RECURSE(na, nz, e, n); - EPSILON(nz, na); - } else { - /* - * Make new beginning/end states for the repeated section, - * build its NFA, and link to its head. - */ - - struct fsm_subgraph subgraph; - fsm_state_t tail; - - fsm_subgraph_start(env->fsm, &subgraph); - - NEWSTATE(na); - NEWSTATE(nz); - RECURSE(na, nz, e, n); - EPSILON(x, na); /* link head to repeated NFA head */ - - b = nz; /* set the initial tail */ - - /* can be skipped */ - if (min == 0) { - EPSILON(na, nz); - } - fsm_subgraph_stop(env->fsm, &subgraph); - tail = nz; - - if (max != AST_COUNT_UNBOUNDED) { - for (i = 1; i < max; i++) { - /* copies the original subgraph; need to set b to the - * original tail - */ - b = tail; +#define RETURN(ENV) comp_stack_pop(ENV) - if (!fsm_subgraph_duplicate(env->fsm, &subgraph, &b, &a)) { - return 0; - } +#define RECURSE(ENV, FROM, TO, NODE) \ + if (!comp_stack_push(ENV, (FROM), (TO), (NODE))) { return 0; } - EPSILON(nz, a); - - /* To the optional part of the repeated count */ - if (i >= min) { - EPSILON(nz, b); - } - - na = a; /* advance head for next duplication */ - nz = b; /* advance tail for concenation */ - } - } else { - for (i = 1; i < min; i++) { - /* copies the original subgraph; need to set b to the - * original tail - */ - b = tail; - - if (!fsm_subgraph_duplicate(env->fsm, &subgraph, &b, &a)) { - return 0; - } - - EPSILON(nz, a); - - na = a; /* advance head for next duplication */ - nz = b; /* advance tail for concenation */ - } - - /* back link to allow for infinite repetition */ - EPSILON(nz,na); - } - - /* tail to last repeated NFA tail */ - EPSILON(nz, y); - } - - return 1; -} +#define TAILCALL(ENV, FROM, TO, NODE) \ + comp_stack_tailcall(ENV, (FROM), (TO), (NODE)); static int -comp_iter(struct comp_env *env, - fsm_state_t x, fsm_state_t y, - struct ast_expr *n, const struct ast_expr *parent) +set_linking(struct comp_env *env, struct ast_expr *n, + enum link_types link_start, enum link_types link_end, + fsm_state_t *px, fsm_state_t *py) { - enum link_types link_start, link_end; - - if (n == NULL) { - return 1; - } - - link_start = decide_linking(env, n, parent, LINK_START); - link_end = decide_linking(env, n, parent, LINK_END); + fsm_state_t x = *px; + fsm_state_t y = *py; #if LOG_LINKAGE - fprintf(stderr, "%s: decide_linking %p: start ", __func__, (void *) n); + fprintf(stderr, "%s: decide_linking %p [%s]: start ", + __func__, (void *) n, ast_node_type_name(n->type)); print_linkage(link_start); fprintf(stderr, ", end "); print_linkage(link_end); fprintf(stderr, ", x %d, y %d\n", x, y); #else (void) print_linkage; + (void)n; #endif switch (link_start) { case LINK_TOP_DOWN: break; case LINK_GLOBAL: - x = env->start; + x = env->start_inner; break; case LINK_GLOBAL_SELF_LOOP: if (!intern_start_any_loop(env)) { return 0; } - assert(env->has_start_any_loop); + assert(env->have_start_any_loop); - x = env->start_any_loop; + x = env->start_any_inner; break; default: - assert(!"unreachable"); + assert(!"match fail"); /* these should be mutually exclusive now */ } switch (link_end) { case LINK_TOP_DOWN: break; case LINK_GLOBAL: - if (env->re_flags & RE_END_NL && (n->flags & AST_FLAG_END_NL)) { + if (env->re_flags & RE_END_NL && !(env->re_flags & RE_END_NL_DISABLE) + && (n->flags & AST_FLAG_END_NL)) { if (!intern_end_nl(env)) { return 0; } - y = env->end_nl; + y = env->end_nl_inner; } else { - y = env->end; + y = env->end_inner; } break; case LINK_GLOBAL_SELF_LOOP: @@ -664,243 +673,784 @@ comp_iter(struct comp_env *env, } assert(env->has_end_any_loop); - y = env->end_any_loop; + y = env->end_any_inner; break; default: - assert(!"unreachable"); + assert(!"match fail"); /* these should be mutually exclusive now */ } #if LOG_LINKAGE fprintf(stderr, " ---> x: %d, y: %d\n", x, y); #endif + *px = x; + *py = y; + return 1; +} - switch (n->type) { - case AST_EXPR_EMPTY: - /* skip these, when possible */ - EPSILON(x, y); - break; +static void +comp_stack_pop(struct comp_env *env) +{ + assert(env->stack.depth > 0); + env->stack.depth--; +} - case AST_EXPR_CONCAT: - { - fsm_state_t curr_x; - size_t i; +static int +comp_stack_push(struct comp_env *env, fsm_state_t x, fsm_state_t y, struct ast_expr *n) +{ + struct comp_stack *stack = &env->stack; + assert(n != NULL); - const size_t count = n->u.concat.count; + if (stack->depth == stack->ceil) { + const size_t nceil = 2*stack->ceil; + struct comp_stack_frame *nframes = f_realloc(env->alloc, + stack->frames, nceil * sizeof(stack->frames[0])); +#if LOG_LINKAGE || LOG_TRAMPOLINE + fprintf(stderr, "comp_stack_push: reallocating comp_stack, %zu -> %zu frames\n", + stack->ceil, nceil); +#endif + if (nframes == NULL) { + return 0; + } + stack->ceil = nceil; + stack->frames = nframes; + } - curr_x = x; + assert(stack->depth < stack->ceil); - assert(count >= 1); + struct comp_stack_frame *sf = &stack->frames[stack->depth]; + memset(sf, 0x00, sizeof(*sf)); + sf->n = n; + sf->x = x; + sf->y = y; - if (!fsm_addstate_bulk(env->fsm, count - 1)) { - return 0; - } + stack->depth++; + return 1; +} - for (i = 0; i < count; i++) { - struct ast_expr *curr = n->u.concat.n[i]; +static void +comp_stack_tailcall(struct comp_env *env, + fsm_state_t x, fsm_state_t y, struct ast_expr *n) +{ + struct comp_stack *stack = &env->stack; - /* If a subtree is unsatisfiable but also nullable, ignore it. */ - const enum ast_flags nullable_and_unsat = AST_FLAG_NULLABLE - | AST_FLAG_UNSATISFIABLE; - if ((curr->flags & nullable_and_unsat) == nullable_and_unsat) { - /* if necessary, link the end */ - if (i == count - 1) { - EPSILON(curr_x, y); - } - continue; - } + assert(stack->depth > 0); - struct ast_expr *next = i == count - 1 - ? NULL - : n->u.concat.n[i + 1]; + /* Replace current stack frame. */ + struct comp_stack_frame *sf = &stack->frames[stack->depth - 1]; + memset(sf, 0x00, sizeof(*sf)); + sf->n = n; + sf->x = x; + sf->y = y; +} - fsm_state_t z; - if (i + 1 < count) { - if (!fsm_addstate(env->fsm, &z)) { - return 0; - } -#if LOG_LINKAGE - fprintf(stderr, "%s: added state z %d\n", __func__, z); -#endif - } else { - z = y; /* connect to right parent to close off subtree */ - } +#define JUST_ONE_PROG 1 - /* - * If nullable, add an extra state & epsilion as a one-way gate - */ - if (!can_skip_concat_state_and_epsilon(curr, next)) { - fsm_state_t diode; +static int +comp_iter(struct comp_env *env, + fsm_state_t x, const struct ast *ast) +{ + int res = 1; + assert(ast != NULL); + assert(ast->expr != NULL); - NEWSTATE(diode); - EPSILON(curr_x, diode); - curr_x = diode; - } + struct comp_stack_frame *frames = NULL; + uint64_t *active_capture_ids = NULL; + const bool use_captures = (env->re_flags & RE_NOCAPTURE) == 0; - RECURSE(curr_x, z, curr, n); + frames = f_calloc(env->alloc, + DEF_COMP_STACK_CEIL, sizeof(env->stack.frames[0])); + if (frames == NULL) { + goto alloc_fail; + } - curr_x = z; + { + const size_t capture_id_words = (env->max_capture_id == AST_NO_MAX_CAPTURE_ID) + ? 1 /* do non-zero allocation to silence EFENCE */ + : ((env->max_capture_id)/64 + 1); + active_capture_ids = f_calloc(env->alloc, capture_id_words, + sizeof(active_capture_ids[0])); + if (active_capture_ids == NULL) { + goto alloc_fail; } + } - break; + /* Add inner and outer end states. Like start_outer and start_inner, + * these represent the boundary between match group 0 (inner) and + * states outside it (the unanchored end loop). */ + if (!fsm_addstate(env->fsm, &env->end_inner)) { + goto alloc_fail; + } + if (!fsm_addstate(env->fsm, &env->end_outer)) { + goto alloc_fail; + } + if (!fsm_addedge_epsilon(env->fsm, env->end_inner, env->end_outer)) { + goto alloc_fail; } - case AST_EXPR_ALT: - { - size_t i; + fsm_setend(env->fsm, env->end_outer, 1); - const size_t count = n->u.alt.count; +#if LOG_LINKAGE + fprintf(stderr, "end: outer %d, inner %d\n", + env->end_outer, env->end_inner); +#endif - assert(count >= 1); +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: x %d, y %d\n", __func__, x, env->end_inner); +#endif - for (i = 0; i < count; i++) { - /* skip unsatisfiable ALT subtrees */ - if (n->u.alt.n[i]->flags & AST_FLAG_UNSATISFIABLE) { + env->stack.ceil = DEF_COMP_STACK_CEIL; + env->stack.depth = 1; + env->stack.frames = frames; + env->active_capture_ids = active_capture_ids; + + { /* set up the first stack frame */ + struct comp_stack_frame *sf = &env->stack.frames[0]; + sf->n = ast->expr; + sf->x = x; + sf->y = env->end_inner; + sf->step = 0; + } + +#if JUST_ONE_PROG + uint32_t prog_id; + if (use_captures) { + if (!compile_capvm_program_for_stack_end_states(env, ast, &prog_id)) { + goto alloc_fail; + } + } +#endif + + /* evaluate call stack until termination */ + while (res && env->stack.depth > 0) { + if (!eval_stack_frame(env)) { +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: res -> 0\n", __func__); +#endif + res = 0; + break; + } + } + + if (use_captures && res && env->max_capture_id != AST_NO_MAX_CAPTURE_ID) { + /* Set the active captures on the end state. */ + for (unsigned i = 0; i <= (unsigned)env->max_capture_id; i++) { + if (!u64bitset_get(env->active_capture_ids, i)) { continue; } + if (!fsm_capture_set_active_for_end(env->fsm, i, env->end_outer)) { + goto alloc_fail; + } + } - /* - * CONCAT handles adding extra states and - * epsilons when necessary, so there isn't much - * more to do here. - */ - RECURSE(x, y, n->u.alt.n[i], n); +#if !JUST_ONE_PROG + uint32_t prog_id; + if (!compile_capvm_program_for_stack_end_states(env, stack, ast, &prog_id)) { + goto alloc_fail; + } +#endif + +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: associated prog_id %u with state %d\n", + __func__, prog_id, stack->end_outer); +#endif + if (!fsm_capture_associate_program_with_end_state(env->fsm, + prog_id, env->end_outer)) { + goto alloc_fail; } - break; } - case AST_EXPR_LITERAL: - LITERAL(x, y, n->u.literal.c); - break; + f_free(env->alloc, env->stack.frames); + f_free(env->alloc, env->active_capture_ids); - case AST_EXPR_CODEPOINT: { - fsm_state_t a, b; - char c[4]; - int r, i; + return res; - r = utf8(n->u.codepoint.u, c); - if (!r) { - if (env->err != NULL) { - env->err->e = RE_EBADCP; - env->err->cp = n->u.codepoint.u; - } +alloc_fail: + /* TODO: set env->err to indicate alloc failure */ + if (frames != NULL) { + f_free(env->alloc, frames); + } + if (active_capture_ids != NULL) { + f_free(env->alloc, active_capture_ids); + } + return 0; +} + +static struct comp_stack_frame * +get_comp_stack_top(struct comp_env *env) +{ + struct comp_stack *stack = &env->stack; + assert(stack->depth > 0); + struct comp_stack_frame *sf = &stack->frames[stack->depth - 1]; + assert(sf->n != NULL); + return sf; +} + +static int +eval_stack_frame(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + +#if LOG_TRAMPOLINE + fprintf(stderr, "%s: depth %zu/%zu, type %s, step %u\n", __func__, + stack->depth, stack->ceil, ast_node_type_name(sf->n->type), sf->step); +#endif + + /* If this is the first time the trampoline has called this + * state, decide the linking. Some of the states below (such as + * AST_EXPR_CONCAT) can have multiple child nodes, so they will + * increment step and use it to resume where they left off as + * the trampoline returns execution to them. */ + enum link_types link_end; + if (sf->step == 0) { /* entering state */ + enum link_types link_start; + link_start = decide_linking(env, sf->x, sf->y, sf->n, LINK_START); + link_end = decide_linking(env, sf->x, sf->y, sf->n, LINK_END); + if (!set_linking(env, sf->n, link_start, link_end, &sf->x, &sf->y)) { return 0; } + } + +#if LOG_TRAMPOLINE > 1 + fprintf(stderr, "%s: x %d, y %d\n", __func__, sf->x, sf->y); +#endif + + switch (sf->n->type) { + case AST_EXPR_EMPTY: + return eval_EMPTY(env); + case AST_EXPR_CONCAT: + return eval_CONCAT(env); + case AST_EXPR_ALT: + return eval_ALT(env); + case AST_EXPR_LITERAL: + return eval_LITERAL(env); + case AST_EXPR_CODEPOINT: + return eval_CODEPOINT(env); + case AST_EXPR_REPEAT: + return eval_REPEAT(env); + case AST_EXPR_GROUP: + return eval_GROUP(env); + case AST_EXPR_ANCHOR: + return eval_ANCHOR(env); + case AST_EXPR_SUBTRACT: + return eval_SUBTRACT(env); + case AST_EXPR_RANGE: + return eval_RANGE(env); + case AST_EXPR_TOMBSTONE: + return eval_TOMBSTONE(env); + default: + assert(!"unreached"); + return 0; + } +} - a = x; +static int +eval_EMPTY(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); +#if LOG_LINKAGE + fprintf(stderr, "eval_EMPTY: step %u, x %d -> y %d\n", + sf->step, sf->x, sf->y); +#endif - for (i = 0; i < r; i++) { - if (i + 1 < r) { - NEWSTATE(b); - } else { - b = y; + EPSILON(sf->x, sf->y); + RETURN(env); + return 1; +} + +static int +eval_CONCAT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; + const size_t count = n->u.concat.count; + assert(count >= 1); + +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: eval_CONCAT: x %d, y %d, step %d\n", + sf->x, sf->y, sf->step); +#endif + + if (sf->step == 0) { + sf->u.concat.link = sf->x; + } + + while (sf->step < count) { + fsm_state_t curr_x = sf->u.concat.link; + struct ast_expr *curr = n->u.concat.n[sf->step]; + + /* If a subtree is unsatisfiable but also nullable, ignore it. */ + const enum ast_flags nullable_and_unsat = AST_FLAG_NULLABLE + | AST_FLAG_UNSATISFIABLE; + if ((curr->flags & nullable_and_unsat) == nullable_and_unsat) { + sf->step++; + + /* if necessary, link the end */ + if (sf->step == count) { + EPSILON(curr_x, sf->y); } + return 1; + } - LITERAL(a, b, c[i]); + struct ast_expr *next = sf->step == count - 1 + ? NULL + : n->u.concat.n[sf->step + 1]; - a = b; + fsm_state_t z; + if (sf->step + 1 < count) { + if (!fsm_addstate(env->fsm, &z)) { + return 0; + } +#if LOG_LINKAGE + fprintf(stderr, "%s: added state z %d\n", __func__, z); +#endif + } else { + z = sf->y; /* connect to right parent to close off subtree */ } - break; - } +#if LOG_LINKAGE + fprintf(stderr, "%s: curr_x %d, z %d\n", + __func__, curr_x, z); +#endif - case AST_EXPR_REPEAT: /* - * REPEAT breaks out into its own function, because - * there are several special cases + * If nullable, add an extra state & epsilon as a one-way gate */ - if (!comp_iter_repeated(env, x, y, n)) { - return 0; + if (!can_skip_concat_state_and_epsilon(curr, next)) { + fsm_state_t diode; + + NEWSTATE(diode); + EPSILON(curr_x, diode); + curr_x = diode; +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: added diode %d\n", diode); +#endif } - break; - case AST_EXPR_GROUP: - RECURSE(x, y, n->u.group.e, n); - break; +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: recurse CONCAT[%u/%zu]: link %d, z %d\n", + sf->step, count, sf->u.concat.link, z); +#endif + /* Set the right side link, which will become the + * left side link for the next step (if any). */ + sf->u.concat.link = z; + sf->step++; + RECURSE(env, curr_x, z, curr); + return 1; + } - case AST_EXPR_TOMBSTONE: - /* do not link -- intentionally pruned */ - break; + RETURN(env); + return 1; +} - case AST_EXPR_ANCHOR: - EPSILON(x, y); - break; +static int +eval_ALT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + const size_t count = sf->n->u.alt.count; + assert(count >= 1); - case AST_EXPR_SUBTRACT: { - struct fsm *a, *b; - struct fsm *q; - enum re_flags re_flags; +#if LOG_LINKAGE + fprintf(stderr, "eval_ALT: step %u\n", sf->step); +#endif - re_flags = n->re_flags; + if (sf->step < count) { + struct ast_expr *n; - /* wouldn't want to reverse twice! */ - re_flags &= ~(unsigned)RE_REVERSE; + /* + * CONCAT handles adding extra states and + * epsilons when necessary, so there isn't much + * more to do here. + */ +#if LOG_LINKAGE + fprintf(stderr, "eval_ALT: recurse ALT[%u/%zu]: x %d, y %d\n", + sf->step, count, sf->x, sf->y); +#endif - a = expr_compile(n->u.subtract.a, re_flags, - fsm_getoptions(env->fsm), env->err); - if (a == NULL) { - return 0; + n = sf->n->u.alt.n[sf->step]; + assert(n != NULL); + sf->step++; /* RECURSE can realloc the stack and make sf stale. */ + + if (!(n->flags & AST_FLAG_UNSATISFIABLE)) { + RECURSE(env, sf->x, sf->y, n); } + return 1; + } - b = expr_compile(n->u.subtract.b, re_flags, - fsm_getoptions(env->fsm), env->err); - if (b == NULL) { - fsm_free(a); - return 0; + RETURN(env); + return 1; +} + +static int +eval_LITERAL(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; +#if LOG_LINKAGE + fprintf(stderr, "%s: linking %d -> %d with literal '%c' (0x%02x)\n", + __func__, sf->x, sf->y, isprint(n->u.literal.c) ? n->u.literal.c : '.', + n->u.literal.c); +#endif + + LITERAL(sf->x, sf->y, n->u.literal.c); + + RETURN(env); + return 1; +} + +static int +eval_CODEPOINT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; + fsm_state_t a, b; + char c[4]; + int r, i; + + r = utf8(n->u.codepoint.u, c); + if (!r) { + if (env->err != NULL) { + env->err->e = RE_EBADCP; + env->err->cp = n->u.codepoint.u; } - q = fsm_subtract(a, b); - if (q == NULL) { - return 0; + return 0; + } + + a = sf->x; + + for (i = 0; i < r; i++) { + if (i + 1 < r) { + NEWSTATE(b); + } else { + b = sf->y; } + LITERAL(a, b, c[i]); + + a = b; + } + + RETURN(env); + return 1; +} + +static int +eval_REPEAT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + fsm_state_t a, b; + unsigned i, min, max; + + assert(sf->n->type == AST_EXPR_REPEAT); + struct ast_expr_repeat *n = &sf->n->u.repeat; + + min = n->min; + max = n->max; + + assert(min <= max); + + if (min == 0 && max == 0) { /* {0,0} */ + EPSILON(sf->x, sf->y); + RETURN(env); + return 1; + } else if (min == 0 && max == 1) { /* '?' */ + EPSILON(sf->x, sf->y); + TAILCALL(env, sf->x, sf->y, n->e); + return 1; + } else if (min == 1 && max == 1) { /* {1,1} */ + TAILCALL(env, sf->x, sf->y, n->e); + return 1; + } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* '*' */ + fsm_state_t na, nz; + NEWSTATE(na); + NEWSTATE(nz); + EPSILON(sf->x,na); + EPSILON(nz,sf->y); + + EPSILON(na, nz); + EPSILON(nz, na); + TAILCALL(env, na, nz, n->e); + return 1; + } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* '+' */ + fsm_state_t na, nz; + NEWSTATE(na); + NEWSTATE(nz); + EPSILON(sf->x, na); + EPSILON(nz, sf->y); + + EPSILON(nz, na); + TAILCALL(env, na, nz, n->e); + return 1; + } else if (sf->step == 0) { /* - * Subtraction produces quite a mess. We could trim or minimise here - * while q is self-contained, which might work out better than doing it - * in the larger FSM after merge. I'm not sure if it works out better - * overall or not. + * Make new beginning/end states for the repeated section, + * build its NFA, and link to its head. */ - if (fsm_empty(q)) { - EPSILON(x, y); - break; + fsm_subgraph_start(env->fsm, &sf->u.repeat.subgraph); + + sf->step++; /* resume after RECURSE */ + NEWSTATE(sf->u.repeat.na); + NEWSTATE(sf->u.repeat.nz); + RECURSE(env, sf->u.repeat.na, sf->u.repeat.nz, n->e); + return 1; + } else { + fsm_state_t tail; + assert(sf->step == 1); + EPSILON(sf->x, sf->u.repeat.na); /* link head to repeated NFA head */ + + b = sf->u.repeat.nz; /* set the initial tail */ + + /* can be skipped */ + if (min == 0) { + EPSILON(sf->u.repeat.na, sf->u.repeat.nz); } + fsm_subgraph_stop(env->fsm, &sf->u.repeat.subgraph); + tail = sf->u.repeat.nz; - if (!fsm_unionxy(env->fsm, q, x, y)) { - return 0; + if (max != AST_COUNT_UNBOUNDED) { + for (i = 1; i < max; i++) { + /* copies the original subgraph; need to set b to the + * original tail + */ + b = tail; + + if (!fsm_subgraph_duplicate(env->fsm, &sf->u.repeat.subgraph, &b, &a)) { + return 0; + } + + EPSILON(sf->u.repeat.nz, a); + + /* To the optional part of the repeated count */ + if (i >= min) { + EPSILON(sf->u.repeat.nz, b); + } + + sf->u.repeat.na = a; /* advance head for next duplication */ + sf->u.repeat.nz = b; /* advance tail for concenation */ + } + } else { + for (i = 1; i < min; i++) { + /* copies the original subgraph; need to set b to the + * original tail + */ + b = tail; + + if (!fsm_subgraph_duplicate(env->fsm, &sf->u.repeat.subgraph, &b, &a)) { + return 0; + } + + EPSILON(sf->u.repeat.nz, a); + + sf->u.repeat.na = a; /* advance head for next duplication */ + sf->u.repeat.nz = b; /* advance tail for concenation */ + } + + /* back link to allow for infinite repetition */ + EPSILON(sf->u.repeat.nz, sf->u.repeat.na); } - break; + /* tail to last repeated NFA tail */ + EPSILON(sf->u.repeat.nz, sf->y); + RETURN(env); + return 1; } +} + +static void +set_active_capture_ids(struct comp_env *env, unsigned id) +{ +#if LOG_LINKAGE + fprintf(stderr, "set_active_capture_ids: id %u\n", id); +#endif + assert(env->active_capture_ids != NULL); + u64bitset_set(env->active_capture_ids, id); +} - case AST_EXPR_RANGE: { - unsigned int i; +static int +eval_GROUP(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); - if (n->u.range.from.type != AST_ENDPOINT_LITERAL || n->u.range.to.type != AST_ENDPOINT_LITERAL) { - /* not yet supported */ - return 0; + if (env->re_flags & RE_NOCAPTURE) { + /* passthrough, disable captures */ + if (sf->step == 0) { + sf->step++; + RECURSE(env, sf->x, sf->y, sf->n->u.group.e); + } else { + RETURN(env); } + return 1; + } - assert(n->u.range.from.u.literal.c <= n->u.range.to.u.literal.c); + if (sf->step == 0) { + struct ast_expr *n = sf->n; + set_active_capture_ids(env, n->u.group.id); - if (n->u.range.from.u.literal.c == 0x00 && - n->u.range.to.u.literal.c == 0xff) - { - ANY(x, y); +#if LOG_LINKAGE + fprintf(stderr, "comp_iter: recurse GROUP: x %d, y %d\n", + sf->x, sf->y); +#endif + sf->step++; + RECURSE(env, sf->x, sf->y, n->u.group.e); + return 1; + } else { + assert(sf->step == 1); + + RETURN(env); + return 1; + } +} + +static int +eval_ANCHOR(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); +#if 1 + +#if LOG_LINKAGE + fprintf(stderr, "%s: ignoring anchor node %p, epsilon %d -> %d\n", + __func__, (void *)sf->n, sf->x, sf->y); +#endif + EPSILON(sf->x, sf->y); +#else + switch (sf->n->u.anchor.type) { + case AST_ANCHOR_START: + if (!(sf->n->flags & AST_FLAG_FIRST)) { +#if LOG_LINKAGE + fprintf(stderr, "%s: ignoring START anchor in non-FIRST location\n", + __func__); +#endif + EPSILON(sf->x, sf->y); break; } - for (i = n->u.range.from.u.literal.c; i <= n->u.range.to.u.literal.c; i++) { - LITERAL(x, y, i); +#if LOG_LINKAGE + fprintf(stderr, "%s: START anchor %p epsilon-linking %d -> %d\n", + __func__, (void *)sf->n, env->start_inner, sf->y); +#endif + EPSILON(env->start_inner, sf->y); + break; + + case AST_ANCHOR_END: + if (!(sf->n->flags & AST_FLAG_LAST)) { +#if LOG_LINKAGE + fprintf(stderr, "%s: ignoring END anchor in non-LAST location\n", + __func__); +#endif + EPSILON(sf->x, sf->y); + break; } +#if LOG_LINKAGE + fprintf(stderr, "%s: END anchor %p epsilon-linking %d -> %d\n", + __func__, (void *)sf->n, sf->x, stack->end_inner); +#endif + EPSILON(sf->x, stack->end_inner); break; - } default: assert(!"unreached"); + return 0; } +#endif + RETURN(env); + return 1; +} + +static int +eval_SUBTRACT(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + + struct fsm *a, *b; + struct fsm *q; + enum re_flags re_flags = sf->n->re_flags; + + /* wouldn't want to reverse twice! */ + re_flags &= ~(unsigned)RE_REVERSE; + + /* Don't compile capture resolution programs again for the + * subtrees, just ignore capture behavior. */ + re_flags |= RE_NOCAPTURE; + + a = expr_compile(sf->n->u.subtract.a, re_flags, + fsm_getoptions(env->fsm), env->err); + if (a == NULL) { + return 0; + } + + b = expr_compile(sf->n->u.subtract.b, re_flags, + fsm_getoptions(env->fsm), env->err); + if (b == NULL) { + fsm_free(a); + return 0; + } + + q = fsm_subtract(a, b); + if (q == NULL) { + return 0; + } + + /* + * Subtraction produces quite a mess. We could trim or minimise here + * while q is self-contained, which might work out better than doing it + * in the larger FSM after merge. I'm not sure if it works out better + * overall or not. + */ + + if (fsm_empty(q)) { + EPSILON(sf->x, sf->y); + RETURN(env); + return 1; + } + + if (!fsm_unionxy(env->fsm, q, sf->x, sf->y)) { + return 0; + } + + RETURN(env); + return 1; +} + +static int +eval_RANGE(struct comp_env *env) +{ + struct comp_stack_frame *sf = get_comp_stack_top(env); + struct ast_expr *n = sf->n; + unsigned int i; + + if (n->u.range.from.type != AST_ENDPOINT_LITERAL || n->u.range.to.type != AST_ENDPOINT_LITERAL) { + /* not yet supported */ + return 0; + } + + assert(n->u.range.from.u.literal.c <= n->u.range.to.u.literal.c); + + if (n->u.range.from.u.literal.c == 0x00 && + n->u.range.to.u.literal.c == 0xff) + { + ANY(sf->x, sf->y); + RETURN(env); + return 1; + } + + for (i = n->u.range.from.u.literal.c; i <= n->u.range.to.u.literal.c; i++) { + LITERAL(sf->x, sf->y, i); + } + + RETURN(env); + return 1; +} + +static int +eval_TOMBSTONE(struct comp_env *env) +{ + /* do not link -- intentionally pruned */ + (void)env; + RETURN(env); return 1; } @@ -909,6 +1459,8 @@ comp_iter(struct comp_env *env, #undef NEWSTATE #undef LITERAL #undef RECURSE +#undef RETURN +#undef TAILCALL struct fsm * ast_compile(const struct ast *ast, @@ -916,40 +1468,64 @@ ast_compile(const struct ast *ast, const struct fsm_options *opt, struct re_err *err) { - fsm_state_t x, y; + /* Start states inside and outside of match group 0, + * which represents the entire matched input, but does not + * include the implied /.*?/ loop at the start or end when + * a regex is unanchored. */ + fsm_state_t start_outer, start_inner; struct fsm *fsm; assert(ast != NULL); +#if LOG_LINKAGE + ast_print_tree(stderr, opt, re_flags, ast); +#endif + fsm = fsm_new(opt); if (fsm == NULL) { return NULL; } - if (!fsm_addstate(fsm, &x)) { + /* TODO: move these to the call stack, for symmetry? + * Or possibly combine comp_env and stack. */ + if (!fsm_addstate(fsm, &start_outer)) { goto error; } - if (!fsm_addstate(fsm, &y)) { + if (!fsm_addstate(fsm, &start_inner)) { goto error; } - fsm_setstart(fsm, x); - fsm_setend(fsm, y, 1); + if (!fsm_addedge_epsilon(fsm, start_outer, start_inner)) { + goto error; + } + + fsm_setstart(fsm, start_outer); + +#if LOG_LINKAGE + fprintf(stderr, "start: outer %d, inner %d\n", + start_outer, start_inner); +#endif { struct comp_env env; memset(&env, 0x00, sizeof(env)); + env.alloc = fsm->opt->alloc; env.fsm = fsm; env.re_flags = re_flags; env.err = err; - env.start = x; - env.end = y; + env.start_inner = start_inner; + env.start_outer = start_outer; - if (!comp_iter(&env, x, y, ast->expr, NULL)) { + env.max_capture_id = ast->max_capture_id; + + if (!comp_iter(&env, start_inner, ast)) { + if (err != NULL && err->e == 0) { + err->e = RE_EBADGROUP; + } goto error; } } @@ -981,3 +1557,25 @@ ast_compile(const struct ast *ast, return NULL; } +static int +compile_capvm_program_for_stack_end_states(struct comp_env *env, + const struct ast *ast, uint32_t *prog_id) +{ + /* compile and save program in ^, associate its id w/ end state */ + enum re_capvm_compile_ast_res res; + struct capvm_program *prog; + res = re_capvm_compile_ast(env->alloc, + ast, env->re_flags, &prog); + if (res != RE_CAPVM_COMPILE_AST_OK) { + if (env->err != NULL && env->err->e == 0 && errno != 0) { + env->err->e = RE_EERRNO; + } + return 0; + } + + if (!fsm_capture_add_program(env->fsm, prog, prog_id)) { + return 0; + } + + return 1; +} diff --git a/src/libre/ast_rewrite.c b/src/libre/ast_rewrite.c index d05fc0a82..adb0690d2 100644 --- a/src/libre/ast_rewrite.c +++ b/src/libre/ast_rewrite.c @@ -146,7 +146,7 @@ compile_subexpr(struct ast_expr *e, enum re_flags flags) return 0; } - fsm = ast_compile(&ast, flags | RE_ANCHORED, NULL, NULL); + fsm = ast_compile(&ast, flags | RE_ANCHORED | RE_NOCAPTURE, NULL, NULL); if (fsm == NULL) { return 0; } diff --git a/src/libre/print/tree.c b/src/libre/print/tree.c index 5d2f78691..58e1d6050 100644 --- a/src/libre/print/tree.c +++ b/src/libre/print/tree.c @@ -147,7 +147,9 @@ pp_iter(FILE *f, const struct fsm_options *opt, size_t indent, enum re_flags re_ case AST_EXPR_ALT: { size_t i, count = n->u.alt.count; - fprintf(f, "ALT (%u):\n", (unsigned)count); + fprintf(f, "ALT (%u):%s\n", + (unsigned)count, + n->u.alt.contains_empty_groups ? " (contains_empty_groups)" : ""); for (i = 0; i < count; i++) { pp_iter(f, opt, indent + 1 * IND, re_flags, n->u.alt.n[i]); } @@ -155,7 +157,9 @@ pp_iter(FILE *f, const struct fsm_options *opt, size_t indent, enum re_flags re_ } case AST_EXPR_LITERAL: - fprintf(f, "LITERAL '%c'\n", n->u.literal.c); + fprintf(f, "LITERAL '"); + print_char_or_esc(f, n->u.literal.c); + fprintf(f, "'\n"); break; case AST_EXPR_CODEPOINT: @@ -167,18 +171,27 @@ pp_iter(FILE *f, const struct fsm_options *opt, size_t indent, enum re_flags re_ fprintf_count(f, n->u.repeat.min); fprintf(f, ","); fprintf_count(f, n->u.repeat.max); - fprintf(f, "}\n"); + fprintf(f, "}%s\n", n->u.repeat.contains_empty_groups ? " (contains_empty_groups)" : ""); pp_iter(f, opt, indent + 1 * IND, re_flags, n->u.repeat.e); break; case AST_EXPR_GROUP: - fprintf(f, "GROUP %p: %u\n", (void *) n, n->u.group.id); + fprintf(f, "GROUP: %u%s\n", n->u.group.id, + n->u.group.repeated ? " (repeated)" : ""); pp_iter(f, opt, indent + 1 * IND, re_flags, n->u.group.e); break; case AST_EXPR_ANCHOR: assert(n->u.anchor.type == AST_ANCHOR_START || n->u.anchor.type == AST_ANCHOR_END); - fprintf(f, "ANCHOR %s\n", n->u.anchor.type == AST_ANCHOR_START ? "^" : "$"); + if (n->u.anchor.type == AST_ANCHOR_START) { + fprintf(f, "ANCHOR ^\n"); + } else if (n->u.anchor.type == AST_ANCHOR_END) { + assert(n->u.anchor.type == AST_ANCHOR_START || n->u.anchor.type == AST_ANCHOR_END); + fprintf(f, "ANCHOR $%s\n", + n->u.anchor.is_end_nl ? " (with \\n)" : ""); + } else { + assert(!"unreachable"); + } break; case AST_EXPR_SUBTRACT: diff --git a/src/libre/re.c b/src/libre/re.c index 6c423dc36..6474e2db6 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -37,12 +37,12 @@ re_dialect(enum re_dialect dialect) size_t i; static const struct dialect a[] = { - { RE_LIKE, parse_re_like, 0, RE_SINGLE | RE_ANCHORED }, - { RE_LITERAL, parse_re_literal, 0, RE_SINGLE | RE_ANCHORED }, - { RE_GLOB, parse_re_glob, 0, RE_SINGLE | RE_ANCHORED }, + { RE_LIKE, parse_re_like, 0, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE }, + { RE_LITERAL, parse_re_literal, 0, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE }, + { RE_GLOB, parse_re_glob, 0, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE }, { RE_NATIVE, parse_re_native, 0, 0 }, { RE_PCRE, parse_re_pcre, 0, RE_END_NL }, - { RE_SQL, parse_re_sql, 1, RE_SINGLE | RE_ANCHORED } + { RE_SQL, parse_re_sql, 1, RE_SINGLE | RE_ANCHORED | RE_NOCAPTURE } }; for (i = 0; i < sizeof a / sizeof *a; i++) { @@ -125,7 +125,15 @@ re_parse(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, if (res < 0) { ast_free(ast); - if (err != NULL) { err->e = RE_EERRNO; } + if (err != NULL) { + if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + err->e = RE_EUNSUPPPCRE; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE) { + err->e = RE_EUNSUPCAPTUR; + } else if (err->e == RE_ESUCCESS) { + err->e = RE_EERRNO; + } + } return NULL; } diff --git a/src/libre/re_capvm_compile.c b/src/libre/re_capvm_compile.c new file mode 100644 index 000000000..068c00d32 --- /dev/null +++ b/src/libre/re_capvm_compile.c @@ -0,0 +1,1575 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include "re_capvm_compile.h" +#include "../libfsm/capture_vm.h" +#include "../libfsm/capture_vm_program.h" +#include "../libfsm/capture_vm_log.h" + +/* for EXPENSIVE_CHECKS */ +#include "adt/common.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +#include "ast.h" + +#define DEF_OPCODE_CEIL 8 +#define DEF_CHARCLASS_BUCKETS 8 +#define DEF_CHARCLASS_CEIL 4 +#define DEF_REPEATED_ALT_BACKPATCH_CEIL 1 +#define NO_BUCKET_ID ((uint32_t)-1) +#define NO_CAPTURE_ID ((uint32_t)-1) + +#define LOG_REPETITION_CASES 0 + +/* Placeholder markers for pending offset values (which would + * otherwise temporarily be uninitialized memory), chosen so + * they stand out visually in a debugger. */ +enum pending_offset { + PENDING_OFFSET_REPEAT_OPTIONAL_NEW = 11111111, + PENDING_OFFSET_REPEAT_OPTIONAL_CONT = 22222222, + PENDING_OFFSET_ALT_BACKPATCH_JMP = 33333333, + PENDING_OFFSET_ALT_BACKPATCH_NEW = 44444444, + PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS = 55555555, +}; + +struct capvm_compile_env { + const struct fsm_alloc *alloc; + enum re_flags re_flags; + struct capvm_program *program; + + uint32_t max_capture_seen; + + /* Hash table for interning character classes. + * Doubles and rehashes when half full. */ + struct charclass_htab { + uint32_t bucket_count; + uint32_t buckets_used; + uint32_t ids_used; + struct charclass_htab_bucket { + uint32_t id; /* or NO_BUCKET_ID for unused */ + struct capvm_char_class bitset; + } *buckets; + } charclass_htab; + +#define DEF_REPEATED_GROUPS_CEIL 8 + /* Linked list of nodes used at compile time to compile regexes + * such as '^(a((b*)*)*)$' as if they were '^(a(?:b*)(()))$'. + * Since the inner body of the repeated subexpression with the + * capture groups can be empty, it will always repeat after + * its body matches any input. We move the group captures to + * the end of the repeated subexpression to explicitly represent + * them always capturing afterward, because otherwise the + * infinite loop protection skips them. */ + struct repeated_group_info { + /* Ancestor node that should emit the SAVE opcodes; can + * be either a REPEAT or ALT. */ + const struct ast_expr *outermost_ancestor; + size_t ceil; + size_t count; + const struct ast_expr **groups; + /* linked list */ + struct repeated_group_info *prev; + } *repeated_groups; +}; + +static bool +ensure_program_capacity(const struct fsm_alloc *alloc, + struct capvm_program *p, uint32_t count) +{ +#define STRESS_GROWING (EXPENSIVE_CHECKS && 1) + + const uint32_t capacity = p->used + count; + + if (capacity > p->ceil) { +#if STRESS_GROWING + const uint32_t nceil = (p->ceil + 1 < capacity + ? capacity : p->ceil + 1); +#else + const uint32_t nceil = (p->ceil == 0 + ? DEF_OPCODE_CEIL + : 2*p->ceil); + /* This should always be enough for any capacity + * requested during compilation. */ + assert(nceil >= p->used + count); +#endif + LOG(3, "%s: growing %u -> %u (count %u)\n", + __func__, p->ceil, nceil, count); + struct capvm_opcode *nops = f_realloc(alloc, + p->ops, nceil * sizeof(p->ops[0])); + if (nops == NULL) { + return false; + } + +#if EXPENSIVE_CHECKS + for (size_t i = p->ceil; i < nceil; i++) { + /* out of range, will trigger asserts */ + nops[i].t = 'X'; + } +#endif + + p->ceil = nceil; + p->ops = nops; + } + return true; +} + +static void +check_program_for_invalid_labels(const struct capvm_program *p) +{ + for (uint32_t op_i = 0; op_i < p->used; op_i++) { + const struct capvm_opcode *op = &p->ops[op_i]; + switch (op->t) { + case CAPVM_OP_JMP: + assert(op->u.jmp != op_i); + break; + case CAPVM_OP_JMP_ONCE: + assert(op->u.jmp_once != op_i); + break; + case CAPVM_OP_SPLIT: + assert(op->u.split.greedy < p->used); + assert(op->u.split.greedy != op_i); + assert(op->u.split.nongreedy < p->used); + assert(op->u.split.nongreedy != op_i); + break; + + case CAPVM_OP_CHAR: + case CAPVM_OP_CHARCLASS: + case CAPVM_OP_MATCH: + case CAPVM_OP_SAVE: + case CAPVM_OP_ANCHOR: + break; + default: + assert(!"out of range"); + break; + } + } +} + +static uint32_t +get_program_offset(const struct capvm_program *p) +{ + assert(p->used < p->ceil); + +#if EXPENSIVE_CHECKS + struct capvm_opcode *op = &p->ops[p->used]; + op->t = 'X'; /* out of range */ +#endif + + return p->used; +} + +static uint32_t +reserve_program_opcode(struct capvm_program *p) +{ + assert(p->used < p->ceil); + const uint32_t res = p->used; + p->used++; + +#if EXPENSIVE_CHECKS + struct capvm_opcode *op = &p->ops[res]; + op->t = 'X'; /* out of range */ +#endif + + return res; +} + +static bool +grow_program_char_classes(const struct fsm_alloc *alloc, + struct capvm_program *p) +{ + const uint32_t nceil = (p->char_classes.ceil == 0 + ? DEF_CHARCLASS_CEIL + : 2*p->char_classes.ceil); + struct capvm_char_class *nsets = f_realloc(alloc, + p->char_classes.sets, nceil * sizeof(nsets[0])); + if (nsets == NULL) { + return false; + } + + p->char_classes.sets = nsets; + p->char_classes.ceil = nceil; + return true; +} + +static bool +intern_char_class(struct capvm_compile_env *env, + struct capvm_program *p, uint64_t chars[4], + uint32_t *id) +{ + LOG(5, "%s: used %u/%u\n", __func__, + env->charclass_htab.buckets_used, env->charclass_htab.bucket_count); + if (env->charclass_htab.buckets_used >= env->charclass_htab.bucket_count/2) { + const uint32_t ocount = env->charclass_htab.bucket_count; + const uint32_t ncount = (ocount == 0 + ? DEF_CHARCLASS_BUCKETS + : 2*env->charclass_htab.bucket_count); + LOG(3, "%s: growing from %u -> %u\n", __func__, ocount, ncount); + struct charclass_htab_bucket *nbuckets = + f_malloc(env->alloc, ncount * sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return false; + } + for (uint32_t n_i = 0; n_i < ncount; n_i++) { + nbuckets[n_i].id = NO_BUCKET_ID; + } + + const uint32_t nmask = ncount - 1; + assert((ncount & nmask) == 0); + + struct charclass_htab_bucket *obuckets = env->charclass_htab.buckets; + for (uint32_t o_i = 0; o_i < ocount; o_i++) { + if (obuckets[o_i].id == NO_BUCKET_ID) { + continue; + } + const uint64_t h = hash_fnv1a_64((const uint8_t *)obuckets[o_i].bitset.octets, + sizeof(obuckets[o_i].bitset)); + + for (uint32_t n_i = 0; n_i < ncount; n_i++) { + const uint64_t b = (h + n_i) & nmask; + if (nbuckets[b].id == NO_BUCKET_ID) { + memcpy(&nbuckets[b].bitset, + &obuckets[o_i].bitset, + sizeof(obuckets[o_i].bitset)); + nbuckets[b].id = obuckets[o_i].id; + break; + } + } + } + f_free(env->alloc, obuckets); + env->charclass_htab.bucket_count = ncount; + env->charclass_htab.buckets = nbuckets; + } + + assert(env->charclass_htab.buckets_used < env->charclass_htab.bucket_count/2); + + const uint32_t count = env->charclass_htab.bucket_count; + const uint32_t mask = count - 1; + struct charclass_htab_bucket *buckets = env->charclass_htab.buckets; + + const uint64_t h = hash_fnv1a_64((const uint8_t *)chars, + sizeof(buckets[0].bitset)); + for (uint32_t i = 0; i < count; i++) { + const uint64_t b = (h + i) & mask; + LOG(5, "%s: buckets[%lu].id == %d\n", + __func__, b, buckets[b].id); + if (buckets[b].id == NO_BUCKET_ID) { + memcpy(&buckets[b].bitset, chars, sizeof(buckets[b].bitset)); + if (p->char_classes.count == p->char_classes.ceil) { + if (!grow_program_char_classes(env->alloc, p)) { + return false; + } + } + + memcpy(&p->char_classes.sets[p->char_classes.count], + chars, sizeof(buckets[b].bitset)); + p->char_classes.count++; + buckets[b].id = env->charclass_htab.ids_used; + env->charclass_htab.ids_used++; + env->charclass_htab.buckets_used++; + *id = buckets[b].id; + + return true; + } else if (0 == memcmp(chars, &buckets[b].bitset, sizeof(buckets[b].bitset))) { + *id = buckets[b].id; + return true; /* already present, reuse */ + } else { + /* collision */ + } + } + + assert(!"unreachable"); + return false; +} + +static void +dump_endpoint(const struct ast_endpoint *e) +{ + switch (e->type) { + case AST_ENDPOINT_LITERAL: + fprintf(stderr, "endpoint[LITERAL]: 0x%02x '%c'\n", + e->u.literal.c, + isprint(e->u.literal.c) ? e->u.literal.c : '.'); + break; + case AST_ENDPOINT_CODEPOINT: + fprintf(stderr, "endpoint[CODEPOINT]: 0x%x\n", + e->u.codepoint.u); + break; + case AST_ENDPOINT_NAMED: + assert(!"todo?"); + break; + } +} + +static void +dump_pos(const struct ast_pos *p) +{ + fprintf(stderr, "pos: byte %u, line %u, col %u\n", + p->byte, p->line, p->col); +} + +static bool +active_node(const struct ast_expr *n) +{ + assert(n != NULL); + + switch (n->type) { + case AST_EXPR_TOMBSTONE: + return false; + default: + return !(n->flags & AST_FLAG_UNSATISFIABLE); + } +} + +static bool +subtree_represents_character_class(const struct ast_expr *expr, uint64_t cc[4]) +{ + for (size_t i = 0; i < 4; i++) { + cc[i] = 0; + } + + switch (expr->type) { + case AST_EXPR_EMPTY: + /* empty set */ + return false; + + case AST_EXPR_LITERAL: + u64bitset_set(cc, (uint8_t)expr->u.literal.c); + return true; + + case AST_EXPR_RANGE: + { + const struct ast_endpoint *f = &expr->u.range.from; + const struct ast_endpoint *t = &expr->u.range.to; + if (f->type != AST_ENDPOINT_LITERAL + || t->type != AST_ENDPOINT_LITERAL) { + return false; + } + for (uint64_t c = (uint8_t)f->u.literal.c; c <= (uint8_t)t->u.literal.c; c++) { + u64bitset_set(cc, (uint8_t)c); + } + return true; + } + + case AST_EXPR_ALT: + { + /* union character classes from children */ + assert(expr->u.alt.count > 0); + for (size_t c_i = 0; c_i < expr->u.alt.count; c_i++) { + uint64_t child_cc[4]; + const struct ast_expr *child = expr->u.alt.n[c_i]; + if (subtree_represents_character_class(child, child_cc)) { + for (size_t cc_i = 0; cc_i < 4; cc_i++) { + cc[cc_i] |= child_cc[cc_i]; + } + } else { + return false; + } + } + return true; + } + + case AST_EXPR_SUBTRACT: + { + /* Only support AST_EXPR_SUBTRACT nodes where .a is a + * RANGE:0x00-0xff and .b is either a LITERAL, RANGE, EMPTY, + * or an ALT that itself represents a character class, */ + + const struct ast_expr *sub_a = expr->u.subtract.a; + if (sub_a->type != AST_EXPR_RANGE) { + return false; + } + + const struct ast_endpoint *f = &sub_a->u.range.from; + const struct ast_endpoint *t = &sub_a->u.range.to; + if (f->type != AST_ENDPOINT_LITERAL || t->type != AST_ENDPOINT_LITERAL) { + return false; + } + + for (uint64_t i = 0; i < 256; i++) { + if (i >= (uint8_t)f->u.literal.c && i <= (uint8_t)f->u.literal.c) { + u64bitset_set(cc, i); + } + } + + for (size_t i = 0; i < 4; i++) { + cc[i] = ~(uint64_t)0; + } + + uint64_t neg_cc[4]; + if (expr->u.subtract.b->type == AST_EXPR_EMPTY) { + for (size_t cc_i = 0; cc_i < 4; cc_i++) { + neg_cc[cc_i] = (uint64_t)0; + } + } else if (subtree_represents_character_class(expr->u.subtract.b, neg_cc)) { + for (size_t cc_i = 0; cc_i < 4; cc_i++) { + cc[cc_i] &=~ neg_cc[cc_i]; + } + } else { + return false; + } + return true; + } + + default: + return false; + } +} + +static void +make_charclass_case_insensitive(uint64_t *cc) +{ + for (size_t i = 0; i < 256; i++) { + if (isalpha(i) && u64bitset_get(cc, i)) { + const char c = (char)i; + const uint64_t cl = (uint64_t)tolower(c); + const uint64_t cu = (uint64_t)toupper(c); + u64bitset_set(cc, cl); + u64bitset_set(cc, cu); + } + } +} + +static bool +can_safely_skip_JMP_ONCE(const struct ast_expr *expr) +{ + /* There are potentially cases where it's safe to skip the + * JMP_ONCE special case, which would save memory by not + * expanding the path an extra bit per iteration, but the + * criteria are subtle enough that it can probably wait. */ + (void)expr; + return false; +} + +static bool +push_repeated_group_info(struct capvm_compile_env *env, const struct ast_expr *expr) +{ + LOG(3 - LOG_REPETITION_CASES, + "%s: setting env->repeated_groups.outermost_ancestor <- %p\n", + __func__, (void *)expr); + + assert(expr != NULL); + assert(expr->type == AST_EXPR_REPEAT || expr->type == AST_EXPR_ALT); + + struct repeated_group_info *rgi = f_calloc(env->alloc, 1, sizeof(*rgi)); + if (rgi == NULL) { + return false; + } + rgi->outermost_ancestor = expr; + rgi->prev = env->repeated_groups; + env->repeated_groups = rgi; + LOG(3 - LOG_REPETITION_CASES, + "%s: push rgi, allocated %p, prev %p\n", + __func__, (void *)rgi, (void *)rgi->prev); + return true; +} + +static void +pop_repeated_group_info(struct capvm_compile_env *env, const struct ast_expr *expr) +{ + assert(expr != NULL); + assert(expr->type == AST_EXPR_REPEAT || expr->type == AST_EXPR_ALT); + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: pop rgi, expecting %p, got %p\n", + __func__, (void *)expr, (void *)rgi->outermost_ancestor); + assert(rgi->outermost_ancestor == expr); + struct repeated_group_info *prev = rgi->prev; + LOG(3 - LOG_REPETITION_CASES, + "%s: pop rgi, freeing %p, prev %p\n", + __func__, (void *)rgi, (void *)prev); + + env->repeated_groups = prev; + if (rgi->groups != NULL) { + f_free(env->alloc, rgi->groups); + } + f_free(env->alloc, rgi); +} + +static bool +emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p); + +static bool +capvm_compile_iter_save_groups_in_skipped_subtree(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr); + +static bool +compile_kleene_star(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr); + +static bool +capvm_compile_iter(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr) +{ + LOG(4, "%s: expr %p, type %s, %u/%u used, re_flags 0x%02x\n", + __func__, (void *)expr, ast_node_type_name(expr->type), + p->used, p->ceil, expr->re_flags); + + switch (expr->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_TOMBSTONE: + break; + case AST_EXPR_CONCAT: + for (size_t i = 0; i < expr->u.concat.count; i++) { + /* append instructions from each consecutive node */ + const struct ast_expr *n = expr->u.concat.n[i]; + if (!capvm_compile_iter(env, p, n)) { return false; } + } + break; + case AST_EXPR_ALT: + { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + assert(expr->u.alt.count > 0); + + if (expr->u.alt.contains_empty_groups) { + if (!push_repeated_group_info(env, expr)) { + return false; + } + } + + /* If this ALT node represents a character class (such as a + * rewritten . character's ALT[0x00 - 0x09, 0x0b - 0xff] or + * a rewritten [abc-ef]'s ... , then produce the corresponding + * character class literal. The direct representation of the + * subtree would take several instructions and introduce + * unnecessary splits, increasing memory usage at runtime. */ + uint64_t cc[4]; + if (subtree_represents_character_class(expr, cc)) { + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op_cc = &p->ops[pos]; + op_cc->t = CAPVM_OP_CHARCLASS; + + if (expr->re_flags & RE_ICASE) { + make_charclass_case_insensitive(cc); + } + if (!intern_char_class(env, p, cc, &op_cc->u.charclass_id)) { + return false; + } + + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + break; + } + + uint32_t active_count = 0; + uint32_t last_active; + struct alt_flow_info { + bool is_active; + uint32_t backpatch; + }; + struct alt_flow_info *flow_info = f_calloc(env->alloc, + expr->u.alt.count, sizeof(flow_info[0])); + assert(flow_info != NULL); + + for (uint64_t i = 0; i < expr->u.alt.count; i++) { + const struct ast_expr *n = expr->u.alt.n[i]; + if (active_node(n)) { + last_active = i; + active_count++; + flow_info[i].is_active = true; + } + } + + /* If there are no children active this should terminate + * with an empty program. */ + LOG(3, "%s: active_count == %d\n", __func__, active_count); + if (active_count == 0) { + LOG(3, "%s: active_count == 0, skipping\n", __func__); + + for (uint64_t i = 0; i < expr->u.alt.count; i++) { + const struct ast_expr *n = expr->u.alt.n[i]; + capvm_compile_iter_save_groups_in_skipped_subtree(env, p, n); + if (n->flags & AST_FLAG_NULLABLE) { + break; + } + } + + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + + /* FIXME: May need distinct error case to not + * leak. There is currently no test reaching + * this and the fuzzer has not produced an input + * that reaches it -- unsatisfiability has probably + * already pruned subtrees that would get here. */ + return true; + } else if (active_count == 1) { + /* even if one of the later subtrees is active, an earlier + * subtree can still shadow it. */ + bool shadowed = false; + + for (uint64_t i = 0; i < expr->u.alt.count; i++) { + if (i != last_active) { /* evaluate for empty groups */ + const struct ast_expr *n = expr->u.alt.n[i]; + capvm_compile_iter_save_groups_in_skipped_subtree(env, p, n); + if (n->flags & AST_FLAG_NULLABLE) { + shadowed = true; + break; + } + } + } + + if (!shadowed) { + LOG(5, "narrowing to last_active %u\n", last_active); + assert(last_active < expr->u.alt.count); + const struct ast_expr *n = expr->u.alt.n[last_active]; + if (!capvm_compile_iter(env, p, n)) { + return false; + } + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + break; + } else { + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + return true; /* may need distinct error case to not leak */ + } + } + + LOG(3, "%s: compiling AST_EXPR_ALT with %u active nodes, last_active %u\n", + __func__, active_count, last_active); + + /* note: binarized split: for a|b|c, treat this like "a else (b else c)", + * leading to generated code like: + * + * // note: trying each case in order, earlier cases are more greedy + * - split_cont j1 + * - split_new j2 + * j1: + * - + * - jmp pos_after_all // or split pos_after_all, PLUS_BACKPATCH, see below + * j2: + * - split_cont j3 + * - split_new j4 + * j3: + * - + * - jmp pos_after_all + * j4: + * //// DO NOT EMIT split instructions here, treat like a final else + * - + * // fall through to pos_after_all + * pos_after_all: + * + * + * When an ALT case: + * - is nullable (can match the empty string) + * - is the first nullable case (shadowing cases after) + * - is in a subtree of a REPEAT{1,inf} (+) node whose entire subtree is nullable + * then that case's `jmp pos_after_all` should be replaced with + * `split pos_after_all pos_after_repeat_backjmp`, which will need a special + * label for batch-patching by the REPEAT later. This is necessary for cases + * like '^(?:($|x))+$', where the `jmp pos_after_all` would lead to code after + * the ALT that has already been executed at the current input position. + * */ + for (uint32_t c_i = 0; c_i < expr->u.alt.count; c_i++) { + const bool is_final_else_case = c_i == last_active; + LOG(3, "%s: %p c_i %u/%zu, is_final_else_case %d\n", + __func__, (void *)expr, c_i, expr->u.alt.count, is_final_else_case); + if (!flow_info[c_i].is_active) { continue; } + + if (is_final_else_case) { + /* Just add the case for the child node and + * then fall through to pos_after_all. */ + const struct ast_expr *n = expr->u.alt.n[c_i]; + LOG(3, "%s: %p recursing...\n", __func__, (void *)expr); + if (!capvm_compile_iter(env, p, n)) { + return false; + } + LOG(3, "%s: %p recursing...done (final-else-case)\n", __func__, (void *)expr); + + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: ALT %p: contains_empty_groups: %d, outermost_ancestor: %p == %p ? %d\n", + __func__, (void *)expr, + expr->u.alt.contains_empty_groups, + (void *)(rgi ? rgi->outermost_ancestor : NULL), + (void *)expr, + (rgi ? rgi->outermost_ancestor == expr : 0)); + if (expr->u.alt.contains_empty_groups) { + assert(rgi != NULL); + LOG(3 - LOG_REPETITION_CASES, + "%s: outermost_ancestor match, count %zu\n", __func__, rgi->count); + if (!emit_repeated_groups(env, p)) { + return false; + } + } + } else { + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + const uint32_t pos_split_before_case = reserve_program_opcode(p); + struct capvm_opcode *op_split_before = &p->ops[pos_split_before_case]; + op_split_before->t = CAPVM_OP_SPLIT; + + /* greedier branch: trying the next case, in order */ + op_split_before->u.split.greedy = get_program_offset(p); + + /* less greedy branch: moving on to the next case. + * will backpatch .new to after this case's JMP later */ + op_split_before->u.split.nongreedy = PENDING_OFFSET_ALT_BACKPATCH_NEW; + + const struct ast_expr *n = expr->u.alt.n[c_i]; + LOG(3, "%s: %p recursing...\n", __func__, (void *)expr); + if (!capvm_compile_iter(env, p, n)) { + return false; + } + LOG(3, "%s: %p recursing...done (non-final)\n", __func__, (void *)expr); + + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: ALT %p: contains_empty_groups: %d, outermost_ancestor: %p == %p ? %d\n", + __func__, (void *)expr, expr->u.alt.contains_empty_groups, + (void *)(rgi ? rgi->outermost_ancestor : NULL), + (void *)expr, + (rgi ? rgi->outermost_ancestor == expr : 0)); + if (expr->u.alt.contains_empty_groups) { + assert(rgi != NULL); + LOG(3 - LOG_REPETITION_CASES, + "%s: outermost_ancestor match, count %zu\n", __func__, rgi->count); + if (!emit_repeated_groups(env, p)) { + return false; + } + } + + /* JMP or SPLIT, plus space after */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + /* Based on analysis, either emit a JMP or SPLIT. */ + if (n->u.alt.nullable_alt_inside_plus_repeat) { + const uint32_t pos_split_after = reserve_program_opcode(p); + flow_info[c_i].backpatch = pos_split_after; + struct capvm_opcode *op_split_after = &p->ops[pos_split_after]; + op_split_after->t = CAPVM_OP_SPLIT; + op_split_after->u.split.greedy = PENDING_OFFSET_ALT_BACKPATCH_JMP; + op_split_after->u.split.nongreedy = PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS; + } else { + const uint32_t pos_jmp_after = reserve_program_opcode(p); + flow_info[c_i].backpatch = pos_jmp_after; + struct capvm_opcode *op_jmp = &p->ops[pos_jmp_after]; + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = PENDING_OFFSET_ALT_BACKPATCH_JMP; + } + + /* refresh pointer after possible realloc */ + op_split_before = &p->ops[pos_split_before_case]; + + /* and the original split jumps to after + * this case's JMP */ + op_split_before->u.split.nongreedy = get_program_offset(p); + } + } + + /* Ensure there's space for the next instruction, and then + * set every case's JMP suffix to it. */ + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_after_all = get_program_offset(p); + + for (size_t i = 0; i < expr->u.alt.count - 1; i++) { + const bool is_final_else_case = i == last_active; + assert(flow_info[i].backpatch < p->used); + if (is_final_else_case || !flow_info[i].is_active) { + continue; + } + + struct capvm_opcode *op_patch = &p->ops[flow_info[i].backpatch]; + if (op_patch->t == CAPVM_OP_JMP) { + assert(op_patch->u.jmp == PENDING_OFFSET_ALT_BACKPATCH_JMP); + op_patch->u.jmp = pos_after_all; + } else if (op_patch->t == CAPVM_OP_SPLIT) { + assert(op_patch->u.split.greedy == PENDING_OFFSET_ALT_BACKPATCH_JMP); + op_patch->u.split.greedy = pos_after_all; + /* This will be patched by an ancestor repeat node after returning. */ + assert(op_patch->u.split.greedy == PENDING_OFFSET_ALT_BACKPATCH_AFTER_REPEAT_PLUS); + } else { + assert(!"type mismatch"); + } + } + + f_free(env->alloc, flow_info); + if (expr->u.alt.contains_empty_groups) { + pop_repeated_group_info(env, expr); + } + break; + } + case AST_EXPR_LITERAL: + { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos]; + + if (expr->re_flags & RE_ICASE) { + uint64_t cc[4] = { 0 }; + u64bitset_set(cc, (uint8_t)expr->u.literal.c); + + op->t = CAPVM_OP_CHARCLASS; + make_charclass_case_insensitive(cc); + if (!intern_char_class(env, p, cc, &op->u.charclass_id)) { + return false; + } + } else { + op->t = CAPVM_OP_CHAR; + op->u.chr = (uint8_t)expr->u.literal.c; + } + break; + } + case AST_EXPR_CODEPOINT: + assert(!"not implemented, unreachable"); + break; + case AST_EXPR_REPEAT: + { + const unsigned min = expr->u.repeat.min; + const unsigned max = expr->u.repeat.max; + const struct ast_expr *e = expr->u.repeat.e; + + /* collect groups to emit */ + if (expr->u.repeat.contains_empty_groups) { + if (!push_repeated_group_info(env, expr)) { + return false; + } + } + + if (min == 1 && max == 1) { /* {1,1} */ + /* if repeating exactly once, just defer to subtree, + * but still do the repeated_group_info cleanup below */ + if (!capvm_compile_iter(env, p, e)) { + return false; + } + } else if (min == 0 && max == 1) { /* ? */ + /* split l1, l2 + * l1: + * l2: */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l1 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (!capvm_compile_iter(env, p, e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + op_split = &p->ops[pos_split]; /* refresh pointer */ + + const uint32_t after_expr = get_program_offset(p); + op_split->u.split.nongreedy = after_expr; + } else if (min == 0 && max == AST_COUNT_UNBOUNDED) { /* * */ + if (!compile_kleene_star(env, p, expr)) { + return false; + } + } else if (min == 1 && max == AST_COUNT_UNBOUNDED) { /* + */ + /* l1: + * split l1, l2 + * l2: */ + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_l1 = get_program_offset(p); + + if (!capvm_compile_iter(env, p, e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + + /* Only emit the backwards jump for repetition branching + * if the subtree added any instructions. */ + if (get_program_offset(p) != pos_l1) { + if (!ensure_program_capacity(env->alloc, p, 3)) { + return false; + } + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l2 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = pos_l2; + } + } else if (min == 0 && max == 0) { /* {0,0} */ + /* ignored, except any groups contained within that could match + * empty input still get emitted (unless unsatisfiable). */ + if (e->flags & AST_FLAG_UNSATISFIABLE) { + LOG(3, "%s: repeat{0,0} && UNSATISFIABILE -> skipping\n", __func__); + break; + } + + /* Unreachable group captures still need to be counted, otherwise + * subsequent ones would get shifted down. */ + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, e)) { return false; } + break; + } else { /* other bounded count */ + /* repeat the minimum number of times */ + for (size_t i = 0; i < min; i++) { + if (!capvm_compile_iter(env, p, e)) { return false; } + } + + if (max == AST_COUNT_UNBOUNDED) { + /* A repeat of {x,inf} should be treated like + * (?:subtree){x} (?:subtree)* , where any numbered + * capture groups inside have the same group ID in + * both copies of the subtree. */ + if (!compile_kleene_star(env, p, expr)) { + return false; + } + } else { + /* then repeat up to the max as ? + * + * split_cont l1 + * split_new l2 + * l1: + * l2: */ + for (size_t i = min; i < max; i++) { + if (!ensure_program_capacity(env->alloc, p, 3)) { + return false; + } + + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l1 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (!capvm_compile_iter(env, p, e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + op_split = &p->ops[pos_split]; /* refresh pointer */ + + const uint32_t after_expr = get_program_offset(p); + op_split->u.split.nongreedy = after_expr; + } + } + } + + struct repeated_group_info *rgi = env->repeated_groups; + LOG(3 - LOG_REPETITION_CASES, + "%s: REPEAT %p: contains_empty_groups: %d, outermost_ancestor: %p == %p ? %d\n", + __func__, (void *)expr, expr->u.repeat.contains_empty_groups, + (void *)(rgi ? rgi->outermost_ancestor : NULL), + (void *)expr, + (rgi ? rgi->outermost_ancestor == expr : 0)); + if (expr->u.repeat.contains_empty_groups + && rgi != NULL + && rgi->outermost_ancestor == expr) { + LOG(3 - LOG_REPETITION_CASES, + "%s: outermost_ancestor match, count %zu\n", __func__, rgi->count); + if (!emit_repeated_groups(env, p)) { + return false; + } + pop_repeated_group_info(env, expr); + } + + break; + } + case AST_EXPR_GROUP: + { + const uint32_t id = expr->u.group.id; + const int is_repeated = expr->u.group.repeated; + + /* If the group is nullable and repeated, then move its save + * instructions to the end, since the final iteration matching + * nothing will always clobber any earlier saves. This is a + * workaround for cases that would otherwise incorrectly be + * halted by infinite loop prevention at runtime. */ + if (is_repeated && ((expr->flags & AST_FLAG_NULLABLE) + || !(expr->flags & AST_FLAG_CAN_CONSUME))) { + + struct repeated_group_info *rgi = env->repeated_groups; + + LOG(3 - LOG_REPETITION_CASES, + "%s: checking repeated group %u (capvm_compile_iter recurse), parent %p\n", + __func__, id, (void *)(rgi ? rgi->outermost_ancestor : NULL)); + if (!capvm_compile_iter(env, p, expr->u.group.e)) { return false; } + LOG(3 - LOG_REPETITION_CASES, + "%s: checking repeated group %u (capvm_compile_iter done), parent %p\n", + __func__, id, (void *)(rgi ? rgi->outermost_ancestor : NULL)); + + /* don't emit these here, parent repeat node will add them after. */ + if (rgi && rgi->outermost_ancestor != NULL) { + if (rgi->count == rgi->ceil) { + const size_t nceil = (rgi->ceil == 0 + ? DEF_REPEATED_GROUPS_CEIL + : 2*rgi->ceil); + const struct ast_expr **ngroups = f_realloc(env->alloc, + rgi->groups, + nceil * sizeof(ngroups[0])); + if (ngroups == NULL) { + return false; + } + rgi->groups = ngroups; + rgi->ceil = nceil; + } + + LOG(3 - LOG_REPETITION_CASES, + "%s: adding group %u (%p) to outermost_ancestor %p\n", + __func__, id, (void *)expr, + (void *)rgi->outermost_ancestor); + rgi->groups[rgi->count] = expr; + rgi->count++; + } + } else { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + if (!capvm_compile_iter(env, p, expr->u.group.e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*id + 1; + } + + if (id > env->max_capture_seen || env->max_capture_seen == NO_CAPTURE_ID) { + env->max_capture_seen = id; + } + + break; + } + + case AST_EXPR_ANCHOR: + { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos]; + op->t = CAPVM_OP_ANCHOR; + op->u.anchor = (expr->u.anchor.type == AST_ANCHOR_START + ? CAPVM_ANCHOR_START : CAPVM_ANCHOR_END); + break; + } + case AST_EXPR_SUBTRACT: + { + uint64_t cc[4]; + for (size_t i = 0; i < 4; i++) { + cc[i] = ~(uint64_t)0; + } + if (subtree_represents_character_class(expr, cc)) { + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op_cc = &p->ops[pos]; + op_cc->t = CAPVM_OP_CHARCLASS; + + if (expr->re_flags & RE_ICASE) { + make_charclass_case_insensitive(cc); + } + + if (!intern_char_class(env, p, cc, &op_cc->u.charclass_id)) { + return false; + } + } else { + /* FIXME: should return UNSUPPORTED */ + assert(!"unreachable"); + return false; + } + break; + } + case AST_EXPR_RANGE: + { + uint64_t cc[4] = { 0 }; + if (!subtree_represents_character_class(expr, cc)) { + dump_endpoint(&expr->u.range.from); + dump_pos(&expr->u.range.start); + dump_endpoint(&expr->u.range.to); + dump_pos(&expr->u.range.end); + assert(!"unreachable"); + return false; + } + + if (!ensure_program_capacity(env->alloc, p, 1)) { + return false; + } + const uint32_t pos = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos]; + + op->t = CAPVM_OP_CHARCLASS; + if (expr->re_flags & RE_ICASE) { + make_charclass_case_insensitive(cc); + } + + if (!intern_char_class(env, p, cc, &op->u.charclass_id)) { + return false; + } + break; + } + default: + assert(!"matchfail"); + } + + return true; +} + +static bool +compile_kleene_star(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr) +{ + /* Note: min count may be > 0 because this is also + * used for unbounded repetition with a lower count, + * as in `a{3,}`, but in that case the {min} + * repetitions have already been handled by the caller. */ + assert(expr && expr->type == AST_EXPR_REPEAT && + expr->u.repeat.max == AST_COUNT_UNBOUNDED); + + /* l1: split l2, l3 + * l2: + * jmp_once l1 OR jmp l1 + * l3: */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + const uint32_t pos_l1 = reserve_program_opcode(p); + const uint32_t pos_l2 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_l1]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = PENDING_OFFSET_REPEAT_OPTIONAL_CONT; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (!capvm_compile_iter(env, p, expr->u.repeat.e)) { return false; } + + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + /* It's more expensive to always emit JMP_ONCE because it + * extends the path each iteration, so we could detect when + * it would be safe to use a JMP instead. */ + if (can_safely_skip_JMP_ONCE(expr)) { + const uint32_t pos_jmp = reserve_program_opcode(p); + struct capvm_opcode *op_jmp = &p->ops[pos_jmp]; + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = pos_l1; + } else { + const uint32_t pos_jmp_once = reserve_program_opcode(p); + struct capvm_opcode *op_jmp_once = &p->ops[pos_jmp_once]; + op_jmp_once->t = CAPVM_OP_JMP_ONCE; + op_jmp_once->u.jmp_once = pos_l1; + } + + const uint32_t pos_l3 = get_program_offset(p); + op_split = &p->ops[pos_l1]; /* refresh pointer */ + op_split->u.split.greedy = pos_l2; + op_split->u.split.nongreedy = pos_l3; + return true; +} + +static bool +emit_repeated_groups(struct capvm_compile_env *env, struct capvm_program *p) +{ + struct repeated_group_info *rgi = env->repeated_groups; + for (size_t i = 0; i < rgi->count; i++) { + const struct ast_expr *group = rgi->groups[i]; + assert(group->u.group.repeated); + const unsigned id = group->u.group.id; + LOG(3 - LOG_REPETITION_CASES, + "%s: checking %zu/%zu: group_id %u\n", + __func__, i, rgi->count, id); + + if (group->flags & (AST_FLAG_ANCHORED_START | AST_FLAG_ANCHORED_END)) { + /* if the otherwise empty group contains any anchors, + * then emit a subtree like (^)? so that its capture + * is only set when the anchors would match. */ + if (!ensure_program_capacity(env->alloc, p, 6)) { + return false; + } + + /* split l1, l2 + * l1: + * + * l2: save (start) + * save (end) + * */ + const uint32_t pos_split = reserve_program_opcode(p); + const uint32_t pos_l1 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[pos_split]; + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = pos_l1; + op_split->u.split.nongreedy = PENDING_OFFSET_REPEAT_OPTIONAL_NEW; + + if (group->flags & AST_FLAG_ANCHORED_START) { + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_ANCHOR; + op->u.anchor = CAPVM_ANCHOR_START; + } + + if (group->flags & AST_FLAG_ANCHORED_END) { + const uint32_t pos_end = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_end]; + op->t = CAPVM_OP_ANCHOR; + op->u.anchor = CAPVM_ANCHOR_END; + } + + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*group->u.group.id; + op->u.save = 2*id + 1; + + const uint32_t after_expr = get_program_offset(p); + op_split = &p->ops[pos_split]; /* refresh pointer */ + op_split->u.split.nongreedy = after_expr; + } else { + /* simple case, emit SAVE pair */ + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*group->u.group.id; + op->u.save = 2*id + 1; + } + } + + /* clear, because an ALT's subtrees can have distinct repeated groups */ + rgi->count = 0; + + return true; +} + +static bool +capvm_compile_iter_save_groups_in_skipped_subtree(struct capvm_compile_env *env, + struct capvm_program *p, const struct ast_expr *expr) +{ + /* Follow the subtree as far as any expressions that could + * contain GROUPs. Emit any empty groups. This is necessary for + * regexes like /()*^/ and /(x|(x|))^/ whose subtrees are + * otherwise pruned but would still match the empty string + * before ^. */ + switch (expr->type) { + case AST_EXPR_EMPTY: + case AST_EXPR_LITERAL: + case AST_EXPR_CODEPOINT: + case AST_EXPR_ANCHOR: + case AST_EXPR_SUBTRACT: + case AST_EXPR_RANGE: + case AST_EXPR_TOMBSTONE: + /* none of these can contain groups */ + break; + + case AST_EXPR_CONCAT: + if (expr->flags & AST_FLAG_UNSATISFIABLE) { + return true; /* skip */ + } + for (size_t i = 0; i < expr->u.concat.count; i++) { + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.concat.n[i])) { + return false; + } + } + break; + case AST_EXPR_ALT: + for (size_t i = 0; i < expr->u.alt.count; i++) { + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.alt.n[i])) { + return false; + } + } + break; + + case AST_EXPR_REPEAT: + return capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.repeat.e); + + case AST_EXPR_GROUP: + { + const uint32_t id = expr->u.group.id; + LOG(5, "%s: recording otherwise skipped group %u\n", __func__, id); + + if (!ensure_program_capacity(env->alloc, p, 2)) { + return false; + } + + if (id > env->max_capture_seen || env->max_capture_seen == NO_CAPTURE_ID) { + env->max_capture_seen = id; + } + + const uint32_t pos_start = reserve_program_opcode(p); + struct capvm_opcode *op = &p->ops[pos_start]; + op->t = CAPVM_OP_SAVE; /* save capture start */ + op->u.save = 2*id; + + const uint32_t pos_end = reserve_program_opcode(p); + op = &p->ops[pos_end]; + op->t = CAPVM_OP_SAVE; /* save capture end */ + op->u.save = 2*id + 1; + + if (!capvm_compile_iter_save_groups_in_skipped_subtree(env, p, expr->u.group.e)) { + return false; + } + + break; + } + default: + assert(!"match fail"); + + } + return true; +} + +static enum re_capvm_compile_ast_res +capvm_compile(struct capvm_compile_env *env, + const struct ast *ast) +{ + struct capvm_program *p = f_calloc(env->alloc, 1, sizeof(*p)); + if (p == NULL) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + + LOG(3, "%s: has_unanchored: start? %d, end? %d\n", __func__, + ast->has_unanchored_start, + ast->has_unanchored_end); + + /* If the regex has an unanchored start, it gets a `.*` prefix, + * but with the labels swapped so that the unanchored start + * loop is NOT greedy. */ + if (ast->has_unanchored_start) { + if (!ensure_program_capacity(env->alloc, p, 4)) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + + /* l1: split l3, l2 + * l2: . + * jmp l1 + * l3: */ + const uint32_t l1 = get_program_offset(p); + const uint32_t split_pos = reserve_program_opcode(p); + struct capvm_opcode *op_split = &p->ops[split_pos]; + + const uint32_t l2 = get_program_offset(p); + const uint32_t op_cc_pos = reserve_program_opcode(p); + struct capvm_opcode *op_cc = &p->ops[op_cc_pos]; + + const uint32_t op_jmp_pos = reserve_program_opcode(p); + struct capvm_opcode *op_jmp = &p->ops[op_jmp_pos]; + + const uint32_t l3 = get_program_offset(p); + + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = l3; + op_split->u.split.nongreedy = l2; + + op_cc->t = CAPVM_OP_CHARCLASS; + uint64_t any[4]; + for (size_t i = 0; i < 4; i++) { + any[i] = ~(uint64_t)0; + } + if (!intern_char_class(env, p, any, &op_cc->u.charclass_id)) { + goto cleanup; + } + + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = l1; + } + + /* Compile the regex AST, assuming match group 0 is + * explicitly represented. */ + if (!capvm_compile_iter(env, p, ast->expr)) { + goto cleanup; + } + + /* Add the unanchored end loop, outside of match group 0 */ + if (ast->has_unanchored_end) { + if (!ensure_program_capacity(env->alloc, p, 4)) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + + /* l1: split l3, l2 + * l2: . + * jmp l1 + * l3: */ + const uint32_t l1 = reserve_program_opcode(p); + const uint32_t l2 = reserve_program_opcode(p); + const uint32_t l_jmp = reserve_program_opcode(p); + const uint32_t l3 = get_program_offset(p); + + struct capvm_opcode *op_split = &p->ops[l1]; + + struct capvm_opcode *op_any = &p->ops[l2]; + struct capvm_opcode *op_jmp = &p->ops[l_jmp]; + + op_split->t = CAPVM_OP_SPLIT; + op_split->u.split.greedy = l3; + op_split->u.split.nongreedy = l2; + + op_any->t = CAPVM_OP_CHARCLASS; + uint64_t any[4]; + for (size_t i = 0; i < 4; i++) { + any[i] = ~(uint64_t)0; + } + if (!intern_char_class(env, p, any, &op_any->u.charclass_id)) { + goto cleanup; + } + + op_jmp->t = CAPVM_OP_JMP; + op_jmp->u.jmp = l1; + } + + /* add MATCH opcode at end */ + if (!ensure_program_capacity(env->alloc, p, 1)) { + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; + } + const uint32_t pos_m = reserve_program_opcode(p); + struct capvm_opcode *op_m = &p->ops[pos_m]; + op_m->t = CAPVM_OP_MATCH; + + /* TODO: populate info about max threads, etc. in p, + * because it should be possible to calculate runtime + * memory limits at compile time. */ + env->program = p; + p->capture_count = (env->max_capture_seen == NO_CAPTURE_ID + ? 0 : env->max_capture_seen + 1); + + if (LOG_CAPVM > 2) { + LOG(0, "====\n"); + fsm_capvm_program_dump(stderr, p); + LOG(0, "====\n"); + } + + /* TODO: it may be worth exposing these static checks as + * something the caller can run at load-time */ + check_program_for_invalid_labels(p); + + return RE_CAPVM_COMPILE_AST_OK; + +cleanup: + fsm_capvm_program_free(env->alloc, p); + return RE_CAPVM_COMPILE_AST_ERROR_ALLOC; +} + +#define DUMP_AST 0 +#define DUMP_RESULT 0 /* should be 0 in production */ + +#if DUMP_AST || DUMP_RESULT +#include +#include "print.h" +static struct fsm_options opt = { .group_edges = 1 }; + +static unsigned +get_max_capture_id(const struct capvm_program *program) +{ + assert(program != NULL); + return (program->capture_count == 0 + ? 0 + : program->capture_base + program->capture_count - 1); +} + +#endif + +enum re_capvm_compile_ast_res +re_capvm_compile_ast(const struct fsm_alloc *alloc, + const struct ast *ast, + enum re_flags re_flags, + struct capvm_program **program) +{ +#if DUMP_AST + if (LOG_CAPVM > 2) { + ast_print_dot(stderr, &opt, re_flags, ast); + ast_print_tree(stderr, &opt, re_flags, ast); + } +#endif + + struct capvm_compile_env env = { + .alloc = alloc, + .re_flags = re_flags, + .max_capture_seen = NO_CAPTURE_ID, + }; + + enum re_capvm_compile_ast_res res; + res = capvm_compile(&env, ast); + + + struct repeated_group_info *rgi = env.repeated_groups; + while (rgi != NULL) { + struct repeated_group_info *prev = rgi->prev; + LOG(3 - LOG_REPETITION_CASES, + "%s: rgi cleanup, freeing %p, prev %p\n", + __func__, (void *)rgi, (void *)prev); + + if (rgi->groups != NULL) { + f_free(alloc, rgi->groups); + } + f_free(alloc, rgi); + rgi = prev; + } + + if (res == RE_CAPVM_COMPILE_AST_OK) { +#if DUMP_RESULT > 0 + if (DUMP_RESULT > 1 || getenv("DUMP")) { + ast_print_tree(stderr, &opt, re_flags, ast); + fsm_capvm_program_dump(stderr, env.program); + fprintf(stderr, "%s: max_capture_id %u\n", __func__, + get_max_capture_id(env.program)); + + } +#endif + + *program = env.program; + } + + free(env.charclass_htab.buckets); + + return res; +} diff --git a/src/libre/re_capvm_compile.h b/src/libre/re_capvm_compile.h new file mode 100644 index 000000000..b98ac9196 --- /dev/null +++ b/src/libre/re_capvm_compile.h @@ -0,0 +1,32 @@ +/* + * Copyright 2022 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#ifndef RE_CAPVM_COMPILE_H +#define RE_CAPVM_COMPILE_H + +/* The part of the capture VM interface that belongs in + * libre rather than libfsm, mostly related to compiling + * a libre AST into a capvm_program. */ + +#include + +#include "ast.h" +#include + +struct capvm_program; + +enum re_capvm_compile_ast_res { + RE_CAPVM_COMPILE_AST_OK, + RE_CAPVM_COMPILE_AST_ERROR_ALLOC = -1, +}; + +enum re_capvm_compile_ast_res +re_capvm_compile_ast(const struct fsm_alloc *alloc, + const struct ast *ast, + enum re_flags re_flags, + struct capvm_program **program); + +#endif diff --git a/src/libre/strerror.c b/src/libre/strerror.c index 009d61df2..d66e750a4 100644 --- a/src/libre/strerror.c +++ b/src/libre/strerror.c @@ -20,6 +20,8 @@ re_strerror(enum re_errno e) case RE_EERRNO: return strerror(errno); case RE_EBADDIALECT: return "Bad dialect"; case RE_EBADGROUP: return "Bad group"; + case RE_EUNSUPCAPTUR: return "Cannot support captures in this case"; + case RE_EUNSUPPPCRE: return "Unsupported PCRE edge case"; case RE_ENEGRANGE: return "Negative group range"; case RE_ENEGCOUNT: return "Negative count range"; diff --git a/src/lx/parser.act b/src/lx/parser.act index 78254e123..66d0591f0 100644 --- a/src/lx/parser.act +++ b/src/lx/parser.act @@ -388,7 +388,7 @@ assert(@a != NULL); assert(@a->p != NULL); - @r = re_comp(RE_NATIVE, act_agetc, @a, act_state->opt, @f, &err); + @r = re_comp(RE_NATIVE, act_agetc, @a, act_state->opt, @f | RE_NOCAPTURE, &err); if (@r == NULL) { assert(err.e != RE_EBADDIALECT); /* TODO: pass filename for .lx source */ diff --git a/src/re/main.c b/src/re/main.c index 77f147518..6e44d675a 100644 --- a/src/re/main.c +++ b/src/re/main.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -48,6 +49,10 @@ * TODO: flags; -r for RE_REVERSE, etc */ +static int +exec_with_captures(struct fsm *fsm, + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end); + struct match { fsm_end_id_t i; const char *s; @@ -613,6 +618,10 @@ parse_flags(const char *arg, enum re_flags *flags) *flags = *flags | RE_EXTENDED; break; + case 'C': + *flags = *flags | RE_NOCAPTURE; + break; + /* others? */ default: @@ -642,6 +651,7 @@ main(int argc, char *argv[]) int patterns; int ambig; int makevm; + int resolve_captures; size_t generate_bounds = 0; struct fsm_dfavm *vm; @@ -665,6 +675,7 @@ main(int argc, char *argv[]) patterns = 0; ambig = 0; makevm = 0; + resolve_captures = 0; print_fsm = NULL; print_ast = NULL; query = NULL; @@ -675,7 +686,7 @@ main(int argc, char *argv[]) { int c; - while (c = getopt(argc, argv, "h" "acCwXe:E:G:k:" "bi" "sq:r:l:F:" "upMmnftxyz"), c != -1) { + while (c = getopt(argc, argv, "h" "acCwXe:E:G:k:" "bi" "sq:r:l:F:" "upMmnRftxyz"), c != -1) { switch (c) { case 'a': opt.anonymous_states = 0; break; case 'c': opt.consolidate_edges = 0; break; @@ -714,6 +725,7 @@ main(int argc, char *argv[]) case 't': isliteral = 1; break; case 'z': patterns = 1; break; case 'M': makevm = 1; break; + case 'R': resolve_captures = 1; break; case 'G': generate_bounds = strtoul(optarg, NULL, 10); @@ -1242,8 +1254,10 @@ main(int argc, char *argv[]) if (vm != NULL) { e = fsm_vm_match_file(vm, f); + } else if (resolve_captures) { + assert(!"todo"); } else { - e = fsm_exec(fsm, fsm_fgetc, f, &state, NULL); + e = fsm_exec(fsm, fsm_fgetc, f, &state); } fclose(f); @@ -1254,8 +1268,10 @@ main(int argc, char *argv[]) if (vm != NULL) { e = fsm_vm_match_buffer(vm, s, strlen(s)); + } else if (resolve_captures) { + e = exec_with_captures(fsm, fsm_sgetc, &s, &state); } else { - e = fsm_exec(fsm, fsm_sgetc, &s, &state, NULL); + e = fsm_exec(fsm, fsm_sgetc, &s, &state); } } @@ -1288,3 +1304,56 @@ main(int argc, char *argv[]) return r; } } + +static int +exec_with_captures(struct fsm *fsm, + int (*fsm_getc)(void *opaque), void *opaque, fsm_state_t *end) +{ + int c; + size_t ceil = 16; + size_t used = 0; + unsigned char *buf = malloc(ceil); + size_t i; + size_t capture_ceil; + struct fsm_capture *captures; + int res; + + while (c = fsm_getc(opaque), c != EOF) { + if (used == ceil - 1) { + const size_t nceil = 2*ceil; + unsigned char *nbuf = realloc(buf, nceil); + if (nbuf == NULL) { + free(buf); + return -1; + } + ceil = nceil; + buf = nbuf; + } + buf[used] = c; + used++; + } + buf[used] = '\0'; + + capture_ceil = fsm_capture_ceiling(fsm); + + captures = malloc(capture_ceil * sizeof(captures[0])); + if (captures == NULL) { + free(buf); + return -1; + } + + res = fsm_exec_with_captures(fsm, buf, used, + end, captures, capture_ceil); + if (res == 1) { + for (i = 0; i < capture_ceil; i++) { + printf("-- %zu: %zd,%zd\n", + i, captures[i].pos[0], captures[i].pos[1]); + } + } else { + printf("-- no match\n"); + } + + free(buf); + free(captures); + return res; +} diff --git a/src/retest/main.c b/src/retest/main.c index b6b4c52f7..e01c93e7c 100644 --- a/src/retest/main.c +++ b/src/retest/main.c @@ -393,7 +393,7 @@ parse_escapes(char *s, char **errpos, int *lenp) ndig++; } else { - s[j++] = ccode; + s[j++] = (char)ccode; st = ST_BARE; if (!hexcurly) { diff --git a/tests/aho_corasick/Makefile b/tests/aho_corasick/Makefile index 5748ddd5c..6eef421bf 100644 --- a/tests/aho_corasick/Makefile +++ b/tests/aho_corasick/Makefile @@ -33,7 +33,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}a.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}a.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}a.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}a.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt @@ -48,7 +48,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}l.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}l.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}l.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}l.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt @@ -63,7 +63,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}r.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}r.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}r.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}r.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt @@ -79,7 +79,7 @@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}u.re: ${TEST_SRCDIR.tests/aho_corasick} < ${.ALLSRC:M*.txt} > $@ ${TEST_OUTDIR.tests/aho_corasick}/out${n}u.fsm: ${RE} ${TEST_OUTDIR.tests/aho_corasick}/out${n}u.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/aho_corasick}/got${n}u.fsm: ${AC_TEST} ${TEST_SRCDIR.tests/aho_corasick}/in${n}.txt diff --git a/tests/capture/Makefile b/tests/capture/Makefile index 53d63ff2b..bdee42df4 100644 --- a/tests/capture/Makefile +++ b/tests/capture/Makefile @@ -7,18 +7,20 @@ TEST_OUTDIR.tests/capture = ${BUILD}/tests/capture .for n in ${TEST.tests/capture:T:R:C/^capture//} test:: ${TEST_OUTDIR.tests/capture}/res${n} SRC += ${TEST_SRCDIR.tests/capture}/capture${n}.c -CFLAGS.${TEST_SRCDIR.tests/capture}/capture${n}.c = -UNDEBUG +CFLAGS.${TEST_SRCDIR.tests/capture}/capture${n}.c = -UNDEBUG -std=c99 ${TEST_OUTDIR.tests/capture}/run${n}: ${TEST_OUTDIR.tests/capture}/capture${n}.o ${TEST_OUTDIR.tests/capture}/captest.o - ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/capture}/run${n} ${TEST_OUTDIR.tests/capture}/capture${n}.o ${TEST_OUTDIR.tests/capture}/captest.o ${BUILD}/lib/libfsm.a + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/capture}/run${n} ${TEST_OUTDIR.tests/capture}/capture${n}.o ${TEST_OUTDIR.tests/capture}/captest.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a ${TEST_OUTDIR.tests/capture}/res${n}: ${TEST_OUTDIR.tests/capture}/run${n} ( ${TEST_OUTDIR.tests/capture}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/capture}/res${n} -.for lib in ${LIB:Mlibfsm} +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} ${TEST_OUTDIR.tests/capture}/run${n}: ${BUILD}/lib/${lib:R}.a .endfor .endfor +CFLAGS.${TEST_SRCDIR.tests/capture}/captest.c += -std=c99 + ${TEST_OUTDIR.tests/capture}/captest.o: tests/capture/captest.c ${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/capture}/captest.o tests/capture/captest.c diff --git a/tests/capture/captest.c b/tests/capture/captest.c index dc66f81d4..bf453e3ad 100644 --- a/tests/capture/captest.c +++ b/tests/capture/captest.c @@ -1,19 +1,19 @@ #include "captest.h" -#include -#include -#include - -#if CAPTEST_RUN_SINGLE_LOG #include -#endif +#include + +#include + +/* for fsm_capvm_program_exec */ +#include "../../src/libfsm/capture_vm.h" -#define FAIL(MSG) \ - fprintf(stderr, "FAIL: %s:%d -- %s\n", \ - __FILE__, __LINE__, MSG); \ - exit(EXIT_FAILURE) +struct captest_input { + const char *string; + size_t pos; +}; -int +static int captest_getc(void *opaque) { struct captest_input *input = opaque; @@ -22,189 +22,483 @@ captest_getc(void *opaque) return res == 0 ? EOF : res; } -int -captest_run_single(const struct captest_single_fsm_test_info *info) -{ - size_t i; - struct captest_input input; - fsm_state_t end; - int exec_res; - struct fsm_capture got_captures[MAX_TEST_CAPTURES]; - struct fsm_capture exp_captures[MAX_TEST_CAPTURES]; - size_t capture_count = 0; - struct fsm *fsm = captest_fsm_of_string(info->string, 0); +static struct fsm_options options = { + .group_edges = 1, +}; - input.string = info->string; - input.pos = 0; +#define MAX_INPUT_WITH_NL_LENGTH 1000 +static char +input_with_nl[MAX_INPUT_WITH_NL_LENGTH]; - if (fsm == NULL) { - FAIL("fsm_of_string"); +enum captest_run_case_res +captest_run_case(const struct captest_case_single *testcase, + int verbosity, bool trailing_newline, FILE *prog_output) +{ + bool dump_captures = false; + enum captest_run_case_res res = CAPTEST_RUN_CASE_ERROR; + struct re_err err; + + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + assert(prog_output != NULL); + } else { + assert(prog_output == NULL); } - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - exp_captures[i].pos[0] = FSM_CAPTURE_NO_POS; - exp_captures[i].pos[1] = FSM_CAPTURE_NO_POS; + if (verbosity > 0) { + printf("/%s/ <- \"%s%s\": ", + testcase->regex, testcase->input, + trailing_newline ? "\\n" : ""); } - for (i = 0; i < MAX_SINGLE_FSM_TEST_PATHS; i++) { - const struct captest_single_fsm_test_path *path = - &info->paths[i]; - if (path->start == 0 && path->end == 0 && i > 0) { - break; /* end of list */ - } + /* build regex */ + const enum re_flags flags = 0; + struct captest_input comp_input = { + .string = testcase->regex, + }; - /* no zero-width captures */ - assert(path->end > path->start); + struct fsm *fsm = re_comp(RE_PCRE, + captest_getc, &comp_input, + &options, flags, &err); - if (!fsm_capture_set_path(fsm, i, - path->start, path->end)) { - fprintf(stderr, - "failed to set capture path %lu\n", i); - FAIL("fsm_capture_set_path"); + if (testcase->match == SHOULD_REJECT_AS_UNSUPPORTED) { + if (fsm != NULL) { + fsm_free(fsm); + return CAPTEST_RUN_CASE_FAIL; } + return CAPTEST_RUN_CASE_PASS; + } - exp_captures[i].pos[0] = path->start; - exp_captures[i].pos[1] = path->end; + assert(fsm != NULL); - capture_count = i + 1; + if (!fsm_determinise(fsm)) { + return CAPTEST_RUN_CASE_ERROR; } - { - const unsigned count = fsm_countcaptures(fsm); - const unsigned expected = capture_count; - if (count != expected) { - fprintf(stderr, "expected %u, got %u\n", - expected, count); - FAIL("countcaptures"); - } + if (!fsm_minimise(fsm)) { + return CAPTEST_RUN_CASE_ERROR; } -#if CAPTEST_RUN_SINGLE_LOG - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "fsm", fsm); -#endif + if (verbosity > 3) { + fsm_print_fsm(stdout, fsm); + } - exec_res = fsm_exec(fsm, captest_getc, &input, &end, got_captures); - if (exec_res != 1) { FAIL("exec_res"); } - if (end != strlen(info->string)) { FAIL("exec end pos"); } + if (trailing_newline) { + const size_t length = strlen(testcase->input); + assert(length + 1 < MAX_INPUT_WITH_NL_LENGTH); + memcpy(input_with_nl, testcase->input, + length); + input_with_nl[length] = '\n'; + input_with_nl[length + 1] = '\0'; + } - { - fsm_end_id_t id_buf[1] = { ~0 }; - enum fsm_getendids_res gres; - size_t written; - if (1 != fsm_getendidcount(fsm, end)) { - FAIL("did not have exactly one end ID"); + const char *input = trailing_newline + ? input_with_nl + : testcase->input; + assert(input != NULL); + const size_t length = strlen(input); + + fsm_state_t end; /* unused but required by API */ + struct fsm_capture capture_buf[MAX_CAPTEST_SINGLE_CAPTURE_PAIRS]; + const size_t capture_buf_length = MAX_CAPTEST_SINGLE_CAPTURE_PAIRS; + + /* Initialize with values that are distinct from FSM_CAPTURE_NO_POS + * and will stand out visually. Should never see these. */ + for (size_t i = 0; i < MAX_CAPTEST_SINGLE_CAPTURE_PAIRS; i++) { + capture_buf[i].pos[0] = 88888888; + capture_buf[i].pos[1] = 99999999; + } + + /* If verbosity is exactly DUMP_PROGRAMS_VERBOSITY, then print out capture info and pass. */ + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + assert(prog_output != NULL); + if (!trailing_newline) { + const char *match_str = testcase->match == SHOULD_MATCH ? "SHOULD_MATCH" + : testcase->match == SHOULD_NOT_MATCH ? "SHOULD_NOT_MATCH" + : testcase->match == SHOULD_REJECT_AS_UNSUPPORTED ? "SHOULD_REJECT_AS_UNSUPPORTED" + : "ERROR"; + fprintf(prog_output, "regex \"%s\", input \"%s\", match %s, no_nl %d, count %zu:", + testcase->regex, testcase->input, match_str, testcase->no_nl, + testcase->count); + for (size_t i = 0; i < testcase->count; i++) { + fprintf(prog_output, " %zu:[%zd, %zd]", + i, testcase->expected[i].pos[0], testcase->expected[i].pos[1]); + } + fprintf(prog_output, "\n"); + fsm_capture_dump(prog_output, "capture_info", fsm); } + fsm_free(fsm); + return CAPTEST_RUN_CASE_PASS; + } + + /* first, execute with a capture buffer that is one cell too small and check for an error */ + const size_t capture_ceil = fsm_capture_ceiling(fsm); + assert(capture_ceil > 0); + const size_t insufficient_capture_buf_length = capture_ceil - 1; + errno = 0; + int exec_res = fsm_exec_with_captures(fsm, + (const unsigned char *)input, length, &end, capture_buf, insufficient_capture_buf_length); + assert(exec_res == -1); + assert(errno == EINVAL); + errno = 0; + + /* then, execute and check result & captures */ + exec_res = fsm_exec_with_captures(fsm, + (const unsigned char *)input, length, &end, capture_buf, capture_buf_length); + if (exec_res == -1) { + perror("fsm_exec_with_captures"); + return CAPTEST_RUN_CASE_ERROR; + } - gres = fsm_getendids(fsm, end, 1, id_buf, &written); - if (gres != FSM_GETENDIDS_FOUND) { - FAIL("failed to get end IDs"); + if (testcase->match == SHOULD_NOT_MATCH) { /* expect match failure */ + res = (exec_res == 0 + ? CAPTEST_RUN_CASE_PASS + : CAPTEST_RUN_CASE_FAIL); + } else if (exec_res == 0) { + res = CAPTEST_RUN_CASE_FAIL; /* didn't match, should have */ + } else { + res = CAPTEST_RUN_CASE_PASS; + if (verbosity > 1) { + dump_captures = true; } - if (0 != id_buf[0]) { - FAIL("failed to get end ID of 0"); + /* check captures against expected */ + for (size_t i = 0; i < testcase->count; i++) { + if (testcase->expected[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected[i].pos[1] != capture_buf[i].pos[1]) { + res = CAPTEST_RUN_CASE_FAIL; + dump_captures = true; + } } } - for (i = 0; i < capture_count; i++) { -#if CAPTEST_RUN_SINGLE_LOG - fprintf(stderr, "captest: capture %lu: exp (%ld, %ld), got (%ld, %ld)\n", - i, exp_captures[i].pos[0], exp_captures[i].pos[1], - got_captures[i].pos[0], got_captures[i].pos[1]); -#endif - if (got_captures[i].pos[0] != exp_captures[i].pos[0]) { - fprintf(stderr, "capture[%lu].pos[0]: exp %lu, got %lu\n", - i, exp_captures[i].pos[0], - got_captures[i].pos[0]); - FAIL("capture mismatch"); + switch (res) { + case CAPTEST_RUN_CASE_PASS: + if (verbosity > 0) { + printf("pass\n"); } - if (got_captures[i].pos[1] != exp_captures[i].pos[1]) { - fprintf(stderr, "capture[%lu].pos[1]: exp %lu, got %lu\n", - i, exp_captures[i].pos[1], - got_captures[i].pos[1]); - FAIL("capture mismatch"); + break; + case CAPTEST_RUN_CASE_FAIL: + if (verbosity == 0) { + printf("/%s/ <- \"%s%s\": FAIL\n", + testcase->regex, testcase->input, + trailing_newline ? "\\n" : ""); + } + if (verbosity > 0) { + printf("FAIL\n"); + } + break; + case CAPTEST_RUN_CASE_ERROR: + printf("ERROR\n"); + break; + } + + if (dump_captures) { + for (size_t i = 0; i < testcase->count; i++) { + printf("exp %zd, %zd, got %zd, %zd%s\n", + testcase->expected[i].pos[0], testcase->expected[i].pos[1], + capture_buf[i].pos[0], capture_buf[i].pos[1], + (testcase->expected[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected[i].pos[1] != capture_buf[i].pos[1]) + ? " *" : ""); } } fsm_free(fsm); - return 0; + return res; } -struct fsm * -captest_fsm_of_string(const char *string, unsigned end_id) +enum captest_run_case_res +captest_run_case_multi(const struct captest_case_multi *testcase, + int verbosity, bool trailing_newline, FILE *prog_output, + struct captest_case_multi_result *result) { - struct fsm *fsm = captest_fsm_with_options(); - const size_t length = strlen(string); - size_t i; + struct re_err err; + const enum re_flags flags = 0; - if (fsm == NULL) { - return NULL; + struct captest_case_multi_result ignored_result; + if (result == NULL) { + result = &ignored_result; } + memset(result, 0x00, sizeof(*result)); - if (!fsm_addstate_bulk(fsm, length + 1)) { - goto cleanup; + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + assert(prog_output != NULL); + } else { + assert(prog_output == NULL); } - fsm_setstart(fsm, 0); - for (i = 0; i < length; i++) { - if (!fsm_addedge_literal(fsm, i, i + 1, string[i])) { + /* build each regex, combining them and keeping track of capture offsets */ + struct fsm *fsms[testcase->regex_count]; + struct fsm_combined_base_pair bases[testcase->regex_count]; + struct fsm *combined_fsm = NULL; + + for (size_t i = 0; i < testcase->regex_count; i++) { + fsms[i] = NULL; + } + + /* compile each individually */ + for (size_t i = 0; i < testcase->regex_count; i++) { + struct captest_input comp_input = { + .string = testcase->regexes[i], + }; + + if (verbosity > 1) { + fprintf(stderr, "%s: compiling \"%s\"\n", + __func__, comp_input.string); + } + + struct fsm *fsm = re_comp(RE_PCRE, + captest_getc, &comp_input, + &options, flags, &err); + assert(fsm != NULL); + + if (!fsm_determinise(fsm)) { goto cleanup; } + + if (!fsm_minimise(fsm)) { + goto cleanup; + } + + if (verbosity > 3) { + char tag_buf[16] = { 0 }; + snprintf(tag_buf, sizeof(tag_buf), "fsm[%zu]", i); + + fprintf(stderr, "==== fsm[%zu]\n", i); + fsm_print_fsm(stderr, fsm); + fsm_capture_dump(stderr, tag_buf, fsm); + } + + fsms[i] = fsm; + } + + combined_fsm = fsm_union_array(testcase->regex_count, fsms, bases); + assert(combined_fsm != NULL); + if (verbosity > 0) { + fprintf(stderr, "%s: combined_fsm: %d states after fsm_union_array\n", + __func__, fsm_countstates(combined_fsm)); + } + if (verbosity > 1) { + for (size_t i = 0; i < testcase->regex_count; i++) { + fprintf(stderr, "%s: base[%zu]: state %d, capture %u\n", + __func__, i, bases[i].state, bases[i].capture); + } } - fsm_setend(fsm, length, 1); - if (!fsm_setendid(fsm, end_id)) { + if (!fsm_determinise(combined_fsm)) { goto cleanup; } + if (verbosity > 0) { + fprintf(stderr, "%s: combined_fsm: %d states after determinise\n", + __func__, fsm_countstates(combined_fsm)); + } + + if (!fsm_minimise(combined_fsm)) { + goto cleanup; + } + if (verbosity > 0) { + fprintf(stderr, "%s: combined_fsm: %d states after minimise\n", + __func__, fsm_countstates(combined_fsm)); + } + + /* If verbosity is exactly 9, then print out capture info and pass. */ + if (verbosity == DUMP_PROGRAMS_VERBOSITY) { + fsm_capture_dump(prog_output, "capture_info", combined_fsm); + fsm_free(combined_fsm); + return CAPTEST_RUN_CASE_PASS; + } + + if (verbosity > 3) { + fprintf(stderr, "==== combined\n"); + fsm_print_fsm(stderr, combined_fsm); + fsm_capture_dump(stderr, "combined", combined_fsm); + } + + /* for each input, execute and check result */ + const struct multi_case_input_info *info; + for (info = &testcase->inputs[0]; info->input != NULL; info++) { + if (trailing_newline) { + const size_t length = strlen(info->input); + assert(length + 1 < MAX_INPUT_WITH_NL_LENGTH); + memcpy(input_with_nl, info->input, + length); + input_with_nl[length] = '\n'; + input_with_nl[length + 1] = '\0'; + } + + const char *input = trailing_newline + ? input_with_nl + : info->input; + assert(input != NULL); + const size_t length = strlen(input); + + if (verbosity > 1) { + fprintf(stderr, "%s: input: %s\n", __func__, input); + } + + fsm_state_t end; /* unused but required by API */ + struct fsm_capture capture_buf[MAX_CAPTEST_MULTI_CAPTURE_PAIRS]; + const size_t capture_buf_length = MAX_CAPTEST_MULTI_CAPTURE_PAIRS; + for (size_t i = 0; i < capture_buf_length; i++) { + capture_buf[i].pos[0] = (size_t)-2; + capture_buf[i].pos[1] = (size_t)-3; + } + + /* execute and check result & captures */ + int exec_res = fsm_exec_with_captures(combined_fsm, + (const unsigned char *)input, length, &end, capture_buf, capture_buf_length); + if (exec_res == -1) { + perror("fsm_exec_with_captures"); + return CAPTEST_RUN_CASE_ERROR; + } + + /* The .regex field should be in ascending order so we know + * when we've reached the all-0 suffix of expected[]. */ + uint8_t prev_regex = 0; + for (const struct case_multi_expected *exp = &info->expected[0]; + exp->regex >= prev_regex; exp++) { + prev_regex = exp->regex; + bool match = true; + const unsigned capture_base = bases[exp->regex].capture; + const unsigned capture_id = capture_base + exp->capture; + assert(capture_id < MAX_CAPTEST_MULTI_CAPTURE_PAIRS); + const size_t exp_s = exp->pos[0]; + const size_t exp_e = exp->pos[1]; + const size_t got_s = capture_buf[capture_id].pos[0]; + const size_t got_e = capture_buf[capture_id].pos[1]; + if (exp_s == got_s && exp_e == got_e) { + result->pass++; + } else { + match = false; + result->fail++; + } + + if (!match || verbosity > 2) { + fprintf(stderr, "%s: regex %u, capture %u (%u + base %u), exp (%zd, %zd), got (%zd, %zd)%s\n", + __func__, exp->regex, + capture_id, exp->capture, capture_base, + exp_s, exp_e, got_s, got_e, + match ? "" : " *** mismatch ***"); + } + } + } + + fsm_free(combined_fsm); - return fsm; + /* this could populate a result struct so it can pass/fail multiple inputs */ + + return result->fail == 0 + ? CAPTEST_RUN_CASE_PASS + : CAPTEST_RUN_CASE_FAIL; cleanup: - fsm_free(fsm); - return NULL; -} + if (combined_fsm != NULL) { + fsm_free(combined_fsm); + } else { + for (size_t i = 0; i < testcase->regex_count; i++) { + if (fsms[i] != NULL) { + fsm_free(fsms[i]); + } + } + } -static struct fsm_options options; + return CAPTEST_RUN_CASE_ERROR; +} -struct fsm * -captest_fsm_with_options(void) +static struct capvm_program * +get_program_copy(const struct captest_case_program *testcase) { - struct fsm *fsm = NULL; + static struct capvm_program prog; + static struct capvm_opcode ops[MAX_PROGRAM_OPS + 1] = { 0 }; + static struct capvm_char_class cc_sets[MAX_PROGRAM_CHAR_CLASSES] = { 0 }; + + memset(&prog, 0x00, sizeof(prog)); + + memcpy(ops, testcase->ops, + MAX_PROGRAM_OPS * sizeof(testcase->ops[0])); + memcpy(cc_sets, testcase->char_class, + MAX_PROGRAM_CHAR_CLASSES * sizeof(testcase->char_class[0])); + + assert(testcase->expected.count < MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS); + prog.capture_count = testcase->expected.count; + prog.capture_base = testcase->expected.base; + + uint32_t max_cc_used = (uint32_t)-1; + + prog.used = MAX_PROGRAM_OPS; + for (size_t i = 0; i < MAX_PROGRAM_OPS; i++) { + const struct capvm_opcode *op = &testcase->ops[i]; + if (op->t == CAPVM_OP_CHAR && op->u.chr == 0x00) { + prog.used = i; + break; + } else if (op->t == CAPVM_OP_CHARCLASS) { + if (max_cc_used == (uint32_t)-1 || op->u.charclass_id > max_cc_used) { + assert(op->u.charclass_id < MAX_PROGRAM_CHAR_CLASSES); + max_cc_used = op->u.charclass_id; + } + } + } + + prog.ceil = MAX_PROGRAM_OPS; + prog.ops = ops; + + prog.char_classes.sets = cc_sets; + prog.char_classes.count = max_cc_used == (uint32_t)-1 ? 0 : max_cc_used + 1; + prog.char_classes.ceil = MAX_PROGRAM_CHAR_CLASSES; - /* We currently don't need to set anything custom on this. */ - fsm = fsm_new(&options); - return fsm; + return &prog; } -int -captest_check_single_end_id(const struct fsm *fsm, fsm_state_t end_state, - unsigned expected_end_id, const char **msg) +enum captest_run_case_res +captest_run_case_program(const struct captest_case_program *testcase, + int verbosity) { - fsm_end_id_t id_buf[1] = { ~0 }; - enum fsm_getendids_res gres; - size_t written; - const char *unused; + (void)verbosity; - if (msg == NULL) { - msg = &unused; + /* copy program */ + const size_t input_length = strlen(testcase->input); + struct fsm_capture capture_buf[MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS]; + const size_t capture_buf_length = MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS; + + /* Initialize with FSM_CAPTURE_NO_POS, as the caller would */ + for (size_t i = 0; i < capture_buf_length; i++) { + capture_buf[i].pos[0] = FSM_CAPTURE_NO_POS; + capture_buf[i].pos[1] = FSM_CAPTURE_NO_POS; } - if (1 != fsm_getendidcount(fsm, end_state)) { - *msg = "did not have exactly one end ID"; - return 0; + struct capvm_program *program = get_program_copy(testcase); + + if (verbosity > 2) { + fsm_capvm_program_dump(stderr, program); } - gres = fsm_getendids(fsm, end_state, 1, id_buf, &written); - if (gres != FSM_GETENDIDS_FOUND) { - *msg = "failed to get end IDs"; - return 0; + fsm_capvm_program_exec(program, (const uint8_t *)testcase->input, input_length, + capture_buf, capture_buf_length); + + bool dump_captures = false; + enum captest_run_case_res res = CAPTEST_RUN_CASE_PASS; + + /* check captures against expected */ + for (size_t i = 0; i < testcase->expected.count; i++) { + if (testcase->expected.captures[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected.captures[i].pos[1] != capture_buf[i].pos[1]) { + res = CAPTEST_RUN_CASE_FAIL; + dump_captures = true; + } } - if (expected_end_id != id_buf[0]) { - *msg = "failed to get expected end ID"; - return 0; + if (dump_captures) { + for (size_t i = 0; i < testcase->expected.count; i++) { + printf("exp %zd, %zd, got %zd, %zd%s\n", + testcase->expected.captures[i].pos[0], + testcase->expected.captures[i].pos[1], + capture_buf[i].pos[0], capture_buf[i].pos[1], + (testcase->expected.captures[i].pos[0] != capture_buf[i].pos[0] || + testcase->expected.captures[i].pos[1] != capture_buf[i].pos[1]) + ? " *" : ""); + } } - return 1; + return res; } diff --git a/tests/capture/captest.h b/tests/capture/captest.h index a9debff6c..53b30cec0 100644 --- a/tests/capture/captest.h +++ b/tests/capture/captest.h @@ -1,56 +1,123 @@ /* - * Copyright 2020 Scott Vokes + * Copyright 2022 Scott Vokes * * See LICENCE for the full copyright terms. */ + #ifndef CAPTEST_H #define CAPTEST_H #include #include #include +#include +#include #include #include #include #include +#include -#define MAX_SINGLE_FSM_TEST_PATHS 8 -#define MAX_TEST_CAPTURES 8 +#include -#define CAPTEST_RUN_SINGLE_LOG 0 +/* for captest_run_case_program, to evaluate handwritten programs */ +#include "../../src/libfsm/capture_vm_program.h" +#include "../../src/libfsm/capture_vm.h" -#ifndef LOG_INTERMEDIATE_FSMS -#define LOG_INTERMEDIATE_FSMS 0 -#endif +#define MAX_CAPTEST_SINGLE_CAPTURE_PAIRS 16 +#define MAX_CAPTEST_MULTI_CAPTURE_PAIRS 16 +#define MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS 16 + +/* position representing no match */ +#define POS_NONE { (size_t)-1, (size_t)-1 } + +/* If verbosity is set to this (with -vvvvvvvvv) then dump all the + * compiled programs to 'prog_output'. */ +#define DUMP_PROGRAMS_VERBOSITY 9 -struct captest_single_fsm_test_info { - const char *string; - struct captest_single_fsm_test_path { - fsm_state_t start; - fsm_state_t end; - } paths[MAX_SINGLE_FSM_TEST_PATHS]; +enum captest_match { + SHOULD_MATCH = 0, /* implied, set by designated initializer */ + SHOULD_NOT_MATCH = 1, + SHOULD_REJECT_AS_UNSUPPORTED = 2, + SHOULD_SKIP = 3, }; -struct captest_input { - const char *string; - size_t pos; +struct captest_case_single { + const char *regex; + const char *input; + enum captest_match match; + bool no_nl; /* do not retry with trailing newline */ + + size_t count; + struct fsm_capture expected[MAX_CAPTEST_SINGLE_CAPTURE_PAIRS]; }; -int -captest_run_single(const struct captest_single_fsm_test_info *info); +/* Same as captest_case_single, but these expect multiple (possibly overlapping) + * regexes to be combined before checking the match/capture behavior. */ +#define MAX_REGEXES 4 +#define MAX_INPUTS 8 +#define MAX_CAPTEST_MULTI_EXPECTED 8 +struct captest_case_multi { + uint8_t regex_count; + const char *regexes[MAX_REGEXES]; + enum captest_match match; + bool no_nl; + + struct multi_case_input_info { + const char *input; /* first NULL input = end of list */ + struct case_multi_expected { + uint8_t regex; /* expected: ascending order */ + uint8_t capture; /* 0 is default */ + size_t pos[2]; + } expected[MAX_CAPTEST_MULTI_EXPECTED]; + } inputs[MAX_INPUTS]; +}; -int -captest_getc(void *opaque); +struct captest_case_multi_result { + size_t pass; + size_t fail; +}; + +#define MAX_PROGRAM_CHAR_CLASSES 4 +#define MAX_PROGRAM_OPS 32 +struct captest_case_program { + const char *input; + + struct capvm_char_class char_class[MAX_PROGRAM_CHAR_CLASSES]; + + struct { + uint32_t count; + uint32_t base; + struct fsm_capture captures[MAX_CAPTEST_PROGRAM_CAPTURE_PAIRS]; + } expected; + + /* termined by 0'd record, { .t == CAPVM_OP_CHAR, .u.chr = 0x00 } */ + struct capvm_opcode ops[MAX_PROGRAM_OPS]; +}; + +enum captest_run_case_res { + CAPTEST_RUN_CASE_PASS, + CAPTEST_RUN_CASE_FAIL, + CAPTEST_RUN_CASE_ERROR, +}; +enum captest_run_case_res +captest_run_case(const struct captest_case_single *testcase, + int verbosity, bool trailing_newline, FILE *prog_output); -struct fsm * -captest_fsm_with_options(void); +enum captest_run_case_res +captest_run_case_multi(const struct captest_case_multi *testcase, + int verbosity, bool trailing_newline, FILE *prog_output, + struct captest_case_multi_result *result); -struct fsm * -captest_fsm_of_string(const char *string, unsigned end_id); +/* This should probably only be used for evaluating specific + * hand-written programs for development, because we only care + * about supporting the kinds of programs that could be produced + * by compiling from valid regexes. In other words, this is not + * a stable public interface. */ +enum captest_run_case_res +captest_run_case_program(const struct captest_case_program *testcase, + int verbosity); -int -captest_check_single_end_id(const struct fsm *fsm, fsm_state_t end_state, - unsigned expected_end_id, const char **msg); #endif diff --git a/tests/capture/capture0.c b/tests/capture/capture0.c deleted file mode 100644 index 4e7d0e3fa..000000000 --- a/tests/capture/capture0.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" - -/* /a(bcd)e/ */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abcde", - { - { 1, 4 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture1.c b/tests/capture/capture1.c deleted file mode 100644 index e9fe53ab9..000000000 --- a/tests/capture/capture1.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" -/* (a(b(c))) */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abc", - { - { 0, 3 }, - { 1, 3 }, - { 2, 3 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture2.c b/tests/capture/capture2.c deleted file mode 100644 index 20a1c1bac..000000000 --- a/tests/capture/capture2.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" - -/* (a(b((c))(d))) */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abcd", - { - { 0, 4 }, - { 1, 4 }, - { 2, 3 }, - { 2, 3 }, - { 3, 4 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture3.c b/tests/capture/capture3.c deleted file mode 100644 index 9d4d284ab..000000000 --- a/tests/capture/capture3.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "captest.h" - -/* Combine 3 fully disjoint FSMs: - * - * - 0: "(a(b))" - * - 1: "(cd(e))" - * - 2: "(fgh(i))" - * - * Shift the captures for 1 and 2 forward and use/combine - * opaques on them to track which one(s) matched. - * - * This tracking of which DFA matched should be more directly - * supported by the API later. */ - -static void -check(const struct fsm *fsm, const char *string, - unsigned end_id, unsigned capture_base); - -static void -det_and_min(const char *tag, struct fsm *fsm); - -int main(void) { - struct fsm *f_ab = captest_fsm_of_string("ab", 0); - struct fsm *f_cde = captest_fsm_of_string("cde", 1); - struct fsm *f_fghi = captest_fsm_of_string("fghi", 2); - struct fsm *f_all = NULL; - unsigned captures; - - struct fsm_combined_base_pair bases[3]; - struct fsm *fsms[3]; - - assert(f_ab); - assert(f_cde); - assert(f_fghi); - - /* set captures */ -#define SET_CAPTURE(FSM, STATE, CAPTURE, TYPE) \ - if (!fsm_set_capture_action(FSM, STATE, CAPTURE, TYPE)) { \ - fprintf(stderr, "failed to set capture on line %d\n", \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } - - /* (a(b)) */ - if (!fsm_capture_set_path(f_ab, 0, 0, 2)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_ab, 1, 1, 2)) { - exit(EXIT_FAILURE); - } - - /* (cd(e)) */ - if (!fsm_capture_set_path(f_cde, 0, 0, 3)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_cde, 1, 2, 3)) { - exit(EXIT_FAILURE); - } - - /* (fgh(i)) */ - if (!fsm_capture_set_path(f_fghi, 0, 0, 4)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_fghi, 1, 3, 4)) { - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "\n=== f_ab...\n"); - fsm_print_fsm(stderr, f_ab); - fsm_capture_dump(stderr, "#### f_ab", f_ab); - - fprintf(stderr, "\n=== f_cde...\n"); - fsm_print_fsm(stderr, f_cde); - fsm_capture_dump(stderr, "#### f_cde", f_cde); - - fprintf(stderr, "\n=== f_fghi...\n"); - fsm_print_fsm(stderr, f_fghi); - fsm_capture_dump(stderr, "#### f_fghi", f_fghi); -#endif - - /* determinise and minimise each before unioning */ - det_and_min("ab", f_ab); - det_and_min("cde", f_cde); - det_and_min("fghi", f_fghi); - - /* union them */ - fsms[0] = f_ab; - fsms[1] = f_cde; - fsms[2] = f_fghi; - - f_all = fsm_union_array(3, fsms, bases); - assert(f_all != NULL); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "=== unioned f_ab with f_cde... (CB ab: %u, cde: %u)\n", - bases[0].capture, bases[1].capture); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "#### f_all", f_all); -#endif - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "=== unioned f_all with f_fghi... (CB fghi: %u), %u captures\n", - bases[2].capture, fsm_countcaptures(f_all)); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "#### f_all #2", f_all); -#endif - - if (!fsm_determinise(f_all)) { - fprintf(stderr, "NOPE %d\n", __LINE__); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "#### f_all", f_all); -#endif - - captures = fsm_countcaptures(f_all); - if (captures != 6) { - fprintf(stderr, "expected 6 captures, got %u\n", captures); - exit(EXIT_FAILURE); - } - - check(f_all, "ab", 0, bases[0].capture); - check(f_all, "cde", 1, bases[1].capture); - check(f_all, "fghi", 2, bases[2].capture); - - - fsm_free(f_all); - - return 0; -} - -static void -det_and_min(const char *tag, struct fsm *fsm) -{ - if (!fsm_determinise(fsm)) { - fprintf(stderr, "Failed to determise '%s'\n", tag); - exit(EXIT_FAILURE); - } - - if (!fsm_minimise(fsm)) { - fprintf(stderr, "Failed to minimise '%s'\n", tag); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after det_and_min: '%s'\n", tag); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, tag, fsm); -#endif - -} - -static void -check(const struct fsm *fsm, const char *string, - unsigned end_id, unsigned capture_base) -{ - int exec_res; - size_t i; - struct captest_input input; - fsm_state_t end; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - const size_t length = strlen(string); - const unsigned cb = capture_base; /* alias */ - - input.string = string; - input.pos = 0; - - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; - } - - exec_res = fsm_exec(fsm, captest_getc, &input, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "fsm_exec: %d for '%s', expected 1\n", - exec_res, string); - exit(EXIT_FAILURE); - } - - /* check end ID */ - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - /* check captures */ - if (0) { - fprintf(stderr, "captures for '%s' (cb %u): [%ld, %ld], [%ld, %ld]\n", - string, capture_base, - captures[0 + cb].pos[0], captures[0 + cb].pos[1], - captures[1 + cb].pos[0], captures[1 + cb].pos[1]); - } - - assert(captures[0 + cb].pos[0] == 0); - assert(captures[0 + cb].pos[1] == length); - assert(captures[1 + cb].pos[0] == length - 1); - assert(captures[1 + cb].pos[1] == length); -} diff --git a/tests/capture/capture4.c b/tests/capture/capture4.c deleted file mode 100644 index 170cbe8b0..000000000 --- a/tests/capture/capture4.c +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "captest.h" - -/* Combine 2 mostly overlapping FSMs: - * - 0: "(abc)" - * - 1: "(ab*c)" - * and check for false positives in the match. - */ - -static struct fsm * -build_and_combine(unsigned *cb_a, unsigned *cb_b); - -static void -det_and_min(const char *tag, struct fsm *fsm); - -static struct fsm * -build_ab_c(void); - -static void -check(const struct fsm *fsm, const char *string, - unsigned expected_ends, - unsigned cb_a, size_t pa_0, size_t pa_1, - unsigned cb_b, size_t pb_0, size_t pb_1); - -int main(void) { - unsigned cb_abc, cb_ab_c; - struct fsm *f_all = build_and_combine(&cb_abc, &cb_ab_c); - unsigned captures; - const unsigned exp_0 = 1U << 0; - const unsigned exp_1 = 1U << 1; - - captures = fsm_countcaptures(f_all); - if (captures != 2) { - fprintf(stderr, "expected 2 captures, got %u\n", captures); - exit(EXIT_FAILURE); - } - - #define NO_POS FSM_CAPTURE_NO_POS - check(f_all, "abc", /* captures 0 and 1 */ - exp_0 | exp_1, - cb_abc, 0, 3, - cb_ab_c, 0, 3); - check(f_all, "ac", /* only capture 1 */ - exp_1, - cb_abc, NO_POS, NO_POS, - cb_ab_c, 0, 2); - check(f_all, "abbc", /* only capture 1 */ - exp_1, - cb_abc, NO_POS, NO_POS, - cb_ab_c, 0, 4); - - fsm_free(f_all); - - return 0; -} - -static struct fsm * -build_and_combine(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *f_abc = captest_fsm_of_string("abc", 0); - struct fsm *f_ab_c = build_ab_c(); - struct fsm *f_all; - struct fsm_combine_info ci; - - assert(f_abc); - assert(f_ab_c); - - if (!fsm_capture_set_path(f_abc, 0, 0, 3)) { - exit(EXIT_FAILURE); - } - if (!fsm_capture_set_path(f_ab_c, 0, 0, 3)) { - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==================== abc \n"); - fsm_print_fsm(stderr, f_abc); - fsm_capture_dump(stderr, "abc", f_abc); - - fprintf(stderr, "==================== ab*c \n"); - fsm_print_fsm(stderr, f_ab_c); - fsm_capture_dump(stderr, "ab*c", f_ab_c); -#endif - - det_and_min("abc", f_abc); - det_and_min("ab*c", f_ab_c); - - /* union them */ - f_all = fsm_union(f_abc, f_ab_c, &ci); - assert(f_all != NULL); - - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==================== post-union \n"); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "capture_actions", f_all); - fprintf(stderr, "====================\n"); -#endif - - if (!fsm_determinise(f_all)) { - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==================== post-det \n"); - fsm_print_fsm(stderr, f_all); - fsm_capture_dump(stderr, "capture_actions", f_all); - fprintf(stderr, "====================\n"); -#endif - - return f_all; -} - -static void -det_and_min(const char *tag, struct fsm *fsm) -{ - if (!fsm_determinise(fsm)) { - fprintf(stderr, "Failed to determise '%s'\n", tag); - exit(EXIT_FAILURE); - } - - if (!fsm_minimise(fsm)) { - fprintf(stderr, "Failed to minimise '%s'\n", tag); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after det_and_min: '%s'\n", tag); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, tag, fsm); -#endif - -} - -static struct fsm * -build_ab_c(void) -{ - struct fsm *fsm = captest_fsm_with_options(); - assert(fsm != NULL); - - if (!fsm_addstate_bulk(fsm, 4)) { goto fail; } - - fsm_setstart(fsm, 0); - if (!fsm_addedge_literal(fsm, 0, 1, 'a')) { goto fail; } - - if (!fsm_addedge_literal(fsm, 1, 2, 'b')) { goto fail; } - if (!fsm_addedge_literal(fsm, 1, 3, 'c')) { goto fail; } - - if (!fsm_addedge_literal(fsm, 2, 2, 'b')) { goto fail; } - if (!fsm_addedge_literal(fsm, 2, 3, 'c')) { goto fail; } - - fsm_setend(fsm, 3, 1); - if (!fsm_setendid(fsm, 1)) { - goto fail; - } - - return fsm; - -fail: - exit(EXIT_FAILURE); -} - -static void -check(const struct fsm *fsm, const char *string, - unsigned expected_ends, - unsigned cb_a, size_t pa_0, size_t pa_1, - unsigned cb_b, size_t pb_0, size_t pb_1) -{ - int exec_res; - size_t i; - struct captest_input input; - fsm_state_t end; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - fprintf(stderr, "#### check '%s', exp: ends 0x%u, c%u: (%ld, %ld), c%u: %ld, %ld)\n", - string, expected_ends, - cb_a, pa_0, pa_1, - cb_b, pb_0, pb_1); - - input.string = string; - input.pos = 0; - - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; - } - - exec_res = fsm_exec(fsm, captest_getc, &input, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "fsm_exec: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - /* check captures */ - fprintf(stderr, "captures for '%s': [%ld, %ld], [%ld, %ld]\n", - string, - captures[0].pos[0], captures[0].pos[1], - captures[1].pos[0], captures[1].pos[1]); - assert(captures[cb_a].pos[0] == pa_0); - assert(captures[cb_a].pos[1] == pa_1); - assert(captures[cb_b].pos[0] == pb_0); - assert(captures[cb_b].pos[1] == pb_1); - - { - enum fsm_getendids_res gres; - fsm_end_id_t id_buf[2]; - size_t written; - gres = fsm_getendids(fsm, end, 2, id_buf, &written); - if (gres != FSM_GETENDIDS_FOUND) { - assert(!"fsm_getendids failed"); - } - - if (expected_ends == 0x2) { - assert(written == 1); - assert(id_buf[0] == 1); - } else if (expected_ends == 0x3) { - assert(written == 2); - assert(id_buf[0] == 0); - assert(id_buf[1] == 1); - } else { - assert(!"test not handled"); - } - } -} diff --git a/tests/capture/capture5.c b/tests/capture/capture5.c deleted file mode 100644 index b3a4be3ee..000000000 --- a/tests/capture/capture5.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define LOG_INTERMEDIATE_FSMS 0 -#include "captest.h" - -/* Check that self edges are handled properly in the - * capture action analysis. - * - * The DFA corresponds to /a(b*)(c)/. */ - -static struct fsm * -build(void); - -static void -check(struct fsm *f, const char *input, - unsigned pa_0, unsigned pa_1, - unsigned pb_0, unsigned pb_1); - -int main(void) { - struct fsm *f = build(); - unsigned captures; - assert(f != NULL); - - captures = fsm_countcaptures(f); - if (captures != 2) { - fprintf(stderr, "expected 2 captures, got %u\n", captures); - exit(EXIT_FAILURE); - } - - check(f, "ac", - 1, 1, - 1, 2); - check(f, "abc", - 1, 2, - 2, 3); - check(f, "abbc", - 1, 3, - 3, 4); - - fsm_free(f); - - return 0; -} - -static struct fsm * -build(void) -{ - struct fsm *fsm = captest_fsm_with_options(); - - if (!fsm_addstate_bulk(fsm, 4)) { goto fail; } - - fsm_setstart(fsm, 0); - if (!fsm_addedge_literal(fsm, 0, 1, 'a')) { goto fail; } - - if (!fsm_addedge_literal(fsm, 1, 1, 'b')) { goto fail; } - if (!fsm_addedge_literal(fsm, 1, 2, 'c')) { goto fail; } - - fsm_setend(fsm, 2, 1); - - if (!fsm_capture_set_path(fsm, 0, 1, 1)) { goto fail; } - if (!fsm_capture_set_path(fsm, 1, 1, 2)) { goto fail; } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== built\n"); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "built", fsm); -#endif - - if (!fsm_determinise(fsm)) { - fprintf(stderr, "Failed to determise\n"); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after det\n"); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "after det", fsm); -#endif - - if (!fsm_minimise(fsm)) { - fprintf(stderr, "Failed to minimise\n"); - exit(EXIT_FAILURE); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after min\n"); - fsm_print_fsm(stderr, fsm); - fsm_capture_dump(stderr, "after min", fsm); -#endif - return fsm; - -fail: - exit(EXIT_FAILURE); -} - -static void -check(struct fsm *fsm, const char *string, - unsigned pa_0, unsigned pa_1, - unsigned pb_0, unsigned pb_1) -{ - int exec_res; - size_t i; - struct captest_input input; - fsm_state_t end; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - fprintf(stderr, "#### check '%s', exp: c%u: (%u, %u), c%u: %u, %u)\n", - string, - 0, pa_0, pa_1, - 1, pb_0, pb_1); - - input.string = string; - input.pos = 0; - - for (i = 0; i < MAX_TEST_CAPTURES; i++) { - captures[i].pos[0] = FSM_CAPTURE_NO_POS; - captures[i].pos[1] = FSM_CAPTURE_NO_POS; - } - - exec_res = fsm_exec(fsm, captest_getc, &input, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "fsm_exec: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - /* check captures */ - fprintf(stderr, "captures for '%s': [%ld, %ld], [%ld, %ld]\n", - string, - captures[0].pos[0], captures[0].pos[1], - captures[1].pos[0], captures[1].pos[1]); - assert(captures[0].pos[0] == pa_0); - assert(captures[0].pos[1] == pa_1); - assert(captures[1].pos[0] == pb_0); - assert(captures[1].pos[1] == pb_1); -} diff --git a/tests/capture/capture_concat1.c b/tests/capture/capture_concat1.c deleted file mode 100644 index ee9c8aaab..000000000 --- a/tests/capture/capture_concat1.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "captest.h" - -/* concat /(ab)/ and /(cde)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_ab, size_t exp_start_ab, size_t exp_end_ab, - unsigned cb_cde, size_t exp_start_cde, size_t exp_end_cde); - -int main(void) { - unsigned cb_ab, cb_cde; /* capture base */ - struct fsm *abcde = build(&cb_ab, &cb_cde); - - check(abcde, "abcde", 1, - cb_ab, 0, 2, - cb_cde, 2, 5); - - fsm_free(abcde); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *ab = captest_fsm_of_string("ab", 0); - struct fsm *cde = captest_fsm_of_string("cde", 1); - struct fsm *abcde; - struct fsm_combine_info ci; - size_t cc_ab, cc_cde, cc_abcde; - - assert(ab); - assert(cde); - - if (!fsm_capture_set_path(ab, 0, 0, 2)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(cde, 0, 0, 3)) { - assert(!"path 1"); - } - - cc_ab = fsm_countcaptures(ab); - assert(cc_ab == 1); - - cc_cde = fsm_countcaptures(cde); - assert(cc_cde == 1); - - abcde = fsm_concat(ab, cde, &ci); - assert(abcde); - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - - cc_abcde = fsm_countcaptures(abcde); - assert(cc_abcde == cc_ab + cc_cde); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after concat: cb_ab %u, cb_cde %u\n", - *cb_a, *cb_b); - fsm_print_fsm(stderr, abcde); - - fsm_capture_dump(stderr, "#### after concat", abcde); - - fprintf(stderr, "==== determinise\n"); -#endif - - if (!fsm_determinise(abcde)) { - assert(!"determinise"); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, abcde); - - assert(fsm_countcaptures(abcde) == cc_abcde); - - fsm_capture_dump(stderr, "#### after det", abcde); -#endif - - assert(fsm_countcaptures(abcde) == cc_abcde); - return abcde; -} - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_ab, size_t exp_start_ab, size_t exp_end_ab, - unsigned cb_cde, size_t exp_start_cde, size_t exp_end_cde) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - assert(captures[cb_ab].pos[0] == exp_start_ab); - assert(captures[cb_ab].pos[1] == exp_end_ab); - - assert(captures[cb_cde].pos[0] == exp_start_cde); - assert(captures[cb_cde].pos[1] == exp_end_cde); -} diff --git a/tests/capture/capture_concat2.c b/tests/capture/capture_concat2.c deleted file mode 100644 index a8f070c7e..000000000 --- a/tests/capture/capture_concat2.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "captest.h" - -/* concat /(abc)/ and /(de)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_ab, size_t exp_start_ab, size_t exp_end_ab, - unsigned cb_cde, size_t exp_start_cde, size_t exp_end_cde); - -int main(void) { - unsigned cb_abc, cb_de; /* capture base */ - struct fsm *abcde = build(&cb_abc, &cb_de); - - check(abcde, "abcde", 1, - cb_abc, 0, 3, - cb_de, 3, 5); - - fsm_free(abcde); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *abc = captest_fsm_of_string("abc", 0); - struct fsm *de = captest_fsm_of_string("de", 1); - struct fsm *abcde; - struct fsm_combine_info ci; - size_t cc_abc, cc_de, cc_abcde; - - assert(abc); - assert(de); - - if (!fsm_capture_set_path(abc, 0, 0, 3)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(de, 0, 0, 2)) { - assert(!"path 1"); - } - - cc_abc = fsm_countcaptures(abc); - assert(cc_abc == 1); - - cc_de = fsm_countcaptures(de); - assert(cc_de == 1); - - abcde = fsm_concat(abc, de, &ci); - assert(abcde); - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - - cc_abcde = fsm_countcaptures(abcde); - assert(cc_abcde == cc_abc + cc_de); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after concat: cb_abc %u, cb_de %u\n", - *cb_a, *cb_b); - fsm_print_fsm(stderr, abcde); - - fsm_capture_dump(stderr, "#### after concat", abcde); - - fprintf(stderr, "==== determinise\n"); -#endif - - if (!fsm_determinise(abcde)) { - assert(!"determinise"); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, abcde); - - assert(fsm_countcaptures(abcde) == cc_abcde); - - fsm_capture_dump(stderr, "#### after det", abcde); -#endif - - assert(fsm_countcaptures(abcde) == cc_abcde); - return abcde; -} - -static void -check(const struct fsm *fsm, const char *input, unsigned end_id, - unsigned cb_abc, size_t exp_start_abc, size_t exp_end_abc, - unsigned cb_de, size_t exp_start_de, size_t exp_end_de) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - assert(captures[cb_abc].pos[0] == exp_start_abc); - assert(captures[cb_abc].pos[1] == exp_end_abc); - - assert(captures[cb_de].pos[0] == exp_start_de); - assert(captures[cb_de].pos[1] == exp_end_de); -} diff --git a/tests/capture/capture_long_trail.c b/tests/capture/capture_long_trail.c deleted file mode 100644 index 349717b0f..000000000 --- a/tests/capture/capture_long_trail.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include - -#include "captest.h" -/* a(bcdefghijklmnopqrstuvwxy)z - * This is long enough to exercise growing the trail for - * capture action analysis. */ - -int main(void) { - struct captest_single_fsm_test_info test_info = { - "abcdefghijklmnopqrstuvwxyz", - { - { 1, 25 }, - } - }; - return captest_run_single(&test_info); -} diff --git a/tests/capture/capture_test_case_list.c b/tests/capture/capture_test_case_list.c new file mode 100644 index 000000000..6b72d1018 --- /dev/null +++ b/tests/capture/capture_test_case_list.c @@ -0,0 +1,1944 @@ +#include "captest.h" + +#include + +#define NO_POS FSM_CAPTURE_NO_POS + +const struct captest_case_single single_cases[] = { + { + .regex = "^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$^", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "()*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^$", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^($|($)|(($))|((($))))", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(((($)))|(($))|($)|$)", + .input = "", + .count = 5, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^((((a$)))|((b$))|(c$)|d$)", + .input = "a", + .count = 5, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^((((a$)))|((b$))|(c$)|d$)", + .input = "b", + .count = 7, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^((((b$)))|((b$))|(c$)|d$)", + .input = "b", + .count = 5, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^((x?))*$", + .input = "x", + .count = 3, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^((x?)*)*$", + .input = "", + .count = 3, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^((x?)*)*$", + .input = "xxxxx", + .count = 3, .expected = { + { .pos = {0, 5}, }, + { .pos = {5, 5}, }, + { .pos = {5, 5}, }, + }, + }, + { + .regex = "xx*x", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 2}, }, + }, + }, + { + .regex = "^(x?)*$", + .input = "xx", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {2, 2}, }, + }, + }, + { + .regex = "^(x?)*$", + .input = "xxx", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {3, 3}, }, + }, + }, + { + .regex = "^(x?)+$", + .input = "xx", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {2, 2}, }, + }, + }, + { + .regex = "^(x?)+$", + .input = "xxx", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {3, 3}, }, + }, + }, + { + .regex = "^x(z?)*y$", + .input = "xy", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "()|x", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()|x", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "x|()", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|()", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$|", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = ".|$^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = ".|$^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$^|.", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$^|.", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$$$^|...", + .input = "xxx", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "x?$x?^x?|x?$x?^x?", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "[^x]", .input = "", + .no_nl = true, + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "[^x]", + .input = "\n", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = ".$()", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = ".$()", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^.$()", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^.$()", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$(x?)(y?)(z?)", + .input = "a", + .count = 4, .expected = { + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = ".$(x?)(y?)(z?)", + .input = "a", + .count = 4, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "[^y]", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = ".", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "(x)+", + .input = "xxx", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {2, 3}, }, + }, + }, + { + .regex = "^(x)*.", + .input = "xx", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^(x)*.", + .input = "xy", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "a.b(c)*", + .input = "axbc", + .count = 2, .expected = { + { .pos = {0, 4}, }, + { .pos = {3, 4}, }, + }, + }, + { + .regex = "^x?^", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^x?^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$(^)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "($)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "($$$)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$x?^", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$(^)*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$(^)*", + .input = "x", + .count = 1, .expected = { + { .pos = {1, 1}, }, + }, + }, + { + .regex = "$()*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$()*", + .input = "x", + .count = 2, .expected = { + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^$^", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$^$", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$y?^x*", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "x|$^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|$^", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "x|$^$^", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|$^$^", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$^|x", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$^|x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$^$^|x", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "$^$^|x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^$|.", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x|^$^$", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^$^$|x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "$|^|a$", + .input = "x", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "[^a]x", .input = "x", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "[^a]x", + .input = "xx", + .count = 1, .expected = { + { .pos = {0, 2}, }, + }, + }, + { + .regex = "a(b|c$)d", .input = "ac", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "a(^b|c)d", .input = "bd", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(a|b|)*", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "xx*y$", + .input = "x_xxy", + .count = 1, .expected = { + { .pos = {2, 5}, }, + }, + }, + { + .regex = "(|.$)*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(.$)*x", .input = "y", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(.$)*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^(|.$)*", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(|.$)*$", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "x|y(^)", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(?:x*.|^$).", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "(?:x|^$)x", .input = "", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "()+x", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "($$)^", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "$($|$a)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?i)abc$", + .input = "AbC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)ab(?-i)c$", .input = "AbC", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^(?i)ab(?-i)c$", + .input = "Abc", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)a[b]c$", + .input = "ABC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)a[^b]c$", .input = "ABC", + .match = SHOULD_NOT_MATCH, + }, + { + .regex = "^(?i)a[bx]c$", + .input = "ABC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(?i)a[b-c]c$", + .input = "ABC", + .count = 1, .expected = { + { .pos = {0, 3}, }, + }, + }, + { + .regex = "(a()b)+a", + .input = "a!aba", + .count = 3, .expected = { + { .pos = {2, 5}, }, + { .pos = {2, 4}, }, + { .pos = {3, 3}, }, + }, + }, + { + .regex = "^^[^]]", + .input = "\n", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "x(x()y)*", + .input = "xxy", + .count = 3, .expected = { + { .pos = {0, 3}, }, + { .pos = {1, 3}, }, + { .pos = {2, 2}, }, + }, + }, + { + .regex = "x(()x)*", + .input = "xx", + .count = 3, .expected = { + { .pos = {0, 2}, }, + { .pos = {1, 2}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "b(x*x*a()*y)*(a)a*", + .input = "ba", + .count = 4, .expected = { + { .pos = {0, 2}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {1, 2}, }, + }, + }, + { + .regex = "a(().x)*ab", + .input = "a.a.aaxab", + .count = 3, .expected = { + { .pos = {4, 9}, }, + { .pos = {5, 7}, }, + { .pos = {5, 5}, }, + }, + }, + { + .regex = "ab(b()*()*)*()*z", + .input = "a!abz", + .count = 5, .expected = { + { .pos = {2, 5}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {-1, -1}, }, + { .pos = {4, 4}, }, + }, + }, + { + .regex = "^x(y?z*)*$", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^(y?z*)*$", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(x|$x?)*$", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(^|$x)*$", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "((x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*(x?)*)*y$", + .input = "xxxxxxxxxxy", + .count = 15, .expected = { + { .pos = {0, 11}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + { .pos = {10, 10}, }, + }, + }, + { + .regex = "^a$", + .input = "a", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^a(bcd)e$", + .input = "abcde", + .count = 2, .expected = { + { .pos = {0, 5}, }, + { .pos = {1, 4}, }, + }, + }, + { + .regex = "^(a(b((c))(d)))$", + .input = "abcd", + .count = 6, .expected = { + { .pos = {0, 4}, }, + { .pos = {0, 4}, }, + { .pos = {1, 4}, }, + { .pos = {2, 3}, }, + { .pos = {2, 3}, }, + { .pos = {3, 4}, }, + }, + }, + { + .regex = "^(a(b(c)))$", + .input = "abc", + .count = 4, .expected = { + { .pos = {0, 3}, }, + { .pos = {0, 3}, }, + { .pos = {1, 3}, }, + { .pos = {2, 3}, }, + }, + }, + { + .regex = "^a(b*)(c)$", + .input = "ac", + .count = 3, .expected = { + { .pos = {0, 2}, }, + { .pos = {1, 1}, }, + { .pos = {1, 2}, }, + }, + }, + { + .regex = "^a(b*)(c)$", + .input = "abc", + .count = 3, .expected = { + { .pos = {0, 3}, }, + { .pos = {1, 2}, }, + { .pos = {2, 3}, }, + }, + }, + { + .regex = "^a(b*)(c)$", + .input = "abbc", + .count = 3, .expected = { + { .pos = {0, 4}, }, + { .pos = {1, 3}, }, + { .pos = {3, 4}, }, + }, + }, + { + .regex = "^(ab*c)$", + .input = "ac", + .count = 2, .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 2}, }, + }, + }, + { + .regex = "^(ab*c)$", + .input = "abc", + .count = 2, .expected = { + { .pos = {0, 3}, }, + { .pos = {0, 3}, }, + }, + }, + { + .regex = "^(ab*c)$", + .input = "abbc", + .count = 2, .expected = { + { .pos = {0, 4}, }, + { .pos = {0, 4}, }, + }, + }, + { + .regex = "^(a*)", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(a*)", + .input = "x", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(a*)", + .input = "a", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^(a*)", + .input = "ax", + .count = 2, .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^a*", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^a*", + .input = "a", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = "^a*", + .input = "ax", + .count = 1, .expected = { + { .pos = {0, 1}, }, + }, + }, + { + .regex = ".|", + .input = "", + .count = 1, .expected = { + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()*^", + .input = "", + .count = 2, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(((())))*^", + .input = "", + .count = 5, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + + { + .regex = "(x|(x|))^", + .input = "", + .count = 3, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = ".*(x|())^", + .input = "", + .count = 3, .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(()|(()|x)^|x)^", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + + { + .regex = "x^()()|()", + .input = "", + .count = 4, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "y^()|()^x", + .input = "x", + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()$a|()", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "()$z|(x)$", + .input = "x", + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 1}, }, + }, + }, + + { + /* long enough to exercise the USE_COLLAPSED_ZERO_PREFIX optimization */ + .regex = "a*(ba*)c$", + .input = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaac", + .count = 2, + .expected = { + { .pos = {101, 303}, }, + { .pos = {201, 302}, }, + }, + }, + + /* regression: losing the first character on the transition from + * the unanchored start loop to the capture */ + { + .regex = "aa+b$", + .input = "aXaXaaab", + .count = 1, + .expected = { + { .pos = {4, 8}, }, + }, + }, + { + .regex = "aa*b$", + .input = "aXaXaaab", + .count = 1, + .expected = { + { .pos = {4, 8}, }, + }, + }, + { + .regex = "!!!+$", + .input = "!\"!\"!\"!!!!", + .count = 1, + .expected = { + { .pos = {6, 10}, }, + }, + }, + + /* new fuzzer regressions */ + { + /* PCRE does not set the first capture, which is unsatisfiable */ + .regex = "^(.^)*^(a*)", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + /* similar to the previous case, but with different anchoring */ + .regex = "(a)*(^)*^", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(.a)*^(.a)", + .input = "!a", + .count = 3, + .expected = { + { .pos = {0, 2}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 2}, }, + }, + }, + { + .regex = "(A)*^()*^", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + + { + .regex = "(a(b*)*|)*bc", + .input = "b!bc", + .count = 3, + .expected = { + { .pos = {2, 4}, }, + { .pos = {2, 2}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(a(b*)*|)*bc$", + .input = "bc", + .count = 3, + .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "(|a((b*)*b*))*", + .input = "", + .count = 4, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + /* simplified version of the above */ + .regex = "^(|a(b*)*)*$", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + /* zero repetitions should not set the capture */ + .regex = "^(a)*$", + .input = "", + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(a)*(^)$", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + /* raw fuzzer output */ + .regex = "()((()(^|$|$^|^|$|$^^|$|$^|^|$|$^^^^|^|(|)($)|)+|^^|^|(|)($)|)+|)($)()+", + .input = "", + .count = 12, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(^|())+()", + .input = "", + .count = 4, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "(?:(^|^$)+|)+", + .input = "", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^((|)($)|)+a$", + .input = "a", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 4, + .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(($)|)+a$", + .input = "a", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + { + .regex = "^(|(|x))*$", + .input = "x", + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + /* same as the previous but without outer capture */ + .regex = "^(?:|(|x))*$", + .input = "x", + .count = 2, + .expected = { + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + { + .regex = "(((($)|)+|)a|)+", + .input = "", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 5, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {NO_POS, NO_POS}, }, + }, + }, + + { + .regex = "^(|(|(|x)))*$", + .input = "x", + .count = 4, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {0, 1}, }, + { .pos = {0, 1}, }, + }, + }, + + + { + .regex = "^(?:(?:(x?)^)y?)+$", + .input = "", + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?:^())+$", + .input = "", + .count = 2, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?:($|x))+$", + .input = "x", + + .match = SHOULD_REJECT_AS_UNSUPPORTED, + + .count = 2, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^(($)|x)+$", + .input = "x", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + { + .regex = "^(?:()?^()?)+$", + .input = "", + .count = 3, + .expected = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + }, + }, + { + .regex = "^(?:($|x)())+$", + .input = "x", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 3, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + + { + .regex = "()~((|)($)|%)+", + .input = "~%", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 5, + .expected = { + { .pos = {0, 2}, }, + { .pos = {0, 0}, }, + { .pos = {2, 2}, }, + { .pos = {2, 2}, }, + { .pos = {2, 2}, }, + }, + }, + + { + /* (slightly) reduced version of the previous */ + .regex = "^(()($)|x)+$", + .input = "x", + .match = SHOULD_REJECT_AS_UNSUPPORTED, + .count = 4, + .expected = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + { .pos = {1, 1}, }, + }, + }, + + { + .regex = "a|_$[^b]", + .input = "a", + .count = 1, + .expected = { + { .pos = {0, 1}, }, + }, + }, + + { + .regex = "\\z", + .input = "", + .count = 1, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + }, +}; + +const struct captest_case_multi multi_cases[] = { + { + .regex_count = 4, + .regexes = { + "^aa$", /* exactly two 'a's */ + "^a*", /* zero or more 'a's followed by anything */ + "^ab?$", /* 'a' and optionally 'b' */ + "a*$", /* anything ending in zero or more 'a's */ + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 0 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 0, 0 } }, + }, + }, + + { + .input = "a", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 1 } }, + { .regex = 2, .pos = { 0, 1 } }, + { .regex = 3, .pos = { 0, 1 } }, + }, + }, + + { + .input = "aa", + .expected = { + { .regex = 0, .pos = { 0, 2 } }, + { .regex = 1, .pos = { 0, 2 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 0, 2 } }, + }, + }, + + { + .input = "aaa", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 3 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 0, 3 } }, + }, + }, + + { + .input = "ba", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 0 } }, + { .regex = 2, .pos = POS_NONE }, + { .regex = 3, .pos = { 1, 2 } }, + }, + }, + + { + .input = "ab", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = { 0, 1 } }, + { .regex = 2, .pos = { 0, 2 } }, + { .regex = 3, .pos = { 2, 2 } }, + }, + }, + + { + .input = NULL, + }, + }, + }, + + { + .regex_count = 3, + .regexes = { + "a(b?)*c", + "(ab)(c)", + "ab+(c)", + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 2, .pos = POS_NONE }, + { .regex = 2, .capture = 0, .pos = POS_NONE }, + { .regex = 2, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "abc", + .expected = { + { .regex = 0, .capture = 0, .pos = {0, 3} }, + { .regex = 0, .capture = 1, .pos = {2, 2} }, + { .regex = 1, .capture = 0, .pos = {0, 3} }, + { .regex = 1, .capture = 1, .pos = {0, 2} }, + { .regex = 1, .capture = 2, .pos = {2, 3} }, + { .regex = 2, .capture = 0, .pos = {0, 3} }, + { .regex = 2, .capture = 1, .pos = {2, 3} }, + }, + }, + }, + }, + { + /* fuzzer regression: This led to an execution path in fsm_union_array, + * fsm_union, fsm_merge, merge that did not init or otherwise set the + * `struct fsm_combine_info`, leading to an out of range offset for + * the capture base. */ + .regex_count = 3, + .regexes = { + ".", + ".^", + "^^_", + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .pos = POS_NONE }, + { .regex = 1, .pos = POS_NONE }, + { .regex = 2, .pos = POS_NONE }, + }, + }, + { + .input = "_", + .expected = { + { .regex = 0, .pos = { 0, 1 } }, + { .regex = 1, .pos = { 0, 1 } }, + { .regex = 2, .pos = { 0, 1 } }, + }, + }, + }, + }, + + { + /* This checks that minimisation doesn't incorrectly + * merge these and lead to capture false positives. */ + .regex_count = 2, + .regexes = { + "^a(b)c$", /* exactly one 'b' */ + "^a(b*)c$", /* any number of 'b's */ + }, + .inputs = { + { + .input = "", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "a", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "ab", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = POS_NONE }, + { .regex = 1, .capture = 1, .pos = POS_NONE }, + }, + }, + { + .input = "ac", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = { 0, 2 } }, + { .regex = 1, .capture = 1, .pos = { 1, 1 } }, + }, + }, + { + .input = "abc", + .expected = { + { .regex = 0, .capture = 0, .pos = {0, 3 } }, + { .regex = 0, .capture = 1, .pos = {1, 2 } }, + { .regex = 1, .capture = 0, .pos = { 0, 3 } }, + { .regex = 1, .capture = 1, .pos = { 1, 2 } }, + }, + }, + { + .input = "abbc", + .expected = { + { .regex = 0, .capture = 0, .pos = POS_NONE }, + { .regex = 0, .capture = 1, .pos = POS_NONE }, + { .regex = 1, .capture = 0, .pos = { 0, 4 } }, + { .regex = 1, .capture = 1, .pos = { 1, 3 } }, + }, + }, + + { + .input = NULL, + }, + }, + } +}; + + +static struct captest_case_program program_cases[] = { + { + .input = "", + .char_class = { + { .octets = { ~0, ~0, ~0, ~0 }}, /* 0x00 <= x <= 0xff */ + }, + .expected = { + .count = 4, + .captures = { + { .pos = {0, 0}, }, + { .pos = {0, 0}, }, + { .pos = {NO_POS, NO_POS}, }, + { .pos = {0, 0}, }, + }, + }, + + .ops = { + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 3, .nongreedy = 1 }}, + { .t = CAPVM_OP_CHARCLASS, .u.charclass_id = 0 }, + { .t = CAPVM_OP_JMP, .u.jmp = 0 }, + { .t = CAPVM_OP_SAVE, .u.save = 0 }, + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 5, .nongreedy = 7 }}, + { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_START }, + + { .t = CAPVM_OP_JMP, .u.jmp = 9 }, /* jump after |() */ + { .t = CAPVM_OP_SAVE, .u.save = 4 }, + { .t = CAPVM_OP_SAVE, .u.save = 5 }, + + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 4, .nongreedy = 10 }}, + + { .t = CAPVM_OP_SAVE, .u.save = 2 }, + { .t = CAPVM_OP_SAVE, .u.save = 3 }, + { .t = CAPVM_OP_SAVE, .u.save = 6 }, + { .t = CAPVM_OP_SAVE, .u.save = 7 }, + { .t = CAPVM_OP_SAVE, .u.save = 1 }, + { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 18, .nongreedy = 16 }}, + { .t = CAPVM_OP_CHARCLASS, .u.charclass_id = 0 }, + { .t = CAPVM_OP_JMP, .u.jmp = 15 }, + { .t = CAPVM_OP_MATCH }, + }, + }, + + + { + /* correcting compilation of '^(?:($|x))+$' */ + .input = "x", + .expected = { + .count = 2, + .captures = { + { .pos = {0, 1}, }, + { .pos = {1, 1}, }, + }, + }, + + .ops = { + [0] = { .t = CAPVM_OP_SAVE, .u.save = 0 }, + [1] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_START }, + [2] = { .t = CAPVM_OP_SAVE, .u.save = 2 }, + [3] = { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 4, .nongreedy = 6 }}, + [4] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_END }, + + /* [5] = { .t = CAPVM_OP_JMP, .u.jmp = 7 }, */ + [5] = { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 7, .nongreedy = 9 }}, + + [6] = { .t = CAPVM_OP_CHAR, .u.chr = 'x' }, + [7] = { .t = CAPVM_OP_SAVE, .u.save = 3 }, + [8] = { .t = CAPVM_OP_SPLIT, .u.split = { .greedy = 2, .nongreedy = 9 }}, + [9] = { .t = CAPVM_OP_ANCHOR, .u.anchor = CAPVM_ANCHOR_END }, + [10] = { .t = CAPVM_OP_SAVE, .u.save = 1 }, + [11] = { .t = CAPVM_OP_MATCH }, + }, + }, +}; + +#define NO_FILTER ((size_t)-1) +struct options { + size_t filter; + int verbosity; + bool track_timing; + FILE *prog_output; + enum groups { + GROUP_SINGLE = 0x01, + GROUP_MULTI = 0x02, + GROUP_PROGRAMS = 0x04, + GROUP_ALL = 0xff, + } group; +}; + +static void +print_usage(FILE *f, const char *progname) +{ + fprintf(f, "%s: [-h] [-v] [-s | -m | -p] [-f ] [-t]\n", progname); + fprintf(f, " -h: print this usage info\n"); + fprintf(f, " -v: increase verbosity (can repeat: -vvv)\n"); + fprintf(f, " -f : just run a specific test, by numeric ID\n"); + fprintf(f, " -s: only single casse\n"); + fprintf(f, " -m: only multi cases\n"); + fprintf(f, " -p: only program cases\n"); + fprintf(f, " -t: print timing info\n"); +} + +static void +get_options(struct options *opt, int argc, char **argv) +{ + const char *progname = argv[0]; + int c; + while (c = getopt(argc, argv, "hf:mpstv"), c != -1) { + switch (c) { + case 'h': + print_usage(stdout, progname); + exit(EXIT_SUCCESS); + break; + case 'v': + opt->verbosity++; + break; + case 'f': + opt->filter = atol(optarg); + break; + case 't': + opt->track_timing = true; + break; + case 'p': + opt->group = GROUP_PROGRAMS; + break; + case 's': + opt->group = GROUP_SINGLE; + break; + case 'm': + opt->group = GROUP_MULTI; + break; + case '?': + default: + print_usage(stderr, progname); + exit(EXIT_FAILURE); + } + } +} + +int main(int argc, char **argv) { + size_t pass = 0; + size_t fail = 0; + size_t skip = 0; + size_t nth = 0; + + struct options options = { + .filter = NO_FILTER, + .verbosity = 0, + .group = GROUP_ALL, + }; + get_options(&options, argc, argv); + + if (options.verbosity == DUMP_PROGRAMS_VERBOSITY) { + options.prog_output = fopen("prog_output", "w"); + assert(options.prog_output != NULL); + } + + /* avoid an extra layer of indentation here */ + if (!(options.group & GROUP_SINGLE)) { goto after_single; } + + printf("-- single cases without trailing newline\n"); + const size_t single_case_count = sizeof(single_cases)/sizeof(single_cases[0]); + for (size_t c_i = 0; c_i < single_case_count; c_i++) { + const size_t cur = nth++; + if (options.filter != NO_FILTER && options.filter != cur) { + continue; + } + + if (options.verbosity > 0) { + printf("%zu: ", cur); + if (options.verbosity > 2) { + fflush(stdout); + } + } + + if (options.verbosity == DUMP_PROGRAMS_VERBOSITY) { + fprintf(options.prog_output, "\n\n==== test_case %zu\n", c_i); + } + + const struct captest_case_single *t = &single_cases[c_i]; + + if (t->match == SHOULD_SKIP) { + printf("%zd: SKIP (regex \"%s\", input \"%s\")\n", + cur, t->regex, t->input); + skip++; + continue; + } + + enum captest_run_case_res res = captest_run_case(t, options.verbosity, false, options.prog_output); + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + pass++; + break; + case CAPTEST_RUN_CASE_FAIL: + if (options.verbosity == 0) { + printf("-- test case %zd (regex \"%s\", input \"%s\")\n", cur, t->regex, t->input); + } + fail++; + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } + + /* second pass, adding a trailing newline to input */ + printf("-- single cases with trailing newline\n"); + for (size_t c_i = 0; c_i < single_case_count; c_i++) { + const size_t cur = nth++; + if (options.filter != NO_FILTER && options.filter != cur) { + continue; + } + + const struct captest_case_single *t = &single_cases[c_i]; + if (t->no_nl) { continue; } + if (t->match == SHOULD_SKIP) { + printf("%zd: SKIP (regex \"%s\", input \"%s\\n\")\n", + cur, t->regex, t->input); + skip++; + continue; + } + + if (options.verbosity > 0) { + printf("%zu: ", cur); + if (options.verbosity > 2) { + fflush(stdout); + } + } + + enum captest_run_case_res res = captest_run_case(t, options.verbosity, true, options.prog_output); + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + pass++; + break; + case CAPTEST_RUN_CASE_FAIL: + if (options.verbosity == 0) { + printf("-- test case %zd (regex \"%s\", input \"%s\\n\")\n", cur, t->regex, t->input); + } + fail++; + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } +after_single: + + /* multi-regex tests */ + if (!(options.group & GROUP_MULTI)) { goto after_multi; } + + printf("-- multi-regex cases\n"); + const size_t multi_case_count = sizeof(multi_cases)/sizeof(multi_cases[0]); + for (size_t c_i = 0; c_i < multi_case_count; c_i++) { + const size_t cur = nth++; + if ((options.filter != NO_FILTER && options.filter != cur)) { + continue; + } + + const struct captest_case_multi *t = &multi_cases[c_i]; + if (t->match == SHOULD_SKIP) { + printf("%zu: SKIP (multi)\n", c_i); + skip++; + continue; + } + + if (options.verbosity > 0) { + printf("%zu: ", cur); + } + + struct captest_case_multi_result result; + enum captest_run_case_res res = captest_run_case_multi(t, + options.verbosity, false, options.prog_output, &result); + + pass += result.pass; + fail += result.fail; + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + if (options.verbosity > 0) { + printf("pass\n"); + } + break; + case CAPTEST_RUN_CASE_FAIL: + if (options.verbosity > 0) { + printf("FAIL\n"); + } else { + printf("-- test case %zd\n", cur); + } + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } +after_multi: + + /* hardcoded programs */ + if (!(options.group & GROUP_PROGRAMS)) { goto after_programs; } + + const size_t prog_case_count = sizeof(program_cases)/sizeof(program_cases[0]); + for (size_t c_i = 0; c_i < prog_case_count; c_i++) { + const size_t cur = nth++; + if ((options.filter != NO_FILTER && options.filter != cur)) { + continue; + } + + const struct captest_case_program *t = &program_cases[c_i]; + + if (options.verbosity > 0) { + printf("%zu: ", cur); + } + + enum captest_run_case_res res = captest_run_case_program(t, + options.verbosity); + + switch (res) { + case CAPTEST_RUN_CASE_PASS: + if (options.verbosity > 0) { + printf("pass\n"); + } + pass++; + break; + case CAPTEST_RUN_CASE_FAIL: + fail++; + if (options.verbosity > 0) { + printf("FAIL\n"); + } else if (options.verbosity == 0) { + printf("-- test case %zd\n", cur); + } + break; + case CAPTEST_RUN_CASE_ERROR: + assert(!"error"); + return EXIT_FAILURE; + } + } +after_programs: + + printf("-- pass %zu, fail %zu, skip %zu\n", pass, fail, skip); + + return fail > 0 + ? EXIT_FAILURE + : EXIT_SUCCESS; +} diff --git a/tests/capture/capture_union1.c b/tests/capture/capture_union1.c deleted file mode 100644 index 5d9bd2920..000000000 --- a/tests/capture/capture_union1.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "captest.h" - -/* union /(ab)/ and /(cde)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end); - -int main(void) { - unsigned cb_ab, cb_cde; /* capture base */ - struct fsm *abcde = build(&cb_ab, &cb_cde); - - check(abcde, "ab", 0, cb_ab, 0, 2); - check(abcde, "cde", 1, cb_cde, 0, 3); - - fsm_free(abcde); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *ab = captest_fsm_of_string("ab", 0); - struct fsm *cde = captest_fsm_of_string("cde", 1); - struct fsm *abcde; - struct fsm_combine_info ci; - size_t cc_ab, cc_cde, cc_abcde; - - assert(ab); - assert(cde); - - if (!fsm_capture_set_path(ab, 0, 0, 2)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(cde, 0, 0, 3)) { - assert(!"path 1"); - } - - cc_ab = fsm_countcaptures(ab); - assert(cc_ab == 1); - - cc_cde = fsm_countcaptures(cde); - assert(cc_cde == 1); - - abcde = fsm_union(ab, cde, &ci); - assert(abcde); - *cb_a = ci.capture_base_a; - *cb_b = ci.capture_base_b; - - cc_abcde = fsm_countcaptures(abcde); - assert(cc_abcde == cc_ab + cc_cde); - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after union: cb_ab %u, cb_cde %u\n", - *cb_a, *cb_b); - fsm_print_fsm(stderr, abcde); - - fsm_capture_dump(stderr, "#### after union", abcde); - - fprintf(stderr, "==== determinise\n"); -#endif - - if (!fsm_determinise(abcde)) { - assert(!"determinise"); - } - -#if LOG_INTERMEDIATE_FSMS - fprintf(stderr, "==== after determinise\n"); - fsm_print_fsm(stderr, abcde); - - assert(fsm_countcaptures(abcde) == cc_abcde); - - fsm_capture_dump(stderr, "#### after det", abcde); -#endif - - assert(fsm_countcaptures(abcde) == cc_abcde); - return abcde; -} - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture got_captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, got_captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - if (got_captures[exp_capture_id].pos[0] != exp_start) { - fprintf(stderr, "capture[%u].pos[0]: exp %lu, got %lu\n", - exp_capture_id, exp_start, - got_captures[exp_capture_id].pos[0]); - exit(EXIT_FAILURE); - } - if (got_captures[exp_capture_id].pos[1] != exp_end) { - fprintf(stderr, "capture[%u].pos[1]: exp %lu, got %lu\n", - exp_capture_id, exp_end, - got_captures[exp_capture_id].pos[1]); - exit(EXIT_FAILURE); - } -} diff --git a/tests/capture/capture_union2.c b/tests/capture/capture_union2.c deleted file mode 100644 index 7fab2f18d..000000000 --- a/tests/capture/capture_union2.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2020 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "captest.h" - -/* union /(abcd)/ and /(abed)/ */ - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b); - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end); - -int main(void) { - unsigned cb_abcd, cb_abed; - struct fsm *fsm = build(&cb_abcd, &cb_abed); - - check(fsm, "abcd", 0, cb_abcd, 0, 4); - check(fsm, "abed", 1, cb_abed, 0, 4); - - fsm_free(fsm); - - return EXIT_SUCCESS; -} - -static struct fsm * -build(unsigned *cb_a, unsigned *cb_b) -{ - struct fsm *abcd = captest_fsm_of_string("abcd", 0); - struct fsm *abed = captest_fsm_of_string("abed", 1); - struct fsm *res; - - assert(abcd); - assert(abed); - - if (!fsm_capture_set_path(abcd, 0, 0, 4)) { - assert(!"path 0"); - } - if (!fsm_capture_set_path(abed, 0, 0, 4)) { - assert(!"path 1"); - } - - { - struct fsm *fsms[2]; - struct fsm_combined_base_pair bases[2]; - fsms[0] = abcd; - fsms[1] = abed; - res = fsm_union_array(2, fsms, bases); - assert(res); - *cb_a = bases[0].capture; - *cb_b = bases[1].capture; - } - - if (!fsm_determinise(res)) { - assert(!"determinise"); - } - - assert(fsm_countcaptures(res) == 2); - - return res; -} - -static void -check(const struct fsm *fsm, const char *input, - unsigned end_id, unsigned exp_capture_id, - size_t exp_start, size_t exp_end) -{ - struct captest_input ci; - fsm_state_t end; - int exec_res; - struct fsm_capture got_captures[MAX_TEST_CAPTURES]; - - ci.string = input; - ci.pos = 0; - - exec_res = fsm_exec(fsm, captest_getc, &ci, &end, got_captures); - if (exec_res != 1) { - fprintf(stderr, "exec_res: %d\n", exec_res); - exit(EXIT_FAILURE); - } - - { - const char *msg; - if (!captest_check_single_end_id(fsm, end, end_id, &msg)) { - fprintf(stderr, "%s\n", msg); - exit(EXIT_FAILURE); - } - } - - if (got_captures[exp_capture_id].pos[0] != exp_start) { - fprintf(stderr, "capture[%u].pos[0]: exp %lu, got %lu\n", - exp_capture_id, exp_start, - got_captures[exp_capture_id].pos[0]); - exit(EXIT_FAILURE); - } - if (got_captures[exp_capture_id].pos[1] != exp_end) { - fprintf(stderr, "capture[%u].pos[1]: exp %lu, got %lu\n", - exp_capture_id, exp_end, - got_captures[exp_capture_id].pos[1]); - exit(EXIT_FAILURE); - } -} diff --git a/tests/endids/endids2_union_many_endids.c b/tests/endids/endids2_union_many_endids.c index 8e39ca93d..47af96dc9 100644 --- a/tests/endids/endids2_union_many_endids.c +++ b/tests/endids/endids2_union_many_endids.c @@ -167,6 +167,7 @@ int main(void) if (fsm == NULL) { fsm = new; } else { + /* TODO: this could use fsm_union_array instead */ fsm = fsm_union(fsm, new, NULL); assert(fsm != NULL); } @@ -283,5 +284,3 @@ int main(void) return EXIT_SUCCESS; } - - diff --git a/tests/endids/utils.c b/tests/endids/utils.c index 79777e825..85ff7a660 100644 --- a/tests/endids/utils.c +++ b/tests/endids/utils.c @@ -9,7 +9,7 @@ match_string(const struct fsm *fsm, const char *s, fsm_state_t *end_ptr, fsm_end fsm_state_t end = 0; int ret; - ret = fsm_exec(fsm, fsm_sgetc, &s, &end, NULL); + ret = fsm_exec(fsm, fsm_sgetc, &s, &end); if (ret == 1) { size_t num_endids; diff --git a/tests/idmap/Makefile b/tests/idmap/Makefile new file mode 100644 index 000000000..aee01f565 --- /dev/null +++ b/tests/idmap/Makefile @@ -0,0 +1,19 @@ +.include "../../share/mk/top.mk" + +TEST.tests/idmap != ls -1 tests/idmap/idmap*.c +TEST_SRCDIR.tests/idmap = tests/idmap +TEST_OUTDIR.tests/idmap = ${BUILD}/tests/idmap + +.for n in ${TEST.tests/idmap:T:R:C/^idmap//} +INCDIR.${TEST_SRCDIR.tests/idmap}/idmap${n}.c += src/adt +.endfor + +.for n in ${TEST.tests/idmap:T:R:C/^idmap//} +test:: ${TEST_OUTDIR.tests/idmap}/res${n} +SRC += ${TEST_SRCDIR.tests/idmap}/idmap${n}.c +CFLAGS.${TEST_SRCDIR.tests/idmap}/idmap${n}.c += -UNDEBUG -D_DEFAULT_SOURCE -std=c99 +${TEST_OUTDIR.tests/idmap}/run${n}: ${TEST_OUTDIR.tests/idmap}/idmap${n}.o ${BUILD}/lib/adt.o + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/idmap}/idmap${n}.c} -o ${TEST_OUTDIR.tests/idmap}/run${n} ${TEST_OUTDIR.tests/idmap}/idmap${n}.o ${BUILD}/lib/adt.o +${TEST_OUTDIR.tests/idmap}/res${n}: ${TEST_OUTDIR.tests/idmap}/run${n} + ( ${TEST_OUTDIR.tests/idmap}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/idmap}/res${n} +.endfor diff --git a/tests/idmap/idmap_basic.c b/tests/idmap/idmap_basic.c new file mode 100644 index 000000000..c7a18856b --- /dev/null +++ b/tests/idmap/idmap_basic.c @@ -0,0 +1,137 @@ +/* + * Copyright 2021 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include + +#include + +#define DEF_LIMIT 10 +#define DEF_SEED 0 + +/* Thes numbers were chose to get a reasonable variety, + * but also some duplicated values as the input grows. */ +#define MAX_GEN_VALUES 23 +#define ID_MASK ((1 << 9) - 1) +#define VALUE_MASK ((1 << 10) - 1) + +static int +dump_cb(fsm_state_t state_id, unsigned value, void *opaque) +{ + /* fprintf(stderr, " -- state %d, value %u\n", state_id, value); */ + assert(state_id <= ID_MASK); + assert(value <= VALUE_MASK); + (void)opaque; + return 1; +} + +static int +cmp_u(const void *pa, const void *pb) +{ + const unsigned a = *(unsigned *)pa; + const unsigned b = *(unsigned *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +int main(int argc, char **argv) { + const size_t limit = (argc > 1 ? atoi(argv[1]) : DEF_LIMIT); + const unsigned seed = (argc > 2 ? atoi(argv[2]) : DEF_SEED); + + (void)argc; + (void)argv; + struct idmap *m = idmap_new(NULL); + + srandom(seed); + + /* Fill the table with random data */ + for (size_t id_i = 0; id_i < limit; id_i++) { + const fsm_state_t id = (fsm_state_t)(random() & ID_MASK); + const size_t value_count = random() % MAX_GEN_VALUES; + + for (size_t v_i = 0; v_i < value_count; v_i++) { + const unsigned v = random() & VALUE_MASK; + if (!idmap_set(m, id, v)) { + assert(!"failed to set"); + } + } + } + + idmap_iter(m, dump_cb, NULL); + + srandom(seed); + + size_t got_buf_ceil = MAX_GEN_VALUES; + unsigned *got_buf = malloc(got_buf_ceil * sizeof(got_buf[0])); + assert(got_buf != NULL); + + /* Reset the PRNG and read back the same data. */ + for (size_t id_i = 0; id_i < limit; id_i++) { + const fsm_state_t id = (fsm_state_t)(random() & ID_MASK); + const size_t generated_value_count = random() % MAX_GEN_VALUES; + + /* Note: This can occasionally differ from + * generated_value_count, because the same id or values + * may have been generated more than once. As long as + * all the values match, it's fine. */ + const size_t value_count = idmap_get_value_count(m, id); + + if (value_count > got_buf_ceil) { + size_t nceil = got_buf_ceil; + while (nceil <= value_count) { + nceil *= 2; + } + free(got_buf); + got_buf = malloc(nceil * sizeof(got_buf[0])); + assert(got_buf != NULL); + got_buf_ceil = nceil; + } + + size_t written; + if (!idmap_get(m, id, + got_buf_ceil * sizeof(got_buf[0]), got_buf, + &written)) { + assert(!"failed to get"); + } + assert(written == value_count); + + unsigned gen_buf[MAX_GEN_VALUES]; + + for (size_t v_i = 0; v_i < generated_value_count; v_i++) { + const unsigned v = random() & VALUE_MASK; + gen_buf[v_i] = v; + } + qsort(gen_buf, generated_value_count, sizeof(gen_buf[0]), cmp_u); + + /* Every generated value should appear in the buffer. + * There may be more in the buffer; ignore them. */ + size_t v_i = 0; + for (size_t gen_i = 0; gen_i < generated_value_count; gen_i++) { + int found = 0; + const unsigned gv = gen_buf[gen_i]; + assert(value_count <= got_buf_ceil); + /* got_buf should be sorted, so we can pick up where we left off */ + while (v_i < value_count) { + if (gv == got_buf[v_i]) { + /* Intentionally don't increment v_i on match, + * because gen_buf can repeat values. */ + found = 1; + break; + } + v_i++; + } + if (!found) { + fprintf(stderr, "NOT FOUND: state %d -- value: %u\n", + id, gv); + return EXIT_FAILURE; + } + } + } + + free(got_buf); + idmap_free(m); + return EXIT_SUCCESS; +} diff --git a/tests/ir/Makefile b/tests/ir/Makefile index 0009c45ec..566d1add8 100755 --- a/tests/ir/Makefile +++ b/tests/ir/Makefile @@ -9,7 +9,7 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/ir:T:Mout*.json:R:C/^out//} ${TEST_OUTDIR.tests/ir}/got${n}.json: ${TEST_SRCDIR.tests/ir}/in${n}.re - ${RE} -pl irjson -y ${.ALLSRC:M*.re} \ + ${RE} -FC -pl irjson -y ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/ir}/res${n}: \ diff --git a/tests/minimise/minimise_test_case_list.c b/tests/minimise/minimise_test_case_list.c index 1386f0dcc..7299d2fa1 100644 --- a/tests/minimise/minimise_test_case_list.c +++ b/tests/minimise/minimise_test_case_list.c @@ -22,7 +22,6 @@ const char *test_cases[] = { "(?:a+|b)a+", "(?:a*ba)+", "(?:a|cd)+e?x", - "-> 1 'a';", "(?:abc|def)+", "(?:abc|def)*", "(?:b|a*)", @@ -81,7 +80,7 @@ check_minimisation(const char *pattern) .offset = 0 }; - fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI, &err); + fsm = re_comp(RE_PCRE, scanner_next, &s, &opt, RE_MULTI | RE_NOCAPTURE, &err); assert(fsm != NULL); if (!fsm_determinise(fsm)) { return 0; diff --git a/tests/native/Makefile b/tests/native/Makefile index 8712e1588..fbca0ca69 100755 --- a/tests/native/Makefile +++ b/tests/native/Makefile @@ -9,11 +9,11 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/native:T:Mout*.fsm:R:C/^out//} ${TEST_OUTDIR.tests/native}/got${n}.fsm: ${TEST_SRCDIR.tests/native}/in${n}.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/native}/nfa${n}.fsm: ${TEST_SRCDIR.tests/native}/in${n}.re - ${RE} -r native -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/native}/res${n}: \ @@ -27,7 +27,7 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/native}/res${n} .for n in ${TEST.tests/native:T:Mout*.err:R:C/^out//} ${TEST_OUTDIR.tests/native}/got${n}.err: ${TEST_SRCDIR.tests/native}/in${n}.re - ${RE} -r native -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r native -py ${.ALLSRC:M*.re} \ 2> $@; [ $$? -ne 0 ] ${TEST_OUTDIR.tests/native}/res${n}: \ diff --git a/tests/pcre-anchor/Makefile b/tests/pcre-anchor/Makefile index bb9954554..1dc4a77bc 100644 --- a/tests/pcre-anchor/Makefile +++ b/tests/pcre-anchor/Makefile @@ -9,11 +9,11 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/pcre-anchor:T:Mout*.fsm:R:C/^out//} ${TEST_OUTDIR.tests/pcre-anchor}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-anchor}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-anchor}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-anchor}/in${n}.re - ${RE} -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-anchor}/res${n}: \ diff --git a/tests/pcre-anchor/in81.re b/tests/pcre-anchor/in81.re new file mode 100644 index 000000000..8b5fad7c3 --- /dev/null +++ b/tests/pcre-anchor/in81.re @@ -0,0 +1 @@ +($x)* \ No newline at end of file diff --git a/tests/pcre-anchor/out81.fsm b/tests/pcre-anchor/out81.fsm new file mode 100644 index 000000000..2cdc2f023 --- /dev/null +++ b/tests/pcre-anchor/out81.fsm @@ -0,0 +1,5 @@ +0 -> 0 ?; +0 -> 1 "\n"; + +start: 0; +end: 0, 1; \ No newline at end of file diff --git a/tests/pcre-classes/Makefile b/tests/pcre-classes/Makefile index 0d9809d76..0d459e256 100755 --- a/tests/pcre-classes/Makefile +++ b/tests/pcre-classes/Makefile @@ -16,7 +16,7 @@ RE=${BUILD}/bin/re FSM=${BUILD}/bin/fsm ${TEST_OUTDIR.tests/pcre-classes}/dot-all.fsm: - ${RE} -r pcre -p '^[\x00-\xff]$$' | ${FSM} -pm \ + ${RE} -FC -r pcre -p '^[\x00-\xff]$$' | ${FSM} -pm \ > $@ # compl.re tests @@ -32,7 +32,7 @@ ${TEST_OUTDIR.tests/pcre-classes}/dot-all.fsm: .for n in ${TEST.tests/pcre-classes:M*/compl*.re:T:R:C/^compl//} ${TEST_OUTDIR.tests/pcre-classes}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/got-compl${n}.fsm: ${TEST_OUTDIR.tests/pcre-classes}/got${n}.fsm ${TEST_OUTDIR.tests/pcre-classes}/dot-all.fsm @@ -40,7 +40,7 @@ ${TEST_OUTDIR.tests/pcre-classes}/got-compl${n}.fsm: ${TEST_OUTDIR.tests/pcre-cl > $@ ${TEST_OUTDIR.tests/pcre-classes}/expect-compl${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/compl${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/compl*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/compl*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/res${n}: \ @@ -66,11 +66,11 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/pcre-classes}/res${n} @echo x: ${n} ${TEST_OUTDIR.tests/pcre-classes}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/in*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/out${n}.fsm: ${TEST_SRCDIR.tests/pcre-classes}/equal${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*/equal*.re} | ${FSM} -pm \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*/equal*.re} | ${FSM} -pm \ > $@ ${TEST_OUTDIR.tests/pcre-classes}/res${n}: \ diff --git a/tests/pcre-flags/Makefile b/tests/pcre-flags/Makefile index 67e70fbbb..308571395 100755 --- a/tests/pcre-flags/Makefile +++ b/tests/pcre-flags/Makefile @@ -13,17 +13,17 @@ RE=${BUILD}/bin/re TEST_OUTDIR.tests/pcre-flags/mode${n} != cat ${TEST_SRCDIR.tests/pcre-flags}/mode${n} ${TEST_OUTDIR.tests/pcre-flags}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re - ${RE} -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-flags}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re - ${RE} -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -F "${TEST_OUTDIR.tests/pcre-flags/mode${n}}" -b -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ .else ${TEST_OUTDIR.tests/pcre-flags}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re - ${RE} -b -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -b -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-flags}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-flags}/in${n}.re diff --git a/tests/pcre-repeat/Makefile b/tests/pcre-repeat/Makefile index c325d2f8e..97535b2c3 100755 --- a/tests/pcre-repeat/Makefile +++ b/tests/pcre-repeat/Makefile @@ -12,11 +12,11 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/pcre-repeat:T:Mout*.fsm:R:C/^out//} ${TEST_OUTDIR.tests/pcre-repeat}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre-repeat}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-repeat}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre-repeat}/in${n}.re - ${RE} -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre-repeat}/res${n}: \ @@ -30,7 +30,7 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/pcre-repeat}/res${n} .for n in ${TEST.tests/pcre-repeat:T:Mout*.err:R:C/^out//} ${TEST_OUTDIR.tests/pcre-repeat}/got${n}.err: ${TEST_SRCDIR.tests/pcre-repeat}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ 2> $@; [ $$? -ne 0 ] ${TEST_OUTDIR.tests/pcre-repeat}/res${n}: \ diff --git a/tests/pcre/Makefile b/tests/pcre/Makefile index 23f879a04..239d2c93b 100755 --- a/tests/pcre/Makefile +++ b/tests/pcre/Makefile @@ -21,7 +21,7 @@ PCREGREP ?= pcregrep # bit of a hack. # 2) removes any trailing \n at the end of the input ${TEST_OUTDIR.tests/pcre-pcregrep}/in${n}.txt: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -mr pcre -y ${.ALLSRC:M*.re} \ + ${RE} -FC -mr pcre -y ${.ALLSRC:M*.re} \ | perl -0pe 's/\\x([0-9a-zA-z]{2})/chr(hex($$1))/ge;' -e 's/\n\Z//' \ > $@ @@ -41,16 +41,16 @@ test:: ${TEST_OUTDIR.tests/pcre-pcregrep}/res${n} .if exists(${TEST_SRCDIR.tests/pcre}/mode${n}) TEST_OUTDIR.tests/pcre/mode${n} != cat ${TEST_SRCDIR.tests/pcre}/mode$n ${TEST_OUTDIR.tests/pcre}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -F "${TEST_OUTDIR.tests/pcre/mode${n}}" -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -F "${TEST_OUTDIR.tests/pcre/mode${n}}" -r pcre -py ${.ALLSRC:M*.re} \ > $@ .else ${TEST_OUTDIR.tests/pcre}/got${n}.fsm: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ > $@ .endif ${TEST_OUTDIR.tests/pcre}/nfa${n}.fsm: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -r pcre -n -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -n -py ${.ALLSRC:M*.re} \ > $@ ${TEST_OUTDIR.tests/pcre}/res${n}: \ @@ -64,7 +64,7 @@ FSMTEST_RESULT += ${TEST_OUTDIR.tests/pcre}/res${n} .for n in ${TEST.tests/pcre:T:Mout*.err:R:C/^out//} ${TEST_OUTDIR.tests/pcre}/got${n}.err: ${TEST_SRCDIR.tests/pcre}/in${n}.re - ${RE} -r pcre -py ${.ALLSRC:M*.re} \ + ${RE} -FC -r pcre -py ${.ALLSRC:M*.re} \ 2> $@; [ $$? -ne 0 ] ${TEST_OUTDIR.tests/pcre}/res${n}: \ diff --git a/tests/re_literal/Makefile b/tests/re_literal/Makefile index 1d1333491..9227a5699 100755 --- a/tests/re_literal/Makefile +++ b/tests/re_literal/Makefile @@ -9,7 +9,7 @@ RE=${BUILD}/bin/re .for n in ${TEST.tests/re_literal:T:Mout*.txt:R:C/^out//} ${TEST_OUTDIR.tests/re_literal}/got${n}.txt: ${TEST_SRCDIR.tests/re_literal}/in${n}.re - ( ${RE} -r pcre -t -y ${.ALLSRC:M*.re} || echo non-literal ) \ + ( ${RE} -FC -r pcre -t -y ${.ALLSRC:M*.re} || echo non-literal ) \ > $@ ${TEST_OUTDIR.tests/re_literal}/res${n}: \ diff --git a/theft/Makefile b/theft/Makefile index 0d38d8cfc..921c482a9 100644 --- a/theft/Makefile +++ b/theft/Makefile @@ -6,7 +6,6 @@ SRC += theft/util.c SRC += theft/wrap.c SRC += theft/fuzz_adt_edge_set.c -SRC += theft/fuzz_adt_ipriq.c SRC += theft/fuzz_adt_priq.c SRC += theft/fuzz_capture_string_set.c SRC += theft/fuzz_literals.c diff --git a/theft/fuzz_adt_ipriq.c b/theft/fuzz_adt_ipriq.c deleted file mode 100644 index 1847ef6ce..000000000 --- a/theft/fuzz_adt_ipriq.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright 2021 Scott Vokes - * - * See LICENCE for the full copyright terms. - */ - -#include "type_info_adt_ipriq.h" - -#include -#include - -struct model { - size_t used; - size_t entries[]; -}; - -static enum ipriq_cmp_res -cmp_size_t(size_t a, size_t b, void *opaque) -{ - (void)opaque; - return a < b ? IPRIQ_CMP_LT : - a > b ? IPRIQ_CMP_GT : IPRIQ_CMP_EQ; -} - -static int exec_add(size_t x, struct model *m, struct ipriq *pq) -{ - if (!ipriq_add(pq, x)) { - return 0; - } - - m->entries[m->used] = x; - m->used++; - return 1; -} - -static int find_min_pos(const struct model *m, size_t *pos) -{ - size_t i; - if (m->used == 0) { - return 0; - } - - size_t res, min; - res = 0; - min = m->entries[0]; - - for (i = 1; i < m->used; i++) { - if (m->entries[i] < min) { - res = i; - min = m->entries[i]; - } - } - *pos = res; - return 1; -} - -static int exec_peek(struct model *m, struct ipriq *pq) -{ - size_t res; - - if (!ipriq_peek(pq, &res)) { - return m->used == 0; - } - - size_t pos; - if (!find_min_pos(m, &pos)) { - assert(!"unreachable (peek)"); - } - - return res == m->entries[pos]; -} - -static int exec_pop(struct model *m, struct ipriq *pq) -{ - size_t res; - - if (!ipriq_pop(pq, &res)) { - return m->used == 0; - } - - size_t pos; - if (!find_min_pos(m, &pos)) { - assert(!"unreachable (pop)"); - } - - if (res != m->entries[pos]) { - return 0; - } - - assert(m->used > 0); - if (pos < m->used - 1) { - m->entries[pos] = m->entries[m->used - 1]; - } - m->used--; - return 1; -} - -static enum theft_trial_res -compare_against_model(const struct ipriq_scenario *scen) -{ - enum theft_trial_res res = THEFT_TRIAL_FAIL; - size_t i; - - struct model *m = malloc(sizeof(*m) - + scen->count * sizeof(m->entries[0])); - if (m == NULL) { - return THEFT_TRIAL_ERROR; - } - m->used = 0; - - struct ipriq *pq = ipriq_new(NULL, cmp_size_t, NULL); - if (pq == NULL) { - return THEFT_TRIAL_ERROR; - } - - for (i = 0; i < scen->count; i++) { - const struct ipriq_op *op = &scen->ops[i]; - - switch (op->t) { - case IPRIQ_OP_ADD: - if (!exec_add(op->u.add.x, m, pq)) { - goto cleanup; - } - break; - - case IPRIQ_OP_PEEK: - if (!exec_peek(m, pq)) { - goto cleanup; - } - break; - - case IPRIQ_OP_POP: - if (!exec_pop(m, pq)) { - goto cleanup; - } - break; - - default: - assert(false); break; - } - } - - res = THEFT_TRIAL_PASS; - -cleanup: - free(m); - - return res; -} - -static enum theft_trial_res -prop_ipriq_model(struct theft *t, void *arg1) -{ - const struct ipriq_scenario *scen = arg1; - (void)t; - return compare_against_model(scen); -} - -static bool -test_ipriq(theft_seed seed, uintptr_t limit) -{ - enum theft_run_res res; - - struct ipriq_hook_env env = { - .tag = 'I', - .limit = limit, - }; - - struct theft_run_config config = { - .name = __func__, - .prop1 = prop_ipriq_model, - .type_info = { &type_info_adt_ipriq }, - .trials = 1000, - .hooks = { - .trial_pre = theft_hook_first_fail_halt, - .env = &env, - }, - .fork = { - .enable = true, - }, - - .seed = seed, - }; - - (void)limit; - - res = theft_run(&config); - printf("%s: %s\n", __func__, theft_run_res_str(res)); - - return res == THEFT_RUN_PASS; -} - -void -register_test_adt_ipriq(void) -{ - reg_test1("adt_ipriq", test_ipriq, 10000); -} diff --git a/theft/fuzz_capture_string_set.c b/theft/fuzz_capture_string_set.c index 7326356c2..f225bb326 100644 --- a/theft/fuzz_capture_string_set.c +++ b/theft/fuzz_capture_string_set.c @@ -158,7 +158,7 @@ check_capstring_set(struct capture_env *env, return THEFT_TRIAL_ERROR; } - const size_t capture_count = fsm_countcaptures(dfa); + const size_t capture_count = fsm_capture_ceiling(dfa); if (verbosity > 2) { fprintf(stderr, "==== cs '%s'\n", cs->string); @@ -172,7 +172,7 @@ check_capstring_set(struct capture_env *env, assert(cp != NULL); fsm_copies[cs_i] = cp; - const size_t cp_capture_count = fsm_countcaptures(cp); + const size_t cp_capture_count = fsm_capture_ceiling(cp); if (verbosity > 2) { fprintf(stderr, "==== min(det(cp))\n"); fsm_print_fsm(stderr, cp); @@ -196,7 +196,7 @@ check_capstring_set(struct capture_env *env, return THEFT_TRIAL_FAIL; } - combined_capture_count = fsm_countcaptures(combined); + combined_capture_count = fsm_capture_ceiling(combined); for (size_t cs_i = 0; cs_i < css->count; cs_i++) { total_captures += capture_counts[cs_i]; } @@ -295,7 +295,7 @@ check_fsms_for_single_input(struct check_env *env, struct fsm_capture *captures, assert(exec_res >= 0); if (exec_res == 1) { if (LOG_LEVEL > 0) { - const size_t combined_capture_count = fsm_countcaptures(env->combined); + const size_t combined_capture_count = fsm_capture_ceiling(env->combined); for (size_t i = 0; i < combined_capture_count; i++) { fprintf(stderr, "capture[%zu/%zu]: (%ld, %ld)\n", i, combined_capture_count, @@ -415,7 +415,7 @@ compare_captures(const struct check_env *env, const struct fsm_capture *captures_combined, size_t nth_fsm, const struct fsm_capture *captures) { - const size_t combined_capture_count = fsm_countcaptures(env->combined); + const size_t combined_capture_count = fsm_capture_ceiling(env->combined); if (combined_capture_count == 0) { return true; /* no captures */ } @@ -639,7 +639,7 @@ build_capstring_dfa(const struct capstring *cs, uint8_t end_id) goto cleanup; } - if (fsm_countcaptures(fsm) != cs->capture_count) { + if (fsm_capture_ceiling(fsm) != cs->capture_count) { goto cleanup; }