Skip to content

Commit

Permalink
Merge branch 'sv/determine-which-characters-must-appear-in-input-to-m…
Browse files Browse the repository at this point in the history
…atch' into sv/tmp-integration-branch-to-vendor-for-da2lx-subside-development
  • Loading branch information
silentbicycle committed Sep 24, 2024
2 parents 19245f0 + aab4422 commit a8229c2
Show file tree
Hide file tree
Showing 33 changed files with 657 additions and 2 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ SUBDIR += tests/intersect
SUBDIR += tests/eclosure
SUBDIR += tests/equals
SUBDIR += tests/subtract
SUBDIR += tests/detect_required
SUBDIR += tests/determinise
SUBDIR += tests/eager_output
SUBDIR += tests/endids
Expand Down
18 changes: 18 additions & 0 deletions include/adt/bitmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#define ADT_BITMAP_H

#include <stdint.h>
#include <limits.h>

#include <stdio.h>
#include "print/esc.h"

struct fsm_state;
Expand All @@ -23,6 +26,9 @@ bm_get(const struct bm *bm, size_t i);
void
bm_set(struct bm *bm, size_t i);

void
bm_unset(struct bm *bm, size_t i);

/* Get a writeable pointer to the Nth word of the char set bitmap,
* or NULL if out of bounds. */
uint64_t *
Expand Down Expand Up @@ -51,5 +57,17 @@ bm_snprint(const struct bm *bm, const struct fsm_options *opt,
int boxed,
escputc *escputc);

void
bm_copy(struct bm *dst, const struct bm *src);

void
bm_intersect(struct bm *dst, const struct bm *src);

void
bm_union(struct bm *dst, const struct bm *src);

int
bm_any(const struct bm *bm);

#endif

31 changes: 31 additions & 0 deletions include/fsm/walk.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#ifndef FSM_WALK_H
#define FSM_WALK_H

#include <adt/bitmap.h>

struct fsm;
struct fsm_state;

Expand Down Expand Up @@ -128,5 +130,34 @@ fsm_generate_matches_cb fsm_generate_cb_printf;
* to escape all characters or just nonprintable ones. */
fsm_generate_matches_cb fsm_generate_cb_printf_escaped;

/* Walk a DFA and detect which characters MUST appear in the input for a
* match to be possible. For example, if input for the DFA corresponding
* to /^(abc|dbe)$/ does not contain 'b' at all, there's no way it can
* ever match, so executing the regex is unnecessary. This does not detect
* which characters must appear before/after others or how many times, just
* which must be present.
*
* The input must be a DFA. When run with EXPENSIVE_CHECKS this will
* check and return ERROR_MISUSE if it is not, otherwise this is an
* unchecked error.
*
* The character map will be cleared before populating. If *count is
* non-NULL it will be updated with how many required characters were
* found.
*
* There is an optional step_limit -- if this is reached, then it will
* return FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED and a
* cleared bitmap, because any partial information could still have been
* contradicted later. If the step_limit is 0 it will be ignored. */
enum fsm_detect_required_characters_res {
FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN,
FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED,
FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE = -1,
FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC = -2,
};
enum fsm_detect_required_characters_res
fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit,
uint64_t charmap[4], size_t *count);

#endif

16 changes: 16 additions & 0 deletions man/fsm.1/fsm.1.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<!ENTITY G.opt "<option>-G</option>&nbsp;&length.arg;">
<!ENTITY k.opt "<option>-k</option>&nbsp;&io.arg;">
<!ENTITY i.opt "<option>-i</option>&nbsp;&iterations.arg;">
<!ENTITY G.opt "<option>-S</option>&nbsp;&limit.arg;">
<!ENTITY U.opt "<option>-U</option>&nbsp;&charset.arg;">
<!ENTITY X.opt "<option>-X</option>">

Expand Down Expand Up @@ -325,6 +326,14 @@
</listitem>
</varlistentry>

<varlistentry>
<term>&S.opt;</term>

<listitem>
<para>Set a step limit for long-running operations.</para>
</listitem>
</varlistentry>

<varlistentry>
<term>&t.opt;</term>

Expand Down Expand Up @@ -487,6 +496,13 @@
of each state in the &fsm;.
Printed to &stdout.lit;; exit status is always true.</td>
</tr>
<tr>
<td><code>requiredchars</code></td>
<td rowspan="1" role="na">&ndash;</td>
<td rowspan="1">Determine characters that must appear in any
inputs that could match the &fsm;. Exit status is true
unless the step limit was reached.</td>
</tr>
</tbody>
</table>
</listitem>
Expand Down
41 changes: 41 additions & 0 deletions src/adt/bitmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <limits.h>
#include <ctype.h>
#include <stdint.h>
#include <string.h>

#include <adt/bitmap.h>
#include <adt/u64bitset.h>
Expand All @@ -34,6 +35,15 @@ bm_set(struct bm *bm, size_t i)
u64bitset_set(bm->map, i);
}

void
bm_unset(struct bm *bm, size_t i)
{
assert(bm != NULL);
assert(i <= UCHAR_MAX);

u64bitset_clear(bm->map, i);
}

uint64_t *
bm_nth_word(struct bm *bm, size_t n)
{
Expand Down Expand Up @@ -325,3 +335,34 @@ bm_snprint(const struct bm *bm, const struct fsm_options *opt,

return -1;
}

void
bm_copy(struct bm *dst, const struct bm *src)
{
memcpy(dst, src, sizeof(*src));
}

void
bm_intersect(struct bm *dst, const struct bm *src)
{
for (size_t i = 0; i < sizeof(src->map)/sizeof(src->map[0]); i++) {
dst->map[i] &= src->map[i];
}
}

void
bm_union(struct bm *dst, const struct bm *src)
{
for (size_t i = 0; i < sizeof(src->map)/sizeof(src->map[0]); i++) {
dst->map[i] |= src->map[i];
}
}

int
bm_any(const struct bm *bm)
{
for (size_t i = 0; i < sizeof(bm->map)/sizeof(bm->map[0]); i++) {
if (bm->map[i]) { return 1; }
}
return 0;
}
50 changes: 48 additions & 2 deletions src/fsm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <fsm/parser.h>

#include <adt/stateset.h> /* XXX */
#include <adt/u64bitset.h>

#include "libfsm/internal.h" /* XXX */

Expand Down Expand Up @@ -101,6 +102,16 @@ query_epsilonclosure(const struct fsm *fsm, fsm_state_t state)
abort();
}

static int
query_required_chars(const struct fsm *fsm, fsm_state_t state)
{
(void) fsm;
(void) state;

/* never called */
abort();
}

static void
usage(void)
{
Expand Down Expand Up @@ -227,7 +238,9 @@ static int
{ "hasambiguity", fsm_has, fsm_hasnondeterminism },
{ "hasnondeterminism", fsm_has, fsm_hasnondeterminism },
{ "hasepsilons", fsm_has, fsm_hasepsilons },
{ "epsilons", fsm_has, fsm_hasepsilons }
{ "epsilons", fsm_has, fsm_hasepsilons },
{ "requiredchars", NULL, query_required_chars },
{ "chars", NULL, query_required_chars },
};

assert(name != NULL);
Expand Down Expand Up @@ -378,6 +391,7 @@ main(int argc, char *argv[])
int xfiles;
int r;
size_t generate_bounds = 0;
size_t step_limit = 0;

int (*query)(const struct fsm *, fsm_state_t);
int (*walk )(const struct fsm *,
Expand All @@ -404,7 +418,7 @@ main(int argc, char *argv[])
{
int c;

while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:EU:W:"), c != -1) {
while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:ES:U:W:"), c != -1) {
switch (c) {
case 'a': opt.anonymous_states = 1; break;
case 'c': opt.consolidate_edges = 1; break;
Expand Down Expand Up @@ -451,6 +465,10 @@ main(int argc, char *argv[])
}
break;

case 'S':
step_limit = strtoul(optarg, NULL, 10);
break; /* can be 0 */

case 'h':
usage();
exit(EXIT_SUCCESS);
Expand Down Expand Up @@ -669,6 +687,34 @@ main(int argc, char *argv[])
closure_free(fsm, closures, fsm->statecount);

return 0;
} else if (query == query_required_chars) {
assert(walk == NULL);
uint64_t charmap[4];
size_t count;
enum fsm_detect_required_characters_res res;
res = fsm_detect_required_characters(fsm, step_limit, charmap, &count);
if (res == FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED) {
fprintf(stderr, "fsm_detect_required_characters: step limit reached (%zd)\n", step_limit);
exit(EXIT_FAILURE);
} else {
assert(res == FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN);
char buf[257] = {0};
size_t used = 0;
for (size_t i = 0; i < 256; i++) {
if (u64bitset_get(charmap, i)) {
buf[used++] = (char)i;
}
}
printf("%zd ", count);
for (size_t i = 0; i < used; i++) {
c_escputc_str(stdout, &opt, buf[i]);
}
printf("\n");

fsm_free(fsm);
fsm_to_cleanup = NULL;
return EXIT_SUCCESS;
}
} else {
assert(walk != NULL);
r |= !walk(fsm, query);
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ SRC += src/libfsm/complete.c
SRC += src/libfsm/consolidate.c
SRC += src/libfsm/clone.c
SRC += src/libfsm/closure.c
SRC += src/libfsm/detect_required.c
SRC += src/libfsm/eager_output.c
SRC += src/libfsm/edge.c
SRC += src/libfsm/empty.c
Expand Down
Loading

0 comments on commit a8229c2

Please sign in to comment.