From 7abff83de06bf50d73d6219e0f4d281f16e1987a Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 8 Jun 2023 16:35:33 -0700 Subject: [PATCH] Disallow \C --- src/libre/dialect/pcre/lexer.c | 581 ++++++++++++++++---------------- src/libre/dialect/pcre/lexer.lx | 4 + tests/pcre/in47.re | 1 + tests/pcre/out47.err | 1 + 4 files changed, 297 insertions(+), 290 deletions(-) create mode 100644 tests/pcre/in47.re create mode 100644 tests/pcre/out47.err diff --git a/src/libre/dialect/pcre/lexer.c b/src/libre/dialect/pcre/lexer.c index 177566b9b..b26096785 100644 --- a/src/libre/dialect/pcre/lexer.c +++ b/src/libre/dialect/pcre/lexer.c @@ -188,7 +188,7 @@ z0(struct lx_pcre_lx *lx) } break; - case S2: /* e.g. "\\x01" */ + case S2: /* e.g. "a" */ lx_pcre_ungetc(lx, c); return TOK_CHAR; case S3: /* e.g. "\\E" */ @@ -351,7 +351,7 @@ z2(struct lx_pcre_lx *lx) } break; - case S2: /* e.g. "\\x01" */ + case S2: /* e.g. "a" */ lx_pcre_ungetc(lx, c); return TOK_CHAR; case S3: /* e.g. "\\E" */ @@ -441,8 +441,22 @@ z3(struct lx_pcre_lx *lx) case 'h': case 's': case 'v': - case 'w': state = S23; break; - case 'c': state = S55; break; + case 'w': state = S24; break; + case 'Q': state = S55; break; + case 'E': state = S56; break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': state = S57; break; + case 'x': state = S58; break; + case '0': state = S59; break; + case 'o': state = S60; break; + case 'c': state = S61; break; case '$': case '(': case '*': @@ -461,21 +475,7 @@ z3(struct lx_pcre_lx *lx) case 'r': case 't': case '{': - case '|': state = S56; break; - case 'E': state = S57; break; - case 'Q': state = S58; break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': state = S59; break; - case 'x': state = S60; break; - case 'o': state = S61; break; - case '0': state = S63; break; + case '|': state = S63; break; default: state = S62; break; } break; @@ -487,7 +487,7 @@ z3(struct lx_pcre_lx *lx) } break; - case S3: /* e.g. "\\x01" */ + case S3: /* e.g. "a" */ lx_pcre_ungetc(lx, c); return TOK_CHAR; case S4: /* e.g. "-" */ @@ -506,16 +506,16 @@ z3(struct lx_pcre_lx *lx) case S7: /* e.g. "[:" */ switch ((unsigned char) c) { case 'd': state = S8; break; - case 'p': state = S9; break; - case 'x': state = S10; break; - case 'c': state = S11; break; - case 'l': state = S12; break; - case 'g': state = S13; break; - case 's': state = S14; break; + case 'u': state = S9; break; + case 'w': state = S10; break; + case 'x': state = S11; break; + case 'b': state = S12; break; + case 'c': state = S13; break; + case 'l': state = S14; break; case 'a': state = S15; break; - case 'b': state = S16; break; - case 'u': state = S17; break; - case 'w': state = S18; break; + case 's': state = S16; break; + case 'p': state = S17; break; + case 'g': state = S18; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -527,309 +527,309 @@ z3(struct lx_pcre_lx *lx) } break; - case S9: /* e.g. "[:p" */ + case S9: /* e.g. "[:u" */ switch ((unsigned char) c) { - case 'r': state = S48; break; - case 'u': state = S49; break; + case 'p': state = S52; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S10: /* e.g. "[:x" */ + case S10: /* e.g. "[:w" */ switch ((unsigned char) c) { - case 'd': state = S8; break; + case 'o': state = S50; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S11: /* e.g. "[:c" */ + case S11: /* e.g. "[:x" */ switch ((unsigned char) c) { - case 'n': state = S45; break; + case 'd': state = S8; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S12: /* e.g. "[:l" */ + case S12: /* e.g. "[:b" */ switch ((unsigned char) c) { - case 'o': state = S44; break; + case 'l': state = S47; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S13: /* e.g. "[:g" */ + case S13: /* e.g. "[:c" */ switch ((unsigned char) c) { - case 'r': state = S41; break; + case 'n': state = S44; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S14: /* e.g. "[:s" */ + case S14: /* e.g. "[:l" */ switch ((unsigned char) c) { - case 'p': state = S38; break; + case 'o': state = S41; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:a" */ switch ((unsigned char) c) { - case 's': state = S30; break; - case 'l': state = S31; break; + case 's': state = S33; break; + case 'l': state = S34; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S16: /* e.g. "[:b" */ + case S16: /* e.g. "[:s" */ switch ((unsigned char) c) { - case 'l': state = S27; break; + case 'p': state = S30; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S17: /* e.g. "[:u" */ + case S17: /* e.g. "[:p" */ switch ((unsigned char) c) { - case 'p': state = S24; break; + case 'r': state = S25; break; + case 'u': state = S26; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S18: /* e.g. "[:w" */ + case S18: /* e.g. "[:g" */ switch ((unsigned char) c) { - case 'o': state = S19; break; + case 'r': state = S19; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S19: /* e.g. "[:wo" */ + case S19: /* e.g. "[:gr" */ switch ((unsigned char) c) { - case 'r': state = S20; break; + case 'a': state = S20; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S20: /* e.g. "[:wor" */ + case S20: /* e.g. "[:gra" */ switch ((unsigned char) c) { - case 'd': state = S21; break; + case 'p': state = S21; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S21: /* e.g. "[:word" */ + case S21: /* e.g. "[:grap" */ switch ((unsigned char) c) { - case ':': state = S22; break; + case 'h': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S22: /* e.g. "[:word:" */ + case S22: /* e.g. "[:word" */ switch ((unsigned char) c) { - case ']': state = S23; break; + case ':': state = S23; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S23: /* e.g. "\\D" */ - lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; - - case S24: /* e.g. "[:up" */ + case S23: /* e.g. "[:word:" */ switch ((unsigned char) c) { - case 'p': state = S25; break; + case ']': state = S24; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S25: /* e.g. "[:low" */ + case S24: /* e.g. "\\d" */ + lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; + + case S25: /* e.g. "[:pr" */ switch ((unsigned char) c) { - case 'e': state = S26; break; + case 'i': state = S29; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S26: /* e.g. "[:lowe" */ + case S26: /* e.g. "[:pu" */ switch ((unsigned char) c) { - case 'r': state = S21; break; + case 'n': state = S27; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S27: /* e.g. "[:bl" */ + case S27: /* e.g. "[:pun" */ switch ((unsigned char) c) { - case 'a': state = S28; break; + case 'c': state = S28; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S28: /* e.g. "[:bla" */ + case S28: /* e.g. "[:digi" */ switch ((unsigned char) c) { - case 'n': state = S29; break; + case 't': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S29: /* e.g. "[:blan" */ + case S29: /* e.g. "[:pri" */ switch ((unsigned char) c) { - case 'k': state = S21; break; + case 'n': state = S28; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S30: /* e.g. "[:as" */ + case S30: /* e.g. "[:sp" */ switch ((unsigned char) c) { - case 'c': state = S36; break; + case 'a': state = S31; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S31: /* e.g. "[:al" */ + case S31: /* e.g. "[:spa" */ switch ((unsigned char) c) { - case 'p': state = S32; break; - case 'n': state = S33; break; + case 'c': state = S32; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S32: /* e.g. "[:alp" */ + case S32: /* e.g. "[:spac" */ switch ((unsigned char) c) { - case 'h': state = S35; break; + case 'e': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S33: /* e.g. "[:aln" */ + case S33: /* e.g. "[:as" */ switch ((unsigned char) c) { - case 'u': state = S34; break; + case 'c': state = S39; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S34: /* e.g. "[:alnu" */ + case S34: /* e.g. "[:al" */ switch ((unsigned char) c) { - case 'm': state = S21; break; + case 'n': state = S35; break; + case 'p': state = S36; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S35: /* e.g. "[:alph" */ + case S35: /* e.g. "[:aln" */ switch ((unsigned char) c) { - case 'a': state = S21; break; + case 'u': state = S38; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S36: /* e.g. "[:asc" */ + case S36: /* e.g. "[:alp" */ switch ((unsigned char) c) { - case 'i': state = S37; break; + case 'h': state = S37; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S37: /* e.g. "[:asci" */ + case S37: /* e.g. "[:alph" */ switch ((unsigned char) c) { - case 'i': state = S21; break; + case 'a': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S38: /* e.g. "[:sp" */ + case S38: /* e.g. "[:alnu" */ switch ((unsigned char) c) { - case 'a': state = S39; break; + case 'm': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S39: /* e.g. "[:spa" */ + case S39: /* e.g. "[:asc" */ switch ((unsigned char) c) { - case 'c': state = S40; break; + case 'i': state = S40; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S40: /* e.g. "[:spac" */ + case S40: /* e.g. "[:asci" */ switch ((unsigned char) c) { - case 'e': state = S21; break; + case 'i': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S41: /* e.g. "[:gr" */ + case S41: /* e.g. "[:lo" */ switch ((unsigned char) c) { - case 'a': state = S42; break; + case 'w': state = S42; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S42: /* e.g. "[:gra" */ + case S42: /* e.g. "[:low" */ switch ((unsigned char) c) { - case 'p': state = S43; break; + case 'e': state = S43; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S43: /* e.g. "[:grap" */ + case S43: /* e.g. "[:lowe" */ switch ((unsigned char) c) { - case 'h': state = S21; break; + case 'r': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S44: /* e.g. "[:lo" */ + case S44: /* e.g. "[:cn" */ switch ((unsigned char) c) { - case 'w': state = S25; break; + case 't': state = S45; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S45: /* e.g. "[:cn" */ + case S45: /* e.g. "[:cnt" */ switch ((unsigned char) c) { - case 't': state = S46; break; + case 'r': state = S46; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S46: /* e.g. "[:cnt" */ + case S46: /* e.g. "[:cntr" */ switch ((unsigned char) c) { - case 'r': state = S47; break; + case 'l': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S47: /* e.g. "[:cntr" */ + case S47: /* e.g. "[:bl" */ switch ((unsigned char) c) { - case 'l': state = S21; break; + case 'a': state = S48; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S48: /* e.g. "[:pr" */ + case S48: /* e.g. "[:bla" */ switch ((unsigned char) c) { - case 'i': state = S52; break; + case 'n': state = S49; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S49: /* e.g. "[:pu" */ + case S49: /* e.g. "[:blan" */ switch ((unsigned char) c) { - case 'n': state = S50; break; + case 'k': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S50: /* e.g. "[:pun" */ + case S50: /* e.g. "[:wo" */ switch ((unsigned char) c) { - case 'c': state = S51; break; + case 'r': state = S51; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S51: /* e.g. "[:digi" */ + case S51: /* e.g. "[:wor" */ switch ((unsigned char) c) { - case 't': state = S21; break; + case 'd': state = S22; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S52: /* e.g. "[:pri" */ + case S52: /* e.g. "[:up" */ switch ((unsigned char) c) { - case 'n': state = S51; break; + case 'p': state = S42; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -843,24 +843,18 @@ z3(struct lx_pcre_lx *lx) case S54: /* e.g. "[:dig" */ switch ((unsigned char) c) { - case 'i': state = S51; break; + case 'i': state = S28; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S55: /* e.g. "\\c" */ - state = S72; break; - - case S56: /* e.g. "\\$" */ - lx_pcre_ungetc(lx, c); return TOK_ESC; + case S55: /* e.g. "\\Q" */ + lx_pcre_ungetc(lx, c); return lx->z = z2, lx->z(lx); - case S57: /* e.g. "\\E" */ + case S56: /* e.g. "\\E" */ lx_pcre_ungetc(lx, c); return lx->z(lx); - case S58: /* e.g. "\\Q" */ - lx_pcre_ungetc(lx, c); return lx->z = z2, lx->z(lx); - - case S59: /* e.g. "\\1" */ + case S57: /* e.g. "\\1" */ switch ((unsigned char) c) { case '0': case '1': @@ -876,9 +870,9 @@ z3(struct lx_pcre_lx *lx) } break; - case S60: /* e.g. "\\x" */ + case S58: /* e.g. "\\x" */ switch ((unsigned char) c) { - case '{': state = S68; break; + case '{': state = S69; break; case '0': case '1': case '2': @@ -900,22 +894,12 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S69; break; + case 'f': state = S70; break; default: lx_pcre_ungetc(lx, c); return TOK_HEX; } break; - case S61: /* e.g. "\\o" */ - switch ((unsigned char) c) { - case '{': state = S66; break; - default: lx_pcre_ungetc(lx, c); return TOK_NOESC; - } - break; - - case S62: /* e.g. "\\\\x00" */ - lx_pcre_ungetc(lx, c); return TOK_NOESC; - - case S63: /* e.g. "\\0" */ + case S59: /* e.g. "\\0" */ switch ((unsigned char) c) { case '0': case '1': @@ -924,12 +908,31 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S64; break; + case '7': state = S68; break; default: lx_pcre_ungetc(lx, c); return TOK_OCT; } break; - case S64: /* e.g. "\\00" */ + case S60: /* e.g. "\\o" */ + switch ((unsigned char) c) { + case '{': state = S65; break; + default: lx_pcre_ungetc(lx, c); return TOK_NOESC; + } + break; + + case S61: /* e.g. "\\c" */ + state = S64; break; + + case S62: /* e.g. "\\g" */ + lx_pcre_ungetc(lx, c); return TOK_NOESC; + + case S63: /* e.g. "\\a" */ + lx_pcre_ungetc(lx, c); return TOK_ESC; + + case S64: /* e.g. "\\ca" */ + lx_pcre_ungetc(lx, c); return TOK_CONTROL; + + case S65: /* e.g. "\\o{" */ switch ((unsigned char) c) { case '0': case '1': @@ -938,15 +941,12 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S65; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; + case '7': state = S66; break; + default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S65: /* e.g. "\\000" */ - lx_pcre_ungetc(lx, c); return TOK_OCT; - - case S66: /* e.g. "\\o{" */ + case S66: /* e.g. "\\o{0" */ switch ((unsigned char) c) { case '0': case '1': @@ -955,14 +955,17 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S67; break; + case '7': break; + case '}': state = S67; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S67: /* e.g. "\\o{0" */ + case S67: /* e.g. "\\000" */ + lx_pcre_ungetc(lx, c); return TOK_OCT; + + case S68: /* e.g. "\\00" */ switch ((unsigned char) c) { - case '}': state = S65; break; case '0': case '1': case '2': @@ -970,12 +973,12 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case '7': state = S67; break; + default: lx_pcre_ungetc(lx, c); return TOK_OCT; } break; - case S68: /* e.g. "\\x{" */ + case S69: /* e.g. "\\x{" */ switch ((unsigned char) c) { case '0': case '1': @@ -998,12 +1001,12 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S71; break; + case 'f': state = S72; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S69: /* e.g. "\\x0" */ + case S70: /* e.g. "\\xa" */ switch ((unsigned char) c) { case '0': case '1': @@ -1026,17 +1029,17 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S70; break; + case 'f': state = S71; break; default: lx_pcre_ungetc(lx, c); return TOK_HEX; } break; - case S70: /* e.g. "\\x00" */ + case S71: /* e.g. "\\xaa" */ lx_pcre_ungetc(lx, c); return TOK_HEX; - case S71: /* e.g. "\\x{0" */ + case S72: /* e.g. "\\x{a" */ switch ((unsigned char) c) { - case '}': state = S70; break; + case '}': state = S71; break; case '0': case '1': case '2': @@ -1063,16 +1066,13 @@ z3(struct lx_pcre_lx *lx) } break; - case S72: /* e.g. "\\c\\x00" */ - lx_pcre_ungetc(lx, c); return TOK_CONTROL; - default: ; /* unreached */ } switch (state) { - case S57: - case S58: + case S55: + case S56: break; default: @@ -1096,21 +1096,21 @@ z3(struct lx_pcre_lx *lx) case S4: return TOK_RANGE; case S5: return TOK_CLOSEGROUP; case S6: return TOK_CLOSEGROUPRANGE; - case S23: return TOK_NAMED__CLASS; - case S55: return TOK_NOESC; - case S56: return TOK_ESC; - case S57: return TOK_EOF; - case S58: return TOK_EOF; - case S59: return TOK_UNSUPPORTED; - case S60: return TOK_HEX; + case S24: return TOK_NAMED__CLASS; + case S55: return TOK_EOF; + case S56: return TOK_EOF; + case S57: return TOK_UNSUPPORTED; + case S58: return TOK_HEX; + case S59: return TOK_OCT; + case S60: return TOK_NOESC; case S61: return TOK_NOESC; case S62: return TOK_NOESC; - case S63: return TOK_OCT; - case S64: return TOK_OCT; - case S65: return TOK_OCT; - case S69: return TOK_HEX; + case S63: return TOK_ESC; + case S64: return TOK_CONTROL; + case S67: return TOK_OCT; + case S68: return TOK_OCT; case S70: return TOK_HEX; - case S72: return TOK_CONTROL; + case S71: return TOK_HEX; default: errno = EINVAL; return TOK_ERROR; } } @@ -1188,7 +1188,7 @@ z4(struct lx_pcre_lx *lx) } break; - case S1: /* e.g. "+" */ + case S1: /* e.g. "R" */ lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; case S2: /* e.g. "0" */ @@ -1297,7 +1297,7 @@ z5(struct lx_pcre_lx *lx) } break; - case S1: /* e.g. "\\x00" */ + case S1: /* e.g. "a" */ switch ((unsigned char) c) { case '(': case ')': lx_pcre_ungetc(lx, c); return lx->z(lx); @@ -1932,7 +1932,7 @@ z6(struct lx_pcre_lx *lx) } break; - case S78: /* e.g. ":\\x00" */ + case S78: /* e.g. ":a" */ switch ((unsigned char) c) { case ')': lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; default: break; @@ -2028,7 +2028,7 @@ z7(struct lx_pcre_lx *lx) } break; - case S1: /* e.g. "\\x01" */ + case S1: /* e.g. "a" */ lx_pcre_ungetc(lx, c); return TOK_CHAR; case S2: /* e.g. "\\" */ @@ -2036,12 +2036,30 @@ z7(struct lx_pcre_lx *lx) case 'Z': state = S14; break; case 'A': state = S15; break; case 'B': + case 'C': case 'G': case 'K': case 'X': case 'b': case 'g': - case 'k': state = S21; break; + case 'k': state = S20; break; + case 'Q': state = S28; break; + case 'o': state = S29; break; + case 'c': state = S30; break; + case 'x': state = S32; break; + case '0': state = S33; break; + case 'R': state = S34; break; + case 'D': + case 'H': + case 'N': + case 'S': + case 'V': + case 'W': + case 'd': + case 'h': + case 's': + case 'v': + case 'w': state = S35; break; case '$': case '(': case ')': @@ -2059,9 +2077,8 @@ z7(struct lx_pcre_lx *lx) case 'r': case 't': case '{': - case '|': state = S28; break; - case 'E': state = S29; break; - case 'z': state = S30; break; + case '|': state = S36; break; + case 'E': state = S37; break; case '1': case '2': case '3': @@ -2070,25 +2087,9 @@ z7(struct lx_pcre_lx *lx) case '6': case '7': case '8': - case '9': state = S31; break; - case 'Q': state = S32; break; - case 'c': state = S33; break; - case 'o': state = S34; break; - case 'x': state = S36; break; - case 'D': - case 'H': - case 'N': - case 'S': - case 'V': - case 'W': - case 'd': - case 'h': - case 's': - case 'v': - case 'w': state = S37; break; - case 'R': state = S38; break; - case '0': state = S39; break; - default: state = S35; break; + case '9': state = S38; break; + case 'z': state = S39; break; + default: state = S31; break; } break; @@ -2147,11 +2148,11 @@ z7(struct lx_pcre_lx *lx) case S17: /* e.g. "(?" */ switch ((unsigned char) c) { case '#': state = S19; break; - case '<': state = S20; break; case '!': case '&': - case '=': state = S21; break; - case 'P': state = S22; break; + case '=': state = S20; break; + case 'P': state = S21; break; + case '<': state = S22; break; default: lx_pcre_ungetc(lx, c); return lx->z = z4, TOK_FLAGS; } break; @@ -2162,10 +2163,20 @@ z7(struct lx_pcre_lx *lx) case S19: /* e.g. "(?#" */ lx_pcre_ungetc(lx, c); return lx->z = z5, lx->z(lx); - case S20: /* e.g. "(?<" */ + case S20: /* e.g. "\\b" */ + lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + + case S21: /* e.g. "(?P" */ + switch ((unsigned char) c) { + case '>': state = S20; break; + default: lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S22: /* e.g. "(?<" */ switch ((unsigned char) c) { case '!': - case '=': state = S21; break; + case '=': state = S20; break; case 'A': case 'B': case 'C': @@ -2223,17 +2234,7 @@ z7(struct lx_pcre_lx *lx) } break; - case S21: /* e.g. "\\B" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; - - case S22: /* e.g. "(?P" */ - switch ((unsigned char) c) { - case '>': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; - - case S23: /* e.g. "(?" */ + case S24: /* e.g. "(?" */ lx_pcre_ungetc(lx, c); return TOK_OPENCAPTURE; case S25: /* e.g. "[^" */ @@ -2319,48 +2320,23 @@ z7(struct lx_pcre_lx *lx) case S27: /* e.g. "[^]" */ lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPINVCB; - case S28: /* e.g. "\\$" */ - lx_pcre_ungetc(lx, c); return TOK_ESC; - - case S29: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z(lx); - - case S30: /* e.g. "\\z" */ - lx_pcre_ungetc(lx, c); return TOK_END; - - case S31: /* e.g. "\\1" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; - } - break; - - case S32: /* e.g. "\\Q" */ + case S28: /* e.g. "\\Q" */ lx_pcre_ungetc(lx, c); return lx->z = z0, lx->z(lx); - case S33: /* e.g. "\\c" */ - state = S48; break; - - case S34: /* e.g. "\\o" */ + case S29: /* e.g. "\\o" */ switch ((unsigned char) c) { - case '{': state = S46; break; + case '{': state = S47; break; default: lx_pcre_ungetc(lx, c); return TOK_NOESC; } break; - case S35: /* e.g. "\\\\x00" */ + case S30: /* e.g. "\\c" */ + state = S46; break; + + case S31: /* e.g. "\\i" */ lx_pcre_ungetc(lx, c); return TOK_NOESC; - case S36: /* e.g. "\\x" */ + case S32: /* e.g. "\\x" */ switch ((unsigned char) c) { case '{': state = S42; break; case '0': @@ -2389,13 +2365,33 @@ z7(struct lx_pcre_lx *lx) } break; - case S37: /* e.g. "\\D" */ - lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; + case S33: /* e.g. "\\0" */ + switch ((unsigned char) c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': state = S40; break; + default: lx_pcre_ungetc(lx, c); return TOK_OCT; + } + break; - case S38: /* e.g. "\\R" */ + case S34: /* e.g. "\\R" */ lx_pcre_ungetc(lx, c); return TOK_EOL; - case S39: /* e.g. "\\0" */ + case S35: /* e.g. "\\d" */ + lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; + + case S36: /* e.g. "\\a" */ + lx_pcre_ungetc(lx, c); return TOK_ESC; + + case S37: /* e.g. "\\E" */ + lx_pcre_ungetc(lx, c); return lx->z(lx); + + case S38: /* e.g. "\\1" */ switch ((unsigned char) c) { case '0': case '1': @@ -2404,11 +2400,16 @@ z7(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S40; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; + case '7': + case '8': + case '9': break; + default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; } break; + case S39: /* e.g. "\\z" */ + lx_pcre_ungetc(lx, c); return TOK_END; + case S40: /* e.g. "\\00" */ switch ((unsigned char) c) { case '0': @@ -2454,7 +2455,7 @@ z7(struct lx_pcre_lx *lx) } break; - case S43: /* e.g. "\\x0" */ + case S43: /* e.g. "\\xa" */ switch ((unsigned char) c) { case '0': case '1': @@ -2482,10 +2483,10 @@ z7(struct lx_pcre_lx *lx) } break; - case S44: /* e.g. "\\x00" */ + case S44: /* e.g. "\\xaa" */ lx_pcre_ungetc(lx, c); return TOK_HEX; - case S45: /* e.g. "\\x{0" */ + case S45: /* e.g. "\\x{a" */ switch ((unsigned char) c) { case '}': state = S44; break; case '0': @@ -2514,7 +2515,10 @@ z7(struct lx_pcre_lx *lx) } break; - case S46: /* e.g. "\\o{" */ + case S46: /* e.g. "\\ca" */ + lx_pcre_ungetc(lx, c); return TOK_CONTROL; + + case S47: /* e.g. "\\o{" */ switch ((unsigned char) c) { case '0': case '1': @@ -2523,12 +2527,12 @@ z7(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S47; break; + case '7': state = S48; break; default: lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S47: /* e.g. "\\o{0" */ + case S48: /* e.g. "\\o{0" */ switch ((unsigned char) c) { case '}': state = S41; break; case '0': @@ -2543,9 +2547,6 @@ z7(struct lx_pcre_lx *lx) } break; - case S48: /* e.g. "\\c\\x00" */ - lx_pcre_ungetc(lx, c); return TOK_CONTROL; - default: ; /* unreached */ } @@ -2553,8 +2554,8 @@ z7(struct lx_pcre_lx *lx) switch (state) { case S18: case S19: - case S29: - case S32: + case S28: + case S37: break; default: @@ -2591,28 +2592,28 @@ z7(struct lx_pcre_lx *lx) case S17: return TOK_FLAGS; case S18: return TOK_EOF; case S19: return TOK_EOF; - case S21: return TOK_UNSUPPORTED; + case S20: return TOK_UNSUPPORTED; case S24: return TOK_OPENCAPTURE; case S25: return TOK_OPENGROUPINV; case S26: return TOK_OPENGROUPCB; case S27: return TOK_OPENGROUPINVCB; - case S28: return TOK_ESC; - case S29: return TOK_EOF; - case S30: return TOK_END; - case S31: return TOK_UNSUPPORTED; - case S32: return TOK_EOF; - case S33: return TOK_NOESC; - case S34: return TOK_NOESC; - case S35: return TOK_NOESC; - case S36: return TOK_HEX; - case S37: return TOK_NAMED__CLASS; - case S38: return TOK_EOL; - case S39: return TOK_OCT; + case S28: return TOK_EOF; + case S29: return TOK_NOESC; + case S30: return TOK_NOESC; + case S31: return TOK_NOESC; + case S32: return TOK_HEX; + case S33: return TOK_OCT; + case S34: return TOK_EOL; + case S35: return TOK_NAMED__CLASS; + case S36: return TOK_ESC; + case S37: return TOK_EOF; + case S38: return TOK_UNSUPPORTED; + case S39: return TOK_END; case S40: return TOK_OCT; case S41: return TOK_OCT; case S43: return TOK_HEX; case S44: return TOK_HEX; - case S48: return TOK_CONTROL; + case S46: return TOK_CONTROL; default: errno = EINVAL; return TOK_ERROR; } } diff --git a/src/libre/dialect/pcre/lexer.lx b/src/libre/dialect/pcre/lexer.lx index 09ad7e0b6..17e314800 100644 --- a/src/libre/dialect/pcre/lexer.lx +++ b/src/libre/dialect/pcre/lexer.lx @@ -30,6 +30,10 @@ '\Z' -> $end__nl; '\z' -> $end; +# unicode crime, one "code unit, whether or not a UTF mode is set" +# not supporting this is equivalent to PCRE2_NEVER_BACKSLASH_C +'\C' -> $unsupported; + # unicode extended grapheme cluster '\X' -> $unsupported; diff --git a/tests/pcre/in47.re b/tests/pcre/in47.re new file mode 100644 index 000000000..39b6043ef --- /dev/null +++ b/tests/pcre/in47.re @@ -0,0 +1 @@ +\C \ No newline at end of file diff --git a/tests/pcre/out47.err b/tests/pcre/out47.err new file mode 100644 index 000000000..5b957e8df --- /dev/null +++ b/tests/pcre/out47.err @@ -0,0 +1 @@ +tests/pcre/in47.re:1: Unsupported operator