From 99af25fa825407f802c1430a5c881f9bde4aca77 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 14 Nov 2017 10:16:46 +0100 Subject: [PATCH 01/27] pcre: applied andreas patch "268_tk85.patch_final.diff" [f40de304f7992e77] from [1520767fff]; rebased to sebres-8-5-timerate, conflicts resolved (fixes of Tcl_RegexpObjCmd from tclCmdMZ.c going to tclRegexp.c#TclRegexpClassic, etc) --- generic/tcl.h | 2 + generic/tclBasic.c | 7 + generic/tclCmdMZ.c | 321 ++++++------------- generic/tclExecute.c | 11 +- generic/tclInt.h | 13 + generic/tclInterp.c | 43 ++- generic/tclRegexp.c | 700 +++++++++++++++++++++++++++++++++++++++--- generic/tclRegexp.h | 8 + tests/interp.test | 34 +- tests/reg.test | 6 +- tests/regexp.test | 20 +- tests/regexpComp.test | 22 +- unix/Makefile.in | 4 +- unix/configure | 127 +++++++- unix/configure.in | 6 + unix/tcl.m4 | 122 ++++++++ 16 files changed, 1154 insertions(+), 292 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 64c4683d1d53..ecba3f851c6f 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -559,6 +559,8 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); #define TCL_REG_NEWLINE 000300 /* Newlines are line terminators. */ #define TCL_REG_CANMATCH 001000 /* Report details on partial/limited * matches. */ +#define TCL_REG_PCRE 0x08000000 /* Make sure it doesn't conflict with + * existing TCL_REG_* or PCRE_* bits */ /* * Flags values passed to Tcl_RegExpExecObj. diff --git a/generic/tclBasic.c b/generic/tclBasic.c index 83e1a7522914..2cd92bd2cea9 100644 --- a/generic/tclBasic.c +++ b/generic/tclBasic.c @@ -500,6 +500,13 @@ Tcl_CreateInterp(void) iPtr->evalFlags = 0; iPtr->scriptFile = NULL; iPtr->flags = 0; +#ifdef HAVE_PCRE +#ifdef USE_DEFAULT_PCRE + if (getenv("TCL_REGEXP_CLASSIC") == NULL) { iPtr->flags |= INTERP_PCRE; } +#else + if (getenv("TCL_REGEXP_PCRE") != NULL) { iPtr->flags |= INTERP_PCRE; } +#endif +#endif iPtr->tracePtr = NULL; iPtr->tracesForbiddingInline = 0; iPtr->activeCmdTracePtr = NULL; diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 2b96c5a8dff1..66eaf87a220b 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -87,26 +87,35 @@ Tcl_RegexpObjCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - int i, indices, match, about, offset, all, doinline, numMatchesSaved; - int cflags, eflags, stringLength, matchLength; + int i, indices, about, offset, all, doinline; + int cflags, re_type; + Tcl_Obj *startIndex = NULL; Tcl_RegExp regExpr; - Tcl_Obj *objPtr, *startIndex = NULL, *resultPtr = NULL; - Tcl_RegExpInfo info; static const char *options[] = { "-all", "-about", "-indices", "-inline", "-expanded", "-line", "-linestop", "-lineanchor", - "-nocase", "-start", "--", NULL + "-nocase", "-start", "-type", "--", NULL }; enum options { REGEXP_ALL, REGEXP_ABOUT, REGEXP_INDICES, REGEXP_INLINE, REGEXP_EXPANDED,REGEXP_LINE, REGEXP_LINESTOP,REGEXP_LINEANCHOR, - REGEXP_NOCASE, REGEXP_START, REGEXP_LAST + REGEXP_NOCASE,REGEXP_START, REGEXP_TYPE, REGEXP_LAST + }; + static CONST char *re_type_opts[] = { + "classic", "pcre", NULL + }; + enum re_type_opts { + RETYPE_CLASSIC, RETYPE_PCRE, }; indices = 0; about = 0; +#ifdef USE_DEFAULT_PCRE + re_type = RETYPE_PCRE; +#else + re_type = RETYPE_CLASSIC; +#endif cflags = TCL_REG_ADVANCED; - eflags = 0; offset = 0; all = 0; doinline = 0; @@ -166,6 +175,15 @@ Tcl_RegexpObjCmd( Tcl_IncrRefCount(startIndex); break; } + case REGEXP_TYPE: + if (++i >= objc) { + goto endOfForLoop; + } + if (Tcl_GetIndexFromObj(interp, objv[i], re_type_opts, "type", + 0, &re_type) != TCL_OK) { + goto optionError; + } + break; case REGEXP_LAST: i++; goto endOfForLoop; @@ -189,23 +207,11 @@ Tcl_RegexpObjCmd( if (doinline && ((objc - 2) != 0)) { Tcl_AppendResult(interp, "regexp match variables not allowed" " when using -inline", NULL); - goto optionError; - } - - /* - * Handle the odd about case separately. - */ - - if (about) { - regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); - if ((regExpr == NULL) || (TclRegAbout(interp, regExpr) < 0)) { - optionError: - if (startIndex) { - Tcl_DecrRefCount(startIndex); - } - return TCL_ERROR; + optionError: + if (startIndex) { + Tcl_DecrRefCount(startIndex); } - return TCL_OK; + return TCL_ERROR; } /* @@ -214,10 +220,19 @@ Tcl_RegexpObjCmd( * regexp to avoid shimmering problems. */ - objPtr = objv[1]; - stringLength = Tcl_GetCharLength(objPtr); - if (startIndex) { + int stringLength; + + if ((enum re_type_opts) re_type == RETYPE_CLASSIC) { + stringLength = Tcl_GetCharLength(objv[1]); + } else { + if (objv[1]->typePtr == &tclByteArrayType) { + (void) Tcl_GetByteArrayFromObj(objv[1], &stringLength); + } else { + /* XXX validate offset by char length */ + (void) Tcl_GetStringFromObj(objv[1], &stringLength); + } + } TclGetIntForIndexM(NULL, startIndex, stringLength, &offset); Tcl_DecrRefCount(startIndex); if (offset < 0) { @@ -225,203 +240,38 @@ Tcl_RegexpObjCmd( } } + /* + * Handle the odd about case separately, otherwise pass of to appropriate + * RE engine. + */ + + if ((enum re_type_opts) re_type == RETYPE_PCRE) { + cflags |= TCL_REG_PCRE; + } regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); if (regExpr == NULL) { return TCL_ERROR; } - objc -= 2; - objv += 2; - - if (doinline) { - /* - * Save all the subexpressions, as we will return them as a list - */ - - numMatchesSaved = -1; - } else { - /* - * Save only enough subexpressions for matches we want to keep, expect - * in the case of -all, where we need to keep at least one to know - * where to move the offset. - */ - - numMatchesSaved = (objc == 0) ? all : objc; - } - - /* - * The following loop is to handle multiple matches within the same source - * string; each iteration handles one match. If "-all" hasn't been - * specified then the loop body only gets executed once. We terminate the - * loop when the starting offset is past the end of the string. - */ - - while (1) { - /* - * Pass either 0 or TCL_REG_NOTBOL in the eflags. Passing - * TCL_REG_NOTBOL indicates that the character at offset should not be - * considered the start of the line. If for example the pattern {^} is - * passed and -start is positive, then the pattern will not match the - * start of the string unless the previous character is a newline. - */ - - if (offset == 0) { - eflags = 0; - } else if (offset > stringLength) { - eflags = TCL_REG_NOTBOL; - } else if (Tcl_GetUniChar(objPtr, offset-1) == (Tcl_UniChar)'\n') { - eflags = 0; - } else { - eflags = TCL_REG_NOTBOL; - } - - match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, - numMatchesSaved, eflags); - if (match < 0) { - return TCL_ERROR; - } - - if (match == 0) { - /* - * We want to set the value of the intepreter result only when - * this is the first time through the loop. - */ - - if (all <= 1) { - /* - * If inlining, the interpreter's object result remains an - * empty list, otherwise set it to an integer object w/ value - * 0. - */ - - if (!doinline) { - Tcl_SetObjResult(interp, Tcl_NewIntObj(0)); - } - return TCL_OK; - } - break; - } - - /* - * If additional variable names have been specified, return index - * information in those variables. - */ - - Tcl_RegExpGetInfo(regExpr, &info); - if (doinline) { - /* - * It's the number of substitutions, plus one for the matchVar at - * index 0 - */ - - objc = info.nsubs + 1; - if (all <= 1) { - resultPtr = Tcl_NewObj(); - } - } - for (i = 0; i < objc; i++) { - Tcl_Obj *newPtr; - - if (indices) { - int start, end; - Tcl_Obj *objs[2]; - - /* - * Only adjust the match area if there was a match for that - * area. (Scriptics Bug 4391/SF Bug #219232) - */ - - if (i <= info.nsubs && info.matches[i].start >= 0) { - start = offset + info.matches[i].start; - end = offset + info.matches[i].end; - - /* - * Adjust index so it refers to the last character in the - * match instead of the first character after the match. - */ - - if (end >= offset) { - end--; - } - } else { - start = -1; - end = -1; - } - - objs[0] = Tcl_NewLongObj(start); - objs[1] = Tcl_NewLongObj(end); - - newPtr = Tcl_NewListObj(2, objs); - } else { - if (i <= info.nsubs) { - newPtr = Tcl_GetRange(objPtr, - offset + info.matches[i].start, - offset + info.matches[i].end - 1); - } else { - newPtr = Tcl_NewObj(); - } - } - if (doinline) { - if (Tcl_ListObjAppendElement(interp, resultPtr, newPtr) - != TCL_OK) { - Tcl_DecrRefCount(newPtr); - Tcl_DecrRefCount(resultPtr); - return TCL_ERROR; - } - } else { - Tcl_Obj *valuePtr; - valuePtr = Tcl_ObjSetVar2(interp, objv[i], NULL, newPtr, 0); - if (valuePtr == NULL) { - Tcl_AppendResult(interp, "couldn't set variable \"", - TclGetString(objv[i]), "\"", NULL); - return TCL_ERROR; - } + if ((enum re_type_opts) re_type == RETYPE_CLASSIC) { + if (about) { + if (TclRegAbout(interp, regExpr) < 0) { + return TCL_ERROR; } + return TCL_OK; } - if (all == 0) { - break; - } - - /* - * Adjust the offset to the character just after the last one in the - * matchVar and increment all to count how many times we are making a - * match. We always increment the offset by at least one to prevent - * endless looping (as in the case: regexp -all {a*} a). Otherwise, - * when we match the NULL string at the end of the input string, we - * will loop indefinately (because the length of the match is 0, so - * offset never changes). - */ - - matchLength = info.matches[0].end - info.matches[0].start; - offset += info.matches[0].end; - - /* - * A match of length zero could happen for {^} {$} or {.*} and in - * these cases we always want to bump the index up one. - */ - - if (matchLength == 0) { - offset++; - } - all++; - if (offset >= stringLength) { - break; + return TclRegexpClassic(interp, objc, objv, regExpr, + all, indices, doinline, offset); + } else { + if (about) { + /* XXX: implement PCRE about */ + return TCL_OK; } - } - /* - * Set the interpreter's object result to an integer object with value 1 - * if -all wasn't specified, otherwise it's all-1 (the number of times - * through the while - 1). - */ - - if (doinline) { - Tcl_SetObjResult(interp, resultPtr); - } else { - Tcl_SetObjResult(interp, Tcl_NewIntObj(all ? all-1 : 1)); + return TclRegexpPCRE(interp, objc, objv, regExpr, + all, indices, doinline, offset); } - return TCL_OK; } /* @@ -449,7 +299,7 @@ Tcl_RegsubObjCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int idx, result, cflags, all, wlen, wsublen, numMatches, offset; - int start, end, subStart, subEnd, match; + int start, end, subStart, subEnd, match, re_type; Tcl_RegExp regExpr; Tcl_RegExpInfo info; Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL; @@ -458,14 +308,25 @@ Tcl_RegsubObjCmd( static const char *options[] = { "-all", "-nocase", "-expanded", "-line", "-linestop", "-lineanchor", "-start", - "--", NULL + "-type", "--", NULL }; enum options { REGSUB_ALL, REGSUB_NOCASE, REGSUB_EXPANDED, REGSUB_LINE, REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_START, - REGSUB_LAST + REGSUB_TYPE, REGSUB_LAST + }; + static CONST char *re_type_opts[] = { + "classic", "pcre", NULL + }; + enum re_type_opts { + RETYPE_CLASSIC, RETYPE_PCRE, }; +#ifdef USE_DEFAULT_PCRE + re_type = RETYPE_PCRE; +#else + re_type = RETYPE_CLASSIC; +#endif cflags = TCL_REG_ADVANCED; all = 0; offset = 0; @@ -517,6 +378,15 @@ Tcl_RegsubObjCmd( Tcl_IncrRefCount(startIndex); break; } + case REGSUB_TYPE: + if (++idx >= objc) { + goto endOfForLoop; + } + if (Tcl_GetIndexFromObj(interp, objv[idx], re_type_opts, "type", + 0, &re_type) != TCL_OK) { + goto optionError; + } + break; case REGSUB_LAST: idx++; goto endOfForLoop; @@ -538,8 +408,18 @@ Tcl_RegsubObjCmd( objv += idx; if (startIndex) { - int stringLength = Tcl_GetCharLength(objv[1]); + int stringLength; + if ((enum re_type_opts) re_type == RETYPE_CLASSIC) { + stringLength = Tcl_GetCharLength(objv[1]); + } else { + if (objv[1]->typePtr == &tclByteArrayType) { + (void) Tcl_GetByteArrayFromObj(objv[1], &stringLength); + } else { + /* XXX validate offset by char length */ + (void) Tcl_GetStringFromObj(objv[1], &stringLength); + } + } TclGetIntForIndexM(NULL, startIndex, stringLength, &offset); Tcl_DecrRefCount(startIndex); if (offset < 0) { @@ -618,6 +498,9 @@ Tcl_RegsubObjCmd( goto regsubDone; } + if ((enum re_type_opts) re_type == RETYPE_PCRE) { + cflags |= TCL_REG_PCRE; + } regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); if (regExpr == NULL) { return TCL_ERROR; @@ -1723,14 +1606,14 @@ UniCharIsAscii( { return (character >= 0) && (character < 0x80); } - -static int + static int UniCharIsHexDigit( int character) { return (character >= 0) && (character < 0x80) && isxdigit(character); } - + + /* *---------------------------------------------------------------------- * diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 61d0ddc8d36e..8eebea51271f 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -16,6 +16,7 @@ #include "tclInt.h" #include "tclCompile.h" +#include "tclRegexp.h" #include "tommath.h" #include @@ -4490,6 +4491,14 @@ TclExecuteByteCode( valuePtr = OBJ_AT_TOS; /* String */ value2Ptr = OBJ_UNDER_TOS; /* Pattern */ + /* + * cflags won't use PCRE flag indicator during compilation + * XXX may use TCL_REG_ADVANCED to indicate -type classic for + * XXX compilation, but currently -type isn't compiled + */ + if (((Interp *)interp)->flags & INTERP_PCRE) { + cflags |= TCL_REG_PCRE; + } regExpr = Tcl_GetRegExpFromObj(interp, value2Ptr, cflags); if (regExpr == NULL) { match = -1; @@ -4498,7 +4507,7 @@ TclExecuteByteCode( } /* - * Adjustment is 2 due to the nocase byte + * Adjustment is 2 due to the cflags byte */ if (match < 0) { diff --git a/generic/tclInt.h b/generic/tclInt.h index a184950247d5..f848c0ed1ff5 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2011,6 +2011,7 @@ typedef struct Interp { * of the wrong-num-args string in Tcl_WrongNumArgs. * Makes it append instead of replacing and uses * different intermediate text. + * INTERP_PCRE Non-zero means use PCRE engine by default for REs * * WARNING: For the sake of some extensions that have made use of former * internal values, do not re-use the flag values 2 (formerly ERR_IN_PROGRESS) @@ -2023,6 +2024,7 @@ typedef struct Interp { #define DONT_COMPILE_CMDS_INLINE 0x20 #define RAND_SEED_INITIALIZED 0x40 #define SAFE_INTERP 0x80 +#define INTERP_PCRE 0x100 #define INTERP_TRACE_IN_PROGRESS 0x200 #define INTERP_ALTERNATE_WRONG_ARGS 0x400 #define ERR_LEGACY_COPY 0x800 @@ -3358,6 +3360,17 @@ MODULE_SCOPE int TclObjCallVarTraces(Interp *iPtr, Var *arrayPtr, Var *varPtr, Tcl_Obj *part1Ptr, Tcl_Obj *part2Ptr, int flags, int leaveErrMsg, int index); +/* + * The variant RE engines + */ + +MODULE_SCOPE int TclRegexpClassic(Tcl_Interp *interp, int objc, + Tcl_Obj *CONST objv[], Tcl_RegExp regExpr, + int all, int indices, int doinline, int offset); +MODULE_SCOPE int TclRegexpPCRE(Tcl_Interp *interp, int objc, + Tcl_Obj *CONST objv[], Tcl_RegExp regExpr, + int all, int indices, int doinline, int offset); + /* * So tclObj.c and tclDictObj.c can share these implementations. */ diff --git a/generic/tclInterp.c b/generic/tclInterp.c index dbbf10ab174c..d50b6643e953 100644 --- a/generic/tclInterp.c +++ b/generic/tclInterp.c @@ -592,16 +592,16 @@ Tcl_InterpObjCmd( "alias", "aliases", "bgerror", "create", "debug", "delete", "eval", "exists", "expose", "hide", "hidden", "issafe", "invokehidden", - "limit", "marktrusted", "recursionlimit","slaves", - "share", "target", "transfer", + "limit", "marktrusted", "recursionlimit","regexp", + "slaves", "share", "target", "transfer", NULL }; enum option { OPT_ALIAS, OPT_ALIASES, OPT_BGERROR, OPT_CREATE, OPT_DEBUG, OPT_DELETE, OPT_EVAL, OPT_EXISTS, OPT_EXPOSE, OPT_HIDE, OPT_HIDDEN, OPT_ISSAFE, OPT_INVOKEHID, - OPT_LIMIT, OPT_MARKTRUSTED,OPT_RECLIMIT, OPT_SLAVES, - OPT_SHARE, OPT_TARGET, OPT_TRANSFER + OPT_LIMIT, OPT_MARKTRUSTED,OPT_RECLIMIT, OPT_REGEXP, + OPT_SLAVES, OPT_SHARE, OPT_TARGET, OPT_TRANSFER }; if (objc < 2) { @@ -949,6 +949,41 @@ Tcl_InterpObjCmd( } return SlaveRecursionLimit(interp, slaveInterp, objc - 3, objv + 3); } + case OPT_REGEXP: { + int re_type; + Interp *slaveInterp; + static CONST char *re_type_opts[] = { + "classic", "pcre", NULL + }; + enum re_type_opts { + RETYPE_CLASSIC, RETYPE_PCRE, + }; + if (objc != 3 && objc != 4) { + Tcl_WrongNumArgs(interp, 2, objv, "path ?type?"); + return TCL_ERROR; + } + slaveInterp = (Interp *) GetInterp(interp, objv[2]); + if (slaveInterp == NULL) { + return TCL_ERROR; + } + if (objc == 4) { + if (Tcl_GetIndexFromObj(interp, objv[3], re_type_opts, "type", + 0, &re_type) != TCL_OK) { + return TCL_ERROR; + } + if ((enum re_type_opts) re_type == RETYPE_PCRE) { + slaveInterp->flags |= INTERP_PCRE; + } else { + slaveInterp->flags &= ~(INTERP_PCRE); + } + } + if (slaveInterp->flags & INTERP_PCRE) { + Tcl_SetObjResult(interp, Tcl_NewStringObj("pcre", -1)); + } else { + Tcl_SetObjResult(interp, Tcl_NewStringObj("classic", -1)); + } + return TCL_OK; + } case OPT_SLAVES: { Tcl_Interp *slaveInterp; InterpInfo *iiPtr; diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index dac6aba27270..17be0f89d7dd 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -75,6 +75,12 @@ typedef struct ThreadSpecificData { struct TclRegexp *regexps[NUM_REGEXPS]; /* Compiled forms of above strings. Also * malloc-ed, or NULL if not in use yet. */ +#ifdef HAVE_PCRE + Tcl_RegExpIndices *matches; /* To support PCRE in Tcl_RegExpGetInfo, we + * need a classic info matches area to store + * data in. */ + int matchelems; /* length of matches */ +#endif } ThreadSpecificData; static Tcl_ThreadDataKey dataKey; @@ -433,7 +439,6 @@ Tcl_RegExpExecObj( int flags) /* Regular expression execution flags. */ { TclRegexp *regexpPtr = (TclRegexp *) re; - Tcl_UniChar *udata; int length; int reflags = regexpPtr->flags; #define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) @@ -464,15 +469,75 @@ Tcl_RegExpExecObj( regexpPtr->string = NULL; regexpPtr->objPtr = textObj; - udata = Tcl_GetUnicodeFromObj(textObj, &length); + if (reflags & TCL_REG_PCRE) { +#ifdef HAVE_PCRE + const char *matchstr; + int match, pcreeflags, nm = (regexpPtr->re.re_nsub + 1) * 3; - if (offset > length) { - offset = length; - } - udata += offset; - length -= offset; + if (textObj->typePtr == &tclByteArrayType) { + matchstr = Tcl_GetByteArrayFromObj(textObj, &length); + } else { + matchstr = Tcl_GetStringFromObj(textObj, &length); + } + + if (offset > length) { + offset = length; + } + + pcreeflags = 0; + if (flags & TCL_REG_NOTBOL) { + pcreeflags |= PCRE_NOTBOL; + } + + match = pcre_exec(regexpPtr->pcre, regexpPtr->study, + matchstr, length, offset, pcreeflags, + (int *) regexpPtr->matches, nm); - return RegExpExecUniChar(interp, re, udata, length, nmatches, flags); + /* + * Store last offset to support Tcl_RegExpGetInfo translation. + */ + regexpPtr->details.rm_extend.rm_so = offset; + + /* + * Check for errors. + */ + + if (match == PCRE_ERROR_NOMATCH) { + return 0; + } else if (match == 0) { + if (interp != NULL) { + Tcl_AppendResult(interp, + "pcre_exec had insufficient capture space", NULL); + } + return -1; + } else if (match < -1) { + if (interp != NULL) { + char buf[32 + TCL_INTEGER_SPACE]; + sprintf(buf, "pcre_exec returned error code %d", match); + Tcl_AppendResult(interp, buf, NULL); + } + return -1; + } + return 1; +#else + if (interp != NULL) { + Tcl_AppendResult(interp, "PCRE not available", NULL); + } + return -1; +#endif + } else { + Tcl_UniChar *udata; + + udata = Tcl_GetUnicodeFromObj(textObj, &length); + + if (offset > length) { + offset = length; + } + udata += offset; + length -= offset; + + return RegExpExecUniChar(interp, re, udata, length, nmatches, flags); + } } /* @@ -535,8 +600,34 @@ Tcl_RegExpGetInfo( TclRegexp *regexpPtr = (TclRegexp *) regexp; infoPtr->nsubs = regexpPtr->re.re_nsub; - infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; - infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; + if (regexpPtr->flags & TCL_REG_PCRE) { +#ifdef HAVE_PCRE + ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); + int i, last, *matches = (int *) regexpPtr->matches; + + /* + * This works both to initialize and extend matches as necessary + */ + if (tsdPtr->matchelems <= infoPtr->nsubs) { + tsdPtr->matchelems = infoPtr->nsubs + 1; + tsdPtr->matches = (Tcl_RegExpIndices *) + ckrealloc((char *) tsdPtr->matches, + sizeof(Tcl_RegExpIndices) * tsdPtr->matchelems); + } + last = regexpPtr->details.rm_extend.rm_so; /* last offset */ + for (i = 0; i <= infoPtr->nsubs; i++) { + tsdPtr->matches[i].start = matches[i*2] - last; + tsdPtr->matches[i].end = matches[i*2+1] - last; + } + infoPtr->matches = tsdPtr->matches; + infoPtr->extendStart = 0; /* XXX support? */ +#else + Tcl_Panic("Cannot get info for PCRE match"); +#endif + } else { + infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; + infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; + } } /* @@ -580,6 +671,10 @@ Tcl_GetRegExpFromObj( regexpPtr = (TclRegexp *) objPtr->internalRep.twoPtrValue.ptr1; + /* XXX Need to have case where -type classic isn't ignored in regexp/sub */ + if ((interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE)) { + flags |= TCL_REG_PCRE; + } if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { pattern = TclGetStringFromObj(objPtr, &length); @@ -904,38 +999,121 @@ CompileRegexp( */ regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); - regexpPtr->objPtr = NULL; - regexpPtr->string = NULL; + memset(regexpPtr, 0, sizeof(TclRegexp)); + + regexpPtr->flags = flags; regexpPtr->details.rm_extend.rm_so = -1; regexpPtr->details.rm_extend.rm_eo = -1; - /* - * Get the up-to-date string representation and map to unicode. - */ + if (flags & TCL_REG_PCRE) { +#ifdef HAVE_PCRE + pcre *pcre; + char *p, *cstring = (char *) string; + const char *errstr; + int erroffset, rc, nsubs, pcrecflags; - Tcl_DStringInit(&stringBuf); - uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf); - numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar); + /* + * Convert from Tcl classic to PCRE cflags + */ - /* - * Compile the string and check for errors. - */ + /* XXX Should enable PCRE_UTF8 selectively on non-ByteArray Tcl_Obj */ + pcrecflags = PCRE_NO_UTF8_CHECK | PCRE_DOLLAR_ENDONLY; + for (i = 0, p = cstring; i < length; i++) { + if (UCHAR(*p++) > 0x80) { + pcrecflags |= PCRE_UTF8; + break; + } + } + if (flags & TCL_REG_NOCASE) { + pcrecflags |= PCRE_CASELESS; + } + if (flags & TCL_REG_EXPANDED) { + pcrecflags |= PCRE_EXTENDED; + } + if (flags & (TCL_REG_NEWLINE|TCL_REG_NLSTOP|TCL_REG_NLANCH)) { + pcrecflags |= PCRE_MULTILINE; + } - regexpPtr->flags = flags; - status = TclReComp(®expPtr->re, uniString, (size_t) numChars, flags); - Tcl_DStringFree(&stringBuf); + if (cstring[length] != 0) { + cstring = (char *) ckalloc(length + 1); + memcpy(cstring, string, length); + cstring[length] = 0; + } + pcre = pcre_compile(cstring, pcrecflags, &errstr, &erroffset, NULL); + regexpPtr->pcre = pcre; + if (cstring != (char *) string) { + ckfree(cstring); + } + + if (pcre == NULL) { + ckfree((char *)regexpPtr); + Tcl_AppendResult(interp, + "couldn't compile pcre pattern: ", errstr, NULL); + return NULL; + } + + regexpPtr->study = pcre_study(pcre, 0, &errstr); + if (errstr != NULL) { + pcre_free(pcre); + ckfree((char *)regexpPtr); + Tcl_AppendResult(interp, + "error studying pcre pattern: ", errstr, NULL); + return NULL; + } - if (status != REG_OKAY) { /* - * Clean up and report errors in the interpreter, if possible. + * Allocate enough space for all of the subexpressions, plus one extra + * for the entire pattern. */ - ckfree((char *)regexpPtr); - if (interp) { - TclRegError(interp, - "couldn't compile regular expression pattern: ", status); + rc = pcre_fullinfo(pcre, NULL, PCRE_INFO_CAPTURECOUNT, &nsubs); + if (rc == 0) { + regexpPtr->re.re_nsub = nsubs; + regexpPtr->matches = (regmatch_t *) + ckalloc(sizeof(int) * (nsubs+1)*3); } +#else + Tcl_AppendResult(interp, + "couldn't compile pcre pattern: pcre unavailabe", NULL); return NULL; +#endif + } else { + /* + * Get the up-to-date string representation and map to unicode. + */ + + Tcl_DStringInit(&stringBuf); + uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf); + numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar); + + /* + * Compile the string and check for errors. + */ + + status = TclReComp(®expPtr->re, uniString, (size_t) numChars, flags); + Tcl_DStringFree(&stringBuf); + + if (status != REG_OKAY) { + /* + * Clean up and report errors in the interpreter, if possible. + */ + + ckfree((char *)regexpPtr); + if (interp) { + TclRegError(interp, + "couldn't compile regular expression pattern: ", + status); + } + return NULL; + } + + /* + * Allocate enough space for all of the subexpressions, plus one extra + * for the entire pattern. + */ + + regexpPtr->matches = (regmatch_t *) ckalloc( + sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); } /* @@ -953,14 +1131,6 @@ CompileRegexp( regexpPtr->globObjPtr = NULL; } - /* - * Allocate enough space for all of the subexpressions, plus one extra for - * the entire pattern. - */ - - regexpPtr->matches = (regmatch_t *) ckalloc( - sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); - /* * Initialize the refcount to one initially, since it is in the cache. */ @@ -1012,6 +1182,14 @@ static void FreeRegexp( TclRegexp *regexpPtr) /* Compiled regular expression to free. */ { +#ifdef HAVE_PCRE + if (regexpPtr->flags & TCL_REG_PCRE) { + pcre_free(regexpPtr->pcre); + if (regexpPtr->study) { + pcre_free(regexpPtr->study); + } + } else +#endif TclReFree(®expPtr->re); if (regexpPtr->globObjPtr) { TclDecrRefCount(regexpPtr->globObjPtr); @@ -1054,6 +1232,9 @@ FinalizeRegexp( ckfree(tsdPtr->patterns[i]); tsdPtr->patterns[i] = NULL; } + if (tsdPtr->matches != NULL) { + ckfree((char *) tsdPtr->matches); + } /* * We may find ourselves reinitialized if another finalization routine * invokes regexps. @@ -1061,6 +1242,449 @@ FinalizeRegexp( tsdPtr->initialized = 0; } +/* + *---------------------------------------------------------------------- + * + * TclRegexpClassic -- + * + * This procedure processes a classic "regexp". + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +int +TclRegexpClassic( + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *CONST objv[], /* Argument objects. */ + Tcl_RegExp regExpr, + int all, + int indices, + int doinline, + int offset) +{ + int i, match, numMatchesSaved; + int eflags, stringLength, matchLength; + Tcl_Obj *objPtr, *resultPtr = NULL; + Tcl_RegExpInfo info; + + objPtr = objv[1]; + stringLength = Tcl_GetCharLength(objPtr); + + eflags = 0; + objc -= 2; + objv += 2; + + if (doinline) { + /* + * Save all the subexpressions, as we will return them as a list + */ + + numMatchesSaved = -1; + } else { + /* + * Save only enough subexpressions for matches we want to keep, expect + * in the case of -all, where we need to keep at least one to know + * where to move the offset. + */ + + numMatchesSaved = (objc == 0) ? all : objc; + } + + /* + * The following loop is to handle multiple matches within the same source + * string; each iteration handles one match. If "-all" hasn't been + * specified then the loop body only gets executed once. We terminate the + * loop when the starting offset is past the end of the string. + */ + + while (1) { + /* + * Pass either 0 or TCL_REG_NOTBOL in the eflags. Passing + * TCL_REG_NOTBOL indicates that the character at offset should not be + * considered the start of the line. If for example the pattern {^} is + * passed and -start is positive, then the pattern will not match the + * start of the string unless the previous character is a newline. + */ + + if (offset == 0) { + eflags = 0; + } else if (offset > stringLength) { + eflags = TCL_REG_NOTBOL; + } else if (Tcl_GetUniChar(objPtr, offset-1) == (Tcl_UniChar)'\n') { + eflags = 0; + } else { + eflags = TCL_REG_NOTBOL; + } + + match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, + numMatchesSaved, eflags); + if (match < 0) { + return TCL_ERROR; + } + + if (match == 0) { + /* + * We want to set the value of the intepreter result only when + * this is the first time through the loop. + */ + + if (all <= 1) { + /* + * If inlining, the interpreter's object result remains an + * empty list, otherwise set it to an integer object w/ value + * 0. + */ + + if (!doinline) { + Tcl_SetObjResult(interp, Tcl_NewIntObj(0)); + } + return TCL_OK; + } + break; + } + + /* + * If additional variable names have been specified, return index + * information in those variables. + */ + + Tcl_RegExpGetInfo(regExpr, &info); + if (doinline) { + /* + * It's the number of substitutions, plus one for the matchVar at + * index 0 + */ + + objc = info.nsubs + 1; + if (all <= 1) { + resultPtr = Tcl_NewObj(); + } + } + for (i = 0; i < objc; i++) { + Tcl_Obj *newPtr; + + if (indices) { + int start, end; + Tcl_Obj *objs[2]; + + /* + * Only adjust the match area if there was a match for that + * area. (Scriptics Bug 4391/SF Bug #219232) + */ + + if (i <= info.nsubs && info.matches[i].start >= 0) { + start = offset + info.matches[i].start; + end = offset + info.matches[i].end; + + /* + * Adjust index so it refers to the last character in the + * match instead of the first character after the match. + */ + + if (end >= offset) { + end--; + } + } else { + start = -1; + end = -1; + } + + objs[0] = Tcl_NewLongObj(start); + objs[1] = Tcl_NewLongObj(end); + + newPtr = Tcl_NewListObj(2, objs); + } else { + if (i <= info.nsubs) { + newPtr = Tcl_GetRange(objPtr, + offset + info.matches[i].start, + offset + info.matches[i].end - 1); + } else { + newPtr = Tcl_NewObj(); + } + } + if (doinline) { + if (Tcl_ListObjAppendElement(interp, resultPtr, newPtr) + != TCL_OK) { + Tcl_DecrRefCount(newPtr); + Tcl_DecrRefCount(resultPtr); + return TCL_ERROR; + } + } else { + Tcl_Obj *valuePtr; + valuePtr = Tcl_ObjSetVar2(interp, objv[i], NULL, newPtr, 0); + if (valuePtr == NULL) { + Tcl_AppendResult(interp, "couldn't set variable \"", + TclGetString(objv[i]), "\"", NULL); + return TCL_ERROR; + } + } + } + + if (all == 0) { + break; + } + + /* + * Adjust the offset to the character just after the last one in the + * matchVar and increment all to count how many times we are making a + * match. We always increment the offset by at least one to prevent + * endless looping (as in the case: regexp -all {a*} a). Otherwise, + * when we match the NULL string at the end of the input string, we + * will loop indefinately (because the length of the match is 0, so + * offset never changes). + */ + + matchLength = info.matches[0].end - info.matches[0].start; + offset += info.matches[0].end; + + /* + * A match of length zero could happen for {^} {$} or {.*} and in + * these cases we always want to bump the index up one. + */ + + if (matchLength == 0) { + offset++; + } + all++; + if (offset >= stringLength) { + break; + } + } + + /* + * Set the interpreter's object result to an integer object with value 1 + * if -all wasn't specified, otherwise it's all-1 (the number of times + * through the while - 1). + */ + + if (doinline) { + Tcl_SetObjResult(interp, resultPtr); + } else { + Tcl_SetObjResult(interp, Tcl_NewIntObj(all ? all-1 : 1)); + } + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * + * TclRegexpPCRE -- + * + * This procedure processes a PCRE "regexp". + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +int +TclRegexpPCRE( + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *CONST objv[], /* Argument objects. */ + Tcl_RegExp regExpr, + int all, + int indices, + int doinline, + int offset) +{ +#ifdef HAVE_PCRE + int i, match, eflags, stringLength, matchelems, *matches; + Tcl_Obj *objPtr, *resultPtr = NULL; + const char *matchstr; + pcre *re; + pcre_extra *study; + TclRegexp *regexpPtr = (TclRegexp *) regExpr; + + objPtr = objv[1]; + if (objPtr->typePtr == &tclByteArrayType) { + matchstr = Tcl_GetByteArrayFromObj(objPtr, &stringLength); + } else { + matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); + } + + eflags = PCRE_NO_UTF8_CHECK; + if (offset > 0) { + /* + * Translate offset into correct placement for utf-8 chars. + * Add flag if using offset (string is part of a larger string), so + * that "^" won't match. + */ + + if (objPtr->typePtr != &tclByteArrayType) { + /* XXX: probably needs length restriction */ + offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; + } + eflags |= PCRE_NOTBOL; + } + + objc -= 2; + objv += 2; + + /* + * The following loop is to handle multiple matches within the same source + * string; each iteration handles one match. If "-all" hasn't been + * specified then the loop body only gets executed once. We terminate the + * loop when the starting offset is past the end of the string. + */ + + re = regexpPtr->pcre; + study = regexpPtr->study; + matches = (int *) regexpPtr->matches; + matchelems = (int) (regexpPtr->re.re_nsub + 1) * 3; + while (1) { + match = pcre_exec(re, study, matchstr, stringLength, + offset, eflags, matches, matchelems); + + if (match < -1) { + char buf[32 + TCL_INTEGER_SPACE]; + sprintf(buf, "pcre_exec returned error code %d", match); + Tcl_AppendResult(interp, buf, NULL); + return TCL_ERROR; + } + + if (match == 0) { + Tcl_AppendResult(interp, + "pcre_exec had insufficient capture space", NULL); + return TCL_ERROR; + } + + if (match == PCRE_ERROR_NOMATCH) { + /* + * We want to set the value of the intepreter result only when + * this is the first time through the loop. + */ + + if (all <= 1) { + /* + * If inlining, the interpreter's object result remains an + * empty list, otherwise set it to an integer object w/ value + * 0. + */ + + if (!doinline) { + Tcl_SetObjResult(interp, Tcl_NewIntObj(0)); + } + return TCL_OK; + } + break; + } + + /* + * If additional variable names have been specified, return index + * information in those variables. + */ + + if (doinline) { + /* + * It's the number of substitutions, plus one for the matchVar at + * index 0 + */ + + objc = match; + if (all <= 1) { + resultPtr = Tcl_NewObj(); + } + } + for (i = 0; i < objc; i++) { + Tcl_Obj *newPtr; + int start, end; + + if (i < match) { + start = matches[i*2]; + end = matches[i*2 + 1]; + } else { + start = -1; + end = -1; + } + if (indices) { + Tcl_Obj *objs[2]; + + objs[0] = Tcl_NewLongObj(start); + objs[1] = Tcl_NewLongObj((end < 0) ? end : end - 1); + + newPtr = Tcl_NewListObj(2, objs); + } else { + if (i < match) { + newPtr = Tcl_NewStringObj(matchstr + start, end - start); + } else { + newPtr = Tcl_NewObj(); + } + } + if (doinline) { + if (Tcl_ListObjAppendElement(interp, resultPtr, newPtr) + != TCL_OK) { + Tcl_DecrRefCount(newPtr); + Tcl_DecrRefCount(resultPtr); + return TCL_ERROR; + } + } else { + Tcl_Obj *valuePtr; + valuePtr = Tcl_ObjSetVar2(interp, objv[i], NULL, newPtr, 0); + if (valuePtr == NULL) { + Tcl_AppendResult(interp, "couldn't set variable \"", + TclGetString(objv[i]), "\"", NULL); + return TCL_ERROR; + } + } + } + + if (all == 0) { + break; + } + + /* + * Adjust the offset to the character just after the last one in the + * matchVar and increment all to count how many times we are making a + * match. We always increment the offset by at least one to prevent + * endless looping (as in the case: regexp -all {a*} a). Otherwise, + * when we match the NULL string at the end of the input string, we + * will loop indefinately (because the length of the match is 0, so + * offset never changes). + * matches[1] is the match end point of the full RE match. + */ + + if (matches[0] == matches[1]) { + offset++; + } else { + offset = matches[1]; + } + all++; + eflags |= PCRE_NOTBOL; + if (offset >= stringLength) { + break; + } + } + + /* + * Set the interpreter's object result to an integer object with value 1 + * if -all wasn't specified, otherwise it's all-1 (the number of times + * through the while - 1). + */ + + if (doinline) { + Tcl_SetObjResult(interp, resultPtr); + } else { + Tcl_SetObjResult(interp, Tcl_NewIntObj(all ? all-1 : 1)); + } + return TCL_OK; +#else /* !HAVE_PCRE */ + Tcl_AppendResult(interp, "PCRE not available", NULL); + return TCL_ERROR; +#endif +} + /* * Local Variables: * mode: c diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 8650776587dc..1b38c5e01a8e 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -16,6 +16,10 @@ #include "regex.h" +#ifdef HAVE_PCRE +#include +#endif + /* * The TclRegexp structure encapsulates a compiled regex_t, the flags that * were used to compile it, and an array of pointers that are used to indicate @@ -28,6 +32,10 @@ typedef struct TclRegexp { int flags; /* Regexp compile flags. */ regex_t re; /* Compiled re, includes number of * subexpressions. */ +#ifdef HAVE_PCRE + pcre *pcre; /* PCRE compile re */ + pcre_extra *study; /* study of PCRE */ +#endif CONST char *string; /* Last string passed to Tcl_RegExpExec. */ Tcl_Obj *objPtr; /* Last object passed to Tcl_RegExpExecObj. */ Tcl_Obj *globObjPtr; /* Glob pattern rep of RE or NULL if none. */ diff --git a/tests/interp.test b/tests/interp.test index 510ab4a5192e..904dc6e8f307 100644 --- a/tests/interp.test +++ b/tests/interp.test @@ -29,7 +29,7 @@ test interp-1.1 {options for interp command} { } {1 {wrong # args: should be "interp cmd ?arg ...?"}} test interp-1.2 {options for interp command} { list [catch {interp frobox} msg] $msg -} {1 {bad option "frobox": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, slaves, share, target, or transfer}} +} {1 {bad option "frobox": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, regexp, slaves, share, target, or transfer}} test interp-1.3 {options for interp command} { interp delete } "" @@ -47,13 +47,13 @@ test interp-1.6 {options for interp command} { } {1 {wrong # args: should be "interp slaves ?path?"}} test interp-1.7 {options for interp command} { list [catch {interp hello} msg] $msg -} {1 {bad option "hello": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, slaves, share, target, or transfer}} +} {1 {bad option "hello": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, regexp, slaves, share, target, or transfer}} test interp-1.8 {options for interp command} { list [catch {interp -froboz} msg] $msg -} {1 {bad option "-froboz": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, slaves, share, target, or transfer}} +} {1 {bad option "-froboz": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, regexp, slaves, share, target, or transfer}} test interp-1.9 {options for interp command} { list [catch {interp -froboz -safe} msg] $msg -} {1 {bad option "-froboz": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, slaves, share, target, or transfer}} +} {1 {bad option "-froboz": must be alias, aliases, bgerror, create, debug, delete, eval, exists, expose, hide, hidden, issafe, invokehidden, limit, marktrusted, recursionlimit, regexp, slaves, share, target, or transfer}} test interp-1.10 {options for interp command} { list [catch {interp target} msg] $msg } {1 {wrong # args: should be "interp target path alias"}} @@ -3508,6 +3508,32 @@ test interp-37.1 {safe interps and min() and max(): Bug 2895741} -setup { interp delete a } -result {26 26} +test interp-37.11 {interp regexp} { + list [catch {interp regexp} msg] $msg +} {1 {wrong # args: should be "interp regexp path ?type?"}} +test interp-37.12 {interp regexp} { + list [catch {interp regexp {} invalid} msg] $msg +} {1 {bad type "invalid": must be classic or pcre}} +test interp-37.13 {interp regexp} { + list [catch {interp regexp {} classic bogus} msg] $msg +} {1 {wrong # args: should be "interp regexp path ?type?"}} +test interp-37.14 {interp regexp} -setup { + unset -nocomplain ::env(TCL_REGEXP_PCRE) + interp create slave +} -body { + slave eval {interp regexp {}} +} -cleanup { + interp delete slave +} -result {classic} +test interp-37.15 {interp regexp} -setup { + unset -nocomplain ::env(TCL_REGEXP_PCRE) + interp create slave +} -body { + slave eval {interp regexp {} pcre} +} -cleanup { + interp delete slave +} -result {pcre} + test interp-38.1 {interp debug one-way switch} -setup { catch {interp delete a} interp create a diff --git a/tests/reg.test b/tests/reg.test index 6cd2eb343401..54283eb81258 100644 --- a/tests/reg.test +++ b/tests/reg.test @@ -18,6 +18,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { ::tcltest::testConstraint testregexp [llength [info commands testregexp]] ::tcltest::testConstraint localeRegexp 0 +::tcltest::testConstraint classicre [string equal [interp regexp {}] classic] # This file uses some custom procedures, defined below, for regexp regression # testing. The name of the procedure indicates the general nature of the @@ -177,7 +178,7 @@ namespace eval RETest { # Share the generation of the list of test constraints so it is # done the same on all routes. proc TestConstraints {flags} { - set constraints [list testregexp] + set constraints [list testregexp classicre] variable regBug if {$regBug} { @@ -327,7 +328,8 @@ namespace eval RETest { } } namespace import RETest::* - + + ######## the tests themselves ######## # support functions and preliminary misc. diff --git a/tests/regexp.test b/tests/regexp.test index 362f42569af0..3bbe7d0a17f7 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -15,7 +15,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest 2 namespace import -force ::tcltest::* } - +interp regexp {} pcre testConstraint exec [llength [info commands exec]] catch {unset foo} @@ -225,13 +225,13 @@ test regexp-6.2 {regexp errors} { } {1 {wrong # args: should be "regexp ?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"}} test regexp-6.3 {regexp errors} { list [catch {regexp -gorp a} msg] $msg -} {1 {bad switch "-gorp": must be -all, -about, -indices, -inline, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}} -test regexp-6.4 {regexp errors} { - list [catch {regexp a( b} msg] $msg -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} -test regexp-6.5 {regexp errors} { +} {1 {bad switch "-gorp": must be -all, -about, -indices, -inline, -expanded, -line, -linestop, -lineanchor, -nocase, -start, -type, or --}} +test regexp-6.4 {regexp errors} -body { list [catch {regexp a( b} msg] $msg -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +} -match glob -result {1 {couldn't compile*}} +test regexp-6.5 {regexp errors} -body { + list [catch {regexp a) b} msg] $msg +} -match glob -result {1 {couldn't compile*}} test regexp-6.6 {regexp errors} { list [catch {regexp a a f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1} msg] $msg } {0 1} @@ -391,10 +391,10 @@ test regexp-11.4 {regsub errors} { } {1 {wrong # args: should be "regsub ?switches? exp string subSpec ?varName?"}} test regexp-11.5 {regsub errors} { list [catch {regsub -gorp a b c} msg] $msg -} {1 {bad switch "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}} -test regexp-11.6 {regsub errors} { +} {1 {bad switch "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, -type, or --}} +test regexp-11.6 {regsub errors} -body { list [catch {regsub -nocase a( b c d} msg] $msg -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +} -match glob -result {1 {couldn't compile*}} test regexp-11.7 {regsub errors} { catch {unset f1} set f1 44 diff --git a/tests/regexpComp.test b/tests/regexpComp.test index e927ca22d0e7..b173a1cbac17 100644 --- a/tests/regexpComp.test +++ b/tests/regexpComp.test @@ -325,17 +325,17 @@ test regexpComp-6.3 {regexp errors} { evalInProc { list [catch {regexp -gorp a} msg] $msg } -} {1 {bad switch "-gorp": must be -all, -about, -indices, -inline, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}} -test regexpComp-6.4 {regexp errors} { +} {1 {bad switch "-gorp": must be -all, -about, -indices, -inline, -expanded, -line, -linestop, -lineanchor, -nocase, -start, -type, or --}} +test regexpComp-6.4 {regexp errors} -body { evalInProc { list [catch {regexp a( b} msg] $msg } -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} -test regexpComp-6.5 {regexp errors} { +} -match glob -result {1 {couldn't compile*}} +test regexpComp-6.5 {regexp errors} -body { evalInProc { - list [catch {regexp a( b} msg] $msg + list [catch {regexp a) b} msg] $msg } -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +} -match glob -result {1 {couldn't compile*}} test regexpComp-6.6 {regexp errors} { evalInProc { list [catch {regexp a a f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1} msg] $msg @@ -581,12 +581,12 @@ test regexpComp-11.5 {regsub errors} { evalInProc { list [catch {regsub -gorp a b c} msg] $msg } -} {1 {bad switch "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}} -test regexpComp-11.6 {regsub errors} { +} {1 {bad switch "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, -type, or --}} +test regexpComp-11.6 {regsub errors} -body { evalInProc { list [catch {regsub -nocase a( b c d} msg] $msg } -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +} -match glob -result {1 {couldn't compile*}} test regexpComp-11.7 {regsub errors} { evalInProc { catch {unset f1} @@ -959,12 +959,12 @@ test regexpComp-24.8 {regexp command compiling tests} { regexp -- $re dogfod } } 0 -test regexpComp-24.9 {regexp command compiling tests} { +test regexpComp-24.9 {regexp command compiling tests} -body { evalInProc { set re "(" list [catch {regexp -- $re dogfod} msg] $msg } -} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +} -match glob -result {1 {couldn't compile*}} test regexpComp-24.10 {regexp command compiling tests} { # Bug 1902436 - last * escaped evalInProc { diff --git a/unix/Makefile.in b/unix/Makefile.in index 50f4bb3166c7..4acb3fa9e7c9 100644 --- a/unix/Makefile.in +++ b/unix/Makefile.in @@ -256,14 +256,14 @@ GDB = gdb #-------------------------------------------------------------------------- STUB_CC_SWITCHES = ${CFLAGS} ${CFLAGS_WARNING} ${SHLIB_CFLAGS} \ --I"${BUILD_DIR}" -I${UNIX_DIR} -I${GENERIC_DIR} -I${TOMMATH_DIR} \ +-I"${BUILD_DIR}" -I${UNIX_DIR} -I${GENERIC_DIR} -I${TOMMATH_DIR} @PCRE_INCLUDE@ \ ${AC_FLAGS} ${PROTO_FLAGS} ${ENV_FLAGS} ${EXTRA_CFLAGS} @EXTRA_CC_SWITCHES@ CC_SWITCHES = $(STUB_CC_SWITCHES) ${NO_DEPRECATED_FLAGS} APP_CC_SWITCHES = $(CC_SWITCHES) @EXTRA_APP_CC_SWITCHES@ -LIBS = @TCL_LIBS@ +LIBS = @TCL_LIBS@ @PCRE_LIBS@ DEPEND_SWITCHES = ${CFLAGS} -I${UNIX_DIR} -I${GENERIC_DIR} \ ${AC_FLAGS} ${PROTO_FLAGS} ${EXTRA_CFLAGS} @EXTRA_CC_SWITCHES@ diff --git a/unix/configure b/unix/configure index 7ff9f7257222..3cef4930681d 100755 --- a/unix/configure +++ b/unix/configure @@ -308,7 +308,7 @@ ac_includes_default="\ # include #endif" -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS MAN_FLAGS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT CPP EGREP TCL_THREADS RANLIB ac_ct_RANLIB AR ac_ct_AR TCL_LIBS DL_LIBS DL_OBJS PLAT_OBJS PLAT_SRCS LDAIX_SRC CFLAGS_DEBUG CFLAGS_OPTIMIZE CFLAGS_WARNING LDFLAGS_DEBUG LDFLAGS_OPTIMIZE CC_SEARCH_FLAGS LD_SEARCH_FLAGS STLIB_LD SHLIB_LD TCL_SHLIB_LD_EXTRAS TK_SHLIB_LD_EXTRAS SHLIB_LD_LIBS SHLIB_CFLAGS SHLIB_SUFFIX MAKE_LIB MAKE_STUB_LIB INSTALL_LIB DLL_INSTALL_DIR INSTALL_STUB_LIB CFLAGS_DEFAULT LDFLAGS_DEFAULT LIBOBJS DTRACE TCL_VERSION TCL_MAJOR_VERSION TCL_MINOR_VERSION TCL_PATCH_LEVEL TCL_YEAR TCL_LIB_FILE TCL_LIB_FLAG TCL_LIB_SPEC TCL_STUB_LIB_FILE TCL_STUB_LIB_FLAG TCL_STUB_LIB_SPEC TCL_STUB_LIB_PATH TCL_INCLUDE_SPEC TCL_BUILD_STUB_LIB_SPEC TCL_BUILD_STUB_LIB_PATH TCL_SRC_DIR CFG_TCL_SHARED_LIB_SUFFIX CFG_TCL_UNSHARED_LIB_SUFFIX TCL_SHARED_BUILD LD_LIBRARY_PATH_VAR TCL_BUILD_LIB_SPEC TCL_LIB_VERSIONS_OK TCL_SHARED_LIB_SUFFIX TCL_UNSHARED_LIB_SUFFIX TCL_HAS_LONGLONG INSTALL_TZDATA DTRACE_SRC DTRACE_HDR DTRACE_OBJ MAKEFILE_SHELL BUILD_DLTEST TCL_PACKAGE_PATH TCL_MODULE_PATH TCL_LIBRARY PRIVATE_INCLUDE_DIR HTML_DIR EXTRA_CC_SWITCHES EXTRA_APP_CC_SWITCHES EXTRA_INSTALL EXTRA_INSTALL_BINARIES EXTRA_BUILD_HTML EXTRA_TCLSH_LIBS DLTEST_LD DLTEST_SUFFIX' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS MAN_FLAGS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT CPP EGREP TCL_THREADS RANLIB ac_ct_RANLIB AR ac_ct_AR TCL_LIBS DL_LIBS DL_OBJS PLAT_OBJS PLAT_SRCS LDAIX_SRC CFLAGS_DEBUG CFLAGS_OPTIMIZE CFLAGS_WARNING LDFLAGS_DEBUG LDFLAGS_OPTIMIZE CC_SEARCH_FLAGS LD_SEARCH_FLAGS STLIB_LD SHLIB_LD TCL_SHLIB_LD_EXTRAS TK_SHLIB_LD_EXTRAS SHLIB_LD_LIBS SHLIB_CFLAGS SHLIB_SUFFIX MAKE_LIB MAKE_STUB_LIB INSTALL_LIB DLL_INSTALL_DIR INSTALL_STUB_LIB CFLAGS_DEFAULT PCRE_INCLUDE PCRE_LIBS LDFLAGS_DEFAULT LIBOBJS DTRACE TCL_VERSION TCL_MAJOR_VERSION TCL_MINOR_VERSION TCL_PATCH_LEVEL TCL_YEAR TCL_LIB_FILE TCL_LIB_FLAG TCL_LIB_SPEC TCL_STUB_LIB_FILE TCL_STUB_LIB_FLAG TCL_STUB_LIB_SPEC TCL_STUB_LIB_PATH TCL_INCLUDE_SPEC TCL_BUILD_STUB_LIB_SPEC TCL_BUILD_STUB_LIB_PATH TCL_SRC_DIR CFG_TCL_SHARED_LIB_SUFFIX CFG_TCL_UNSHARED_LIB_SUFFIX TCL_SHARED_BUILD LD_LIBRARY_PATH_VAR TCL_BUILD_LIB_SPEC TCL_LIB_VERSIONS_OK TCL_SHARED_LIB_SUFFIX TCL_UNSHARED_LIB_SUFFIX TCL_HAS_LONGLONG INSTALL_TZDATA DTRACE_SRC DTRACE_HDR DTRACE_OBJ MAKEFILE_SHELL BUILD_DLTEST TCL_PACKAGE_PATH TCL_MODULE_PATH TCL_LIBRARY PRIVATE_INCLUDE_DIR HTML_DIR EXTRA_CC_SWITCHES EXTRA_APP_CC_SWITCHES EXTRA_INSTALL EXTRA_INSTALL_BINARIES EXTRA_BUILD_HTML EXTRA_TCLSH_LIBS DLTEST_LD DLTEST_SUFFIX' ac_subst_files='' # Initialize some variables set by options. @@ -857,6 +857,7 @@ Optional Features: --enable-load allow dynamic loading and "load" command (default: on) --enable-symbols build with debugging symbols (default: off) + --enable-pcre whether to enable pcre (default: off) --enable-langinfo use nl_langinfo if possible to determine encoding at startup, otherwise use old heuristic (default: on) --enable-dll-unloading enable the 'unload' command (default: on) @@ -869,6 +870,7 @@ Optional Packages: --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) --with-encoding encoding for configuration values (default: iso8859-1) + --with-pcre directory containing pcre headers and libraries --with-tzdata install timezone data (default: autodetect) Some influential environment variables: @@ -13778,6 +13780,127 @@ _ACEOF fi +#------------------------------------------------------------------------------ +# Check if we want to use pcre +#------------------------------------------------------------------------------ + + + +# Check whether --with-pcre or --without-pcre was given. +if test "${with_pcre+set}" = set; then + withval="$with_pcre" + with_pcre=${withval} +fi; + echo "$as_me:$LINENO: checking for PCRE configuration" >&5 +echo $ECHO_N "checking for PCRE configuration... $ECHO_C" >&6 + + if test "${ac_cv_c_pcre+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + + # First check to see if --with-pcre was specified. + if test x"${with_pcre}" != x ; then + if test -f "${with_pcre}/include/pcre.h" -a \ + \( -f "${with_pcre}/lib/libpcre.so" -o \ + -f "${with_pcre}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd ${with_pcre}; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + else + { { echo "$as_me:$LINENO: error: ${with_pcre} directory doesn't contain pcre header and/or library" >&5 +echo "$as_me: error: ${with_pcre} directory doesn't contain pcre header and/or library" >&2;} + { (exit 1); exit 1; }; } + fi + fi + + if test x"${ac_cv_c_pcre}" = x ; then + # Try pcre-config if it exists + ac_cv_c_pcre=`pcre-config --prefix 2>/dev/null` + if test "$?" -ne 0; then + PCRE_INCLUDE=`pcre-config --cflags 2>/dev/null` + PCRE_LIBS=`pcre-config --libs 2>/dev/null` + fi + fi + + # check in a few common install locations + if test x"${ac_cv_c_pcre}" = x ; then + for i in \ + `ls -d ${exec_prefix} 2>/dev/null` \ + `ls -d ${prefix} 2>/dev/null` \ + `ls -d /usr/local 2>/dev/null` \ + `ls -d /usr/contrib 2>/dev/null` \ + `ls -d /usr 2>/dev/null` \ + ; do + if test -f "${i}/include/pcre.h" -a \ + \( -f "${i}/lib/libpcre.so" -o \ + -f "${i}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd $i; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + break + fi + done + fi + +fi + + + if test x"${ac_cv_c_pcre}" = x ; then + { echo "$as_me:$LINENO: WARNING: Can't find PCRE configuration, PCRE won't be used" >&5 +echo "$as_me: WARNING: Can't find PCRE configuration, PCRE won't be used" >&2;} + else + echo "$as_me:$LINENO: result: found PCRE configuration at ${ac_cv_c_pcre}" >&5 +echo "${ECHO_T}found PCRE configuration at ${ac_cv_c_pcre}" >&6 + +cat >>confdefs.h <<\_ACEOF +#define HAVE_PCRE 1 +_ACEOF + + fi + + + + + + echo "$as_me:$LINENO: checking whether to enable pcre in Tcl" >&5 +echo $ECHO_N "checking whether to enable pcre in Tcl... $ECHO_C" >&6 + # Check whether --enable-pcre or --disable-pcre was given. +if test "${enable_pcre+set}" = set; then + enableval="$enable_pcre" + pcre_ok=$enableval +else + pcre_ok=yes +fi; + + if test "${enable_pcre+set}" = set; then + enableval="$enable_pcre" + pcre_ok=$enableval + else + pcre_ok=yes + fi + + if test x"${ac_cv_c_pcre}" = x ; then + echo "$as_me:$LINENO: result: pcre configuration not found" >&5 +echo "${ECHO_T}pcre configuration not found" >&6 + else + if test "$pcre_ok" = "default" ; then + echo "$as_me:$LINENO: result: pcre default" >&5 +echo "${ECHO_T}pcre default" >&6 + +cat >>confdefs.h <<\_ACEOF +#define USE_DEFAULT_PCRE 1 +_ACEOF + + elif test "$pcre_ok" = "yes" ; then + echo "$as_me:$LINENO: result: pcre enabled" >&5 +echo "${ECHO_T}pcre enabled" >&6 + else + echo "$as_me:$LINENO: result: no pcre" >&5 +echo "${ECHO_T}no pcre" >&6 + fi + fi + + #-------------------------------------------------------------------- # Some systems (e.g., IRIX 4.0.5) lack some fields in struct stat. But # we might be able to use fstatfs instead. Some systems (OpenBSD?) also @@ -19602,6 +19725,8 @@ s,@DLL_INSTALL_DIR@,$DLL_INSTALL_DIR,;t t s,@INSTALL_STUB_LIB@,$INSTALL_STUB_LIB,;t t s,@CFLAGS_DEFAULT@,$CFLAGS_DEFAULT,;t t s,@LDFLAGS_DEFAULT@,$LDFLAGS_DEFAULT,;t t +s,@PCRE_INCLUDE@,$PCRE_INCLUDE,;t t +s,@PCRE_LIBS@,$PCRE_LIBS,;t t s,@LIBOBJS@,$LIBOBJS,;t t s,@DTRACE@,$DTRACE,;t t s,@TCL_VERSION@,$TCL_VERSION,;t t diff --git a/unix/configure.in b/unix/configure.in index e4255b6dfa14..c4cda3894604 100644 --- a/unix/configure.in +++ b/unix/configure.in @@ -241,6 +241,12 @@ fi SC_TIME_HANDLER +#------------------------------------------------------------------------------ +# Check if we want to use pcre +#------------------------------------------------------------------------------ + +SC_ENABLE_PCRE + #-------------------------------------------------------------------- # Some systems (e.g., IRIX 4.0.5) lack some fields in struct stat. But # we might be able to use fstatfs instead. Some systems (OpenBSD?) also diff --git a/unix/tcl.m4 b/unix/tcl.m4 index 6b6d3731c35c..e7a72756a39a 100644 --- a/unix/tcl.m4 +++ b/unix/tcl.m4 @@ -499,6 +499,128 @@ AC_DEFUN([SC_BUILD_TCLSH], [ AC_SUBST(BUILD_TCLSH) ]) +#------------------------------------------------------------------------ +# SC_WITH_PCRE -- +# +# Finds the PCRE header and library files for use with Tcl +# +# Arguments: +# none +# +# Results: +# +# Adds the following arguments to configure: +# --with-pcre=/path/to/pcre +# +# Sets the following vars: +# PCRE_DIR +#------------------------------------------------------------------------ + +AC_DEFUN([SC_WITH_PCRE], [ + AC_ARG_WITH(pcre, + AC_HELP_STRING([--with-pcre], + [directory containing pcre headers and libraries]), + [with_pcre=${withval}]) + AC_MSG_CHECKING([for PCRE configuration]) + + AC_CACHE_VAL(ac_cv_c_pcre,[ + # First check to see if --with-pcre was specified. + if test x"${with_pcre}" != x ; then + if test -f "${with_pcre}/include/pcre.h" -a \ + \( -f "${with_pcre}/lib/libpcre.so" -o \ + -f "${with_pcre}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd ${with_pcre}; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + else + AC_MSG_ERROR([${with_pcre} directory doesn't contain pcre header and/or library]) + fi + fi + + if test x"${ac_cv_c_pcre}" = x ; then + # Try pcre-config if it exists + ac_cv_c_pcre=`pcre-config --prefix 2>/dev/null` + if test "$?" -ne 0; then + PCRE_INCLUDE=`pcre-config --cflags 2>/dev/null` + PCRE_LIBS=`pcre-config --libs 2>/dev/null` + fi + fi + + # check in a few common install locations + if test x"${ac_cv_c_pcre}" = x ; then + for i in \ + `ls -d ${exec_prefix} 2>/dev/null` \ + `ls -d ${prefix} 2>/dev/null` \ + `ls -d /usr/local 2>/dev/null` \ + `ls -d /usr/contrib 2>/dev/null` \ + `ls -d /usr 2>/dev/null` \ + ; do + if test -f "${i}/include/pcre.h" -a \ + \( -f "${i}/lib/libpcre.so" -o \ + -f "${i}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd $i; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + break + fi + done + fi + ]) + + if test x"${ac_cv_c_pcre}" = x ; then + AC_MSG_WARN([Can't find PCRE configuration, PCRE won't be used]) + else + AC_MSG_RESULT([found PCRE configuration at ${ac_cv_c_pcre}]) + fi + AC_SUBST([PCRE_INCLUDE]) + AC_SUBST([PCRE_LIBS]) +]) + +#------------------------------------------------------------------------ +# SC_ENABLE_PCRE -- +# +# Allows the use of PCRE in Tcl as default +# +# Arguments: +# none +# +# Results: +# Adds the following arguments to configure: +# --enable-pcre=yes|no|pcre +# +#------------------------------------------------------------------------ + +AC_DEFUN([SC_ENABLE_PCRE], [ + AC_REQUIRE([SC_WITH_PCRE]) + AC_MSG_CHECKING([whether to enable pcre in Tcl]) + AC_ARG_ENABLE(pcre, + AC_HELP_STRING([--enable-pcre], + [whether to enable pcre (default: off)]), + [pcre_ok=$enableval], [pcre_ok=yes]) + + if test "${enable_pcre+set}" = set; then + enableval="$enable_pcre" + pcre_ok=$enableval + else + pcre_ok=yes + fi + + if test x"${ac_cv_c_pcre}" = x ; then + AC_MSG_RESULT([pcre configuration not found]) + else + if test "$pcre_ok" = "default" ; then + AC_MSG_RESULT([pcre default]) + AC_DEFINE(USE_DEFAULT_PCRE, 1, [Use PCRE as default RE?]) + AC_DEFINE(HAVE_PCRE, 1, [Do we enable PCRE interfaces?]) + elif test "$pcre_ok" = "yes" ; then + AC_MSG_RESULT([pcre enabled]) + AC_DEFINE(HAVE_PCRE, 1, [Do we enable PCRE interfaces?]) + else + AC_MSG_RESULT([no pcre]) + fi + fi +]) + #------------------------------------------------------------------------ # SC_ENABLE_SHARED -- # From ebe1515483d79966e4949c0ad57021922a46a7ca Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 14 Nov 2017 11:44:44 +0100 Subject: [PATCH 02/27] fixed compiling without HAVE_PCRE (no matches variable in TSD in such case) --- generic/tclRegexp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 17be0f89d7dd..18f4c1b09f7e 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1232,9 +1232,11 @@ FinalizeRegexp( ckfree(tsdPtr->patterns[i]); tsdPtr->patterns[i] = NULL; } +#ifdef HAVE_PCRE if (tsdPtr->matches != NULL) { ckfree((char *) tsdPtr->matches); } +#endif /* * We may find ourselves reinitialized if another finalization routine * invokes regexps. From ec10c97534dbe31b44b6e1cf8284563a80e3fe01 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 14 Nov 2017 16:05:23 +0100 Subject: [PATCH 03/27] [win] Introduces nmake command line parameter ADDLINKOPTS, to specify additionally linker options (e. g. to compile with PCRE), for example: set PCREDIR="..\..\lib\pcre" nmake -nologo -f makefile.vc release OPTS=threads,thrdalloc OPTIMIZATIONS="-DHAVE_PCRE -I%PCREDIR% -Ox -Ot -Oi -Gs" ADDLINKOPTS="%PCREDIR%\pcre.lib" --- win/rules.vc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/win/rules.vc b/win/rules.vc index 2edaa49ec3ff..9e53e222a634 100644 --- a/win/rules.vc +++ b/win/rules.vc @@ -447,6 +447,9 @@ This compiler does not support profile guided optimization. !endif !endif +# Via command line specified linker options (e. g. to compile with PCRE): +LINKERFLAGS = $(LINKERFLAGS) $(ADDLINKOPTS) + #---------------------------------------------------------- # Set our defines now armed with our options. #---------------------------------------------------------- From bdd7096e20db09e2fcd0d3340d1e740e30e942ff Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 14 Nov 2017 15:37:38 +0100 Subject: [PATCH 04/27] recognize explicit type of regexp engine used (avoid usage of default interp engine if specified as parameter), test cases extended in order to cover this situation; bug fixing (test-cases repaired) in case of no HAVE_PCRE; several optimizations and code review (e. g. option -type compiled now if token is a simple word); --- generic/tcl.h | 4 +- generic/tclCmdMZ.c | 2 + generic/tclCompCmds.c | 37 +++++++++++++----- generic/tclExecute.c | 8 ---- generic/tclInterp.c | 2 + generic/tclRegexp.c | 7 +++- tests/regexp.test | 2 +- tests/regexp2.test | 87 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 127 insertions(+), 22 deletions(-) create mode 100644 tests/regexp2.test diff --git a/generic/tcl.h b/generic/tcl.h index ecba3f851c6f..6ce14052542c 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -559,7 +559,9 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); #define TCL_REG_NEWLINE 000300 /* Newlines are line terminators. */ #define TCL_REG_CANMATCH 001000 /* Report details on partial/limited * matches. */ -#define TCL_REG_PCRE 0x08000000 /* Make sure it doesn't conflict with +#define TCL_REG_EXPLTYPE 0x10000000 /* Explicit type (avoid usage of + * default interp engine, mean it specified as parameter) */ +#define TCL_REG_PCRE 0x20000000 /* Make sure it doesn't conflict with * existing TCL_REG_* or PCRE_* bits */ /* diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 66eaf87a220b..2d09ca74971c 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -179,6 +179,8 @@ Tcl_RegexpObjCmd( if (++i >= objc) { goto endOfForLoop; } + /* explicit specified */ + cflags |= TCL_REG_EXPLTYPE; if (Tcl_GetIndexFromObj(interp, objv[i], re_type_opts, "type", 0, &re_type) != TCL_OK) { goto optionError; diff --git a/generic/tclCompCmds.c b/generic/tclCompCmds.c index 3b234b0fe84b..76ec9f374bff 100644 --- a/generic/tclCompCmds.c +++ b/generic/tclCompCmds.c @@ -2979,24 +2979,22 @@ TclCompileRegexpCmd( { Tcl_Token *varTokenPtr; /* Pointer to the Tcl_Token representing the * parse of the RE or string. */ - int i, len, nocase, exact, sawLast, simple; + int i, len, exact, sawLast = 0, simple = 0, + cflags = TCL_REG_ADVANCED; char *str; DefineLineInformation; /* TIP #280 */ /* * We are only interested in compiling simple regexp cases. Currently * supported compile cases are: - * regexp ?-nocase? ?--? staticString $var - * regexp ?-nocase? ?--? {^staticString$} $var + * regexp ?-nocase? ?-type ...? ?--? staticString $var + * regexp ?-nocase? ?-type ...? ?--? {^staticString$} $var */ if (parsePtr->numWords < 3) { return TCL_ERROR; } - simple = 0; - nocase = 0; - sawLast = 0; varTokenPtr = parsePtr->tokenPtr; /* @@ -3020,8 +3018,27 @@ TclCompileRegexpCmd( sawLast++; i++; break; - } else if ((len > 1) && (strncmp(str,"-nocase",(unsigned)len) == 0)) { - nocase = 1; + } else if ((len-- > 1) && *str++ == '-') { + if (strncmp(str, "nocase", (unsigned)len) == 0) { + cflags |= TCL_REG_NOCASE; + } else if (strncmp(str, "type", (unsigned)len) == 0) { + i++; + varTokenPtr = TokenAfter(varTokenPtr); + if (varTokenPtr->type != TCL_TOKEN_SIMPLE_WORD) { + return TCL_ERROR; /* runtime */ + } + str = (char *) varTokenPtr[1].start; + len = varTokenPtr[1].size; + if (strncmp(str, "classic", (unsigned)len) == 0) { + cflags = (cflags & ~TCL_REG_PCRE) | TCL_REG_EXPLTYPE; + } else if (strncmp(str, "pcre", (unsigned)len) == 0) { + cflags |= TCL_REG_PCRE | TCL_REG_EXPLTYPE; + } else { + return TCL_ERROR; /* runtime */ + } + } else { + return TCL_ERROR; /* runtime */ + } } else { /* * Not an option we recognize. @@ -3094,9 +3111,10 @@ TclCompileRegexpCmd( CompileWord(envPtr, varTokenPtr, interp, parsePtr->numWords-1); if (simple) { - if (exact && !nocase) { + if (exact && !(cflags & TCL_REG_NOCASE)) { TclEmitOpcode(INST_STR_EQ, envPtr); } else { + int nocase = (cflags & TCL_REG_NOCASE) ? 1 : 0; TclEmitInstInt1(INST_STR_MATCH, nocase, envPtr); } } else { @@ -3105,7 +3123,6 @@ TclCompileRegexpCmd( * that handles all the flags we want to pass. * Don't use TCL_REG_NOSUB as we may have backrefs. */ - int cflags = TCL_REG_ADVANCED | (nocase ? TCL_REG_NOCASE : 0); TclEmitInstInt1(INST_REGEXP, cflags, envPtr); } diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 8eebea51271f..bfb4356dc028 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -4491,14 +4491,6 @@ TclExecuteByteCode( valuePtr = OBJ_AT_TOS; /* String */ value2Ptr = OBJ_UNDER_TOS; /* Pattern */ - /* - * cflags won't use PCRE flag indicator during compilation - * XXX may use TCL_REG_ADVANCED to indicate -type classic for - * XXX compilation, but currently -type isn't compiled - */ - if (((Interp *)interp)->flags & INTERP_PCRE) { - cflags |= TCL_REG_PCRE; - } regExpr = Tcl_GetRegExpFromObj(interp, value2Ptr, cflags); if (regExpr == NULL) { match = -1; diff --git a/generic/tclInterp.c b/generic/tclInterp.c index d50b6643e953..2d9be6511e45 100644 --- a/generic/tclInterp.c +++ b/generic/tclInterp.c @@ -976,6 +976,8 @@ Tcl_InterpObjCmd( } else { slaveInterp->flags &= ~(INTERP_PCRE); } + Tcl_SetObjResult(interp, objv[3]); + return TCL_OK; } if (slaveInterp->flags & INTERP_PCRE) { Tcl_SetObjResult(interp, Tcl_NewStringObj("pcre", -1)); diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 18f4c1b09f7e..a4e7d603ec82 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -671,10 +671,13 @@ Tcl_GetRegExpFromObj( regexpPtr = (TclRegexp *) objPtr->internalRep.twoPtrValue.ptr1; - /* XXX Need to have case where -type classic isn't ignored in regexp/sub */ - if ((interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE)) { + /* if type was not explicit specified */ + if ( !(flags & TCL_REG_EXPLTYPE) + && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) + ) { flags |= TCL_REG_PCRE; } + if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { pattern = TclGetStringFromObj(objPtr, &length); diff --git a/tests/regexp.test b/tests/regexp.test index 3bbe7d0a17f7..93cf32ba33f8 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -15,7 +15,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest 2 namespace import -force ::tcltest::* } -interp regexp {} pcre + testConstraint exec [llength [info commands exec]] catch {unset foo} diff --git a/tests/regexp2.test b/tests/regexp2.test new file mode 100644 index 000000000000..7ff4bba22cce --- /dev/null +++ b/tests/regexp2.test @@ -0,0 +1,87 @@ +# Commands covered: regexp, regsub +# +# This file contains a collection of tests for one or more of the Tcl +# built-in commands. Sourcing this file into Tcl runs the tests and +# generates output for errors. No output means no errors were found. +# +# Copyright (c) 1991-1993 The Regents of the University of California. +# Copyright (c) 1998 Sun Microsystems, Inc. +# Copyright (c) 1998-1999 by Scriptics Corporation. +# +# See the file "license.terms" for information on usage and redistribution +# of this file, and for a DISCLAIMER OF ALL WARRANTIES. + +if {[lsearch [namespace children] ::tcltest] == -1} { + package require tcltest 2 + namespace import -force ::tcltest::* +} + +# prepare: +namespace inscope ::tcltest {rename cleanupTests _cleanupTests; proc cleanupTests {args} {}} +rename test __test +proc test {args} {} + +catch { + + # test all available regexp engines (switch default to another one): + set org_reeng [interp regexp {}] + + foreach reeng {classic pcre} { + + proc _test_reeng_available {} { + variable reeng + interp regexp {} $reeng + # note we should not use simple regexp here (to avoid compile it via TclReToGlob to the glob expr): + if {[catch { regexp -- {^(_)(?!_)$} {_} } errMsg]} { + puts "ignore test of engine \"$reeng\": $errMsg" + return 0 + } + return 1 + } + + testConstraint reeng_$reeng [_test_reeng_available] + + # wrapper for tests from regexp.test: + proc test {args} { + variable reeng + uplevel [list __test [lindex $args 0]-$reeng {*}[lrange $args 1 end]] + } + +# --------------------------------------------------------------------------- + +# special engine-related tests: + +test regexp-0.1 {test of interim "switch" of engine} -body { + # this test will fail if current engine is not classic (no lookbehind syntax), and if it will be not + # switched via parameter -type classic (regardless PCRE available or not) + set re {(? Date: Wed, 15 Nov 2017 00:03:38 +0100 Subject: [PATCH 05/27] **interim commit** bug fixing: note ready --- generic/tcl.h | 5 + generic/tclCmdMZ.c | 434 ++++++++++++++++++++++++++++++------------ generic/tclCompCmds.c | 4 +- generic/tclExecute.c | 1 + generic/tclRegexp.c | 15 +- 5 files changed, 330 insertions(+), 129 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 6ce14052542c..23125100bf0d 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -564,6 +564,11 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); #define TCL_REG_PCRE 0x20000000 /* Make sure it doesn't conflict with * existing TCL_REG_* or PCRE_* bits */ +/* Following two macros used to supply TCL_REG_PCRE and TCL_REG_EXPLTYPE +/* to INST_REGEXP over one byte op (instead of TCL_REG_ADVANCED, that is always set) */ +#define TCL_REG_COMPILE_SHIFT(v) ((v>>28)&000003) +#define TCL_REG_COMPILE_UNSHIFT(v) (((v&000003)<<28)|TCL_REG_ADVANCED) + /* * Flags values passed to Tcl_RegExpExecObj. */ diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 2d09ca74971c..1bd284ecd7f8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -110,11 +110,7 @@ Tcl_RegexpObjCmd( indices = 0; about = 0; -#ifdef USE_DEFAULT_PCRE - re_type = RETYPE_PCRE; -#else - re_type = RETYPE_CLASSIC; -#endif + re_type = 0; cflags = TCL_REG_ADVANCED; offset = 0; all = 0; @@ -185,6 +181,11 @@ Tcl_RegexpObjCmd( 0, &re_type) != TCL_OK) { goto optionError; } + if ((enum re_type_opts) re_type == RETYPE_PCRE) { + cflags |= TCL_REG_PCRE; + } else { + cflags &= ~TCL_REG_PCRE; + } break; case REGEXP_LAST: i++; @@ -216,6 +217,13 @@ Tcl_RegexpObjCmd( return TCL_ERROR; } + /* own re-type from interp, if type was not explicit specified */ + if ( !(cflags & TCL_REG_EXPLTYPE) + && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) + ) { + cflags |= TCL_REG_PCRE; + } + /* * Get the length of the string that we are matching against so we can do * the termination test for -all matches. Do this before getting the @@ -225,7 +233,7 @@ Tcl_RegexpObjCmd( if (startIndex) { int stringLength; - if ((enum re_type_opts) re_type == RETYPE_CLASSIC) { + if (!(cflags & TCL_REG_PCRE)) { stringLength = Tcl_GetCharLength(objv[1]); } else { if (objv[1]->typePtr == &tclByteArrayType) { @@ -242,20 +250,12 @@ Tcl_RegexpObjCmd( } } - /* - * Handle the odd about case separately, otherwise pass of to appropriate - * RE engine. - */ - - if ((enum re_type_opts) re_type == RETYPE_PCRE) { - cflags |= TCL_REG_PCRE; - } regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); if (regExpr == NULL) { return TCL_ERROR; } - if ((enum re_type_opts) re_type == RETYPE_CLASSIC) { + if (!(cflags & TCL_REG_PCRE)) { if (about) { if (TclRegAbout(interp, regExpr) < 0) { return TCL_ERROR; @@ -324,12 +324,7 @@ Tcl_RegsubObjCmd( RETYPE_CLASSIC, RETYPE_PCRE, }; -#ifdef USE_DEFAULT_PCRE - re_type = RETYPE_PCRE; -#else - re_type = RETYPE_CLASSIC; -#endif - cflags = TCL_REG_ADVANCED; + re_type = 0; cflags = TCL_REG_ADVANCED; all = 0; offset = 0; resultPtr = NULL; @@ -384,10 +379,16 @@ Tcl_RegsubObjCmd( if (++idx >= objc) { goto endOfForLoop; } + cflags |= TCL_REG_EXPLTYPE; if (Tcl_GetIndexFromObj(interp, objv[idx], re_type_opts, "type", 0, &re_type) != TCL_OK) { goto optionError; } + if ((enum re_type_opts) re_type == RETYPE_PCRE) { + cflags |= TCL_REG_PCRE; + } else { + cflags &= ~TCL_REG_PCRE; + } break; case REGSUB_LAST: idx++; @@ -409,10 +410,17 @@ Tcl_RegsubObjCmd( objc -= idx; objv += idx; + /* own re-type from interp, if type was not explicit specified */ + if ( !(cflags & TCL_REG_EXPLTYPE) + && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) + ) { + cflags |= TCL_REG_PCRE; + } + if (startIndex) { int stringLength; - if ((enum re_type_opts) re_type == RETYPE_CLASSIC) { + if (!(cflags & TCL_REG_PCRE)) { stringLength = Tcl_GetCharLength(objv[1]); } else { if (objv[1]->typePtr == &tclByteArrayType) { @@ -429,7 +437,7 @@ Tcl_RegsubObjCmd( } } - if (all && (offset == 0) + if (all && (offset == 0) && !(cflags & TCL_REG_PCRE) && (strpbrk(TclGetString(objv[2]), "&\\") == NULL) && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) { /* @@ -500,9 +508,6 @@ Tcl_RegsubObjCmd( goto regsubDone; } - if ((enum re_type_opts) re_type == RETYPE_PCRE) { - cflags |= TCL_REG_PCRE; - } regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); if (regExpr == NULL) { return TCL_ERROR; @@ -519,152 +524,327 @@ Tcl_RegsubObjCmd( } else { objPtr = objv[1]; } - wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen); if (objv[2] == objv[0]) { subPtr = Tcl_DuplicateObj(objv[2]); } else { subPtr = objv[2]; } - wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen); result = TCL_OK; - /* - * The following loop is to handle multiple matches within the same source - * string; each iteration handles one match and its corresponding - * substitution. If "-all" hasn't been specified then the loop body only - * gets executed once. We must use 'offset <= wlen' in particular for the - * case where the regexp pattern can match the empty string - this is - * useful when doing, say, 'regsub -- ^ $str ...' when $str might be - * empty. + /* + * PCRE works utf8, thus differentiate between utf8 and unicode processing. */ + if (!(cflags & TCL_REG_PCRE)) { - numMatches = 0; - for ( ; offset <= wlen; ) { + /* Classic as unicode */ + + wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen); + wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen); /* - * The flags argument is set if string is part of a larger string, so - * that "^" won't match. + * The following loop is to handle multiple matches within the same source + * string; each iteration handles one match and its corresponding + * substitution. If "-all" hasn't been specified then the loop body only + * gets executed once. We must use 'offset <= wlen' in particular for the + * case where the regexp pattern can match the empty string - this is + * useful when doing, say, 'regsub -- ^ $str ...' when $str might be + * empty. */ - match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, - 10 /* matches */, ((offset > 0 && - (wstring[offset-1] != (Tcl_UniChar)'\n')) - ? TCL_REG_NOTBOL : 0)); + numMatches = 0; + for ( ; offset <= wlen; ) { - if (match < 0) { - result = TCL_ERROR; - goto done; - } - if (match == 0) { - break; - } - if (numMatches == 0) { - resultPtr = Tcl_NewUnicodeObj(wstring, 0); - Tcl_IncrRefCount(resultPtr); - if (offset > 0) { - /* - * Copy the initial portion of the string in if an offset was - * specified. - */ + /* + * The flags argument is set if string is part of a larger string, so + * that "^" won't match. + */ + + match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, + 10 /* matches */, ((offset > 0 && + (wstring[offset-1] != (Tcl_UniChar)'\n')) + ? TCL_REG_NOTBOL : 0)); - Tcl_AppendUnicodeToObj(resultPtr, wstring, offset); + /*if (reflags & TCL_REG_PCRE) {}*/ + //printf("********** %p (%s) match %d = %d, %d\n", objPtr->typePtr, objPtr->typePtr ? objPtr->typePtr->name : "", numMatches, match, offset); + if (match < 0) { + result = TCL_ERROR; + goto done; } - } - numMatches++; + if (match == 0) { + break; + } + if (numMatches == 0) { + resultPtr = Tcl_NewUnicodeObj(wstring, 0); + Tcl_IncrRefCount(resultPtr); + if (offset > 0) { + /* + * Copy the initial portion of the string in if an offset was + * specified. + */ - /* - * Copy the portion of the source string before the match to the - * result variable. - */ + Tcl_AppendUnicodeToObj(resultPtr, wstring, offset); + } + } + numMatches++; - Tcl_RegExpGetInfo(regExpr, &info); - start = info.matches[0].start; - end = info.matches[0].end; - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); + /* + * Copy the portion of the source string before the match to the + * result variable. + */ - /* - * Append the subSpec argument to the variable, making appropriate - * substitutions. This code is a bit hairy because of the backslash - * conventions and because the code saves up ranges of characters in - * subSpec to reduce the number of calls to Tcl_SetVar. - */ + Tcl_RegExpGetInfo(regExpr, &info); + start = info.matches[0].start; + end = info.matches[0].end; + //printf("********** append %d, %d\n", start, end); + Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); - wsrc = wfirstChar = wsubspec; - wend = wsubspec + wsublen; - for (ch = *wsrc; wsrc != wend; wsrc++, ch = *wsrc) { - if (ch == '&') { - idx = 0; - } else if (ch == '\\') { - ch = wsrc[1]; - if ((ch >= '0') && (ch <= '9')) { - idx = ch - '0'; - } else if ((ch == '\\') || (ch == '&')) { - *wsrc = ch; - Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, - wsrc - wfirstChar + 1); - *wsrc = '\\'; - wfirstChar = wsrc + 2; - wsrc++; - continue; + /* + * Append the subSpec argument to the variable, making appropriate + * substitutions. This code is a bit hairy because of the backslash + * conventions and because the code saves up ranges of characters in + * subSpec to reduce the number of calls to Tcl_SetVar. + */ + + wsrc = wfirstChar = wsubspec; + wend = wsubspec + wsublen; + for (ch = *wsrc; wsrc != wend; wsrc++, ch = *wsrc) { + if (ch == '&') { + idx = 0; + } else if (ch == '\\') { + ch = wsrc[1]; + if ((ch >= '0') && (ch <= '9')) { + idx = ch - '0'; + } else if ((ch == '\\') || (ch == '&')) { + *wsrc = ch; + Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, + wsrc - wfirstChar + 1); + *wsrc = '\\'; + wfirstChar = wsrc + 2; + wsrc++; + continue; + } else { + continue; + } } else { continue; } - } else { - continue; + + if (wfirstChar != wsrc) { + Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, + wsrc - wfirstChar); + } + + if (idx <= info.nsubs) { + subStart = info.matches[idx].start; + subEnd = info.matches[idx].end; + if ((subStart >= 0) && (subEnd >= 0)) { + Tcl_AppendUnicodeToObj(resultPtr, + wstring + offset + subStart, subEnd - subStart); + } + } + + if (*wsrc == '\\') { + wsrc++; + } + wfirstChar = wsrc + 1; } if (wfirstChar != wsrc) { - Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, - wsrc - wfirstChar); + Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar); } - if (idx <= info.nsubs) { - subStart = info.matches[idx].start; - subEnd = info.matches[idx].end; - if ((subStart >= 0) && (subEnd >= 0)) { - Tcl_AppendUnicodeToObj(resultPtr, - wstring + offset + subStart, subEnd - subStart); + if (end == 0) { + /* + * Always consume at least one character of the input string in + * order to prevent infinite loops. + */ + + if (offset < wlen) { + Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); } - } + offset++; + } else { + offset += end; + if (start == end) { + /* + * We matched an empty string, which means we must go forward + * one more step so we don't match again at the same spot. + */ - if (*wsrc == '\\') { - wsrc++; + if (offset < wlen) { + Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + } + offset++; + } + } + if (!all) { + break; } - wfirstChar = wsrc + 1; } - if (wfirstChar != wsrc) { - Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar); + } else { + + /* PCRE as utf8 */ + + char ch, *wsrc, *wfirstChar, *cstring, *wsubspec, *wend; + + if (objPtr->typePtr == &tclByteArrayType) { + cstring = Tcl_GetByteArrayFromObj(objPtr, &wlen); + } else { + /* XXX validate offset by char length */ + cstring = Tcl_GetStringFromObj(objPtr, &wlen); + } + if (subPtr->typePtr == &tclByteArrayType) { + wsubspec = Tcl_GetByteArrayFromObj(subPtr, &wsublen); + } else { + /* XXX validate offset by char length */ + wsubspec = Tcl_GetStringFromObj(subPtr, &wsublen); } - if (end == 0) { + /* + * The following loop is to handle multiple matches within the same source + * string; each iteration handles one match and its corresponding + * substitution. If "-all" hasn't been specified then the loop body only + * gets executed once. We must use 'offset <= wlen' in particular for the + * case where the regexp pattern can match the empty string - this is + * useful when doing, say, 'regsub -- ^ $str ...' when $str might be + * empty. + */ + + numMatches = 0; + for ( ; offset <= wlen; ) { + /* - * Always consume at least one character of the input string in - * order to prevent infinite loops. + * The flags argument is set if string is part of a larger string, so + * that "^" won't match. */ - if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, + 10 /* matches */, ((offset > 0 && + (cstring[offset-1] != '\n')) + ? TCL_REG_NOTBOL : 0)); + + /*if (reflags & TCL_REG_PCRE) {}*/ + //printf("********** %p (%s) match %d = %d, %d\n", objPtr->typePtr, objPtr->typePtr ? objPtr->typePtr->name : "", numMatches, match, offset); + if (match < 0) { + result = TCL_ERROR; + goto done; } - offset++; - } else { - offset += end; - if (start == end) { + if (match == 0) { + break; + } + if (numMatches == 0) { + resultPtr = Tcl_NewStringObj(cstring, 0); + Tcl_IncrRefCount(resultPtr); + if (offset > 0) { + /* + * Copy the initial portion of the string in if an offset was + * specified. + */ + + Tcl_AppendToObj(resultPtr, cstring, offset); + } + } + numMatches++; + + /* + * Copy the portion of the source string before the match to the + * result variable. + */ + + Tcl_RegExpGetInfo(regExpr, &info); + start = info.matches[0].start; + end = info.matches[0].end; + //printf("********** append %d, %d\n", start, end); + Tcl_AppendToObj(resultPtr, cstring + offset, start); + + /* + * Append the subSpec argument to the variable, making appropriate + * substitutions. This code is a bit hairy because of the backslash + * conventions and because the code saves up ranges of characters in + * subSpec to reduce the number of calls to Tcl_SetVar. + */ + + wsrc = wfirstChar = wsubspec; + wend = wsubspec + wsublen; + for (ch = *wsrc; wsrc != wend; wsrc++, ch = *wsrc) { + if (ch == '&') { + idx = 0; + } else if (ch == '\\') { + ch = wsrc[1]; + if ((ch >= '0') && (ch <= '9')) { + idx = ch - '0'; + } else if ((ch == '\\') || (ch == '&')) { + *wsrc = ch; + Tcl_AppendToObj(resultPtr, wfirstChar, + wsrc - wfirstChar + 1); + *wsrc = '\\'; + wfirstChar = wsrc + 2; + wsrc++; + continue; + } else { + continue; + } + } else { + continue; + } + + if (wfirstChar != wsrc) { + Tcl_AppendToObj(resultPtr, wfirstChar, + wsrc - wfirstChar); + } + + if (idx <= info.nsubs) { + subStart = info.matches[idx].start; + subEnd = info.matches[idx].end; + if ((subStart >= 0) && (subEnd >= 0)) { + Tcl_AppendToObj(resultPtr, + cstring + offset + subStart, subEnd - subStart); + } + } + + if (*wsrc == '\\') { + wsrc++; + } + wfirstChar = wsrc + 1; + } + + if (wfirstChar != wsrc) { + Tcl_AppendToObj(resultPtr, wfirstChar, wsrc - wfirstChar); + } + + if (end == 0) { /* - * We matched an empty string, which means we must go forward - * one more step so we don't match again at the same spot. + * Always consume at least one character of the input string in + * order to prevent infinite loops. */ if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + Tcl_AppendToObj(resultPtr, cstring + offset, 1); } offset++; + } else { + offset += end; + if (start == end) { + /* + * We matched an empty string, which means we must go forward + * one more step so we don't match again at the same spot. + */ + + if (offset < wlen) { + Tcl_AppendToObj(resultPtr, cstring + offset, 1); + } + offset++; + } + } + if (!all) { + break; } } - if (!all) { - break; - } + + wstring = (Tcl_UniChar*)cstring; + } /* @@ -682,7 +862,11 @@ Tcl_RegsubObjCmd( resultPtr = objv[1]; Tcl_IncrRefCount(resultPtr); } else if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, wlen - offset); + if (!(cflags & TCL_REG_PCRE)) { + Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, wlen - offset); + } else { + Tcl_AppendToObj(resultPtr, (char *)wstring + offset, wlen - offset); + } } if (objc == 4) { if (Tcl_ObjSetVar2(interp, objv[3], NULL, resultPtr, 0) == NULL) { diff --git a/generic/tclCompCmds.c b/generic/tclCompCmds.c index 76ec9f374bff..8ebe4fda83c9 100644 --- a/generic/tclCompCmds.c +++ b/generic/tclCompCmds.c @@ -2980,7 +2980,7 @@ TclCompileRegexpCmd( Tcl_Token *varTokenPtr; /* Pointer to the Tcl_Token representing the * parse of the RE or string. */ int i, len, exact, sawLast = 0, simple = 0, - cflags = TCL_REG_ADVANCED; + cflags = 0; char *str; DefineLineInformation; /* TIP #280 */ @@ -3121,8 +3121,10 @@ TclCompileRegexpCmd( /* * Pass correct RE compile flags. We use only Int1 (8-bit), but * that handles all the flags we want to pass. + * Note that TCL_REG_PCRE/TCL_REG_EXPLTYPE will be mapped to TCL_REG_ADVANCED. * Don't use TCL_REG_NOSUB as we may have backrefs. */ + cflags |= TCL_REG_COMPILE_SHIFT(cflags); /* int to byte */ TclEmitInstInt1(INST_REGEXP, cflags, envPtr); } diff --git a/generic/tclExecute.c b/generic/tclExecute.c index bfb4356dc028..31cd1c4cddd3 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -4491,6 +4491,7 @@ TclExecuteByteCode( valuePtr = OBJ_AT_TOS; /* String */ value2Ptr = OBJ_UNDER_TOS; /* Pattern */ + cflags = TCL_REG_COMPILE_UNSHIFT(cflags); /* byte to int */ regExpr = Tcl_GetRegExpFromObj(interp, value2Ptr, cflags); if (regExpr == NULL) { match = -1; diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index a4e7d603ec82..da179f190dae 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -671,13 +671,16 @@ Tcl_GetRegExpFromObj( regexpPtr = (TclRegexp *) objPtr->internalRep.twoPtrValue.ptr1; - /* if type was not explicit specified */ + /* own re-type from interp, if type was not explicit specified */ if ( !(flags & TCL_REG_EXPLTYPE) && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) ) { flags |= TCL_REG_PCRE; } + /* explicit flag has no meaning further - remove it in order to compare */ + flags &= ~TCL_REG_EXPLTYPE; + if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { pattern = TclGetStringFromObj(objPtr, &length); @@ -1020,13 +1023,16 @@ CompileRegexp( */ /* XXX Should enable PCRE_UTF8 selectively on non-ByteArray Tcl_Obj */ - pcrecflags = PCRE_NO_UTF8_CHECK | PCRE_DOLLAR_ENDONLY; + pcrecflags = PCRE_UTF8 | PCRE_UCP | PCRE_NO_UTF8_CHECK | + PCRE_DOLLAR_ENDONLY; + /* for (i = 0, p = cstring; i < length; i++) { if (UCHAR(*p++) > 0x80) { - pcrecflags |= PCRE_UTF8; + pcrecflags |= PCRE_UTF8 | PCRE_UCP; break; } } + */ if (flags & TCL_REG_NOCASE) { pcrecflags |= PCRE_CASELESS; } @@ -1036,6 +1042,9 @@ CompileRegexp( if (flags & (TCL_REG_NEWLINE|TCL_REG_NLSTOP|TCL_REG_NLANCH)) { pcrecflags |= PCRE_MULTILINE; } + if (flags & ~TCL_REG_NLSTOP /*&& flags & ~TCL_REG_NEWLINE*/) { + pcrecflags |= PCRE_DOTALL; + } if (cstring[length] != 0) { cstring = (char *) ckalloc(length + 1); From 11a7093d3f26fe6218311f3c24974e06faad1e2d Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 15 Nov 2017 15:29:58 +0100 Subject: [PATCH 06/27] bugs fixed, code review, more backwards compatibility is constituted now. --- generic/tclCmdMZ.c | 28 +++++++--------- generic/tclRegexp.c | 80 ++++++++++++++++++++++++++++++++++----------- generic/tclRegexp.h | 2 ++ tests/regexp.test | 30 ++++++++++++----- 4 files changed, 96 insertions(+), 44 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 1bd284ecd7f8..44761cb92f5e 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -176,15 +176,14 @@ Tcl_RegexpObjCmd( goto endOfForLoop; } /* explicit specified */ - cflags |= TCL_REG_EXPLTYPE; if (Tcl_GetIndexFromObj(interp, objv[i], re_type_opts, "type", 0, &re_type) != TCL_OK) { goto optionError; } if ((enum re_type_opts) re_type == RETYPE_PCRE) { - cflags |= TCL_REG_PCRE; + cflags |= TCL_REG_PCRE | TCL_REG_EXPLTYPE; } else { - cflags &= ~TCL_REG_PCRE; + cflags = (cflags & ~TCL_REG_PCRE) | TCL_REG_EXPLTYPE; } break; case REGEXP_LAST: @@ -217,11 +216,10 @@ Tcl_RegexpObjCmd( return TCL_ERROR; } - /* own re-type from interp, if type was not explicit specified */ - if ( !(cflags & TCL_REG_EXPLTYPE) - && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) - ) { - cflags |= TCL_REG_PCRE; + /* if type is not explicit specified */ + if (!(cflags & TCL_REG_EXPLTYPE)) { + /* own re-type from interp, disable PCRE if needed */ + cflags = TclAdjustRegExpFlags(interp, objv[0], cflags); } /* @@ -379,15 +377,14 @@ Tcl_RegsubObjCmd( if (++idx >= objc) { goto endOfForLoop; } - cflags |= TCL_REG_EXPLTYPE; if (Tcl_GetIndexFromObj(interp, objv[idx], re_type_opts, "type", 0, &re_type) != TCL_OK) { goto optionError; } if ((enum re_type_opts) re_type == RETYPE_PCRE) { - cflags |= TCL_REG_PCRE; + cflags |= TCL_REG_PCRE | TCL_REG_EXPLTYPE; } else { - cflags &= ~TCL_REG_PCRE; + cflags = (cflags & ~TCL_REG_PCRE) | TCL_REG_EXPLTYPE; } break; case REGSUB_LAST: @@ -410,11 +407,10 @@ Tcl_RegsubObjCmd( objc -= idx; objv += idx; - /* own re-type from interp, if type was not explicit specified */ - if ( !(cflags & TCL_REG_EXPLTYPE) - && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) - ) { - cflags |= TCL_REG_PCRE; + /* if type is not explicit specified */ + if (!(cflags & TCL_REG_EXPLTYPE)) { + /* own re-type from interp, disable PCRE if needed */ + cflags = TclAdjustRegExpFlags(interp, objv[0], cflags); } if (startIndex) { diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index da179f190dae..2939aea7fc78 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -630,6 +630,33 @@ Tcl_RegExpGetInfo( } } + +int +TclAdjustRegExpFlags( + Tcl_Interp *interp, /* To access the interp regexp default. */ + Tcl_Obj *objPtr, /* Object whose string rep contains regular + * expression pattern. */ + int flags /* Regular expression compilation flags. */ +) { + /* if type is not explicit specified */ + if (!(flags & TCL_REG_EXPLTYPE)) { + /* own re-type from interp */ + if ((interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE)) { + flags |= TCL_REG_PCRE; + } + /* if does not work in PCRE - switch to classic (backwards compatibility) */ + if ((flags & TCL_REG_PCRE)) { + const char *regStr = TclGetString(objPtr); + if (*regStr == '*' && (objPtr->length >= 4) + && (memcmp("***=", regStr, 4) == 0) + ) { + flags = (flags & ~TCL_REG_PCRE) | TCL_REG_EXPLTYPE; + } + } + } + return flags; +} + /* *---------------------------------------------------------------------- * @@ -671,11 +698,10 @@ Tcl_GetRegExpFromObj( regexpPtr = (TclRegexp *) objPtr->internalRep.twoPtrValue.ptr1; - /* own re-type from interp, if type was not explicit specified */ - if ( !(flags & TCL_REG_EXPLTYPE) - && (interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE) - ) { - flags |= TCL_REG_PCRE; + /* if type is not explicit specified */ + if (!(flags & TCL_REG_EXPLTYPE)) { + /* own re-type from interp, disable PCRE if needed */ + flags = TclAdjustRegExpFlags(interp, objPtr, flags); } /* explicit flag has no meaning further - remove it in order to compare */ @@ -1039,10 +1065,10 @@ CompileRegexp( if (flags & TCL_REG_EXPANDED) { pcrecflags |= PCRE_EXTENDED; } - if (flags & (TCL_REG_NEWLINE|TCL_REG_NLSTOP|TCL_REG_NLANCH)) { + if (flags & TCL_REG_NLANCH) { pcrecflags |= PCRE_MULTILINE; } - if (flags & ~TCL_REG_NLSTOP /*&& flags & ~TCL_REG_NEWLINE*/) { + if (!(flags & TCL_REG_NLSTOP)) { pcrecflags |= PCRE_DOTALL; } @@ -1515,6 +1541,7 @@ TclRegexpPCRE( { #ifdef HAVE_PCRE int i, match, eflags, stringLength, matchelems, *matches; + int offsetDiff = 0; Tcl_Obj *objPtr, *resultPtr = NULL; const char *matchstr; pcre *re; @@ -1541,6 +1568,14 @@ TclRegexpPCRE( offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; } eflags |= PCRE_NOTBOL; + + /* avoid bad offset error (PCRE_ERROR_BADOFFSET) */ + if (offset >= stringLength) { + /* safe offset to correct indices if empty matched */ + offsetDiff = offset; + /* offset (and string) to NTS char */ + offset = stringLength++; + } } objc -= 2; @@ -1562,10 +1597,15 @@ TclRegexpPCRE( offset, eflags, matches, matchelems); if (match < -1) { - char buf[32 + TCL_INTEGER_SPACE]; - sprintf(buf, "pcre_exec returned error code %d", match); - Tcl_AppendResult(interp, buf, NULL); - return TCL_ERROR; + /* offset is out of range (bad utf, wrong length etc) */ + if (match == PCRE_ERROR_BADOFFSET) { + match = PCRE_ERROR_NOMATCH; + } else { + char buf[32 + TCL_INTEGER_SPACE]; + sprintf(buf, "pcre_exec returned error code %d", match); + Tcl_AppendResult(interp, buf, NULL); + return TCL_ERROR; + } } if (match == 0) { @@ -1604,9 +1644,10 @@ TclRegexpPCRE( /* * It's the number of substitutions, plus one for the matchVar at * index 0 + * Note we can get fewer matches as specified (thus just use [-1, -1] indices) */ - objc = match; + objc = regexpPtr->re.re_nsub + 1; if (all <= 1) { resultPtr = Tcl_NewObj(); } @@ -1616,8 +1657,13 @@ TclRegexpPCRE( int start, end; if (i < match) { - start = matches[i*2]; - end = matches[i*2 + 1]; + if (!offsetDiff) { + start = matches[i*2]; + end = matches[i*2 + 1]; + } else { + /* if out of range we've always empty match [offs, offs-1] */ + end = start = offsetDiff; + } } else { start = -1; end = -1; @@ -1669,11 +1715,7 @@ TclRegexpPCRE( * matches[1] is the match end point of the full RE match. */ - if (matches[0] == matches[1]) { - offset++; - } else { - offset = matches[1]; - } + offset = (matches[1] > matches[0]) ? matches[1] : matches[0] + 1; all++; eflags |= PCRE_NOTBOL; if (offset >= stringLength) { diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 1b38c5e01a8e..0d90e653918c 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -49,6 +49,8 @@ typedef struct TclRegexp { * compiled regexp. */ } TclRegexp; +int TclAdjustRegExpFlags(Tcl_Interp *, Tcl_Obj *, int flags); + #endif /* _TCLREGEXP */ /* diff --git a/tests/regexp.test b/tests/regexp.test index 93cf32ba33f8..2f8f985bbdc2 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -17,6 +17,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { } testConstraint exec [llength [info commands exec]] +testConstraint not_pcre [expr {[interp regexp {}] ne "pcre"}] catch {unset foo} test regexp-1.1 {basic regexp operation} { @@ -517,10 +518,15 @@ test regexp-16.3 {regsub -start} { catch {unset x} list [regsub -all -start 3 {z} hello {/&} x] $x } {0 hello} -test regexp-16.4 {regsub -start, \A behavior} { - set out {} - lappend out [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x - lappend out [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x +test regexp-16.4 {regsub -start, \A (resp. \G) behavior} { + if {[interp regexp {}] eq "classic"} { + list [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x + } else { + # PCRE differentiate \A from \G assertions (\A is true only at begin, thus by first match): + list [regsub -start 0 -all {\G(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 -all {\G(\w)} {abcde} {/\1} x] $x + } } {5 /a/b/c/d/e 3 ab/c/d/e} test regexp-16.5 {regsub -start, double option} { list [regsub -start 2 -start 0 a abc c x] $x @@ -621,9 +627,11 @@ test regexp-20.1 {regsub shared object shimmering} { regsub $a $c $b d list $d [string length $d] [string bytelength $d] } {abcdefghijklmnopqurstuvwxyz0123456789 37 37} -test regexp-20.2 {regsub shared object shimmering with -about} { +test regexp-20.2 {regsub shared object shimmering with -about} -constraints { + not_pcre +} -body { eval regexp -about abc -} {0 {}} +} -result {0 {}} test regexp-21.1 {regsub works with empty string} { regsub -- ^ {} foo @@ -716,7 +724,7 @@ test regexp-22.4 {Bug 3606139} -setup { [a 668]([a 55])[a 668]([a 55])[a 668]([a 55])[a 511]] {}] a } -cleanup { rename a {} -} -returnCodes 1 -result {couldn't compile regular expression pattern: nfa has too many states} +} -returnCodes error -match glob -result {couldn't compile *} test regexp-22.5 {Bug 3610026} -setup { set e {} set cp 99 @@ -727,7 +735,7 @@ test regexp-22.5 {Bug 3610026} -setup { regexp -about $e } -cleanup { unset -nocomplain e cp -} -returnCodes error -match glob -result {*too many colors*} +} -returnCodes error -match glob -result {couldn't compile *} test regexp-22.6 {Bug 6585b21ca8} { expr {[regexp {(\w).*?\1} Programmer m] ? $m : ""} } rogr @@ -960,7 +968,11 @@ test regexp-26.8 {Tcl bug 2826551: diff regexp with -line option} { } "{@1\n2\n+3\n} {@4\n-5\n+6\n7\n} {@8\n9\n}" test regexp-26.9 {Tcl bug 2826551: diff regexp with embedded -line option} { set data "@1\n2\n+3\n@4\n-5\n+6\n7\n@8\n9\n" - regexp -all -inline {(?n)^@.*\n(?:[^@].*\n?)*} $data + if {[interp regexp {}] eq "classic"} { + regexp -all -inline {(?n)^@.*\n(?:[^@].*\n?)*} $data + } else { + regexp -all -inline {(?m-s)^@.*\n(?:[^@].*\n?)*} $data + } } "{@1\n2\n+3\n} {@4\n-5\n+6\n7\n} {@8\n9\n}" test regexp-26.10 {regexp with -line option} { regexp -all -inline -line -- {a*} "a\n" From d751b42a4ed34fd374a9424b1f7d5b5b338fa135 Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 15 Nov 2017 22:00:04 +0100 Subject: [PATCH 07/27] **interim commit** almost ready --- generic/tclCmdMZ.c | 47 +++++++++++++----- generic/tclRegexp.c | 115 +++++++++++++++++++++++++------------------- generic/tclRegexp.h | 1 + tests/regexp.test | 1 + win/makefile.vc | 6 +-- 5 files changed, 105 insertions(+), 65 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 44761cb92f5e..bf0f49dd5c68 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -298,7 +298,7 @@ Tcl_RegsubObjCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - int idx, result, cflags, all, wlen, wsublen, numMatches, offset; + int idx, result, cflags, all, wlen, wsublen, numMatches, offset, rest = 0; int start, end, subStart, subEnd, match, re_type; Tcl_RegExp regExpr; Tcl_RegExpInfo info; @@ -433,7 +433,7 @@ Tcl_RegsubObjCmd( } } - if (all && (offset == 0) && !(cflags & TCL_REG_PCRE) + if (0 && all && (offset == 0) && !(cflags & TCL_REG_PCRE) && (strpbrk(TclGetString(objv[2]), "&\\") == NULL) && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) { /* @@ -556,10 +556,12 @@ Tcl_RegsubObjCmd( * that "^" won't match. */ + //printf("**c* exec %d) off: %d, flg: %X\n", numMatches, offset, ((offset > 0 &&(wstring[offset-1] != '\n')) ? TCL_REG_NOTBOL : 0)); match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, 10 /* matches */, ((offset > 0 && (wstring[offset-1] != (Tcl_UniChar)'\n')) ? TCL_REG_NOTBOL : 0)); + //printf("**c* matc %d) ==== %d\n", numMatches, match); /*if (reflags & TCL_REG_PCRE) {}*/ //printf("********** %p (%s) match %d = %d, %d\n", objPtr->typePtr, objPtr->typePtr ? objPtr->typePtr->name : "", numMatches, match, offset); @@ -592,7 +594,7 @@ Tcl_RegsubObjCmd( Tcl_RegExpGetInfo(regExpr, &info); start = info.matches[0].start; end = info.matches[0].end; - //printf("********** append %d, %d\n", start, end); + //printf("********** append [%d, %d] by s/e [%d, %d]\n", offset, start, start, end); Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); /* @@ -678,6 +680,7 @@ Tcl_RegsubObjCmd( break; } } + rest = offset; } else { @@ -709,37 +712,47 @@ Tcl_RegsubObjCmd( */ numMatches = 0; - for ( ; offset <= wlen; ) { + for ( rest = 0; offset <= wlen; ) { /* * The flags argument is set if string is part of a larger string, so * that "^" won't match. */ + //printf("**p* exec %d) rest: %d, off: %d, flg: %X\n", numMatches, rest, offset, ((offset > 0 && (cstring[offset-1] != '\n')) ? TCL_REG_NOTBOL : 0)); match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, 10 /* matches */, ((offset > 0 && (cstring[offset-1] != '\n')) ? TCL_REG_NOTBOL : 0)); + //printf("**p* matc %d) ==== %d\n", numMatches, match); - /*if (reflags & TCL_REG_PCRE) {}*/ //printf("********** %p (%s) match %d = %d, %d\n", objPtr->typePtr, objPtr->typePtr ? objPtr->typePtr->name : "", numMatches, match, offset); if (match < 0) { result = TCL_ERROR; goto done; } if (match == 0) { - break; + /* + * In order to process last line correctly by multiline processing e. g. `(?m)^` + * try to find match after string. + */ + if (!all) { + break; + } + all = 0; /* stop search at next interation */ + offset = wlen; /* repeat search after end */ + continue; } if (numMatches == 0) { resultPtr = Tcl_NewStringObj(cstring, 0); Tcl_IncrRefCount(resultPtr); - if (offset > 0) { + if (rest > 0) { /* * Copy the initial portion of the string in if an offset was * specified. */ - Tcl_AppendToObj(resultPtr, cstring, offset); + Tcl_AppendToObj(resultPtr, cstring, rest); } } numMatches++; @@ -748,12 +761,19 @@ Tcl_RegsubObjCmd( * Copy the portion of the source string before the match to the * result variable. */ + if (rest < offset) { + //printf("********** append [%d, %d] by r/o [%d, %d]\n", rest, offset - rest, rest, offset); + Tcl_AppendToObj(resultPtr, cstring + rest, offset - rest); + rest = offset; + } Tcl_RegExpGetInfo(regExpr, &info); start = info.matches[0].start; end = info.matches[0].end; - //printf("********** append %d, %d\n", start, end); - Tcl_AppendToObj(resultPtr, cstring + offset, start); + if (start) { + //printf("********** append [%d, %d] by s/e [%d, %d]\n", offset, start, start, end); + Tcl_AppendToObj(resultPtr, cstring + offset, start); + } /* * Append the subSpec argument to the variable, making appropriate @@ -834,6 +854,7 @@ Tcl_RegsubObjCmd( offset++; } } + rest = offset; if (!all) { break; } @@ -857,11 +878,11 @@ Tcl_RegsubObjCmd( resultPtr = objv[1]; Tcl_IncrRefCount(resultPtr); - } else if (offset < wlen) { + } else if (rest < wlen) { if (!(cflags & TCL_REG_PCRE)) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, wlen - offset); + Tcl_AppendUnicodeToObj(resultPtr, wstring + rest, wlen - rest); } else { - Tcl_AppendToObj(resultPtr, (char *)wstring + offset, wlen - offset); + Tcl_AppendToObj(resultPtr, (char *)wstring + rest, wlen - rest); } } if (objc == 4) { diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 2939aea7fc78..7157fe59b62b 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -75,12 +75,6 @@ typedef struct ThreadSpecificData { struct TclRegexp *regexps[NUM_REGEXPS]; /* Compiled forms of above strings. Also * malloc-ed, or NULL if not in use yet. */ -#ifdef HAVE_PCRE - Tcl_RegExpIndices *matches; /* To support PCRE in Tcl_RegExpGetInfo, we - * need a classic info matches area to store - * data in. */ - int matchelems; /* length of matches */ -#endif } ThreadSpecificData; static Tcl_ThreadDataKey dataKey; @@ -472,7 +466,7 @@ Tcl_RegExpExecObj( if (reflags & TCL_REG_PCRE) { #ifdef HAVE_PCRE const char *matchstr; - int match, pcreeflags, nm = (regexpPtr->re.re_nsub + 1) * 3; + int match, eflags, nm = (regexpPtr->re.re_nsub + 1) * 3; if (textObj->typePtr == &tclByteArrayType) { matchstr = Tcl_GetByteArrayFromObj(textObj, &length); @@ -484,19 +478,28 @@ Tcl_RegExpExecObj( offset = length; } - pcreeflags = 0; + regexpPtr->details.rm_extend.rm_so = offset; + + eflags = PCRE_NO_UTF8_CHECK; if (flags & TCL_REG_NOTBOL) { - pcreeflags |= PCRE_NOTBOL; + eflags |= PCRE_NOTBOL; + } if (offset >= length) { + /* + * PCRE has currently a bug by multiline with offset after "\n": + * ^ - meant assert start of string (or line, in multiline mode), + * but it will be not found by offset after "\n" regardless multiline. + * Thus just let do a small adustment (shift begin of string to offset). + * Not we'll do it always in order to regard enable multiline by exec using `(?m)`. + */ + matchstr = ""; + offset = 0; + length = 0; } + //printf("**** pcre_exec: %d(%d)..%d, flg:%X\n", offset, regexpPtr->details.rm_extend.rm_so, length, eflags); match = pcre_exec(regexpPtr->pcre, regexpPtr->study, - matchstr, length, offset, pcreeflags, - (int *) regexpPtr->matches, nm); - - /* - * Store last offset to support Tcl_RegExpGetInfo translation. - */ - regexpPtr->details.rm_extend.rm_so = offset; + matchstr, length, offset, eflags, + (int *) regexpPtr->offsets, nm); /* * Check for errors. @@ -518,6 +521,18 @@ Tcl_RegExpExecObj( } return -1; } + + /* + * Adjust match indices relative offset where matching began. + */ + if (offset) { + int i, *offsets = (int *) regexpPtr->offsets; + for (i = 0; i <= match*2; i++) { + //printf("**** correct: %d) ++ %d\n", i, offset); + offsets[i] -= offset; + } + } + return 1; #else if (interp != NULL) { @@ -602,25 +617,15 @@ Tcl_RegExpGetInfo( infoPtr->nsubs = regexpPtr->re.re_nsub; if (regexpPtr->flags & TCL_REG_PCRE) { #ifdef HAVE_PCRE - ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); - int i, last, *matches = (int *) regexpPtr->matches; - - /* - * This works both to initialize and extend matches as necessary - */ - if (tsdPtr->matchelems <= infoPtr->nsubs) { - tsdPtr->matchelems = infoPtr->nsubs + 1; - tsdPtr->matches = (Tcl_RegExpIndices *) - ckrealloc((char *) tsdPtr->matches, - sizeof(Tcl_RegExpIndices) * tsdPtr->matchelems); - } - last = regexpPtr->details.rm_extend.rm_so; /* last offset */ - for (i = 0; i <= infoPtr->nsubs; i++) { - tsdPtr->matches[i].start = matches[i*2] - last; - tsdPtr->matches[i].end = matches[i*2+1] - last; + if ((int *)regexpPtr->matches != regexpPtr->offsets) { + int i, *offsets = (int *) regexpPtr->offsets; + for (i = 0; i <= infoPtr->nsubs; i++) { + regexpPtr->matches[i].rm_so = offsets[i*2]; + regexpPtr->matches[i].rm_eo = offsets[i*2+1]; + } } - infoPtr->matches = tsdPtr->matches; - infoPtr->extendStart = 0; /* XXX support? */ + infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; + infoPtr->extendStart = -1; /* XXX support? */ #else Tcl_Panic("Cannot get info for PCRE match"); #endif @@ -1107,8 +1112,15 @@ CompileRegexp( rc = pcre_fullinfo(pcre, NULL, PCRE_INFO_CAPTURECOUNT, &nsubs); if (rc == 0) { regexpPtr->re.re_nsub = nsubs; - regexpPtr->matches = (regmatch_t *) + regexpPtr->offsets = (int *) ckalloc(sizeof(int) * (nsubs+1)*3); + /* we can use matches = offsets if size of two int's is equal regmatch_t structure */ + if (sizeof(*regexpPtr->offsets)*2 != sizeof(*regexpPtr->matches)) { + regexpPtr->matches = (regmatch_t *) ckalloc( + sizeof(regmatch_t) * (nsubs+1)); + } else { + regexpPtr->matches = (regmatch_t *)regexpPtr->offsets; + } } #else Tcl_AppendResult(interp, @@ -1235,6 +1247,11 @@ FreeRegexp( if (regexpPtr->matches) { ckfree((char *) regexpPtr->matches); } +#ifdef HAVE_PCRE + if (regexpPtr->offsets && regexpPtr->offsets != (int *)regexpPtr->matches) { + ckfree((char *) regexpPtr->offsets); + } +#endif ckfree((char *) regexpPtr); } @@ -1270,11 +1287,6 @@ FinalizeRegexp( ckfree(tsdPtr->patterns[i]); tsdPtr->patterns[i] = NULL; } -#ifdef HAVE_PCRE - if (tsdPtr->matches != NULL) { - ckfree((char *) tsdPtr->matches); - } -#endif /* * We may find ourselves reinitialized if another finalization routine * invokes regexps. @@ -1540,7 +1552,7 @@ TclRegexpPCRE( int offset) { #ifdef HAVE_PCRE - int i, match, eflags, stringLength, matchelems, *matches; + int i, match, eflags, stringLength, matchelems, *offsets; int offsetDiff = 0; Tcl_Obj *objPtr, *resultPtr = NULL; const char *matchstr; @@ -1562,15 +1574,20 @@ TclRegexpPCRE( * Add flag if using offset (string is part of a larger string), so * that "^" won't match. */ + int bol = 0; if (objPtr->typePtr != &tclByteArrayType) { /* XXX: probably needs length restriction */ offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; + bol = *(Tcl_UtfAtIndex(matchstr, offset-1)) == '\n'; + } else { + bol = matchstr[offset-1] == '\n'; } - eflags |= PCRE_NOTBOL; + if (!bol) + eflags |= PCRE_NOTBOL; /* avoid bad offset error (PCRE_ERROR_BADOFFSET) */ - if (offset >= stringLength) { + if (offset > stringLength) { /* safe offset to correct indices if empty matched */ offsetDiff = offset; /* offset (and string) to NTS char */ @@ -1590,11 +1607,11 @@ TclRegexpPCRE( re = regexpPtr->pcre; study = regexpPtr->study; - matches = (int *) regexpPtr->matches; + offsets = (int *) regexpPtr->offsets; matchelems = (int) (regexpPtr->re.re_nsub + 1) * 3; while (1) { match = pcre_exec(re, study, matchstr, stringLength, - offset, eflags, matches, matchelems); + offset, eflags, offsets, matchelems); if (match < -1) { /* offset is out of range (bad utf, wrong length etc) */ @@ -1658,8 +1675,8 @@ TclRegexpPCRE( if (i < match) { if (!offsetDiff) { - start = matches[i*2]; - end = matches[i*2 + 1]; + start = offsets[i*2]; + end = offsets[i*2 + 1]; } else { /* if out of range we've always empty match [offs, offs-1] */ end = start = offsetDiff; @@ -1712,10 +1729,10 @@ TclRegexpPCRE( * when we match the NULL string at the end of the input string, we * will loop indefinately (because the length of the match is 0, so * offset never changes). - * matches[1] is the match end point of the full RE match. + * offsets[1] is the match end point of the full RE match. */ - offset = (matches[1] > matches[0]) ? matches[1] : matches[0] + 1; + offset = (offsets[1] > offsets[0]) ? offsets[1] : offsets[0] + 1; all++; eflags |= PCRE_NOTBOL; if (offset >= stringLength) { diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 0d90e653918c..aab4ade14989 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -35,6 +35,7 @@ typedef struct TclRegexp { #ifdef HAVE_PCRE pcre *pcre; /* PCRE compile re */ pcre_extra *study; /* study of PCRE */ + int *offsets; /* Array of offsets (indices to handle within PCRE) */ #endif CONST char *string; /* Last string passed to Tcl_RegExpExec. */ Tcl_Obj *objPtr; /* Last object passed to Tcl_RegExpExecObj. */ diff --git a/tests/regexp.test b/tests/regexp.test index 2f8f985bbdc2..98e218011abf 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -18,6 +18,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { testConstraint exec [llength [info commands exec]] testConstraint not_pcre [expr {[interp regexp {}] ne "pcre"}] +testConstraint knownBug [expr {[testConstraint knownBug] || [interp regexp {}] eq "pcre"}] catch {unset foo} test regexp-1.1 {basic regexp operation} { diff --git a/win/makefile.vc b/win/makefile.vc index aedb7a60354f..11289bfb4b42 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -425,13 +425,13 @@ cdebug = -O2 $(OPTIMIZATIONS) cdebug = !endif !if $(SYMBOLS) -cdebug = $(cdebug) -Zi +cdebug = $(cdebug) -Zi $(OPTIMIZATIONS) !endif !else if "$(MACHINE)" == "IA64" || "$(MACHINE)" == "AMD64" ### Warnings are too many, can't support warnings into errors. -cdebug = -Zi -Od $(DEBUGFLAGS) +cdebug = -Zi -Od $(DEBUGFLAGS) $(OPTIMIZATIONS) !else -cdebug = -Zi -WX $(DEBUGFLAGS) +cdebug = -Zi $(DEBUGFLAGS) $(OPTIMIZATIONS) !endif ### Declarations common to all compiler options From b102ffb008ab60d06677413b7e813d660198a84f Mon Sep 17 00:00:00 2001 From: sebres Date: Thu, 16 Nov 2017 20:30:08 +0100 Subject: [PATCH 08/27] Bugs fixed, code review, backwards compatibility etc. Several known bugs of classic regexp-engine fixed in PCRE, thus the constraints deactivated now for this test-cases (in -pcre mode); Test cases extended for several PCRE features. PCRE almost ready implemented now (todo: binary recognition resp. option "-binary" or "-bytearray", normalize code of Tcl_RegsubObjCmd because too large ATM). --- generic/tcl.h | 28 ++++---- generic/tclCmdMZ.c | 12 +--- generic/tclCompCmds.c | 2 +- generic/tclRegexp.c | 156 +++++++++++++++++++++++++----------------- tests/regexp.test | 137 +++++++++++++++++++++++++++++-------- tests/regexp2.test | 8 ++- 6 files changed, 222 insertions(+), 121 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 23125100bf0d..11bdd1cbdefb 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -545,19 +545,19 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); * Flag values passed to Tcl_GetRegExpFromObj. */ -#define TCL_REG_BASIC 000000 /* BREs (convenience). */ -#define TCL_REG_EXTENDED 000001 /* EREs. */ -#define TCL_REG_ADVF 000002 /* Advanced features in EREs. */ -#define TCL_REG_ADVANCED 000003 /* AREs (which are also EREs). */ -#define TCL_REG_QUOTE 000004 /* No special characters, none. */ -#define TCL_REG_NOCASE 000010 /* Ignore case. */ -#define TCL_REG_NOSUB 000020 /* Don't care about subexpressions. */ -#define TCL_REG_EXPANDED 000040 /* Expanded format, white space & +#define TCL_REG_BASIC 0x00000000 /* BREs (convenience). */ +#define TCL_REG_EXTENDED 0x00000001 /* EREs. */ +#define TCL_REG_ADVF 0x00000002 /* Advanced features in EREs. */ +#define TCL_REG_ADVANCED 0x00000003 /* AREs (which are also EREs). */ +#define TCL_REG_QUOTE 0x00000004 /* No special characters, none. */ +#define TCL_REG_NOCASE 0x00000008 /* Ignore case. */ +#define TCL_REG_NOSUB 0x00000010 /* Don't care about subexpressions. */ +#define TCL_REG_EXPANDED 0x00000020 /* Expanded format, white space & * comments. */ -#define TCL_REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */ -#define TCL_REG_NLANCH 000200 /* ^ matches after \n, $ before. */ -#define TCL_REG_NEWLINE 000300 /* Newlines are line terminators. */ -#define TCL_REG_CANMATCH 001000 /* Report details on partial/limited +#define TCL_REG_NLSTOP 0x00000040 /* \n doesn't match . or [^ ] */ +#define TCL_REG_NLANCH 0x00000080 /* ^ matches after \n, $ before. */ +#define TCL_REG_NEWLINE 0x000000C0 /* Newlines are line terminators. */ +#define TCL_REG_CANMATCH 0x00000200 /* Report details on partial/limited * matches. */ #define TCL_REG_EXPLTYPE 0x10000000 /* Explicit type (avoid usage of * default interp engine, mean it specified as parameter) */ @@ -566,8 +566,8 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); /* Following two macros used to supply TCL_REG_PCRE and TCL_REG_EXPLTYPE /* to INST_REGEXP over one byte op (instead of TCL_REG_ADVANCED, that is always set) */ -#define TCL_REG_COMPILE_SHIFT(v) ((v>>28)&000003) -#define TCL_REG_COMPILE_UNSHIFT(v) (((v&000003)<<28)|TCL_REG_ADVANCED) +#define TCL_REG_COMPILE_SHIFT(v) ((v&~0x30000000)|(v>>28)&0x3) +#define TCL_REG_COMPILE_UNSHIFT(v) ((v&~0x03)|((v&0x03)<<28)|TCL_REG_ADVANCED) /* * Flags values passed to Tcl_RegExpExecObj. diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index bf0f49dd5c68..7290c92e9742 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -433,7 +433,7 @@ Tcl_RegsubObjCmd( } } - if (0 && all && (offset == 0) && !(cflags & TCL_REG_PCRE) + if (all && (offset == 0) && !(cflags & TCL_REG_PCRE) && (strpbrk(TclGetString(objv[2]), "&\\") == NULL) && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) { /* @@ -556,15 +556,11 @@ Tcl_RegsubObjCmd( * that "^" won't match. */ - //printf("**c* exec %d) off: %d, flg: %X\n", numMatches, offset, ((offset > 0 &&(wstring[offset-1] != '\n')) ? TCL_REG_NOTBOL : 0)); match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, 10 /* matches */, ((offset > 0 && (wstring[offset-1] != (Tcl_UniChar)'\n')) ? TCL_REG_NOTBOL : 0)); - //printf("**c* matc %d) ==== %d\n", numMatches, match); - /*if (reflags & TCL_REG_PCRE) {}*/ - //printf("********** %p (%s) match %d = %d, %d\n", objPtr->typePtr, objPtr->typePtr ? objPtr->typePtr->name : "", numMatches, match, offset); if (match < 0) { result = TCL_ERROR; goto done; @@ -594,7 +590,6 @@ Tcl_RegsubObjCmd( Tcl_RegExpGetInfo(regExpr, &info); start = info.matches[0].start; end = info.matches[0].end; - //printf("********** append [%d, %d] by s/e [%d, %d]\n", offset, start, start, end); Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); /* @@ -719,14 +714,11 @@ Tcl_RegsubObjCmd( * that "^" won't match. */ - //printf("**p* exec %d) rest: %d, off: %d, flg: %X\n", numMatches, rest, offset, ((offset > 0 && (cstring[offset-1] != '\n')) ? TCL_REG_NOTBOL : 0)); match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, 10 /* matches */, ((offset > 0 && (cstring[offset-1] != '\n')) ? TCL_REG_NOTBOL : 0)); - //printf("**p* matc %d) ==== %d\n", numMatches, match); - //printf("********** %p (%s) match %d = %d, %d\n", objPtr->typePtr, objPtr->typePtr ? objPtr->typePtr->name : "", numMatches, match, offset); if (match < 0) { result = TCL_ERROR; goto done; @@ -762,7 +754,6 @@ Tcl_RegsubObjCmd( * result variable. */ if (rest < offset) { - //printf("********** append [%d, %d] by r/o [%d, %d]\n", rest, offset - rest, rest, offset); Tcl_AppendToObj(resultPtr, cstring + rest, offset - rest); rest = offset; } @@ -771,7 +762,6 @@ Tcl_RegsubObjCmd( start = info.matches[0].start; end = info.matches[0].end; if (start) { - //printf("********** append [%d, %d] by s/e [%d, %d]\n", offset, start, start, end); Tcl_AppendToObj(resultPtr, cstring + offset, start); } diff --git a/generic/tclCompCmds.c b/generic/tclCompCmds.c index 8ebe4fda83c9..4514e8f9aa38 100644 --- a/generic/tclCompCmds.c +++ b/generic/tclCompCmds.c @@ -3124,7 +3124,7 @@ TclCompileRegexpCmd( * Note that TCL_REG_PCRE/TCL_REG_EXPLTYPE will be mapped to TCL_REG_ADVANCED. * Don't use TCL_REG_NOSUB as we may have backrefs. */ - cflags |= TCL_REG_COMPILE_SHIFT(cflags); /* int to byte */ + cflags = TCL_REG_COMPILE_SHIFT(cflags); /* int to byte */ TclEmitInstInt1(INST_REGEXP, cflags, envPtr); } diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 7157fe59b62b..49f073bd1b98 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -496,7 +496,6 @@ Tcl_RegExpExecObj( length = 0; } - //printf("**** pcre_exec: %d(%d)..%d, flg:%X\n", offset, regexpPtr->details.rm_extend.rm_so, length, eflags); match = pcre_exec(regexpPtr->pcre, regexpPtr->study, matchstr, length, offset, eflags, (int *) regexpPtr->offsets, nm); @@ -528,7 +527,6 @@ Tcl_RegExpExecObj( if (offset) { int i, *offsets = (int *) regexpPtr->offsets; for (i = 0; i <= match*2; i++) { - //printf("**** correct: %d) ++ %d\n", i, offset); offsets[i] -= offset; } } @@ -1552,8 +1550,8 @@ TclRegexpPCRE( int offset) { #ifdef HAVE_PCRE - int i, match, eflags, stringLength, matchelems, *offsets; - int offsetDiff = 0; + int i, match, eflags, pcrecflags = 0, stringLength, matchelems, *offsets, + offsetDiff, numMatches = 0; Tcl_Obj *objPtr, *resultPtr = NULL; const char *matchstr; pcre *re; @@ -1561,38 +1559,17 @@ TclRegexpPCRE( TclRegexp *regexpPtr = (TclRegexp *) regExpr; objPtr = objv[1]; + /* + * Get match string and translate offset into correct placement for utf-8 chars. + */ if (objPtr->typePtr == &tclByteArrayType) { matchstr = Tcl_GetByteArrayFromObj(objPtr, &stringLength); - } else { - matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); - } - - eflags = PCRE_NO_UTF8_CHECK; - if (offset > 0) { - /* - * Translate offset into correct placement for utf-8 chars. - * Add flag if using offset (string is part of a larger string), so - * that "^" won't match. - */ - int bol = 0; - - if (objPtr->typePtr != &tclByteArrayType) { + if (offset && offset < stringLength) { /* XXX: probably needs length restriction */ offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; - bol = *(Tcl_UtfAtIndex(matchstr, offset-1)) == '\n'; - } else { - bol = matchstr[offset-1] == '\n'; - } - if (!bol) - eflags |= PCRE_NOTBOL; - - /* avoid bad offset error (PCRE_ERROR_BADOFFSET) */ - if (offset > stringLength) { - /* safe offset to correct indices if empty matched */ - offsetDiff = offset; - /* offset (and string) to NTS char */ - offset = stringLength++; } + } else { + matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); } objc -= 2; @@ -1609,7 +1586,53 @@ TclRegexpPCRE( study = regexpPtr->study; offsets = (int *) regexpPtr->offsets; matchelems = (int) (regexpPtr->re.re_nsub + 1) * 3; + eflags = PCRE_NO_UTF8_CHECK; + if (all) { + pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &pcrecflags); + } while (1) { + + offsetDiff = 0; + if (offset > 0) { + + /* + * PCRE has currently a "bug" by multiline with offset after "\n": + * ^ - meant assert start of string (or line, in multiline mode), + * but it will be not found by offset after "\n" regardless multiline mode. + * Thus just let do a small adustment (hacking with shift of offset or length to NTS). + * Note we should do it always in order to regard enable multiline by exec using `(?m)`. + * If offset > stringLength, it avoids bad offset error (PCRE_ERROR_BADOFFSET). + */ + if (offset >= stringLength) { + int bol; + /* avoid match {^$} without multiline, if we are out of range */ + if (!numMatches && offset > stringLength) { + eflags |= PCRE_NOTBOL; + } + /* safe offset to correct indices if empty match found */ + offsetDiff = offset; + offset = stringLength; /* offset after last char */ + if (all && numMatches && offset) { + if (objPtr->typePtr != &tclByteArrayType) { + bol = *(Tcl_UtfAtIndex(matchstr, offset-1)) == '\n'; + } else { + bol = matchstr[offset-1] == '\n'; + } + /* fast fallback if we are not begin of new-line (cannot match anyway) */ + if (!bol) { + break; + } else { + /* hacking PCRE to accept this "extra" new-line (after newline empty match). */ + matchstr = ""; + offset = 0; + stringLength = 0; + eflags |= PCRE_ANCHORED; + } + } + all = 0; /* don't repeat */ + } + } + match = pcre_exec(re, study, matchstr, stringLength, offset, eflags, offsets, matchelems); @@ -1632,24 +1655,29 @@ TclRegexpPCRE( } if (match == PCRE_ERROR_NOMATCH) { - /* - * We want to set the value of the intepreter result only when - * this is the first time through the loop. - */ - - if (all <= 1) { - /* - * If inlining, the interpreter's object result remains an - * empty list, otherwise set it to an integer object w/ value - * 0. - */ - - if (!doinline) { - Tcl_SetObjResult(interp, Tcl_NewIntObj(0)); + /* + * In order to process last line correctly by multiline processing e. g. `(?m)^` + * try to find match after string (in case of findall and already matched something). + * Option pcrecflags & PCRE_ANCHORED is not set in multiline mode (resp. `(?m)`), + * in this case no match means - we will find nothing at all, so don't repeat. + */ + if (!all || !numMatches || !stringLength || (pcrecflags & PCRE_ANCHORED)) { + break; + } + /* If we tried unshifted search - repeat from next offset */ + if (eflags & PCRE_NOTEMPTY_ATSTART) { + eflags &= ~(PCRE_NOTEMPTY_ATSTART|PCRE_ANCHORED); + offset++; + if (objPtr->typePtr != &tclByteArrayType) { + offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; } - return TCL_OK; + continue; } - break; + /* offset to end of string */ + offset = stringLength; + + /* repeat once search at end */ + continue; } /* @@ -1665,7 +1693,7 @@ TclRegexpPCRE( */ objc = regexpPtr->re.re_nsub + 1; - if (all <= 1) { + if (!resultPtr) { resultPtr = Tcl_NewObj(); } } @@ -1717,26 +1745,24 @@ TclRegexpPCRE( } } - if (all == 0) { + numMatches++; + if (!all) { break; } /* * Adjust the offset to the character just after the last one in the - * matchVar and increment all to count how many times we are making a - * match. We always increment the offset by at least one to prevent - * endless looping (as in the case: regexp -all {a*} a). Otherwise, - * when we match the NULL string at the end of the input string, we - * will loop indefinately (because the length of the match is 0, so - * offset never changes). - * offsets[1] is the match end point of the full RE match. + * matchVar. + * In order to correct find all empty matches (X..X-1), we'll use + * PCRE_NOTEMPTY_ATSTART|PCRE_ANCHORED pair for start next try from the + * same offset (if not found again, break the cycle above). */ - offset = (offsets[1] > offsets[0]) ? offsets[1] : offsets[0] + 1; - all++; - eflags |= PCRE_NOTBOL; - if (offset >= stringLength) { - break; + if (offsets[1] > offsets[0]) { + offset = offsets[1]; + } else { + offset = offsets[0]; + eflags |= (PCRE_NOTEMPTY_ATSTART|PCRE_ANCHORED); } } @@ -1747,9 +1773,13 @@ TclRegexpPCRE( */ if (doinline) { - Tcl_SetObjResult(interp, resultPtr); + if (resultPtr) { + Tcl_SetObjResult(interp, resultPtr); + } else { + Tcl_ResetResult(interp); + } } else { - Tcl_SetObjResult(interp, Tcl_NewIntObj(all ? all-1 : 1)); + Tcl_SetObjResult(interp, Tcl_NewIntObj(numMatches)); } return TCL_OK; #else /* !HAVE_PCRE */ diff --git a/tests/regexp.test b/tests/regexp.test index 98e218011abf..49f59f28d6f6 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -17,7 +17,9 @@ if {[lsearch [namespace children] ::tcltest] == -1} { } testConstraint exec [llength [info commands exec]] -testConstraint not_pcre [expr {[interp regexp {}] ne "pcre"}] +testConstraint pcre [expr {[interp regexp {}] eq "pcre"}] +testConstraint classic [expr {[interp regexp {}] ne "pcre"}] +# all known bugs are fixed in PCRE engine, thus disable constraint if PCRE used: testConstraint knownBug [expr {[testConstraint knownBug] || [interp regexp {}] eq "pcre"}] catch {unset foo} @@ -519,16 +521,34 @@ test regexp-16.3 {regsub -start} { catch {unset x} list [regsub -all -start 3 {z} hello {/&} x] $x } {0 hello} -test regexp-16.4 {regsub -start, \A (resp. \G) behavior} { - if {[interp regexp {}] eq "classic"} { - list [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x \ - [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x - } else { - # PCRE differentiate \A from \G assertions (\A is true only at begin, thus by first match): - list [regsub -start 0 -all {\G(\w)} {abcde} {/\1} x] $x \ - [regsub -start 2 -all {\G(\w)} {abcde} {/\1} x] $x - } +test regexp-16.4.1 {regsub -start, \A behavior} {classic} { + list [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x } {5 /a/b/c/d/e 3 ab/c/d/e} +test regexp-16.4.2 {regsub -start, \G behavior} {pcre} { + # PCRE differentiate \A from \G assertions (\A is true only at begin, thus by first match): + list [regsub -start 0 -all {\G(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 -all {\G(\w)} {abcde} {/\1} x] $x +} {5 /a/b/c/d/e 3 ab/c/d/e} +test regexp-16.4.3 {regsub -start, \A behavior} {pcre} { + # PCRE differentiate \A from \G assertions (\A is true only at begin, thus by first match): + list [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x +} {1 /abcde 0 abcde} +test regexp-16.4.4 {regsub once -start, \A behavior} {classic} { + list [regsub -start 0 {\A(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 {\A(\w)} {abcde} {/\1} x] $x +} {1 /abcde 1 ab/cde} +test regexp-16.4.5 {regsub once -start, \G behavior} {pcre} { + # PCRE differentiate \A from \G assertions (\A is true only at begin, thus by first match): + list [regsub -start 0 {\G(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 {\G(\w)} {abcde} {/\1} x] $x +} {1 /abcde 1 ab/cde} +test regexp-16.4.6 {regsub once -start, \A behavior} {pcre} { + # PCRE differentiate \A from \G assertions (\A is true only at begin, thus by first match): + list [regsub -start 0 {\A(\w)} {abcde} {/\1} x] $x \ + [regsub -start 2 {\A(\w)} {abcde} {/\1} x] $x +} {1 /abcde 0 abcde} test regexp-16.5 {regsub -start, double option} { list [regsub -start 2 -start 0 a abc c x] $x } {1 cbc} @@ -628,8 +648,9 @@ test regexp-20.1 {regsub shared object shimmering} { regsub $a $c $b d list $d [string length $d] [string bytelength $d] } {abcdefghijklmnopqurstuvwxyz0123456789 37 37} +# todo remove classic constraint if get impllemented: test regexp-20.2 {regsub shared object shimmering with -about} -constraints { - not_pcre + classic } -body { eval regexp -about abc } -result {0 {}} @@ -749,20 +770,34 @@ test regexp-23.1 {regexp -all and -line} { [regexp -all -inline -indices -line -- {^$} $string] \ [regexp -all -inline -indices -line -- {$} $string] } {{{0 -1}} {{0 -1}} {{0 -1}}} -test regexp-23.2 {regexp -all and -line} { +test regexp-23.2 {regexp -all and -line} {knownBug} { set string "\n" - list \ + # first not multiline (whole string), thus no match for ^$ + # second ^, $ matches newlines, thus always before/after '\n' + list [list \ + [regexp -all -inline -indices -- {^} $string] \ + [regexp -all -inline -indices -- {^$} $string] \ + [regexp -all -inline -indices -- {$} $string] + ] [list \ [regexp -all -inline -indices -line -- {^} $string] \ [regexp -all -inline -indices -line -- {^$} $string] \ [regexp -all -inline -indices -line -- {$} $string] -} {{{0 -1}} {{0 -1}} {{0 -1}}} -test regexp-23.3 {regexp -all and -line} { + ] +} [list {{{0 -1}} {} {{1 0}}} [lrepeat 3 {{0 -1} {1 0}}]] +test regexp-23.3 {regexp -all and -line} {knownBug} { set string "\n\n" - list \ + # first not multiline (whole string), thus no match for ^$ + # second ^, $ matches newlines, thus always match each '\n' + list [list \ + [regexp -all -inline -indices -- {^} $string] \ + [regexp -all -inline -indices -- {^$} $string] \ + [regexp -all -inline -indices -- {$} $string] + ] [list \ [regexp -all -inline -indices -line -- {^} $string] \ [regexp -all -inline -indices -line -- {^$} $string] \ [regexp -all -inline -indices -line -- {$} $string] -} {{{0 -1} {1 0}} {{0 -1} {1 0}} {{0 -1} {1 0}}} + ] +} [list {{{0 -1}} {} {{2 1}}} [lrepeat 3 {{0 -1} {1 0} {2 1}}]] test regexp-23.4 {regexp -all and -line} { set string "a" list \ @@ -975,18 +1010,62 @@ test regexp-26.9 {Tcl bug 2826551: diff regexp with embedded -line option} { regexp -all -inline {(?m-s)^@.*\n(?:[^@].*\n?)*} $data } } "{@1\n2\n+3\n} {@4\n-5\n+6\n7\n} {@8\n9\n}" -test regexp-26.10 {regexp with -line option} { - regexp -all -inline -line -- {a*} "a\n" -} {a {}} -test regexp-26.11 {regexp without -line option} { - regexp -all -inline -- {a*} "a\n" -} {a {}} -test regexp-26.12 {regexp with -line option} { - regexp -all -inline -line -- {a*} "b\n" -} {{} {}} -test regexp-26.13 {regexp without -line option} { - regexp -all -inline -- {a*} "b\n" -} {{} {}} + +test regexp-26.10 {not anchored regexp with -line option} {knownBug} { + # 3 matches should be found (a, empty for '\n' and empty hereafter), + # similar to `regsub -all -line {a*} a\n *` which replaces 3 matches "**\n*": + list \ + [regexp -all -inline -line -- {a*} "a\n"] \ + [regexp -all -inline -line -indices -- {a*} "a\n"] \ + [regexp -all -inline -- {(?m)a*} "a\n"] \ + [regexp -all -inline -indices -- {(?m)a*} "a\n"] +} [lrepeat 2 {a {} {}} {{0 0} {1 0} {2 1}}] +test regexp-26.11.1 {not anchored regexp without -line option} {knownBug} { + # 3 matches should be found (a, empty for '\n' and empty hereafter), + # similar to `regsub -all -line {a*} a\n *` which replaces 3 matches "**\n*". + # Note that it's not anchored - therefore it works regardless -line option. + # See the difference in anchored tests - 26.14 - 26.17. + list \ + [regexp -all -inline -- {a*} "a\n"] \ + [regexp -all -inline -indices -- {a*} "a\n"] +} {{a {} {}} {{0 0} {1 0} {2 1}}} +test regexp-26.12 {not anchored regexp with -line option} {knownBug} { + list \ + [regexp -all -inline -line -- {a*} "b\n"] \ + [regexp -all -inline -line -indices -- {a*} "b\n"] +} {{{} {} {}} {{0 -1} {1 0} {2 1}}} +test regexp-26.13 {not anchored regexp without -line option} {knownBug} { + list \ + [regexp -all -inline -- {a*} "b\n"] \ + [regexp -all -inline -indices -- {a*} "b\n"] +} {{{} {} {}} {{0 -1} {1 0} {2 1}}} + +test regexp-26.14 {anchored regexp with -line option} { + list \ + [regexp -all -inline -line -- {^a*} "a\n"] \ + [regexp -all -inline -line -indices -- {^a*} "a\n"] +} {{a {}} {{0 0} {2 1}}} +test regexp-26.15.1 {anchored regexp without -line option} { + list \ + [regexp -all -inline -- {^a*} "a\n"] \ + [regexp -all -inline -indices -- {^a*} "a\n"] +} {a {{0 0}}} +test regexp-26.15.2 {anchored regexp disabled miltiline} {pcre} { + # inverse multiline (option -line disabled), pcre only variant: + list \ + [regexp -all -inline -line -- {(?-m)^a*} "a\n"] \ + [regexp -all -inline -line -indices -- {(?-m)^a*} "a\n"] +} {a {{0 0}}} +test regexp-26.16 {anchored regexp with -line option} { + list \ + [regexp -all -inline -line -- {^a*} "b\n"] \ + [regexp -all -inline -line -indices -- {^a*} "b\n"] +} {{{} {}} {{0 -1} {2 1}}} +test regexp-26.17 {anchored regexp without -line option} { + list \ + [regexp -all -inline -- {^a*} "b\n"] \ + [regexp -all -inline -indices -- {^a*} "b\n"] +} {{{}} {{0 -1}}} # cleanup ::tcltest::cleanupTests diff --git a/tests/regexp2.test b/tests/regexp2.test index 7ff4bba22cce..d30c3e560da0 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -21,7 +21,7 @@ namespace inscope ::tcltest {rename cleanupTests _cleanupTests; proc cleanupTest rename test __test proc test {args} {} -catch { +set err [catch { # test all available regexp engines (switch default to another one): set org_reeng [interp regexp {}] @@ -75,7 +75,7 @@ test regexp-0.1 {test of interim "switch" of engine} -body { }; #end of engine cycle -} errMsg opt +} errMsg opt] # restore interp regexp {} $org_reeng @@ -84,4 +84,6 @@ namespace inscope ::tcltest {rename cleanupTests {}; rename _cleanupTests cleanu # cleanup ::tcltest::cleanupTests -return {*}$opt -level 1 $errMsg +if {$err} { + return {*}$opt -level 1 $errMsg +} From 0c61084816d8d4b344b2eaa0de6515bdaff4cd0e Mon Sep 17 00:00:00 2001 From: sebres Date: Thu, 16 Nov 2017 20:47:13 +0100 Subject: [PATCH 09/27] reactivate faster replacement of simple words (not really regexp) for pcre also; --- generic/tclCmdMZ.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 7290c92e9742..9ec3391468cc 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -433,7 +433,7 @@ Tcl_RegsubObjCmd( } } - if (all && (offset == 0) && !(cflags & TCL_REG_PCRE) + if (all && (offset == 0) && (strpbrk(TclGetString(objv[2]), "&\\") == NULL) && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) { /* @@ -501,6 +501,7 @@ Tcl_RegsubObjCmd( } objPtr = NULL; subPtr = NULL; + cflags &= ~TCL_REG_PCRE; goto regsubDone; } From 496d5c3868864dfd897092358213ddd242c39958 Mon Sep 17 00:00:00 2001 From: sebres Date: Thu, 16 Nov 2017 21:58:19 +0100 Subject: [PATCH 10/27] 1st shot trying to implement DFA mode of PCRE --- generic/tcl.h | 10 ++++++---- generic/tclCmdMZ.c | 36 ++++++++++++++++++++++++------------ generic/tclInt.h | 6 ++++-- generic/tclInterp.c | 25 ++++++++++++++++++------- generic/tclRegexp.c | 29 ++++++++++++++++++++++++++--- tests/regexp.test | 6 +++--- tests/regexp2.test | 2 +- 7 files changed, 82 insertions(+), 32 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 11bdd1cbdefb..084df805a948 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -563,11 +563,13 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); * default interp engine, mean it specified as parameter) */ #define TCL_REG_PCRE 0x20000000 /* Make sure it doesn't conflict with * existing TCL_REG_* or PCRE_* bits */ +#define TCL_REG_PCDFA 0x40000000 /* DFA variant of PCRE engine */ -/* Following two macros used to supply TCL_REG_PCRE and TCL_REG_EXPLTYPE -/* to INST_REGEXP over one byte op (instead of TCL_REG_ADVANCED, that is always set) */ -#define TCL_REG_COMPILE_SHIFT(v) ((v&~0x30000000)|(v>>28)&0x3) -#define TCL_REG_COMPILE_UNSHIFT(v) ((v&~0x03)|((v&0x03)<<28)|TCL_REG_ADVANCED) +/* Following two macros used to supply TCL_REG_PCRE, TCL_REG_PCDFA and TCL_REG_EXPLTYPE +/* to INST_REGEXP over one byte op (instead of first 3 bits, that currently never compiled + * e. g. TCL_REG_ADVANCED, that is always set in compiled variant) */ +#define TCL_REG_COMPILE_SHIFT(v) ((v&~0x70000000)|(v>>28)&0x07) +#define TCL_REG_COMPILE_UNSHIFT(v) ((v&~0x07)|((v&0x07)<<28)|TCL_REG_ADVANCED) /* * Flags values passed to Tcl_RegExpExecObj. diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 9ec3391468cc..0aea40d64d19 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -102,10 +102,10 @@ Tcl_RegexpObjCmd( REGEXP_NOCASE,REGEXP_START, REGEXP_TYPE, REGEXP_LAST }; static CONST char *re_type_opts[] = { - "classic", "pcre", NULL + "classic", "dfa", "pcre", NULL }; enum re_type_opts { - RETYPE_CLASSIC, RETYPE_PCRE, + RETYPE_CLASSIC, RETYPE_DFA, RETYPE_PCRE }; indices = 0; @@ -180,10 +180,16 @@ Tcl_RegexpObjCmd( 0, &re_type) != TCL_OK) { goto optionError; } - if ((enum re_type_opts) re_type == RETYPE_PCRE) { - cflags |= TCL_REG_PCRE | TCL_REG_EXPLTYPE; - } else { - cflags = (cflags & ~TCL_REG_PCRE) | TCL_REG_EXPLTYPE; + switch ((enum re_type_opts) re_type) { + case RETYPE_PCRE: + cflags = (cflags & ~TCL_REG_PCDFA)|TCL_REG_PCRE | TCL_REG_EXPLTYPE; + break; + case RETYPE_DFA: + cflags |= TCL_REG_PCRE|TCL_REG_PCDFA | TCL_REG_EXPLTYPE; + break; + default: + cflags = (cflags & ~(TCL_REG_PCRE|TCL_REG_PCDFA)) | TCL_REG_EXPLTYPE; + break; } break; case REGEXP_LAST: @@ -316,10 +322,10 @@ Tcl_RegsubObjCmd( REGSUB_TYPE, REGSUB_LAST }; static CONST char *re_type_opts[] = { - "classic", "pcre", NULL + "classic", "dfa", "pcre", NULL }; enum re_type_opts { - RETYPE_CLASSIC, RETYPE_PCRE, + RETYPE_CLASSIC, RETYPE_DFA, RETYPE_PCRE }; re_type = 0; cflags = TCL_REG_ADVANCED; @@ -381,10 +387,16 @@ Tcl_RegsubObjCmd( 0, &re_type) != TCL_OK) { goto optionError; } - if ((enum re_type_opts) re_type == RETYPE_PCRE) { - cflags |= TCL_REG_PCRE | TCL_REG_EXPLTYPE; - } else { - cflags = (cflags & ~TCL_REG_PCRE) | TCL_REG_EXPLTYPE; + switch ((enum re_type_opts) re_type) { + case RETYPE_PCRE: + cflags = (cflags & ~TCL_REG_PCDFA)|TCL_REG_PCRE | TCL_REG_EXPLTYPE; + break; + case RETYPE_DFA: + cflags |= TCL_REG_PCRE|TCL_REG_PCDFA | TCL_REG_EXPLTYPE; + break; + default: + cflags = (cflags & ~(TCL_REG_PCRE|TCL_REG_PCDFA)) | TCL_REG_EXPLTYPE; + break; } break; case REGSUB_LAST: diff --git a/generic/tclInt.h b/generic/tclInt.h index f848c0ed1ff5..73eba4271505 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2011,7 +2011,8 @@ typedef struct Interp { * of the wrong-num-args string in Tcl_WrongNumArgs. * Makes it append instead of replacing and uses * different intermediate text. - * INTERP_PCRE Non-zero means use PCRE engine by default for REs + * INTERP_PCRE Non-zero means use PCRE engine by default for REs. + * INTERP_DFA Non-zero means use dfa mode of PCRE engine by default. * * WARNING: For the sake of some extensions that have made use of former * internal values, do not re-use the flag values 2 (formerly ERR_IN_PROGRESS) @@ -2024,10 +2025,11 @@ typedef struct Interp { #define DONT_COMPILE_CMDS_INLINE 0x20 #define RAND_SEED_INITIALIZED 0x40 #define SAFE_INTERP 0x80 -#define INTERP_PCRE 0x100 #define INTERP_TRACE_IN_PROGRESS 0x200 #define INTERP_ALTERNATE_WRONG_ARGS 0x400 #define ERR_LEGACY_COPY 0x800 +#define INTERP_PCRE 0x1000 +#define INTERP_DFA 0x2000 /* * Maximum number of levels of nesting permitted in Tcl commands (used to diff --git a/generic/tclInterp.c b/generic/tclInterp.c index 2d9be6511e45..44c079ae0eec 100644 --- a/generic/tclInterp.c +++ b/generic/tclInterp.c @@ -953,10 +953,10 @@ Tcl_InterpObjCmd( int re_type; Interp *slaveInterp; static CONST char *re_type_opts[] = { - "classic", "pcre", NULL + "classic", "dfa", "pcre", NULL }; enum re_type_opts { - RETYPE_CLASSIC, RETYPE_PCRE, + RETYPE_CLASSIC, RETYPE_DFA, RETYPE_PCRE }; if (objc != 3 && objc != 4) { Tcl_WrongNumArgs(interp, 2, objv, "path ?type?"); @@ -971,16 +971,27 @@ Tcl_InterpObjCmd( 0, &re_type) != TCL_OK) { return TCL_ERROR; } - if ((enum re_type_opts) re_type == RETYPE_PCRE) { - slaveInterp->flags |= INTERP_PCRE; - } else { - slaveInterp->flags &= ~(INTERP_PCRE); + switch ((enum re_type_opts) re_type) { + case RETYPE_PCRE: + slaveInterp->flags = + (slaveInterp->flags & ~INTERP_DFA) | INTERP_PCRE; + break; + case RETYPE_DFA: + slaveInterp->flags |= INTERP_PCRE|INTERP_DFA; + break; + default: + slaveInterp->flags &= ~(INTERP_PCRE|INTERP_DFA); + break; } Tcl_SetObjResult(interp, objv[3]); return TCL_OK; } if (slaveInterp->flags & INTERP_PCRE) { - Tcl_SetObjResult(interp, Tcl_NewStringObj("pcre", -1)); + if (!(slaveInterp->flags & INTERP_DFA)) { + Tcl_SetObjResult(interp, Tcl_NewStringObj("pcre", -1)); + } else { + Tcl_SetObjResult(interp, Tcl_NewStringObj("dfa", -1)); + } } else { Tcl_SetObjResult(interp, Tcl_NewStringObj("classic", -1)); } diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 49f073bd1b98..39eae7e40b70 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -496,9 +496,17 @@ Tcl_RegExpExecObj( length = 0; } - match = pcre_exec(regexpPtr->pcre, regexpPtr->study, + if (!(regexpPtr->flags & TCL_REG_PCDFA)) { + match = pcre_exec(regexpPtr->pcre, regexpPtr->study, matchstr, length, offset, eflags, (int *) regexpPtr->offsets, nm); + } else { + //TODO: + int wrkspace[60]; + match = pcre_dfa_exec(regexpPtr->pcre, regexpPtr->study, + matchstr, length, offset, eflags, + (int *) regexpPtr->offsets, nm, wrkspace, 60); + } /* * Check for errors. @@ -643,9 +651,17 @@ TclAdjustRegExpFlags( ) { /* if type is not explicit specified */ if (!(flags & TCL_REG_EXPLTYPE)) { + int msk; /* own re-type from interp */ - if ((interp != NULL) && (((Interp *)interp)->flags & INTERP_PCRE)) { + if ( (interp != NULL) + && (msk = (((Interp *)interp)->flags & (INTERP_PCRE|INTERP_DFA))) + ) { flags |= TCL_REG_PCRE; + if (msk & INTERP_DFA) { + flags |= TCL_REG_PCDFA; + } else { + flags &= ~TCL_REG_PCDFA; + } } /* if does not work in PCRE - switch to classic (backwards compatibility) */ if ((flags & TCL_REG_PCRE)) { @@ -1633,8 +1649,15 @@ TclRegexpPCRE( } } - match = pcre_exec(re, study, matchstr, stringLength, + if (!(regexpPtr->flags & TCL_REG_PCDFA)) { + match = pcre_exec(re, study, matchstr, stringLength, offset, eflags, offsets, matchelems); + } else { + //TODO: + int wrkspace[60]; + match = pcre_dfa_exec(re, study, matchstr, stringLength, + offset, eflags, offsets, matchelems, wrkspace, 60); + } if (match < -1) { /* offset is out of range (bad utf, wrong length etc) */ diff --git a/tests/regexp.test b/tests/regexp.test index 49f59f28d6f6..045dcbf0eda8 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -17,10 +17,10 @@ if {[lsearch [namespace children] ::tcltest] == -1} { } testConstraint exec [llength [info commands exec]] -testConstraint pcre [expr {[interp regexp {}] eq "pcre"}] -testConstraint classic [expr {[interp regexp {}] ne "pcre"}] +testConstraint pcre [expr {[interp regexp {}] ne "classic"}] +testConstraint classic [expr {[interp regexp {}] eq "classic"}] # all known bugs are fixed in PCRE engine, thus disable constraint if PCRE used: -testConstraint knownBug [expr {[testConstraint knownBug] || [interp regexp {}] eq "pcre"}] +testConstraint knownBug [expr {[testConstraint knownBug] || [interp regexp {}] ne "classic"}] catch {unset foo} test regexp-1.1 {basic regexp operation} { diff --git a/tests/regexp2.test b/tests/regexp2.test index d30c3e560da0..c34712bae831 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -26,7 +26,7 @@ set err [catch { # test all available regexp engines (switch default to another one): set org_reeng [interp regexp {}] - foreach reeng {classic pcre} { + foreach reeng {classic pcre dfa} { proc _test_reeng_available {} { variable reeng From f8909122e117dee87290808ac6b0c785ee5ef70a Mon Sep 17 00:00:00 2001 From: sebres Date: Fri, 17 Nov 2017 13:53:34 +0100 Subject: [PATCH 11/27] DFA type usable, more improvements, like common storage for matches (now in classic also) and offsets (PCRE vectors), etc; Test cases extended; --- generic/tclRegexp.c | 241 +++++++++++++++++++++++++++++++++----------- generic/tclRegexp.h | 4 +- tests/regexp.test | 2 +- tests/regexp2.test | 58 ++++++++++- 4 files changed, 238 insertions(+), 67 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 39eae7e40b70..253e348d15ac 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -75,14 +75,25 @@ typedef struct ThreadSpecificData { struct TclRegexp *regexps[NUM_REGEXPS]; /* Compiled forms of above strings. Also * malloc-ed, or NULL if not in use yet. */ +#ifdef HAVE_PCRE + void *offsStorage; /* TSD global storeage for offsets/matches */ + size_t offsStorSize; + void *matchStorage; + size_t matchStorSize; +#endif } ThreadSpecificData; static Tcl_ThreadDataKey dataKey; +/* Static storage just points to NULL used as initial pointer for regexp storages */ +static void *emptyMatchStorage = NULL; + + /* * Declarations for functions used only in this file. */ +static void AllocCaptStorage(TclRegexp *regexpPtr); static TclRegexp * CompileRegexp(Tcl_Interp *interp, const char *pattern, int length, int flags); static void DupRegexpInternalRep(Tcl_Obj *srcPtr, @@ -95,6 +106,16 @@ static int RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re, int nmatches, int flags); static int SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); +#ifdef HAVE_PCRE +typedef struct { + int rm_so; /* start of substring */ + int rm_eo; /* end of substring */ +} regoffs_t; + +#define VectorCoountPCRE(regexpPtr) \ + ((int)(regexpPtr->re.re_nsub+1)*3) + +#endif /* * The regular expression Tcl object type. This serves as a cache of the * compiled form of the regular expression. @@ -241,11 +262,12 @@ Tcl_RegExpRange( * in (sub-)range here. */ { TclRegexp *regexpPtr = (TclRegexp *) re; + regmatch_t *matches = *regexpPtr->matchStorage; const char *string; if ((size_t) index > regexpPtr->re.re_nsub) { *startPtr = *endPtr = NULL; - } else if (regexpPtr->matches[index].rm_so < 0) { + } else if (matches[index].rm_so < 0) { *startPtr = *endPtr = NULL; } else { if (regexpPtr->objPtr) { @@ -253,8 +275,8 @@ Tcl_RegExpRange( } else { string = regexpPtr->string; } - *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so); - *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo); + *startPtr = Tcl_UtfAtIndex(string, matches[index].rm_so); + *endPtr = Tcl_UtfAtIndex(string, matches[index].rm_eo); } } @@ -294,14 +316,20 @@ RegExpExecUniChar( int status; TclRegexp *regexpPtr = (TclRegexp *) re; size_t last = regexpPtr->re.re_nsub + 1; + regmatch_t *matches; size_t nm = last; if (nmatches >= 0 && (size_t) nmatches < nm) { nm = (size_t) nmatches; } + if (!(matches = *regexpPtr->matchStorage)) { + AllocCaptStorage(regexpPtr); + matches = *regexpPtr->matchStorage; + } + status = TclReExec(®expPtr->re, wString, (size_t) numChars, - ®expPtr->details, nm, regexpPtr->matches, flags); + ®expPtr->details, nm, matches, flags); /* * Check for errors. @@ -354,6 +382,7 @@ TclRegExpRangeUniChar( * in (sub-)range here. */ { TclRegexp *regexpPtr = (TclRegexp *) re; + regmatch_t *matches = *regexpPtr->matchStorage; if ((regexpPtr->flags®_EXPECT) && index == -1) { *startPtr = regexpPtr->details.rm_extend.rm_so; @@ -362,8 +391,8 @@ TclRegExpRangeUniChar( *startPtr = -1; *endPtr = -1; } else { - *startPtr = regexpPtr->matches[index].rm_so; - *endPtr = regexpPtr->matches[index].rm_eo; + *startPtr = matches[index].rm_so; + *endPtr = matches[index].rm_eo; } } @@ -466,8 +495,12 @@ Tcl_RegExpExecObj( if (reflags & TCL_REG_PCRE) { #ifdef HAVE_PCRE const char *matchstr; - int match, eflags, nm = (regexpPtr->re.re_nsub + 1) * 3; + int match, eflags, *offsets, nm = VectorCoountPCRE(regexpPtr); + if (!(offsets = *regexpPtr->offsStorage)) { + AllocCaptStorage(regexpPtr); + offsets = *regexpPtr->offsStorage; + } if (textObj->typePtr == &tclByteArrayType) { matchstr = Tcl_GetByteArrayFromObj(textObj, &length); } else { @@ -498,14 +531,21 @@ Tcl_RegExpExecObj( if (!(regexpPtr->flags & TCL_REG_PCDFA)) { match = pcre_exec(regexpPtr->pcre, regexpPtr->study, - matchstr, length, offset, eflags, - (int *) regexpPtr->offsets, nm); + matchstr, length, offset, eflags, offsets, nm); } else { //TODO: int wrkspace[60]; - match = pcre_dfa_exec(regexpPtr->pcre, regexpPtr->study, - matchstr, length, offset, eflags, - (int *) regexpPtr->offsets, nm, wrkspace, 60); + do { + match = pcre_dfa_exec(regexpPtr->pcre, regexpPtr->study, + matchstr, length, offset, eflags, offsets, nm, + wrkspace, 60); + if (match) break; + /* insufficient capture space - enlarge vectors buffer */ + regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; + AllocCaptStorage(regexpPtr); + offsets = *regexpPtr->offsStorage; + nm = VectorCoountPCRE(regexpPtr); + } while(1); } /* @@ -533,7 +573,7 @@ Tcl_RegExpExecObj( * Adjust match indices relative offset where matching began. */ if (offset) { - int i, *offsets = (int *) regexpPtr->offsets; + int i; for (i = 0; i <= match*2; i++) { offsets[i] -= offset; } @@ -619,24 +659,26 @@ Tcl_RegExpGetInfo( Tcl_RegExpInfo *infoPtr) /* Match information is stored here. */ { TclRegexp *regexpPtr = (TclRegexp *) regexp; + regmatch_t *matches = *regexpPtr->matchStorage; infoPtr->nsubs = regexpPtr->re.re_nsub; if (regexpPtr->flags & TCL_REG_PCRE) { #ifdef HAVE_PCRE - if ((int *)regexpPtr->matches != regexpPtr->offsets) { - int i, *offsets = (int *) regexpPtr->offsets; + int *offsets = *regexpPtr->offsStorage; + if ((int *)matches != offsets) { + int i; for (i = 0; i <= infoPtr->nsubs; i++) { - regexpPtr->matches[i].rm_so = offsets[i*2]; - regexpPtr->matches[i].rm_eo = offsets[i*2+1]; + matches[i].rm_so = offsets[i*2]; + matches[i].rm_eo = offsets[i*2+1]; } } - infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; + infoPtr->matches = (Tcl_RegExpIndices *)matches; infoPtr->extendStart = -1; /* XXX support? */ #else Tcl_Panic("Cannot get info for PCRE match"); #endif } else { - infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; + infoPtr->matches = (Tcl_RegExpIndices *)matches; infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; } } @@ -723,7 +765,9 @@ Tcl_GetRegExpFromObj( flags = TclAdjustRegExpFlags(interp, objPtr, flags); } - /* explicit flag has no meaning further - remove it in order to compare */ + /* + * Explicit flag has no meaning further - remove it in order to compare. + */ flags &= ~TCL_REG_EXPLTYPE; if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { @@ -965,6 +1009,71 @@ SetRegexpFromAny( return TCL_OK; } +/* + * AllocCaptStorage -- + */ +static void +AllocCaptStorage(TclRegexp *regexpPtr) +{ + ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); + int veccnt, nsubs = regexpPtr->re.re_nsub; + +#ifdef HAVE_PCRE + /* + * We use special handling to allocate storages for PCRE offsets/matches, + * because on some systems size we can use the same storage for both, + * if sizes of regoffs_t and regmatch_t are equal. + */ + veccnt = VectorCoountPCRE(regexpPtr); + if (!tsdPtr->offsStorage || tsdPtr->offsStorSize < sizeof(int) * veccnt) { + tsdPtr->offsStorSize = sizeof(int) * veccnt; + /* if initial call (first call) */ + if (!tsdPtr->offsStorage) { + tsdPtr->offsStorage = ckalloc(tsdPtr->offsStorSize); + /* we can use matches = offsets if size of two int's is equal regmatch_t structure */ + if (sizeof(regoffs_t) != sizeof(regmatch_t)) { + tsdPtr->matchStorSize = sizeof(regmatch_t) * (nsubs+1); + tsdPtr->matchStorage = ckalloc(tsdPtr->matchStorSize); + } else { + tsdPtr->matchStorSize = tsdPtr->offsStorSize; + tsdPtr->matchStorage = tsdPtr->offsStorage; + } + } else { + /* enlarge storages */ + tsdPtr->offsStorage = ckrealloc( + (char*)tsdPtr->offsStorage, sizeof(int) * veccnt); + /* we can use matches = offsets if size of two int's is equal regmatch_t structure */ + if (sizeof(regoffs_t) != sizeof(regmatch_t)) { + tsdPtr->matchStorSize = sizeof(regmatch_t) * (nsubs+1); + tsdPtr->matchStorage = ckrealloc( + (char*)tsdPtr->matchStorage, tsdPtr->matchStorSize); + } else { + tsdPtr->matchStorSize = tsdPtr->offsStorSize; + tsdPtr->matchStorage = tsdPtr->offsStorage; + } + } + } + + /* set current references in regexp for fast access without TSD lookup */ + regexpPtr->offsStorage = &(int *)tsdPtr->offsStorage; +#else + if (!tsdPtr->matchStorage || tsdPtr->matchStorSize < sizeof(regmatch_t) * (nsubs+1)) { + tsdPtr->matchStorSize = sizeof(regmatch_t) * (nsubs+1); + /* if initial call (first call) */ + if (!tsdPtr->matchStorage) { + tsdPtr->matchStorage = ckalloc(tsdPtr->matchStorSize); + } else { + /* enlarge storage */ + tsdPtr->matchStorage = ckrealloc( + (char*)tsdPtr->matchStorage, tsdPtr->matchStorSize); + } + } +#endif + + /* set current references in regexp for fast access without TSD lookup */ + regexpPtr->matchStorage = &(regmatch_t *)tsdPtr->matchStorage; +} + /* *--------------------------------------------------------------------------- * @@ -999,7 +1108,7 @@ CompileRegexp( int numChars, status, i, exact; Tcl_DString stringBuf; ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); - + if (!tsdPtr->initialized) { tsdPtr->initialized = 1; Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL); @@ -1090,6 +1199,13 @@ CompileRegexp( if (!(flags & TCL_REG_NLSTOP)) { pcrecflags |= PCRE_DOTALL; } + /* + * Note that DFA currently does not support captured groups (substrings) at all, + * but it returns all matched alternatives instead of. Disable capturing anyway. + */ + if ((flags & TCL_REG_PCDFA)) { + pcrecflags |= PCRE_NO_AUTO_CAPTURE; + } if (cstring[length] != 0) { cstring = (char *) ckalloc(length + 1); @@ -1118,24 +1234,19 @@ CompileRegexp( return NULL; } - /* - * Allocate enough space for all of the subexpressions, plus one extra - * for the entire pattern. - */ - - rc = pcre_fullinfo(pcre, NULL, PCRE_INFO_CAPTURECOUNT, &nsubs); - if (rc == 0) { - regexpPtr->re.re_nsub = nsubs; - regexpPtr->offsets = (int *) - ckalloc(sizeof(int) * (nsubs+1)*3); - /* we can use matches = offsets if size of two int's is equal regmatch_t structure */ - if (sizeof(*regexpPtr->offsets)*2 != sizeof(*regexpPtr->matches)) { - regexpPtr->matches = (regmatch_t *) ckalloc( - sizeof(regmatch_t) * (nsubs+1)); - } else { - regexpPtr->matches = (regmatch_t *)regexpPtr->offsets; - } + nsubs = 0; + if (!(flags & TCL_REG_PCDFA)) { + rc = pcre_fullinfo(pcre, NULL, PCRE_INFO_CAPTURECOUNT, &nsubs); + if (rc != 0) { + /* todo - error handling */ + nsubs = 0; + } } + regexpPtr->re.re_nsub = nsubs; + + /* Don't allocate capture storages, it occurs on demand by the first usage */ + regexpPtr->offsStorage = &(int *)emptyMatchStorage; + regexpPtr->matchStorage = &(regmatch_t *)emptyMatchStorage; #else Tcl_AppendResult(interp, "couldn't compile pcre pattern: pcre unavailabe", NULL); @@ -1171,13 +1282,8 @@ CompileRegexp( return NULL; } - /* - * Allocate enough space for all of the subexpressions, plus one extra - * for the entire pattern. - */ - - regexpPtr->matches = (regmatch_t *) ckalloc( - sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); + /* Don't allocate capture storages, it occurs on demand by the first usage */ + regexpPtr->matchStorage = &(regmatch_t *)emptyMatchStorage; } /* @@ -1258,14 +1364,6 @@ FreeRegexp( if (regexpPtr->globObjPtr) { TclDecrRefCount(regexpPtr->globObjPtr); } - if (regexpPtr->matches) { - ckfree((char *) regexpPtr->matches); - } -#ifdef HAVE_PCRE - if (regexpPtr->offsets && regexpPtr->offsets != (int *)regexpPtr->matches) { - ckfree((char *) regexpPtr->offsets); - } -#endif ckfree((char *) regexpPtr); } @@ -1301,6 +1399,18 @@ FinalizeRegexp( ckfree(tsdPtr->patterns[i]); tsdPtr->patterns[i] = NULL; } +#ifdef HAVE_PCRE + if (tsdPtr->offsStorage != NULL) { + ckfree((char *) tsdPtr->offsStorage); + tsdPtr->offsStorSize = 0; + } + if ( tsdPtr->matchStorage != NULL + && tsdPtr->matchStorage != tsdPtr->offsStorage + ) { + ckfree((char *) tsdPtr->matchStorage); + tsdPtr->matchStorSize = 0; + } +#endif /* * We may find ourselves reinitialized if another finalization routine * invokes regexps. @@ -1598,10 +1708,13 @@ TclRegexpPCRE( * loop when the starting offset is past the end of the string. */ + if (!(offsets = *regexpPtr->offsStorage)) { + AllocCaptStorage(regexpPtr); + offsets = *regexpPtr->offsStorage; + } re = regexpPtr->pcre; study = regexpPtr->study; - offsets = (int *) regexpPtr->offsets; - matchelems = (int) (regexpPtr->re.re_nsub + 1) * 3; + matchelems = VectorCoountPCRE(regexpPtr); eflags = PCRE_NO_UTF8_CHECK; if (all) { pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &pcrecflags); @@ -1655,8 +1768,16 @@ TclRegexpPCRE( } else { //TODO: int wrkspace[60]; - match = pcre_dfa_exec(re, study, matchstr, stringLength, - offset, eflags, offsets, matchelems, wrkspace, 60); + do { + match = pcre_dfa_exec(re, study, matchstr, stringLength, + offset, eflags, offsets, matchelems, wrkspace, 60); + if (match) break; + /* insufficient capture space - enlarge vectors buffer */ + regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; + AllocCaptStorage(regexpPtr); + offsets = *regexpPtr->offsStorage; + matchelems = VectorCoountPCRE(regexpPtr); + } while(1); } if (match < -1) { @@ -1712,10 +1833,12 @@ TclRegexpPCRE( /* * It's the number of substitutions, plus one for the matchVar at * index 0 - * Note we can get fewer matches as specified (thus just use [-1, -1] indices) + * Note we can get fewer matches as specified (thus just use [-1, -1] indices). + * In case of DFA we've count of matched alternatives here (sorted by longest match). */ - objc = regexpPtr->re.re_nsub + 1; + objc = (!(regexpPtr->flags & TCL_REG_PCDFA)) ? + regexpPtr->re.re_nsub + 1 : match; if (!resultPtr) { resultPtr = Tcl_NewObj(); } diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index aab4ade14989..7cfc5a75efd5 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -35,12 +35,12 @@ typedef struct TclRegexp { #ifdef HAVE_PCRE pcre *pcre; /* PCRE compile re */ pcre_extra *study; /* study of PCRE */ - int *offsets; /* Array of offsets (indices to handle within PCRE) */ + int **offsStorage; /* Storage for array of offsets (indices to handle within PCRE) */ #endif CONST char *string; /* Last string passed to Tcl_RegExpExec. */ Tcl_Obj *objPtr; /* Last object passed to Tcl_RegExpExecObj. */ Tcl_Obj *globObjPtr; /* Glob pattern rep of RE or NULL if none. */ - regmatch_t *matches; /* Array of indices into the Tcl_UniChar + regmatch_t **matchStorage; /* Storage for array of indices into the Tcl_UniChar * representation of the last string matched * with this regexp to indicate the location * of subexpressions. */ diff --git a/tests/regexp.test b/tests/regexp.test index 045dcbf0eda8..e7c630c28779 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -17,7 +17,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { } testConstraint exec [llength [info commands exec]] -testConstraint pcre [expr {[interp regexp {}] ne "classic"}] +testConstraint pcre [expr {[interp regexp {}] eq "pcre"}] testConstraint classic [expr {[interp regexp {}] eq "classic"}] # all known bugs are fixed in PCRE engine, thus disable constraint if PCRE used: testConstraint knownBug [expr {[testConstraint knownBug] || [interp regexp {}] ne "classic"}] diff --git a/tests/regexp2.test b/tests/regexp2.test index c34712bae831..2091d85c19d8 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -41,7 +41,7 @@ set err [catch { testConstraint reeng_$reeng [_test_reeng_available] - # wrapper for tests from regexp.test: + # wrapper for tests from regexp.test (add suffix with re-engine type): proc test {args} { variable reeng uplevel [list __test [lindex $args 0]-$reeng {*}[lrange $args 1 end]] @@ -49,9 +49,15 @@ set err [catch { # --------------------------------------------------------------------------- +# constraint for longest-match instead first-match supported for some features (e.g. alternative patterns): +testConstraint longest-match [expr {[regexp -inline {a|ab|abc} abc] eq "abc"}] +testConstraint pcre [expr {[interp regexp {}] eq "pcre"}] +testConstraint dfa [expr {[interp regexp {}] eq "dfa"}] +testConstraint classic [expr {[interp regexp {}] eq "classic"}] + # special engine-related tests: -test regexp-0.1 {test of interim "switch" of engine} -body { +test regexp2-0.1 {test of interim "switch" of engine} -body { # this test will fail if current engine is not classic (no lookbehind syntax), and if it will be not # switched via parameter -type classic (regardless PCRE available or not) set re {(? Date: Fri, 17 Nov 2017 21:47:44 +0100 Subject: [PATCH 12/27] DFA workspace vector (with reallocation) implemented with shared TSD storage; Common handling for fast access of reStorage rewritten (one pointer instead of multiple references); Bugs fixed. --- generic/tclRegexp.c | 174 +++++++++++++++++++++++++------------------- generic/tclRegexp.h | 20 +++-- 2 files changed, 114 insertions(+), 80 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 253e348d15ac..2f410ab04129 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -75,20 +75,11 @@ typedef struct ThreadSpecificData { struct TclRegexp *regexps[NUM_REGEXPS]; /* Compiled forms of above strings. Also * malloc-ed, or NULL if not in use yet. */ -#ifdef HAVE_PCRE - void *offsStorage; /* TSD global storeage for offsets/matches */ - size_t offsStorSize; - void *matchStorage; - size_t matchStorSize; -#endif + TclRegexpStorage reStorage; /* TSD global (shared) storage for offsets/matches/workspace */ } ThreadSpecificData; static Tcl_ThreadDataKey dataKey; -/* Static storage just points to NULL used as initial pointer for regexp storages */ -static void *emptyMatchStorage = NULL; - - /* * Declarations for functions used only in this file. */ @@ -112,6 +103,8 @@ typedef struct { int rm_eo; /* end of substring */ } regoffs_t; +static void EnlargeWrkSpaceStorage(TclRegexp *regexpPtr); + #define VectorCoountPCRE(regexpPtr) \ ((int)(regexpPtr->re.re_nsub+1)*3) @@ -262,7 +255,7 @@ Tcl_RegExpRange( * in (sub-)range here. */ { TclRegexp *regexpPtr = (TclRegexp *) re; - regmatch_t *matches = *regexpPtr->matchStorage; + regmatch_t *matches = regexpPtr->reStorage->matches; const char *string; if ((size_t) index > regexpPtr->re.re_nsub) { @@ -315,21 +308,18 @@ RegExpExecUniChar( { int status; TclRegexp *regexpPtr = (TclRegexp *) re; + TclRegexpStorage *reStorage = regexpPtr->reStorage; size_t last = regexpPtr->re.re_nsub + 1; - regmatch_t *matches; size_t nm = last; if (nmatches >= 0 && (size_t) nmatches < nm) { nm = (size_t) nmatches; } - if (!(matches = *regexpPtr->matchStorage)) { - AllocCaptStorage(regexpPtr); - matches = *regexpPtr->matchStorage; - } + AllocCaptStorage(regexpPtr); status = TclReExec(®expPtr->re, wString, (size_t) numChars, - ®expPtr->details, nm, matches, flags); + ®expPtr->details, nm, reStorage->matches, flags); /* * Check for errors. @@ -382,7 +372,7 @@ TclRegExpRangeUniChar( * in (sub-)range here. */ { TclRegexp *regexpPtr = (TclRegexp *) re; - regmatch_t *matches = *regexpPtr->matchStorage; + regmatch_t *matches = regexpPtr->reStorage->matches; if ((regexpPtr->flags®_EXPECT) && index == -1) { *startPtr = regexpPtr->details.rm_extend.rm_so; @@ -462,6 +452,7 @@ Tcl_RegExpExecObj( int flags) /* Regular expression execution flags. */ { TclRegexp *regexpPtr = (TclRegexp *) re; + TclRegexpStorage *reStorage = regexpPtr->reStorage; int length; int reflags = regexpPtr->flags; #define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) @@ -497,10 +488,9 @@ Tcl_RegExpExecObj( const char *matchstr; int match, eflags, *offsets, nm = VectorCoountPCRE(regexpPtr); - if (!(offsets = *regexpPtr->offsStorage)) { - AllocCaptStorage(regexpPtr); - offsets = *regexpPtr->offsStorage; - } + AllocCaptStorage(regexpPtr); + offsets = reStorage->offsets; + if (textObj->typePtr == &tclByteArrayType) { matchstr = Tcl_GetByteArrayFromObj(textObj, &length); } else { @@ -534,16 +524,18 @@ Tcl_RegExpExecObj( matchstr, length, offset, eflags, offsets, nm); } else { //TODO: - int wrkspace[60]; do { match = pcre_dfa_exec(regexpPtr->pcre, regexpPtr->study, matchstr, length, offset, eflags, offsets, nm, - wrkspace, 60); + reStorage->wrkSpace, reStorage->wrkSpCnt); + if (match == PCRE_ERROR_DFA_WSSIZE) { + EnlargeWrkSpaceStorage(regexpPtr); + continue; + } if (match) break; /* insufficient capture space - enlarge vectors buffer */ regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; AllocCaptStorage(regexpPtr); - offsets = *regexpPtr->offsStorage; nm = VectorCoountPCRE(regexpPtr); } while(1); } @@ -659,12 +651,12 @@ Tcl_RegExpGetInfo( Tcl_RegExpInfo *infoPtr) /* Match information is stored here. */ { TclRegexp *regexpPtr = (TclRegexp *) regexp; - regmatch_t *matches = *regexpPtr->matchStorage; + regmatch_t *matches = regexpPtr->reStorage->matches; infoPtr->nsubs = regexpPtr->re.re_nsub; if (regexpPtr->flags & TCL_REG_PCRE) { #ifdef HAVE_PCRE - int *offsets = *regexpPtr->offsStorage; + int *offsets = regexpPtr->reStorage->offsets; if ((int *)matches != offsets) { int i; for (i = 0; i <= infoPtr->nsubs; i++) { @@ -1015,8 +1007,8 @@ SetRegexpFromAny( static void AllocCaptStorage(TclRegexp *regexpPtr) { - ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); int veccnt, nsubs = regexpPtr->re.re_nsub; + TclRegexpStorage *reStorage = regexpPtr->reStorage; #ifdef HAVE_PCRE /* @@ -1025,54 +1017,76 @@ AllocCaptStorage(TclRegexp *regexpPtr) * if sizes of regoffs_t and regmatch_t are equal. */ veccnt = VectorCoountPCRE(regexpPtr); - if (!tsdPtr->offsStorage || tsdPtr->offsStorSize < sizeof(int) * veccnt) { - tsdPtr->offsStorSize = sizeof(int) * veccnt; + if (!reStorage->offsets || reStorage->offsSize < sizeof(int) * veccnt) { + reStorage->offsSize = sizeof(int) * veccnt; /* if initial call (first call) */ - if (!tsdPtr->offsStorage) { - tsdPtr->offsStorage = ckalloc(tsdPtr->offsStorSize); + if (!reStorage->offsets) { + reStorage->offsets = (int*)ckalloc(reStorage->offsSize); /* we can use matches = offsets if size of two int's is equal regmatch_t structure */ if (sizeof(regoffs_t) != sizeof(regmatch_t)) { - tsdPtr->matchStorSize = sizeof(regmatch_t) * (nsubs+1); - tsdPtr->matchStorage = ckalloc(tsdPtr->matchStorSize); + reStorage->matchSize = sizeof(regmatch_t) * (nsubs+1); + reStorage->matches = (regmatch_t*)ckalloc(reStorage->matchSize); } else { - tsdPtr->matchStorSize = tsdPtr->offsStorSize; - tsdPtr->matchStorage = tsdPtr->offsStorage; + reStorage->matchSize = reStorage->offsSize; + reStorage->matches = (regmatch_t*)reStorage->offsets; } } else { /* enlarge storages */ - tsdPtr->offsStorage = ckrealloc( - (char*)tsdPtr->offsStorage, sizeof(int) * veccnt); + reStorage->offsets = (int*)ckrealloc( + (char*)reStorage->offsets, sizeof(int) * veccnt); /* we can use matches = offsets if size of two int's is equal regmatch_t structure */ if (sizeof(regoffs_t) != sizeof(regmatch_t)) { - tsdPtr->matchStorSize = sizeof(regmatch_t) * (nsubs+1); - tsdPtr->matchStorage = ckrealloc( - (char*)tsdPtr->matchStorage, tsdPtr->matchStorSize); + reStorage->matchSize = sizeof(regmatch_t) * (nsubs+1); + reStorage->matches = (regmatch_t*)ckrealloc( + (char*)reStorage->matches, reStorage->matchSize); } else { - tsdPtr->matchStorSize = tsdPtr->offsStorSize; - tsdPtr->matchStorage = tsdPtr->offsStorage; + reStorage->matchSize = reStorage->offsSize; + reStorage->matches = (regmatch_t*)reStorage->offsets; } } } - /* set current references in regexp for fast access without TSD lookup */ - regexpPtr->offsStorage = &(int *)tsdPtr->offsStorage; + /* if DFA and still no workspace allocated - initial call */ + if ((regexpPtr->flags & TCL_REG_PCDFA) && !reStorage->wrkSpace) { + reStorage->wrkSpCnt = 60; + reStorage->wrkSpSize = sizeof(int) * reStorage->wrkSpCnt; + reStorage->wrkSpace = (int *)ckalloc(reStorage->wrkSpSize); + } + #else - if (!tsdPtr->matchStorage || tsdPtr->matchStorSize < sizeof(regmatch_t) * (nsubs+1)) { - tsdPtr->matchStorSize = sizeof(regmatch_t) * (nsubs+1); + + if (!reStorage->matches || reStorage->matchSize < sizeof(regmatch_t) * (nsubs+1)) { + reStorage->matchSize = sizeof(regmatch_t) * (nsubs+1); /* if initial call (first call) */ - if (!tsdPtr->matchStorage) { - tsdPtr->matchStorage = ckalloc(tsdPtr->matchStorSize); + if (!reStorage->matches) { + reStorage->matches = (regmatch_t*)ckalloc(reStorage->matchSize); } else { /* enlarge storage */ - tsdPtr->matchStorage = ckrealloc( - (char*)tsdPtr->matchStorage, tsdPtr->matchStorSize); + reStorage->matches = (regmatch_t*)ckrealloc( + (char*)reStorage->matches, reStorage->matchSize); } } #endif +} - /* set current references in regexp for fast access without TSD lookup */ - regexpPtr->matchStorage = &(regmatch_t *)tsdPtr->matchStorage; +#ifdef HAVE_PCRE +static void +EnlargeWrkSpaceStorage(TclRegexp *regexpPtr) { + TclRegexpStorage *reStorage = regexpPtr->reStorage; + size_t newSize; + + /* double size, just to avoid too many reallocations */ + reStorage->wrkSpCnt *= 2; + newSize = sizeof(int) * reStorage->wrkSpCnt; + + if (reStorage->wrkSpSize < newSize) { + reStorage->wrkSpSize = newSize; + /* enlarge storage */ + reStorage->wrkSpace = (int*)ckrealloc( + (char*)reStorage->wrkSpace, newSize); + } } +#endif /* *--------------------------------------------------------------------------- @@ -1244,9 +1258,6 @@ CompileRegexp( } regexpPtr->re.re_nsub = nsubs; - /* Don't allocate capture storages, it occurs on demand by the first usage */ - regexpPtr->offsStorage = &(int *)emptyMatchStorage; - regexpPtr->matchStorage = &(regmatch_t *)emptyMatchStorage; #else Tcl_AppendResult(interp, "couldn't compile pcre pattern: pcre unavailabe", NULL); @@ -1281,11 +1292,14 @@ CompileRegexp( } return NULL; } - - /* Don't allocate capture storages, it occurs on demand by the first usage */ - regexpPtr->matchStorage = &(regmatch_t *)emptyMatchStorage; } + /* + * Don't allocate capture storages, it occurs on demand by the first usage, + * just set current reference in regexp for fast storage access without TSD lookup. + */ + regexpPtr->reStorage = &tsdPtr->reStorage; + /* * Convert RE to a glob pattern equivalent, if any, and cache it. If this * is not possible, then globObjPtr will be NULL. This is used by @@ -1390,6 +1404,7 @@ FinalizeRegexp( int i; TclRegexp *regexpPtr; ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); + TclRegexpStorage *reStorage = &tsdPtr->reStorage; for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { regexpPtr = tsdPtr->regexps[i]; @@ -1400,17 +1415,24 @@ FinalizeRegexp( tsdPtr->patterns[i] = NULL; } #ifdef HAVE_PCRE - if (tsdPtr->offsStorage != NULL) { - ckfree((char *) tsdPtr->offsStorage); - tsdPtr->offsStorSize = 0; + if (reStorage->offsets != NULL) { + /* preserve dual release of same block */ + if ((int*)reStorage->matches == reStorage->offsets) { + reStorage->matches = NULL; + } + ckfree((char *) reStorage->offsets); + reStorage->offsSize = 0; } - if ( tsdPtr->matchStorage != NULL - && tsdPtr->matchStorage != tsdPtr->offsStorage - ) { - ckfree((char *) tsdPtr->matchStorage); - tsdPtr->matchStorSize = 0; + if (reStorage->wrkSpace != NULL) { + ckfree((char *) reStorage->wrkSpace); + reStorage->wrkSpSize = 0; + reStorage->wrkSpCnt = 0; } #endif + if (reStorage->matches != NULL) { + ckfree((char *) reStorage->matches); + reStorage->matchSize = 0; + } /* * We may find ourselves reinitialized if another finalization routine * invokes regexps. @@ -1683,6 +1705,7 @@ TclRegexpPCRE( pcre *re; pcre_extra *study; TclRegexp *regexpPtr = (TclRegexp *) regExpr; + TclRegexpStorage *reStorage = regexpPtr->reStorage; objPtr = objv[1]; /* @@ -1698,6 +1721,9 @@ TclRegexpPCRE( matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); } + AllocCaptStorage(regexpPtr); + offsets = reStorage->offsets; + objc -= 2; objv += 2; @@ -1708,10 +1734,6 @@ TclRegexpPCRE( * loop when the starting offset is past the end of the string. */ - if (!(offsets = *regexpPtr->offsStorage)) { - AllocCaptStorage(regexpPtr); - offsets = *regexpPtr->offsStorage; - } re = regexpPtr->pcre; study = regexpPtr->study; matchelems = VectorCoountPCRE(regexpPtr); @@ -1766,16 +1788,18 @@ TclRegexpPCRE( match = pcre_exec(re, study, matchstr, stringLength, offset, eflags, offsets, matchelems); } else { - //TODO: - int wrkspace[60]; do { match = pcre_dfa_exec(re, study, matchstr, stringLength, - offset, eflags, offsets, matchelems, wrkspace, 60); + offset, eflags, offsets, matchelems, + reStorage->wrkSpace, reStorage->wrkSpCnt); + if (match == PCRE_ERROR_DFA_WSSIZE) { + EnlargeWrkSpaceStorage(regexpPtr); + continue; + } if (match) break; /* insufficient capture space - enlarge vectors buffer */ regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; AllocCaptStorage(regexpPtr); - offsets = *regexpPtr->offsStorage; matchelems = VectorCoountPCRE(regexpPtr); } while(1); } diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 7cfc5a75efd5..39ac15ec4ae6 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -28,6 +28,20 @@ * in order to return pointers into the original string. */ +typedef struct { +#ifdef HAVE_PCRE + int *offsets; /* Storage for array of offsets (indices to handle within PCRE) */ + size_t offsSize; + int *wrkSpace; /* Workspace storage vector (used by parsing via DFA). */ + int wrkSpCnt; /* Current length of shared workspace storage vector */ + size_t wrkSpSize; +#endif + regmatch_t *matches; /* Storage for array of indices into the Tcl_UniChar */ + size_t matchSize; /* representation of the last string matched + * with this regexp to indicate the location + * of subexpressions. */ +} TclRegexpStorage; + typedef struct TclRegexp { int flags; /* Regexp compile flags. */ regex_t re; /* Compiled re, includes number of @@ -35,15 +49,11 @@ typedef struct TclRegexp { #ifdef HAVE_PCRE pcre *pcre; /* PCRE compile re */ pcre_extra *study; /* study of PCRE */ - int **offsStorage; /* Storage for array of offsets (indices to handle within PCRE) */ #endif CONST char *string; /* Last string passed to Tcl_RegExpExec. */ Tcl_Obj *objPtr; /* Last object passed to Tcl_RegExpExecObj. */ Tcl_Obj *globObjPtr; /* Glob pattern rep of RE or NULL if none. */ - regmatch_t **matchStorage; /* Storage for array of indices into the Tcl_UniChar - * representation of the last string matched - * with this regexp to indicate the location - * of subexpressions. */ + TclRegexpStorage *reStorage;/* Shared storage for array of indices, matches, workspace etc. */ rm_detail_t details; /* Detailed information on match (currently * used only for REG_EXPECT). */ int refCount; /* Count of number of references to this From bfc8830b6b8c72325c2feacd7ea680ed545877d7 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 29 Jun 2021 14:53:27 +0200 Subject: [PATCH 13/27] add pcre support for windows build (automake e. g. msys/mingw), also allows static linkage of pcre --- win/Makefile.in | 6 +- win/configure | 174 ++++++++++++++++++++++++++++++++++++++++++++++- win/configure.in | 11 +++ win/tcl.m4 | 137 +++++++++++++++++++++++++++++++++++++ 4 files changed, 324 insertions(+), 4 deletions(-) diff --git a/win/Makefile.in b/win/Makefile.in index d9b52fa939d0..bd5041b6a296 100644 --- a/win/Makefile.in +++ b/win/Makefile.in @@ -183,7 +183,7 @@ EXEEXT = @EXEEXT@ OBJEXT = @OBJEXT@ STLIB_LD = @STLIB_LD@ SHLIB_LD = @SHLIB_LD@ -SHLIB_LD_LIBS = @SHLIB_LD_LIBS@ $(LIBS) +SHLIB_LD_LIBS = @SHLIB_LD_LIBS@ $(LIBS) @PCRE_LIBS@ SHLIB_CFLAGS = @SHLIB_CFLAGS@ SHLIB_SUFFIX = @SHLIB_SUFFIX@ LIBS = @LIBS@ @@ -196,7 +196,7 @@ COPY = cp CC_SWITCHES = ${CFLAGS} ${CFLAGS_WARNING} ${TCL_SHLIB_CFLAGS} \ -I"${GENERIC_DIR_NATIVE}" -DTCL_TOMMATH -DMP_PREC=4 -I"${TOMMATH_DIR_NATIVE}" \ --I"${WIN_DIR_NATIVE}" ${AC_FLAGS} \ +-I"${WIN_DIR_NATIVE}" @PCRE_INCLUDE@ ${AC_FLAGS} \ ${COMPILE_DEBUG_FLAGS} ${NO_DEPRECATED_FLAGS} CC_OBJNAME = @CC_OBJNAME@ @@ -204,7 +204,7 @@ CC_EXENAME = @CC_EXENAME@ STUB_CC_SWITCHES = ${CFLAGS} ${CFLAGS_WARNING} ${SHLIB_CFLAGS} \ -I"${GENERIC_DIR_NATIVE}" -DTCL_TOMMATH -DMP_PREC=4 -I"${TOMMATH_DIR_NATIVE}" \ --I"${WIN_DIR_NATIVE}" ${AC_FLAGS} \ +-I"${WIN_DIR_NATIVE}" @PCRE_INCLUDE@ ${AC_FLAGS} \ ${COMPILE_DEBUG_FLAGS} TCLTEST_OBJS = \ diff --git a/win/configure b/win/configure index 3fe89bfcb924..8ca27ca64d03 100755 --- a/win/configure +++ b/win/configure @@ -309,7 +309,7 @@ ac_includes_default="\ # include #endif" -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT CPP EGREP AR ac_ct_AR RANLIB ac_ct_RANLIB RC ac_ct_RC SET_MAKE TCL_THREADS CYGPATH CELIB_DIR DL_LIBS CFLAGS_DEBUG CFLAGS_OPTIMIZE CFLAGS_WARNING CFLAGS_DEFAULT LDFLAGS_DEFAULT VC_MANIFEST_EMBED_DLL VC_MANIFEST_EMBED_EXE TCL_WIN_VERSION MACHINE TCL_VERSION TCL_MAJOR_VERSION TCL_MINOR_VERSION TCL_PATCH_LEVEL TCL_LIB_FILE TCL_LIB_FLAG TCL_LIB_SPEC TCL_STUB_LIB_FILE TCL_STUB_LIB_FLAG TCL_STUB_LIB_SPEC TCL_STUB_LIB_PATH TCL_INCLUDE_SPEC TCL_BUILD_STUB_LIB_SPEC TCL_BUILD_STUB_LIB_PATH TCL_DLL_FILE TCL_SRC_DIR TCL_BIN_DIR TCL_DBGX CFG_TCL_SHARED_LIB_SUFFIX CFG_TCL_UNSHARED_LIB_SUFFIX CFG_TCL_EXPORT_FILE_SUFFIX EXTRA_CFLAGS DEPARG CC_OBJNAME CC_EXENAME LDFLAGS_DEBUG LDFLAGS_OPTIMIZE LDFLAGS_CONSOLE LDFLAGS_WINDOW STLIB_LD SHLIB_LD SHLIB_LD_LIBS SHLIB_CFLAGS SHLIB_SUFFIX TCL_SHARED_BUILD LIBS_GUI DLLSUFFIX LIBPREFIX LIBSUFFIX EXESUFFIX LIBRARIES MAKE_LIB MAKE_STUB_LIB POST_MAKE_LIB MAKE_DLL MAKE_EXE TCL_BUILD_LIB_SPEC TCL_LD_SEARCH_FLAGS TCL_NEEDS_EXP_FILE TCL_BUILD_EXP_FILE TCL_EXP_FILE TCL_LIB_VERSIONS_OK TCL_PACKAGE_PATH TCL_DDE_VERSION TCL_DDE_MAJOR_VERSION TCL_DDE_MINOR_VERSION TCL_REG_VERSION TCL_REG_MAJOR_VERSION TCL_REG_MINOR_VERSION RC_OUT RC_TYPE RC_INCLUDE RC_DEFINE RC_DEFINES RES LIBOBJS LTLIBOBJS' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT CPP EGREP AR ac_ct_AR RANLIB ac_ct_RANLIB RC ac_ct_RC SET_MAKE TCL_THREADS CYGPATH CELIB_DIR DL_LIBS CFLAGS_DEBUG CFLAGS_OPTIMIZE CFLAGS_WARNING PCRE_INCLUDE PCRE_LIBS CFLAGS_DEFAULT LDFLAGS_DEFAULT VC_MANIFEST_EMBED_DLL VC_MANIFEST_EMBED_EXE TCL_WIN_VERSION MACHINE TCL_VERSION TCL_MAJOR_VERSION TCL_MINOR_VERSION TCL_PATCH_LEVEL TCL_LIB_FILE TCL_LIB_FLAG TCL_LIB_SPEC TCL_STUB_LIB_FILE TCL_STUB_LIB_FLAG TCL_STUB_LIB_SPEC TCL_STUB_LIB_PATH TCL_INCLUDE_SPEC TCL_BUILD_STUB_LIB_SPEC TCL_BUILD_STUB_LIB_PATH TCL_DLL_FILE TCL_SRC_DIR TCL_BIN_DIR TCL_DBGX CFG_TCL_SHARED_LIB_SUFFIX CFG_TCL_UNSHARED_LIB_SUFFIX CFG_TCL_EXPORT_FILE_SUFFIX EXTRA_CFLAGS DEPARG CC_OBJNAME CC_EXENAME LDFLAGS_DEBUG LDFLAGS_OPTIMIZE LDFLAGS_CONSOLE LDFLAGS_WINDOW STLIB_LD SHLIB_LD SHLIB_LD_LIBS SHLIB_CFLAGS SHLIB_SUFFIX TCL_SHARED_BUILD LIBS_GUI DLLSUFFIX LIBPREFIX LIBSUFFIX EXESUFFIX LIBRARIES MAKE_LIB MAKE_STUB_LIB POST_MAKE_LIB MAKE_DLL MAKE_EXE TCL_BUILD_LIB_SPEC TCL_LD_SEARCH_FLAGS TCL_NEEDS_EXP_FILE TCL_BUILD_EXP_FILE TCL_EXP_FILE TCL_LIB_VERSIONS_OK TCL_PACKAGE_PATH TCL_DDE_VERSION TCL_DDE_MAJOR_VERSION TCL_DDE_MINOR_VERSION TCL_REG_VERSION TCL_REG_MAJOR_VERSION TCL_REG_MINOR_VERSION RC_OUT RC_TYPE RC_INCLUDE RC_DEFINE RC_DEFINES RES LIBOBJS LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. @@ -844,6 +844,7 @@ Optional Features: --enable-shared build and link with shared libraries (default: on) --enable-64bit enable 64bit support (where applicable) --enable-wince enable Win/CE support (where applicable) + --enable-pcre whether to enable pcre (default: off) --enable-symbols build with debugging symbols (default: off) --enable-embedded-manifest embed manifest if possible (default: yes) @@ -853,6 +854,7 @@ Optional Packages: --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) --with-encoding encoding for configuration values --with-celib=DIR use Windows/CE support library from DIR + --with-pcre directory containing pcre headers and libraries Some influential environment variables: CC C compiler command @@ -4271,6 +4273,174 @@ _ACEOF +#------------------------------------------------------------------------------ +# Check if we want to use pcre +#------------------------------------------------------------------------------ + + + +# Check whether --with-pcre or --without-pcre was given. +if test "${with_pcre+set}" = set; then + withval="$with_pcre" + with_pcre=${withval} +fi; + echo "$as_me:$LINENO: checking for PCRE configuration" >&5 +echo $ECHO_N "checking for PCRE configuration... $ECHO_C" >&6 + + if test "${ac_cv_c_pcre+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + + # First check to see if --with-pcre was specified. + if test x"${with_pcre}" != x ; then + if test -f "${with_pcre}/pcre.h" -a \ + \( -f "${with_pcre}/.libs/libpcre.so" -o \ + -f "${with_pcre}/.libs/libpcre.a" \); then + ac_cv_c_pcre=`(cd ${with_pcre}; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}" + PCRE_LIBS="-L${ac_cv_c_pcre}/.libs -lpcre" + elif test -f "${with_pcre}/include/pcre.h" -a \ + \( -f "${with_pcre}/lib/libpcre.so" -o \ + -f "${with_pcre}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd ${with_pcre}; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + else + { { echo "$as_me:$LINENO: error: ${with_pcre} directory doesn't contain pcre header and/or library" >&5 +echo "$as_me: error: ${with_pcre} directory doesn't contain pcre header and/or library" >&2;} + { (exit 1); exit 1; }; } + fi + fi + + if test x"${ac_cv_c_pcre}" = x ; then + # Try pcre-config if it exists + ac_cv_c_pcre=`pcre-config --prefix 2>/dev/null` + if test "$?" -ne 0; then + PCRE_INCLUDE=`pcre-config --cflags 2>/dev/null` + PCRE_LIBS=`pcre-config --libs 2>/dev/null` + fi + fi + + # check in a few common install locations + if test x"${ac_cv_c_pcre}" = x ; then + for i in \ + `ls -d ${exec_prefix} 2>/dev/null` \ + `ls -d ${prefix} 2>/dev/null` \ + `ls -d /usr/local 2>/dev/null` \ + `ls -d /usr/contrib 2>/dev/null` \ + `ls -d /usr 2>/dev/null` \ + ; do + if test -f "${i}/include/pcre.h" -a \ + \( -f "${i}/lib/libpcre.so" -o \ + -f "${i}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd $i; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + break + fi + done + fi + +fi + + + if test x"${ac_cv_c_pcre}" = x ; then + { echo "$as_me:$LINENO: WARNING: Can't find PCRE configuration, PCRE won't be used" >&5 +echo "$as_me: WARNING: Can't find PCRE configuration, PCRE won't be used" >&2;} + else + echo "$as_me:$LINENO: result: found PCRE configuration at ${ac_cv_c_pcre}" >&5 +echo "${ECHO_T}found PCRE configuration at ${ac_cv_c_pcre}" >&6 + fi + + + + + + echo "$as_me:$LINENO: checking whether to enable pcre in Tcl" >&5 +echo $ECHO_N "checking whether to enable pcre in Tcl... $ECHO_C" >&6 + # Check whether --enable-pcre or --disable-pcre was given. +if test "${enable_pcre+set}" = set; then + enableval="$enable_pcre" + pcre_ok=$enableval +else + pcre_ok=no +fi; + + if test "${enable_pcre+set}" = set; then + enableval="$enable_pcre" + pcre_ok=$enableval + else + pcre_ok=yes + fi + + if test x"${ac_cv_c_pcre}" = x ; then + echo "$as_me:$LINENO: result: pcre configuration not found" >&5 +echo "${ECHO_T}pcre configuration not found" >&6 + else + if test "$pcre_ok" = "static" ; then + echo "$as_me:$LINENO: result: pcre enabled (static)" >&5 +echo "${ECHO_T}pcre enabled (static)" >&6 + +cat >>confdefs.h <<\_ACEOF +#define PCRE_STATIC 1 +_ACEOF + + +cat >>confdefs.h <<\_ACEOF +#define HAVE_PCRE 1 +_ACEOF + + elif test "$pcre_ok" = "static,default" ; then + echo "$as_me:$LINENO: result: pcre enabled (static,default)" >&5 +echo "${ECHO_T}pcre enabled (static,default)" >&6 + +cat >>confdefs.h <<\_ACEOF +#define PCRE_STATIC 1 +_ACEOF + + +cat >>confdefs.h <<\_ACEOF +#define USE_DEFAULT_PCRE 1 +_ACEOF + + +cat >>confdefs.h <<\_ACEOF +#define HAVE_PCRE 1 +_ACEOF + + elif test "$pcre_ok" = "default" ; then + echo "$as_me:$LINENO: result: pcre enabled (default)" >&5 +echo "${ECHO_T}pcre enabled (default)" >&6 + +cat >>confdefs.h <<\_ACEOF +#define USE_DEFAULT_PCRE 1 +_ACEOF + + +cat >>confdefs.h <<\_ACEOF +#define HAVE_PCRE 1 +_ACEOF + + elif test "$pcre_ok" = "yes" ; then + echo "$as_me:$LINENO: result: pcre enabled" >&5 +echo "${ECHO_T}pcre enabled" >&6 + +cat >>confdefs.h <<\_ACEOF +#define HAVE_PCRE 1 +_ACEOF + + else + echo "$as_me:$LINENO: result: no pcre" >&5 +echo "${ECHO_T}no pcre" >&6 + fi + fi + + +#-------------------------------------------------------------------- +# Check for various typedefs and provide substitutes if +# they don't exist. +#-------------------------------------------------------------------- + echo "$as_me:$LINENO: checking for intptr_t" >&5 echo $ECHO_N "checking for intptr_t... $ECHO_C" >&6 if test "${ac_cv_type_intptr_t+set}" = set; then @@ -5595,6 +5765,8 @@ s,@DL_LIBS@,$DL_LIBS,;t t s,@CFLAGS_DEBUG@,$CFLAGS_DEBUG,;t t s,@CFLAGS_OPTIMIZE@,$CFLAGS_OPTIMIZE,;t t s,@CFLAGS_WARNING@,$CFLAGS_WARNING,;t t +s,@PCRE_INCLUDE@,$PCRE_INCLUDE,;t t +s,@PCRE_LIBS@,$PCRE_LIBS,;t t s,@CFLAGS_DEFAULT@,$CFLAGS_DEFAULT,;t t s,@LDFLAGS_DEFAULT@,$LDFLAGS_DEFAULT,;t t s,@VC_MANIFEST_EMBED_DLL@,$VC_MANIFEST_EMBED_DLL,;t t diff --git a/win/configure.in b/win/configure.in index 8931a38efc56..b2bd4b8bf165 100644 --- a/win/configure.in +++ b/win/configure.in @@ -98,6 +98,17 @@ SC_ENABLE_SHARED SC_CONFIG_CFLAGS +#------------------------------------------------------------------------------ +# Check if we want to use pcre +#------------------------------------------------------------------------------ + +SC_ENABLE_PCRE + +#-------------------------------------------------------------------- +# Check for various typedefs and provide substitutes if +# they don't exist. +#-------------------------------------------------------------------- + AC_CHECK_TYPE([intptr_t], [ AC_DEFINE([HAVE_INTPTR_T], 1, [Do we have the intptr_t type?])], [ AC_CACHE_CHECK([for pointer-size signed integer type], tcl_cv_intptr_t, [ diff --git a/win/tcl.m4 b/win/tcl.m4 index 006778cb0c17..170b7296a3fe 100644 --- a/win/tcl.m4 +++ b/win/tcl.m4 @@ -338,6 +338,143 @@ AC_DEFUN([SC_LOAD_TKCONFIG], [ AC_SUBST(TK_LIB_FILE) ]) +#------------------------------------------------------------------------ +# SC_WITH_PCRE -- +# +# Finds the PCRE header and library files for use with Tcl +# +# Arguments: +# none +# +# Results: +# +# Adds the following arguments to configure: +# --with-pcre=/path/to/pcre +# +# Sets the following vars: +# PCRE_DIR +#------------------------------------------------------------------------ + +AC_DEFUN([SC_WITH_PCRE], [ + AC_ARG_WITH(pcre, + AC_HELP_STRING([--with-pcre], + [directory containing pcre headers and libraries]), + [with_pcre=${withval}]) + AC_MSG_CHECKING([for PCRE configuration]) + + AC_CACHE_VAL(ac_cv_c_pcre,[ + # First check to see if --with-pcre was specified. + if test x"${with_pcre}" != x ; then + if test -f "${with_pcre}/pcre.h" -a \ + \( -f "${with_pcre}/.libs/libpcre.so" -o \ + -f "${with_pcre}/.libs/libpcre.a" \); then + ac_cv_c_pcre=`(cd ${with_pcre}; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}" + PCRE_LIBS="-L${ac_cv_c_pcre}/.libs -lpcre" + elif test -f "${with_pcre}/include/pcre.h" -a \ + \( -f "${with_pcre}/lib/libpcre.so" -o \ + -f "${with_pcre}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd ${with_pcre}; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + else + AC_MSG_ERROR([${with_pcre} directory doesn't contain pcre header and/or library]) + fi + fi + + if test x"${ac_cv_c_pcre}" = x ; then + # Try pcre-config if it exists + ac_cv_c_pcre=`pcre-config --prefix 2>/dev/null` + if test "$?" -ne 0; then + PCRE_INCLUDE=`pcre-config --cflags 2>/dev/null` + PCRE_LIBS=`pcre-config --libs 2>/dev/null` + fi + fi + + # check in a few common install locations + if test x"${ac_cv_c_pcre}" = x ; then + for i in \ + `ls -d ${exec_prefix} 2>/dev/null` \ + `ls -d ${prefix} 2>/dev/null` \ + `ls -d /usr/local 2>/dev/null` \ + `ls -d /usr/contrib 2>/dev/null` \ + `ls -d /usr 2>/dev/null` \ + ; do + if test -f "${i}/include/pcre.h" -a \ + \( -f "${i}/lib/libpcre.so" -o \ + -f "${i}/lib/libpcre.a" \); then + ac_cv_c_pcre=`(cd $i; pwd)` + PCRE_INCLUDE="-I${ac_cv_c_pcre}/include" + PCRE_LIBS="-L${ac_cv_c_pcre}/lib -lpcre" + break + fi + done + fi + ]) + + if test x"${ac_cv_c_pcre}" = x ; then + AC_MSG_WARN([Can't find PCRE configuration, PCRE won't be used]) + else + AC_MSG_RESULT([found PCRE configuration at ${ac_cv_c_pcre}]) + fi + AC_SUBST([PCRE_INCLUDE]) + AC_SUBST([PCRE_LIBS]) +]) + +#------------------------------------------------------------------------ +# SC_ENABLE_PCRE -- +# +# Allows the use of PCRE in Tcl as default +# +# Arguments: +# none +# +# Results: +# Adds the following arguments to configure: +# --enable-pcre=yes|static|default|static,default|no|pcre +# +#------------------------------------------------------------------------ + +AC_DEFUN([SC_ENABLE_PCRE], [ + AC_REQUIRE([SC_WITH_PCRE]) + AC_MSG_CHECKING([whether to enable pcre in Tcl]) + AC_ARG_ENABLE(pcre, + AC_HELP_STRING([--enable-pcre], + [whether to enable pcre (default: off)]), + [pcre_ok=$enableval], [pcre_ok=no]) + + if test "${enable_pcre+set}" = set; then + enableval="$enable_pcre" + pcre_ok=$enableval + else + pcre_ok=yes + fi + + if test x"${ac_cv_c_pcre}" = x ; then + AC_MSG_RESULT([pcre configuration not found]) + else + if test "$pcre_ok" = "static" ; then + AC_MSG_RESULT([pcre enabled (static)]) + AC_DEFINE(PCRE_STATIC, 1, [Link PCRE statically?]) + AC_DEFINE(HAVE_PCRE, 1, [Do we enable PCRE interfaces?]) + elif test "$pcre_ok" = "static,default" ; then + AC_MSG_RESULT([pcre enabled (static,default)]) + AC_DEFINE(PCRE_STATIC, 1, [Link PCRE statically?]) + AC_DEFINE(USE_DEFAULT_PCRE, 1, [Use PCRE as default RE?]) + AC_DEFINE(HAVE_PCRE, 1, [Do we enable PCRE interfaces?]) + elif test "$pcre_ok" = "default" ; then + AC_MSG_RESULT([pcre enabled (default)]) + AC_DEFINE(USE_DEFAULT_PCRE, 1, [Use PCRE as default RE?]) + AC_DEFINE(HAVE_PCRE, 1, [Do we enable PCRE interfaces?]) + elif test "$pcre_ok" = "yes" ; then + AC_MSG_RESULT([pcre enabled]) + AC_DEFINE(HAVE_PCRE, 1, [Do we enable PCRE interfaces?]) + else + AC_MSG_RESULT([no pcre]) + fi + fi +]) + #------------------------------------------------------------------------ # SC_ENABLE_SHARED -- # From acb4fcc8ab4f3cc917a795381d986e384313188d Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 29 Jun 2021 15:31:47 +0200 Subject: [PATCH 14/27] resolve several warnings --- generic/tcl.h | 4 ++-- generic/tclRegexp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 084df805a948..9e44bb6e0bbf 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -566,9 +566,9 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); #define TCL_REG_PCDFA 0x40000000 /* DFA variant of PCRE engine */ /* Following two macros used to supply TCL_REG_PCRE, TCL_REG_PCDFA and TCL_REG_EXPLTYPE -/* to INST_REGEXP over one byte op (instead of first 3 bits, that currently never compiled + * to INST_REGEXP over one byte op (instead of first 3 bits, that currently never compiled * e. g. TCL_REG_ADVANCED, that is always set in compiled variant) */ -#define TCL_REG_COMPILE_SHIFT(v) ((v&~0x70000000)|(v>>28)&0x07) +#define TCL_REG_COMPILE_SHIFT(v) ((v&~0x70000000)|((v>>28)&0x07)) #define TCL_REG_COMPILE_UNSHIFT(v) ((v&~0x07)|((v&0x07)<<28)|TCL_REG_ADVANCED) /* diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 2f410ab04129..b9a40411dd4c 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -492,7 +492,7 @@ Tcl_RegExpExecObj( offsets = reStorage->offsets; if (textObj->typePtr == &tclByteArrayType) { - matchstr = Tcl_GetByteArrayFromObj(textObj, &length); + matchstr = (const char*)Tcl_GetByteArrayFromObj(textObj, &length); } else { matchstr = Tcl_GetStringFromObj(textObj, &length); } @@ -1712,7 +1712,7 @@ TclRegexpPCRE( * Get match string and translate offset into correct placement for utf-8 chars. */ if (objPtr->typePtr == &tclByteArrayType) { - matchstr = Tcl_GetByteArrayFromObj(objPtr, &stringLength); + matchstr = (const char *)Tcl_GetByteArrayFromObj(objPtr, &stringLength); if (offset && offset < stringLength) { /* XXX: probably needs length restriction */ offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; From b72b8447b39a3e818260354c58e73c303840961f Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 29 Jun 2021 16:48:17 +0200 Subject: [PATCH 15/27] tests/regexp.test: increase test coverage: added multi-byte utf capability tests; illustrates wrong handling of indices using pcre engine - retrieves bytes offsets instead of char offsets (tests 'regexp-3.8m*-pcre' and 'regexp-17.3m*-pcre' fail) --- tests/regexp.test | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/regexp.test b/tests/regexp.test index e7c630c28779..66cd7be5e5ab 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -160,6 +160,22 @@ test regexp-3.7 {getting substrings back from regexp} { set foo 1; set f2 1; set f3 1; set f4 1 list [regexp -indices (a)(b)?(c) xacy foo f2 f3 f4] $foo $f2 $f3 $f4 } {1 {1 2} {1 1} {-1 -1} {2 2}} +test regexp-3.8m {getting substrings back from regexp (multi-byte utf capability)} { + set foo 1; set f2 1; set f3 1; set f4 1 + list [regexp -indices {(\w)(b)?([[:upper:]])} a\xe4\xc4\xe4\xdc\xfcz foo f2 f3 f4] $foo $f2 $f3 $f4 +} {1 {1 2} {1 1} {-1 -1} {2 2}} +test regexp-3.8m2 {getting substrings back from regexp (multi-byte utf capability)} { + set foo 1; set f2 1; set f3 1; set f4 1 + list [regexp -start 2 -indices {(\w)(b)?([[:upper:]])} a\xe4\xc4\xe4\xdc\xfcz foo f2 f3 f4] $foo $f2 $f3 $f4 +} {1 {3 4} {3 3} {-1 -1} {4 4}} +test regexp-3.8m3 {getting substrings back from regexp (multi-byte utf capability, start/indices)} { + set foo 1; set f2 1; set f3 1; set f4 1 + list [regexp -start 3 -indices {([[:upper:]])(b)?(\w)} a\xe4\xc4\xe4\xdc\xfcz foo f2 f3 f4] $foo $f2 $f3 $f4 +} {1 {4 5} {4 4} {-1 -1} {5 5}} +test regexp-3.8m4 {getting substrings back from regexp (multi-byte utf capability, start)} { + set foo 1; set f2 1; set f3 1; set f4 1 + list [regexp -start 3 {([[:upper:]])(b)?(\w)} a\xe4\xc4\xe4\xdc\xfcz foo f2 f3 f4] $foo $f2 $f3 $f4 +} [list 1 \xdc\xfc \xdc {} \xfc] test regexp-4.1 {-nocase option to regexp} { regexp -nocase foo abcFOo @@ -309,6 +325,15 @@ test regexp-7.17 {regsub utf compliance} { regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar list [string compare $foo $bar] [regexp 4 $bar] } {0 0} +test regexp-7.17m {regsub multi-byte utf capability} { + regsub {(..)(\w)(.)(\w)(..)} a\xe4\xc4\xe4\xdc\xfcz {\1-\2-\3-\4-\5} +} "a\xe4-\xc4-\xe4-\xdc-\xfcz" +test regexp-7.18m {regsub multi-byte utf capability} { + regsub -all {\w{2}(?=\w)} a\xe4\xc4\xe4\xdc\xfcz {\0-} +} "a\xe4-\xc4\xe4-\xdc\xfc-z" +test regexp-7.18m2 {regsub multi-byte utf capability} { + regsub -all -start 2 {\w{2}(?=\w)} a\xe4\xc4\xe4\xdc\xfcz {\0-} +} "a\xe4\xc4\xe4-\xdc\xfc-z" test regexp-8.1 {case conversion in regsub} { list [regsub -nocase a(a+) xaAAaAAay & foo] $foo @@ -571,6 +596,18 @@ test regexp-17.2 {regexp -inline} { test regexp-17.3 {regexp -inline -indices} { regexp -inline -indices (b) ababa } {{1 1} {1 1}} +test regexp-17.3m {regexp -indices (multi-byte utf capability)} { + regexp -inline -indices {..(\w).(\w)..} a\xe4\xc4\xe4\xdc\xfcz +} {{0 6} {2 2} {4 4}} +test regexp-17.3m2 {regexp -indices -start (multi-byte utf capability)} { + regexp -start 2 -inline -indices {(\w)([[:upper:]])} a\xe4\xc4\xe4\xdc\xfcz +} {{3 4} {3 3} {4 4}} +test regexp-17.3m3 {regexp -indices -all (multi-byte utf capability)} { + regexp -all -inline -indices {(\w)([[:upper:]])} a\xe4\xc4\xe4\xdc\xfcz +} {{1 2} {1 1} {2 2} {3 4} {3 3} {4 4}} +test regexp-17.3m4 {regexp -indices -all -start (multi-byte utf capability)} { + regexp -all -start 2 -inline -indices {(\w)([[:upper:]])} a\xe4\xc4\xe4\xdc\xfc\xc4\xe4z +} {{3 4} {3 3} {4 4} {5 6} {5 5} {6 6}} test regexp-17.4 {regexp -inline} { regexp -inline {\w(\d+)\w} " hello 23 there456def " } {e456d 456} From 61189c7c878f4aa819923e3b5cc11861db4a5b44 Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 30 Jun 2021 16:52:15 +0200 Subject: [PATCH 16/27] fixes prce behavior on multi-byte utf-8 sequences (indices, start offset, etc); more tests --- generic/tclRegexp.c | 135 ++++++++++++++++++++++++++++++++++++-------- tests/regexp.test | 11 +++- 2 files changed, 120 insertions(+), 26 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index b9a40411dd4c..57d238abe5f0 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1190,7 +1190,8 @@ CompileRegexp( * Convert from Tcl classic to PCRE cflags */ - /* XXX Should enable PCRE_UTF8 selectively on non-ByteArray Tcl_Obj */ + /* XXX Should enable PCRE_UTF8 selectively on non-ByteArray Tcl_Obj + * TODO: parse of bytearray doesn't expect PCRE_UTF8 here */ pcrecflags = PCRE_UTF8 | PCRE_UCP | PCRE_NO_UTF8_CHECK | PCRE_DOLLAR_ENDONLY; /* @@ -1699,26 +1700,49 @@ TclRegexpPCRE( { #ifdef HAVE_PCRE int i, match, eflags, pcrecflags = 0, stringLength, matchelems, *offsets, - offsetDiff, numMatches = 0; + offsetDiff, offsetC = offset, numMatches = 0, utfstr; Tcl_Obj *objPtr, *resultPtr = NULL; const char *matchstr; pcre *re; pcre_extra *study; TclRegexp *regexpPtr = (TclRegexp *) regExpr; TclRegexpStorage *reStorage = regexpPtr->reStorage; + struct { + int coffs; /* last known offset in bytes */ + int boffs; /* last known offset in chars */ + } mb[2] = {{0, 0}, {0, 0}}; objPtr = objv[1]; + utfstr = (objPtr->typePtr != &tclByteArrayType); /* * Get match string and translate offset into correct placement for utf-8 chars. */ - if (objPtr->typePtr == &tclByteArrayType) { - matchstr = (const char *)Tcl_GetByteArrayFromObj(objPtr, &stringLength); - if (offset && offset < stringLength) { - /* XXX: probably needs length restriction */ - offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; + if (utfstr) { + matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); + /* OFFS_CHAR2BYTE: convert offset in chars to offset in bytes */ + if (offset > 0) { + Tcl_UniChar ch; + const char *src = matchstr, *srcend = matchstr + stringLength; + + + mb[0].coffs = offset; + /* Tcl_UtfAtIndex considering string length */ + while (offset-- > 0 && src < srcend) { + src += TclUtfToUniChar(src, &ch); + } + mb[1].coffs = mb[0].coffs -= offset+1; + mb[1].boffs = mb[0].boffs = src - matchstr; + if (offset <= 0) { + offset = mb[0].boffs; + } else { + offset = stringLength+1; /* outside of string (and > 0 for empty string) */ + } } } else { - matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); + matchstr = (const char *)Tcl_GetByteArrayFromObj(objPtr, &stringLength); + if (offset > 0) { + mb[1].coffs = mb[0].coffs = mb[1].boffs = mb[0].boffs = offset; + } } AllocCaptStorage(regexpPtr); @@ -1755,20 +1779,23 @@ TclRegexpPCRE( * If offset > stringLength, it avoids bad offset error (PCRE_ERROR_BADOFFSET). */ if (offset >= stringLength) { - int bol; - /* avoid match {^$} without multiline, if we are out of range */ + int bol; + /* avoid match {^$} without multiline, if we are out of range */ if (!numMatches && offset > stringLength) { eflags |= PCRE_NOTBOL; } /* safe offset to correct indices if empty match found */ - offsetDiff = offset; + offsetDiff = offsetC; offset = stringLength; /* offset after last char */ if (all && numMatches && offset) { - if (objPtr->typePtr != &tclByteArrayType) { - bol = *(Tcl_UtfAtIndex(matchstr, offset-1)) == '\n'; + /* + if (utfstr) { + bol = *(Tcl_UtfPrev(matchstr + offset, matchstr)) == '\n'; } else { bol = matchstr[offset-1] == '\n'; } + */ + bol = matchstr[offset-1] == '\n'; /* fast fallback if we are not begin of new-line (cannot match anyway) */ if (!bol) { break; @@ -1835,14 +1862,21 @@ TclRegexpPCRE( /* If we tried unshifted search - repeat from next offset */ if (eflags & PCRE_NOTEMPTY_ATSTART) { eflags &= ~(PCRE_NOTEMPTY_ATSTART|PCRE_ANCHORED); - offset++; - if (objPtr->typePtr != &tclByteArrayType) { - offset = Tcl_UtfAtIndex(matchstr, offset) - matchstr; + offsetC++; + if (utfstr && offset < stringLength) { + offset = Tcl_UtfNext(matchstr + offset) - matchstr; + } else { + offset++; } continue; } /* offset to end of string */ offset = stringLength; + if (utfstr && indices) { + offsetC = Tcl_NumUtfChars(matchstr, stringLength); + } else { + offsetC = offset; + } /* repeat once search at end */ continue; @@ -1864,35 +1898,86 @@ TclRegexpPCRE( objc = (!(regexpPtr->flags & TCL_REG_PCDFA)) ? regexpPtr->re.re_nsub + 1 : match; if (!resultPtr) { - resultPtr = Tcl_NewObj(); + /* empty list with reserved elements by current matched count */ + resultPtr = Tcl_NewListObj(objc, NULL); } } + for (i = 0; i < objc; i++) { Tcl_Obj *newPtr; int start, end; if (i < match) { - if (!offsetDiff) { - start = offsets[i*2]; - end = offsets[i*2 + 1]; - } else { + start = offsets[i*2]; + end = offsets[i*2 + 1]; + /* OFFS_BYTE2CHAR: convert offset in bytes to offset in chars */ + if (indices) { + if (!offsetDiff) { + if (start >= 0) { + int bstart = start, bend = end; + const char *src, *srcend; + Tcl_UniChar ch; + + if (bstart >= mb[1].boffs) { + bstart -= mb[1].boffs; + bend -= mb[1].boffs; + src = matchstr + mb[1].boffs; + start = mb[1].coffs; + } else if (bstart >= mb[0].boffs) { + bstart -= mb[0].boffs; + bend -= mb[0].boffs; + src = matchstr + mb[0].boffs; + start = mb[0].coffs; + } else { + /* todo: check this obscure case is possible at all, + * e. g. by unshifted search */ + src = matchstr; + start = 0; + } + srcend = src + bstart; + while (src < srcend) { + start++; + src += TclUtfToUniChar(src, &ch); + } + end = start; + if (bend > bstart) { + bend -= bstart; + srcend = src + bend; + while (src < srcend) { + end++; + src += TclUtfToUniChar(src, &ch); + } + } + if (i == 0) { + mb[0].boffs = offsets[0]; + mb[0].coffs = start; + mb[1].boffs = offsets[1]; + mb[1].coffs = end; + } + } + } else { /* if out of range we've always empty match [offs, offs-1] */ end = start = offsetDiff; + } } } else { start = -1; - end = -1; + end = 0; } if (indices) { Tcl_Obj *objs[2]; objs[0] = Tcl_NewLongObj(start); - objs[1] = Tcl_NewLongObj((end < 0) ? end : end - 1); + objs[1] = Tcl_NewLongObj(end >= 0 ? end-1 : end); newPtr = Tcl_NewListObj(2, objs); } else { if (i < match) { - newPtr = Tcl_NewStringObj(matchstr + start, end - start); + if (utfstr) { + newPtr = Tcl_NewStringObj(matchstr + start, end - start); + } else { + newPtr = Tcl_NewByteArrayObj((const unsigned char *)(matchstr + start), end - start); + } } else { newPtr = Tcl_NewObj(); } @@ -1930,8 +2015,10 @@ TclRegexpPCRE( if (offsets[1] > offsets[0]) { offset = offsets[1]; + offsetC = mb[1].coffs; /* only used by indices as offsetDiff */ } else { offset = offsets[0]; + offsetC = mb[0].coffs; /* only used by indices as offsetDiff */ eflags |= (PCRE_NOTEMPTY_ATSTART|PCRE_ANCHORED); } } diff --git a/tests/regexp.test b/tests/regexp.test index 66cd7be5e5ab..666981805061 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -517,8 +517,10 @@ test regexp-15.5 {regexp -start, over end of string} { list [regexp -start [string length 1abc2de3] {\d} 1abc2de3 x] [info exists x] } {0 0} test regexp-15.6 {regexp -start, loss of ^$ behavior} { - list [regexp -start 2 {^$} {}] -} {0} + set f1 - + list [regexp -start 2 {^$} {}] [regexp -indices -start 2 {^$} {} f1] $f1 \ + [regexp -start 2 {^.*$} {}] [regexp -indices -start 2 {^.*$} {} f1] $f1 +} {0 0 - 0 0 -} test regexp-15.7 {regexp -start, double option} { regexp -start 2 -start 0 a abc } 1 @@ -1082,6 +1084,11 @@ test regexp-26.14 {anchored regexp with -line option} { [regexp -all -inline -line -- {^a*} "a\n"] \ [regexp -all -inline -line -indices -- {^a*} "a\n"] } {{a {}} {{0 0} {2 1}}} +test regexp-26.14m {anchored regexp with -line option (multi-byte utf capability)} { + list \ + [regexp -all -inline -line -- {^\w*} "\xe4\n"] \ + [regexp -all -inline -line -indices -- {^\w*} "\xe4\n"] +} [list [list \xe4 {}] {{0 0} {2 1}}] test regexp-26.15.1 {anchored regexp without -line option} { list \ [regexp -all -inline -- {^a*} "a\n"] \ From 28988837c701bfbe51c1ed4a224105142ab09b8e Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 30 Jun 2021 17:39:20 +0200 Subject: [PATCH 17/27] match byte-array as string (safe against shimmer, compatible to classic nfa regexp, probably possible if RE gets recompiled on demand without PCRE_UTF8, but there are '\w' which could then confuse some byte sequences with chars, so to avoid regression let parse it as utf-8 now); added regression test cases regexp-27.* --- generic/tclRegexp.c | 16 +++++++++++++++- tests/regexp.test | 27 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 57d238abe5f0..9f0b50e8f261 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1700,7 +1700,7 @@ TclRegexpPCRE( { #ifdef HAVE_PCRE int i, match, eflags, pcrecflags = 0, stringLength, matchelems, *offsets, - offsetDiff, offsetC = offset, numMatches = 0, utfstr; + offsetDiff, offsetC = offset, numMatches = 0 /*, utfstr*/; Tcl_Obj *objPtr, *resultPtr = NULL; const char *matchstr; pcre *re; @@ -1713,7 +1713,18 @@ TclRegexpPCRE( } mb[2] = {{0, 0}, {0, 0}}; objPtr = objv[1]; + /* + * Match byte-array as string (safe against shimmer, probably possible if RE + * gets recompiled on demand without PCRE_UTF8, but there are '\w' or `[[:alpha:]]` + * which could then confuse some byte sequences with chars, so to avoid regression + * let parse it as utf-8). + * TODO: implement -binary option to scan byte-array as byte-array + */ +#define utfstr 1 +#if 0 + /* not implemented for byte-array */ utfstr = (objPtr->typePtr != &tclByteArrayType); +#endif /* * Get match string and translate offset into correct placement for utf-8 chars. */ @@ -2039,6 +2050,9 @@ TclRegexpPCRE( Tcl_SetObjResult(interp, Tcl_NewIntObj(numMatches)); } return TCL_OK; + +#undef utfstr + #else /* !HAVE_PCRE */ Tcl_AppendResult(interp, "PCRE not available", NULL); return TCL_ERROR; diff --git a/tests/regexp.test b/tests/regexp.test index 666981805061..587490cf86b8 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -1111,6 +1111,33 @@ test regexp-26.17 {anchored regexp without -line option} { [regexp -all -inline -indices -- {^a*} "b\n"] } {{{}} {{0 -1}}} +test regexp-27.1 {search in byte-array} { + list [regexp -start 2 -inline .{2} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -start 2 -inline -indices .{2} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -start 2 -inline {\xd1\x82} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -start 2 -inline -indices {\xd1\x82} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -start 0 -inline {\xd1\x82} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -start 0 -inline -indices {\xd1\x82} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] +} [list [string range [encoding convertto utf-8 \u0442\u0435\u0441\u0442] 2 3] \ + {{2 3}} \ + [string range [encoding convertto utf-8 \u0442\u0435\u0441\u0442] 6 7] \ + {{6 7}} \ + [lrepeat 2 [string range [encoding convertto utf-8 \u0442\u0435\u0441\u0442] 6 7]] \ + {{0 1} {6 7}} \ +] +test regexp-27.2 {search in byte-array} { + list [regexp -all -inline {\w} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -inline -indices {\w} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -start 3 -inline {\w} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] \ + [regexp -all -start 3 -inline -indices {\w} [encoding convertto utf-8 \u0442\u0435\u0441\u0442]] +} [list [list \xd1 \xd0 \xb5 \xd1 \xd1] \ + {{0 0} {2 2} {3 3} {4 4} {6 6}} \ + [list \xb5 \xd1 \xd1] \ + {{3 3} {4 4} {6 6}} \ +] + + + # cleanup ::tcltest::cleanupTests return From 7ef01ba313178eee1d458f3ec56731aa24023a1c Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 30 Jun 2021 20:35:07 +0200 Subject: [PATCH 18/27] code review; new flags for Tcl_RegExpExecObj and TclRegexp*, in TclRegexp* parameter flags replaced all, inline, indices; additional tests for regsub over multi-byte string (check correct initial offsets) --- generic/tcl.h | 11 ++-- generic/tclCmdMZ.c | 43 ++++++++------- generic/tclInt.h | 4 +- generic/tclRegexp.c | 128 +++++++++++++++++++++++--------------------- tests/regexp.test | 3 ++ 5 files changed, 100 insertions(+), 89 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 266c6d856343..18775e390bb2 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -572,11 +572,16 @@ typedef void (Tcl_ThreadCreateProc) _ANSI_ARGS_((ClientData clientData)); #define TCL_REG_COMPILE_UNSHIFT(v) ((v&~0x07)|((v&0x07)<<28)|TCL_REG_ADVANCED) /* - * Flags values passed to Tcl_RegExpExecObj. + * Flags values passed to Tcl_RegExpExecObj and TclRegexp*. */ -#define TCL_REG_NOTBOL 0001 /* Beginning of string does not match ^. */ -#define TCL_REG_NOTEOL 0002 /* End of string does not match $. */ +#define TCL_REG_NOTBOL 0x00000001 /* Beginning of string does not match ^. */ +#define TCL_REG_NOTEOL 0x00000002 /* End of string does not match $. */ +#define TCL_REG_RETALL 0x00000010 /* Return all occurences (repeat as long as matches). */ +#define TCL_REG_RETIDX 0x00000020 /* Return indices of matches (instead of strings). */ +#define TCL_REG_DOINLINE 0x00000040 /* Return matches as a list (instead of placing in variables). */ +#define TCL_REG_BYTEOFFS 0x01000000 /* Consider offsets in bytes instead of in chars (PCRE only) */ + /* * Structures filled in by Tcl_RegExpInfo. Note that all offset values are diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 417d66934d09..6a4852c510b5 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -87,7 +87,7 @@ Tcl_RegexpObjCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - int i, indices, about, offset, all, doinline; + int i, about, offset, flags = 0; int cflags, re_type; Tcl_Obj *startIndex = NULL; Tcl_RegExp regExpr; @@ -108,13 +108,10 @@ Tcl_RegexpObjCmd( RETYPE_CLASSIC, RETYPE_DFA, RETYPE_PCRE }; - indices = 0; about = 0; re_type = 0; cflags = TCL_REG_ADVANCED; offset = 0; - all = 0; - doinline = 0; for (i = 1; i < objc; i++) { char *name; @@ -130,13 +127,13 @@ Tcl_RegexpObjCmd( } switch ((enum options) index) { case REGEXP_ALL: - all = 1; + flags |= TCL_REG_RETALL; break; case REGEXP_INDICES: - indices = 1; + flags |= TCL_REG_RETIDX; break; case REGEXP_INLINE: - doinline = 1; + flags |= TCL_REG_DOINLINE; break; case REGEXP_NOCASE: cflags |= TCL_REG_NOCASE; @@ -212,7 +209,7 @@ Tcl_RegexpObjCmd( * no-no. */ - if (doinline && ((objc - 2) != 0)) { + if ((flags & TCL_REG_DOINLINE) && ((objc - 2) != 0)) { Tcl_AppendResult(interp, "regexp match variables not allowed" " when using -inline", NULL); optionError: @@ -268,7 +265,7 @@ Tcl_RegexpObjCmd( } return TclRegexpClassic(interp, objc, objv, regExpr, - all, indices, doinline, offset); + flags, offset); } else { if (about) { /* XXX: implement PCRE about */ @@ -276,7 +273,7 @@ Tcl_RegexpObjCmd( } return TclRegexpPCRE(interp, objc, objv, regExpr, - all, indices, doinline, offset); + flags, offset); } } @@ -696,18 +693,20 @@ Tcl_RegsubObjCmd( char ch, *wsrc, *wfirstChar, *cstring, *wsubspec, *wend; - if (objPtr->typePtr == &tclByteArrayType) { - cstring = Tcl_GetByteArrayFromObj(objPtr, &wlen); - } else { - /* XXX validate offset by char length */ - cstring = Tcl_GetStringFromObj(objPtr, &wlen); - } - if (subPtr->typePtr == &tclByteArrayType) { - wsubspec = Tcl_GetByteArrayFromObj(subPtr, &wsublen); - } else { - /* XXX validate offset by char length */ - wsubspec = Tcl_GetStringFromObj(subPtr, &wsublen); + cstring = Tcl_GetStringFromObj(objPtr, &wlen); + /* OFFS_CHAR2BYTE: convert offset in chars to offset in bytes, + * further offsets are byte offset (see TCL_REG_BYTEOFFS). */ + if (offset > 0) { + Tcl_UniChar ch; + const char *src = cstring, *srcend = cstring + wlen; + + /* Tcl_UtfAtIndex considering string length */ + while (offset-- > 0 && src < srcend) { + src += TclUtfToUniChar(src, &ch); + } + offset = src - cstring; } + wsubspec = Tcl_GetStringFromObj(subPtr, &wsublen); /* * The following loop is to handle multiple matches within the same source @@ -730,7 +729,7 @@ Tcl_RegsubObjCmd( match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset, 10 /* matches */, ((offset > 0 && (cstring[offset-1] != '\n')) - ? TCL_REG_NOTBOL : 0)); + ? TCL_REG_NOTBOL : 0) | TCL_REG_BYTEOFFS); if (match < 0) { result = TCL_ERROR; diff --git a/generic/tclInt.h b/generic/tclInt.h index c2fa1b7d35de..074d0f9a1154 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3369,10 +3369,10 @@ MODULE_SCOPE int TclObjCallVarTraces(Interp *iPtr, Var *arrayPtr, MODULE_SCOPE int TclRegexpClassic(Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[], Tcl_RegExp regExpr, - int all, int indices, int doinline, int offset); + int flags, int offset); MODULE_SCOPE int TclRegexpPCRE(Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[], Tcl_RegExp regExpr, - int all, int indices, int doinline, int offset); + int flags, int offset); /* * So tclObj.c and tclDictObj.c can share these implementations. diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 9f0b50e8f261..b14a209e0aeb 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -491,15 +491,36 @@ Tcl_RegExpExecObj( AllocCaptStorage(regexpPtr); offsets = reStorage->offsets; - if (textObj->typePtr == &tclByteArrayType) { - matchstr = (const char*)Tcl_GetByteArrayFromObj(textObj, &length); - } else { +#define utfstr 1 +#if 0 + /* not implemented for byte-array */ + utfstr = (textObj->typePtr != &tclByteArrayType); +#endif + if (utfstr) { matchstr = Tcl_GetStringFromObj(textObj, &length); + /* OFFS_CHAR2BYTE: convert offset in chars to offset in bytes */ + if (!(flags & TCL_REG_BYTEOFFS) && offset > 0) { + Tcl_UniChar ch; + const char *src = matchstr, *srcend = matchstr + length; + + /* Tcl_UtfAtIndex considering string length */ + while (offset-- > 0 && src < srcend) { + src += TclUtfToUniChar(src, &ch); + } + if (offset <= 0) { + offset = src - matchstr; + } else { + offset = length+1; /* outside of string (and > 0 for empty string) */ + } + } + } else { + matchstr = (const char*)Tcl_GetByteArrayFromObj(textObj, &length); } if (offset > length) { offset = length; } +#undef utfstr regexpPtr->details.rm_extend.rm_so = offset; @@ -570,6 +591,14 @@ Tcl_RegExpExecObj( offsets[i] -= offset; } } + /* TODO: OFFS_BYTE2CHAR not yet implemented for Tcl_RegExpExecObj* / + / * OFFS_BYTE2CHAR: convert offset in bytes to offset in chars * / + if (!(flags & TCL_REG_BYTEOFFS)) { + ... + see for example OFFS_BYTE2CHAR block in TclRegexpPCRE + ... + } + */ return 1; #else @@ -1463,12 +1492,10 @@ TclRegexpClassic( int objc, /* Number of arguments. */ Tcl_Obj *CONST objv[], /* Argument objects. */ Tcl_RegExp regExpr, - int all, - int indices, - int doinline, + int flags, int offset) { - int i, match, numMatchesSaved; + int i, match, numMatches = 0, numMatchesSaved; int eflags, stringLength, matchLength; Tcl_Obj *objPtr, *resultPtr = NULL; Tcl_RegExpInfo info; @@ -1480,7 +1507,7 @@ TclRegexpClassic( objc -= 2; objv += 2; - if (doinline) { + if (flags & TCL_REG_DOINLINE) { /* * Save all the subexpressions, as we will return them as a list */ @@ -1493,7 +1520,7 @@ TclRegexpClassic( * where to move the offset. */ - numMatchesSaved = (objc == 0) ? all : objc; + numMatchesSaved = (objc == 0) ? (flags & TCL_REG_RETALL) : objc; } /* @@ -1529,23 +1556,6 @@ TclRegexpClassic( } if (match == 0) { - /* - * We want to set the value of the intepreter result only when - * this is the first time through the loop. - */ - - if (all <= 1) { - /* - * If inlining, the interpreter's object result remains an - * empty list, otherwise set it to an integer object w/ value - * 0. - */ - - if (!doinline) { - Tcl_SetObjResult(interp, Tcl_NewIntObj(0)); - } - return TCL_OK; - } break; } @@ -1555,21 +1565,21 @@ TclRegexpClassic( */ Tcl_RegExpGetInfo(regExpr, &info); - if (doinline) { + if (flags & TCL_REG_DOINLINE) { /* * It's the number of substitutions, plus one for the matchVar at * index 0 */ objc = info.nsubs + 1; - if (all <= 1) { + if (!resultPtr) { resultPtr = Tcl_NewObj(); } } for (i = 0; i < objc; i++) { Tcl_Obj *newPtr; - if (indices) { + if (flags & TCL_REG_RETIDX) { int start, end; Tcl_Obj *objs[2]; @@ -1608,7 +1618,7 @@ TclRegexpClassic( newPtr = Tcl_NewObj(); } } - if (doinline) { + if (flags & TCL_REG_DOINLINE) { if (Tcl_ListObjAppendElement(interp, resultPtr, newPtr) != TCL_OK) { Tcl_DecrRefCount(newPtr); @@ -1626,13 +1636,14 @@ TclRegexpClassic( } } - if (all == 0) { + numMatches++; + if (!(flags & TCL_REG_RETALL)) { break; } /* * Adjust the offset to the character just after the last one in the - * matchVar and increment all to count how many times we are making a + * matchVar and increment numMatches to count how many times we have * match. We always increment the offset by at least one to prevent * endless looping (as in the case: regexp -all {a*} a). Otherwise, * when we match the NULL string at the end of the input string, we @@ -1651,22 +1662,21 @@ TclRegexpClassic( if (matchLength == 0) { offset++; } - all++; if (offset >= stringLength) { break; } } /* - * Set the interpreter's object result to an integer object with value 1 - * if -all wasn't specified, otherwise it's all-1 (the number of times - * through the while - 1). + * Set the interpreter's object result to an integer object with numMatches + * (the number of times through the while - 1) if -inline wasn't specified, + * otherwise it's a list with matches. */ - if (doinline) { - Tcl_SetObjResult(interp, resultPtr); + if (flags & TCL_REG_DOINLINE) { + Tcl_SetObjResult(interp, resultPtr ? resultPtr : Tcl_NewObj()); } else { - Tcl_SetObjResult(interp, Tcl_NewIntObj(all ? all-1 : 1)); + Tcl_SetObjResult(interp, Tcl_NewIntObj(numMatches)); } return TCL_OK; } @@ -1693,9 +1703,7 @@ TclRegexpPCRE( int objc, /* Number of arguments. */ Tcl_Obj *CONST objv[], /* Argument objects. */ Tcl_RegExp regExpr, - int all, - int indices, - int doinline, + int flags, int offset) { #ifdef HAVE_PCRE @@ -1731,7 +1739,7 @@ TclRegexpPCRE( if (utfstr) { matchstr = Tcl_GetStringFromObj(objPtr, &stringLength); /* OFFS_CHAR2BYTE: convert offset in chars to offset in bytes */ - if (offset > 0) { + if (!(flags & TCL_REG_BYTEOFFS) && offset > 0) { Tcl_UniChar ch; const char *src = matchstr, *srcend = matchstr + stringLength; @@ -1773,7 +1781,7 @@ TclRegexpPCRE( study = regexpPtr->study; matchelems = VectorCoountPCRE(regexpPtr); eflags = PCRE_NO_UTF8_CHECK; - if (all) { + if ((flags & TCL_REG_RETALL)) { pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &pcrecflags); } while (1) { @@ -1798,7 +1806,7 @@ TclRegexpPCRE( /* safe offset to correct indices if empty match found */ offsetDiff = offsetC; offset = stringLength; /* offset after last char */ - if (all && numMatches && offset) { + if ((flags & TCL_REG_RETALL) && numMatches && offset) { /* if (utfstr) { bol = *(Tcl_UtfPrev(matchstr + offset, matchstr)) == '\n'; @@ -1818,7 +1826,7 @@ TclRegexpPCRE( eflags |= PCRE_ANCHORED; } } - all = 0; /* don't repeat */ + flags &= ~TCL_REG_RETALL; /* don't repeat */ } } @@ -1867,7 +1875,7 @@ TclRegexpPCRE( * Option pcrecflags & PCRE_ANCHORED is not set in multiline mode (resp. `(?m)`), * in this case no match means - we will find nothing at all, so don't repeat. */ - if (!all || !numMatches || !stringLength || (pcrecflags & PCRE_ANCHORED)) { + if (!(flags & TCL_REG_RETALL) || !numMatches || !stringLength || (pcrecflags & PCRE_ANCHORED)) { break; } /* If we tried unshifted search - repeat from next offset */ @@ -1883,7 +1891,7 @@ TclRegexpPCRE( } /* offset to end of string */ offset = stringLength; - if (utfstr && indices) { + if (utfstr && (flags & TCL_REG_RETIDX)) { offsetC = Tcl_NumUtfChars(matchstr, stringLength); } else { offsetC = offset; @@ -1898,7 +1906,7 @@ TclRegexpPCRE( * information in those variables. */ - if (doinline) { + if (flags & TCL_REG_DOINLINE) { /* * It's the number of substitutions, plus one for the matchVar at * index 0 @@ -1922,7 +1930,7 @@ TclRegexpPCRE( start = offsets[i*2]; end = offsets[i*2 + 1]; /* OFFS_BYTE2CHAR: convert offset in bytes to offset in chars */ - if (indices) { + if (!(flags & TCL_REG_BYTEOFFS) && (flags & TCL_REG_RETIDX)) { if (!offsetDiff) { if (start >= 0) { int bstart = start, bend = end; @@ -1975,7 +1983,7 @@ TclRegexpPCRE( start = -1; end = 0; } - if (indices) { + if (flags & TCL_REG_RETIDX) { Tcl_Obj *objs[2]; objs[0] = Tcl_NewLongObj(start); @@ -1993,7 +2001,7 @@ TclRegexpPCRE( newPtr = Tcl_NewObj(); } } - if (doinline) { + if (flags & TCL_REG_DOINLINE) { if (Tcl_ListObjAppendElement(interp, resultPtr, newPtr) != TCL_OK) { Tcl_DecrRefCount(newPtr); @@ -2012,7 +2020,7 @@ TclRegexpPCRE( } numMatches++; - if (!all) { + if (!(flags & TCL_REG_RETALL)) { break; } @@ -2035,17 +2043,13 @@ TclRegexpPCRE( } /* - * Set the interpreter's object result to an integer object with value 1 - * if -all wasn't specified, otherwise it's all-1 (the number of times - * through the while - 1). + * Set the interpreter's object result to an integer object with numMatches + * (the number of times through the while - 1) if -inline wasn't specified, + * otherwise it's a list with matches. */ - if (doinline) { - if (resultPtr) { - Tcl_SetObjResult(interp, resultPtr); - } else { - Tcl_ResetResult(interp); - } + if (flags & TCL_REG_DOINLINE) { + Tcl_SetObjResult(interp, resultPtr ? resultPtr : Tcl_NewObj()); } else { Tcl_SetObjResult(interp, Tcl_NewIntObj(numMatches)); } diff --git a/tests/regexp.test b/tests/regexp.test index 587490cf86b8..1b77f9795f21 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -334,6 +334,9 @@ test regexp-7.18m {regsub multi-byte utf capability} { test regexp-7.18m2 {regsub multi-byte utf capability} { regsub -all -start 2 {\w{2}(?=\w)} a\xe4\xc4\xe4\xdc\xfcz {\0-} } "a\xe4\xc4\xe4-\xdc\xfc-z" +test regexp-7.18m3 {regsub multi-byte utf capability} { + regsub -all -start 2 {\w{2}} \u043f\u0440\u0438\u0432\u0435\u0442 {=\0-} +} "\u043f\u0440=\u0438\u0432-=\u0435\u0442-" test regexp-8.1 {case conversion in regsub} { list [regsub -nocase a(a+) xaAAaAAay & foo] $foo From ed93f279c6b761fa6a69c1c3b4b708484233fb4e Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 30 Jun 2021 18:44:23 +0200 Subject: [PATCH 19/27] fixes regression (support of \uXXXX escape sequences), compile pcre with PCRE_JAVASCRIPT_COMPAT (unfortunately it introduces another ugly restriction - lone closing square bracket in a pattern causes a compile-time error, but it can be escaped like "\]", so let use it unless \uXXXX can be compiled in tcl directly, e. g. by some pre-processor) --- generic/tclRegexp.c | 7 ++++--- tests/regexp.test | 7 ++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index b14a209e0aeb..72035dcbf618 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1220,9 +1220,10 @@ CompileRegexp( */ /* XXX Should enable PCRE_UTF8 selectively on non-ByteArray Tcl_Obj - * TODO: parse of bytearray doesn't expect PCRE_UTF8 here */ + * TODO: parse of bytearray doesn't expect PCRE_UTF8 here + * PCRE_JAVASCRIPT_COMPAT - for \uXXXX char sequence support */ pcrecflags = PCRE_UTF8 | PCRE_UCP | PCRE_NO_UTF8_CHECK | - PCRE_DOLLAR_ENDONLY; + PCRE_DOLLAR_ENDONLY | PCRE_JAVASCRIPT_COMPAT; /* for (i = 0, p = cstring; i < length; i++) { if (UCHAR(*p++) > 0x80) { @@ -1752,7 +1753,7 @@ TclRegexpPCRE( mb[1].coffs = mb[0].coffs -= offset+1; mb[1].boffs = mb[0].boffs = src - matchstr; if (offset <= 0) { - offset = mb[0].boffs; + offset = mb[0].boffs; } else { offset = stringLength+1; /* outside of string (and > 0 for empty string) */ } diff --git a/tests/regexp.test b/tests/regexp.test index 1b77f9795f21..9d0ae76b8a2e 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -1138,7 +1138,12 @@ test regexp-27.2 {search in byte-array} { [list \xb5 \xd1 \xd1] \ {{3 3} {4 4} {6 6}} \ ] - +test regexp-27.3 {regression test, support of \uXXXX escape sequences} { + list [regexp -all -inline {\u0442} \u0442\u0435\u0441\u0442] \ + [regexp -all -indices -inline {\u0442} \u0442\u0435\u0441\u0442] \ + [regexp -inline {\u0442..\u0442} \u0442\u0435\u0441\u0442] \ + [regexp -indices -inline {\u0442..\u0442} \u0442\u0435\u0441\u0442] +} [list [list \u0442 \u0442] {{0 0} {3 3}} \u0442\u0435\u0441\u0442 {{0 3}}] # cleanup From 51d743e826b49ec2cd418bd96069a11ef79c11c9 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 6 Jul 2021 16:12:00 +0200 Subject: [PATCH 20/27] optimizes byte 2 char offset (rewritten as function, enlarged mapping array of known offsets with middle indices) --- generic/tclRegexp.c | 130 ++++++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 52 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 72035dcbf618..8d851fb2407a 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1682,6 +1682,61 @@ TclRegexpClassic( return TCL_OK; } +/* + * Helper to map byte offset to char offset, using array of known offsets + * mb2c, which gets stored typically: + * first start [0], last known middle start [1] & end [2], first end [3] + */ +int +BOffs2COffs( + int *mb2c, /* Mapping array of offsets ([0..3] - byte offsets, + * [4..7] - char offsets). */ + const char *src, /* String source in utf-8. */ + int offs) /* Byte offset to be mapped to char offset; caller must + * check it is positive. */ +{ + int i, o = offs, coffs = 0; + + /* Scan for known offsets starting from largest one */ + for (i = 3; i >= 0 && mb2c[i]; i--) { + if (o >= mb2c[i]) { + o -= mb2c[i]; + src += mb2c[i]; + coffs = mb2c[i+4]; + break; + } + } + /* Scan for chars in source starting from found known offset or 0 */ + if (o) { + const char *srcend; + Tcl_UniChar ch = 0; + + srcend = src + o; + do { + coffs++; + src += TclUtfToUniChar(src, &ch); + } while (src < srcend); + /* + * New offset known that is larger that previously known one, store it + * and shift indices (1st and 2nd only), because 0th and 3rd are reserved + * for first start and end indices (helps by search of nested indices). + */ + if (i > 1) { + if (i > 2 && mb2c[i-1]) { + mb2c[i-2] = mb2c[i-1]; + mb2c[i+4-2] = mb2c[i+4-1]; + } + if (mb2c[i]) { + mb2c[i-1] = mb2c[i]; + mb2c[i+4-1] = mb2c[i+4]; + } + mb2c[i] = offs; + mb2c[i+4] = coffs; + } + } + return coffs; +} + /* *---------------------------------------------------------------------- * @@ -1716,10 +1771,7 @@ TclRegexpPCRE( pcre_extra *study; TclRegexp *regexpPtr = (TclRegexp *) regExpr; TclRegexpStorage *reStorage = regexpPtr->reStorage; - struct { - int coffs; /* last known offset in bytes */ - int boffs; /* last known offset in chars */ - } mb[2] = {{0, 0}, {0, 0}}; + int mb2c[4*2] = {/* byte offs */ 0, 0, 0, 0, /* char offs */ 0, 0, 0, 0}; objPtr = objv[1]; /* @@ -1745,15 +1797,15 @@ TclRegexpPCRE( const char *src = matchstr, *srcend = matchstr + stringLength; - mb[0].coffs = offset; + mb2c[0+4] = offset; /* Tcl_UtfAtIndex considering string length */ while (offset-- > 0 && src < srcend) { src += TclUtfToUniChar(src, &ch); } - mb[1].coffs = mb[0].coffs -= offset+1; - mb[1].boffs = mb[0].boffs = src - matchstr; + mb2c[3+4] = (mb2c[0+4] -= offset+1); /* known char offset */ + mb2c[3] = mb2c[0] = src - matchstr; /* for this byte offset */ if (offset <= 0) { - offset = mb[0].boffs; + offset = mb2c[0]; } else { offset = stringLength+1; /* outside of string (and > 0 for empty string) */ } @@ -1761,7 +1813,8 @@ TclRegexpPCRE( } else { matchstr = (const char *)Tcl_GetByteArrayFromObj(objPtr, &stringLength); if (offset > 0) { - mb[1].coffs = mb[0].coffs = mb[1].boffs = mb[0].boffs = offset; + mb2c[3+4] = mb2c[0+4] = + mb2c[3] = mb2c[0] = offset; } } @@ -1933,47 +1986,20 @@ TclRegexpPCRE( /* OFFS_BYTE2CHAR: convert offset in bytes to offset in chars */ if (!(flags & TCL_REG_BYTEOFFS) && (flags & TCL_REG_RETIDX)) { if (!offsetDiff) { - if (start >= 0) { - int bstart = start, bend = end; - const char *src, *srcend; - Tcl_UniChar ch; - - if (bstart >= mb[1].boffs) { - bstart -= mb[1].boffs; - bend -= mb[1].boffs; - src = matchstr + mb[1].boffs; - start = mb[1].coffs; - } else if (bstart >= mb[0].boffs) { - bstart -= mb[0].boffs; - bend -= mb[0].boffs; - src = matchstr + mb[0].boffs; - start = mb[0].coffs; - } else { - /* todo: check this obscure case is possible at all, - * e. g. by unshifted search */ - src = matchstr; - start = 0; - } - srcend = src + bstart; - while (src < srcend) { - start++; - src += TclUtfToUniChar(src, &ch); - } - end = start; - if (bend > bstart) { - bend -= bstart; - srcend = src + bend; - while (src < srcend) { - end++; - src += TclUtfToUniChar(src, &ch); - } - } - if (i == 0) { - mb[0].boffs = offsets[0]; - mb[0].coffs = start; - mb[1].boffs = offsets[1]; - mb[1].coffs = end; - } + if (start > 0) { + start = BOffs2COffs(mb2c, matchstr, start); + } + if (end > 0) { + end = BOffs2COffs(mb2c, matchstr, end); + } + /* first is whole match, so store its indices as 0th and 3rd */ + if (i == 0) { + /* mostly smallest */ + mb2c[0] = offsets[0]; + mb2c[0+4] = start; + /* mostly largest */ + mb2c[3] = offsets[1]; + mb2c[3+4] = end; } } else { /* if out of range we've always empty match [offs, offs-1] */ @@ -2035,10 +2061,10 @@ TclRegexpPCRE( if (offsets[1] > offsets[0]) { offset = offsets[1]; - offsetC = mb[1].coffs; /* only used by indices as offsetDiff */ + offsetC = mb2c[3+4]; /* only used by indices as offsetDiff */ } else { offset = offsets[0]; - offsetC = mb[0].coffs; /* only used by indices as offsetDiff */ + offsetC = mb2c[0+4]; /* only used by indices as offsetDiff */ eflags |= (PCRE_NOTEMPTY_ATSTART|PCRE_ANCHORED); } } From be70193dc3a646943698e3ee1f4752cdab932186 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 12 Oct 2021 15:05:38 +0200 Subject: [PATCH 21/27] try to improve pcre regexp (re)allocate storage (offsets/matches) at compile time --- generic/tclRegexp.c | 40 ++++++++++++++++++++++------------------ generic/tclRegexp.h | 1 + 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 8d851fb2407a..cb0aba5cc14a 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -316,8 +316,6 @@ RegExpExecUniChar( nm = (size_t) nmatches; } - AllocCaptStorage(regexpPtr); - status = TclReExec(®expPtr->re, wString, (size_t) numChars, ®expPtr->details, nm, reStorage->matches, flags); @@ -486,9 +484,9 @@ Tcl_RegExpExecObj( if (reflags & TCL_REG_PCRE) { #ifdef HAVE_PCRE const char *matchstr; - int match, eflags, *offsets, nm = VectorCoountPCRE(regexpPtr); + int match, eflags, *offsets, nm; - AllocCaptStorage(regexpPtr); + nm = reStorage->offsCnt; offsets = reStorage->offsets; #define utfstr 1 @@ -557,7 +555,8 @@ Tcl_RegExpExecObj( /* insufficient capture space - enlarge vectors buffer */ regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; AllocCaptStorage(regexpPtr); - nm = VectorCoountPCRE(regexpPtr); + nm = reStorage->offsCnt; + offsets = reStorage->offsets; } while(1); } @@ -1046,7 +1045,8 @@ AllocCaptStorage(TclRegexp *regexpPtr) * if sizes of regoffs_t and regmatch_t are equal. */ veccnt = VectorCoountPCRE(regexpPtr); - if (!reStorage->offsets || reStorage->offsSize < sizeof(int) * veccnt) { + if (reStorage->offsCnt < veccnt) { + reStorage->offsCnt = veccnt; reStorage->offsSize = sizeof(int) * veccnt; /* if initial call (first call) */ if (!reStorage->offsets) { @@ -1374,6 +1374,9 @@ CompileRegexp( tsdPtr->patLengths[0] = length; tsdPtr->regexps[0] = regexpPtr; + /* Ensure we have offsets/matches of expected size once at compile time */ + AllocCaptStorage(regexpPtr); + return regexpPtr; } @@ -1763,7 +1766,7 @@ TclRegexpPCRE( int offset) { #ifdef HAVE_PCRE - int i, match, eflags, pcrecflags = 0, stringLength, matchelems, *offsets, + int i, match, eflags, pcrecflags = 0, stringLength, *offsets, matchelems, offsetDiff, offsetC = offset, numMatches = 0 /*, utfstr*/; Tcl_Obj *objPtr, *resultPtr = NULL; const char *matchstr; @@ -1818,26 +1821,26 @@ TclRegexpPCRE( } } - AllocCaptStorage(regexpPtr); - offsets = reStorage->offsets; - objc -= 2; objv += 2; - /* - * The following loop is to handle multiple matches within the same source - * string; each iteration handles one match. If "-all" hasn't been - * specified then the loop body only gets executed once. We terminate the - * loop when the starting offset is past the end of the string. - */ + matchelems = reStorage->offsCnt; + offsets = reStorage->offsets; re = regexpPtr->pcre; study = regexpPtr->study; - matchelems = VectorCoountPCRE(regexpPtr); eflags = PCRE_NO_UTF8_CHECK; if ((flags & TCL_REG_RETALL)) { pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &pcrecflags); } + + /* + * The following loop is to handle multiple matches within the same source + * string; each iteration handles one match. If "-all" hasn't been + * specified then the loop body only gets executed once. We terminate the + * loop when the starting offset is past the end of the string. + */ + while (1) { offsetDiff = 0; @@ -1900,7 +1903,8 @@ TclRegexpPCRE( /* insufficient capture space - enlarge vectors buffer */ regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; AllocCaptStorage(regexpPtr); - matchelems = VectorCoountPCRE(regexpPtr); + matchelems = reStorage->offsCnt; + offsets = reStorage->offsets; } while(1); } diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 39ac15ec4ae6..b43038b98a06 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -31,6 +31,7 @@ typedef struct { #ifdef HAVE_PCRE int *offsets; /* Storage for array of offsets (indices to handle within PCRE) */ + int offsCnt; size_t offsSize; int *wrkSpace; /* Workspace storage vector (used by parsing via DFA). */ int wrkSpCnt; /* Current length of shared workspace storage vector */ From be2dc67365c4af0e8079a04ebf285dbb7067e976 Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 13 Oct 2021 15:03:14 +0200 Subject: [PATCH 22/27] don't capture groups if not needed (no variables or -inline arguments specified) --- generic/tclRegexp.c | 25 +++++++++++++++++++------ tests/regexp.test | 8 +++++++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index cb0aba5cc14a..f7e85b156e72 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1824,8 +1824,18 @@ TclRegexpPCRE( objc -= 2; objv += 2; - matchelems = reStorage->offsCnt; offsets = reStorage->offsets; + matchelems = 0; + /* need captured groups or -inline */ + if (objc || (flags & TCL_REG_DOINLINE)) { + matchelems = reStorage->offsCnt; + } else { + /* -all expects at least the first index in order to scroll over string */ + if (flags & TCL_REG_RETALL) { + matchelems = 3; + } + eflags |= PCRE_NO_AUTO_CAPTURE; + } re = regexpPtr->pcre; study = regexpPtr->study; @@ -1899,12 +1909,12 @@ TclRegexpPCRE( EnlargeWrkSpaceStorage(regexpPtr); continue; } - if (match) break; + if (match || !(objc || (flags & TCL_REG_DOINLINE))) break; /* insufficient capture space - enlarge vectors buffer */ regexpPtr->re.re_nsub = (regexpPtr->re.re_nsub+1)*2; AllocCaptStorage(regexpPtr); - matchelems = reStorage->offsCnt; offsets = reStorage->offsets; + matchelems = reStorage->offsCnt; } while(1); } @@ -1920,10 +1930,13 @@ TclRegexpPCRE( } } - if (match == 0) { - Tcl_AppendResult(interp, + if (!match) { + if ((objc || (flags & TCL_REG_DOINLINE))) { + Tcl_AppendResult(interp, "pcre_exec had insufficient capture space", NULL); - return TCL_ERROR; + return TCL_ERROR; + } + match = 1; } if (match == PCRE_ERROR_NOMATCH) { diff --git a/tests/regexp.test b/tests/regexp.test index f1db8f41bbc0..f8bbe4dfbd44 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -636,7 +636,13 @@ test regexp-17.6 {regexp -inline no matches} { test regexp-17.7 {regexp -inline, no matchvars allowed} { list [catch {regexp -inline b abc match} msg] $msg } {1 {regexp match variables not allowed when using -inline}} - +test regexp-17.8 {regexp no captures, match only (multiple groups in RE)} { + list \ + [regexp {^(a)(b)} " ab ab "] \ + [regexp -all {^(a)(b)} " ab ab "] \ + [regexp {(a)(b)} " ab ab "] \ + [regexp -all {(a)(b)} " ab ab "] +} {0 0 1 2} test regexp-18.1 {regexp -all} { regexp -all b bbbbb } {5} From e516db66ae365ac953a820efc7bd1bb84758b652 Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 13 Oct 2021 15:31:06 +0200 Subject: [PATCH 23/27] regexp2.test - allow to include certain tests from regexp.test (filtered if captured groups present or -line boundary matching used) --- tests/regexp2.test | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/tests/regexp2.test b/tests/regexp2.test index 2091d85c19d8..5f2a153741d4 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -42,9 +42,28 @@ set err [catch { testConstraint reeng_$reeng [_test_reeng_available] # wrapper for tests from regexp.test (add suffix with re-engine type): - proc test {args} { + proc test {name description args} { variable reeng - uplevel [list __test [lindex $args 0]-$reeng {*}[lrange $args 1 end]] + if {$reeng eq "dfa" && [string match "regexp-*" $name]} { + # filter capturing group tests for dfa, + # because no group-captures at all supported (thus incompatible) + if {[string match -* [lindex $args 0]]} { + set body [lindex $args [lsearch -exact $args "-body"]+1] + } else { + if {[llength $args] == 2} { + set body [lindex $args 0] + } else { + set body [lindex $args 1] + } + } + # bypass if contains capturing groups or -line boundary matching (dfa seems not work with PCRE_MULTILINE): + if {[regexp -type c {\((?!\?:).*\)} $body] + || [string match "* -line*" $body] + } { + return + } + } + uplevel [list __test $name-$reeng "\[$reeng\]: $description" {*}$args] } # --------------------------------------------------------------------------- @@ -85,10 +104,7 @@ test regexp2-1.1.1 {alternative patterns: longest match} {classic} { test regexp2-1.2 {alternative patterns: longest match (end anchored)} { regexp -inline {(?:a|ab|abc)$} -abc } {abc} -test regexp2-1.2.1 {alternative patterns: longest match (word boundary)} {pcre} { - regexp -inline {(?:a|ab|abc)\b} -abc- -} {abc} -test regexp2-1.2.2 {alternative patterns: longest match (word boundary)} {dfa} { +test regexp2-1.2.1 {alternative patterns: longest match (word boundary)} {pcre|dfa} { regexp -inline {(?:a|ab|abc)\b} -abc- } {abc} test regexp2-1.2.2 {alternative patterns: longest match (word boundary)} {classic} { @@ -97,14 +113,11 @@ test regexp2-1.2.2 {alternative patterns: longest match (word boundary)} {classi test regexp2-1.3 {alternative patterns: longest match (start anchored)} {longest-match} { regexp {^(?:a|ab|abc)} abc-; set v } {abc} -test regexp2-1.3.1 {alternative patterns: longest match (start anchored + boundary)} {pcre} { +test regexp2-1.3.1 {alternative patterns: longest match (start anchored + boundary)} {pcre|dfa} { regexp -inline {^(?:a|bc|ab|abc)\b} abc- } {abc} -test regexp2-1.3.2 {alternative patterns: longest match only (start anchored + boundary)} {dfa} { - regexp -inline {^(?:a|bc|ab|abc)\b} abc- -} {abc} -test regexp2-1.3.3 {alternative patterns: longest match (start anchored)} {classic} { - regexp -inline {^(?:a|bc|ab|abc)} abc- +test regexp2-1.3.2 {alternative patterns: longest match (start anchored) + boundary} {classic} { + regexp -inline {^(?:a|bc|ab|abc)\M} abc- } {abc} # --------------------------------------------------------------------------- @@ -115,11 +128,8 @@ test regexp2-1.3.3 {alternative patterns: longest match (start anchored)} {class if {![testConstraint reeng_$reeng]} { continue } - # evaluate the tests from regexp.test, - # don't test "dfa" here, because no group-captures at all supported (thus incompatible) - if {$reeng ne "dfa"} { - source -encoding utf-8 [file join [file dirname [info script]] regexp.test] - } + # evaluate the tests from regexp.test + source -encoding utf-8 [file join [file dirname [info script]] regexp.test] }; #end of engine cycle From 9daff73b775495f501b9865ee0359c82d4e4a6c7 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 12 Oct 2021 21:24:37 +0200 Subject: [PATCH 24/27] =?UTF-8?q?use=20JIT-compile=20if=20possible;=20stra?= =?UTF-8?q?nge=20is=20that=20some=20REs=20become=20faster=20where=20anothe?= =?UTF-8?q?r=20significantly=20slower:=20=20=20%=20regexp=20-type=20p=20-i?= =?UTF-8?q?nline=20{^([\w]+)://([^/\s=3F#]+)([^\s=3F#]*)(=3F:\=3F([^\s#]*)?= =?UTF-8?q?=3F(=3F:#([^\s]*))=3F)=3F$}=20"http://usr@example.com/uri=3Farg?= =?UTF-8?q?s#id"=20=20=20-=201.308844=20=C2=B5s/#=20764033=20#=20764033=20?= =?UTF-8?q?#/sec=201000.000=20net-ms=20=20=20+=200.918862=20=C2=B5s/#=2010?= =?UTF-8?q?88303=20#=201088303=20#/sec=201000.000=20net-ms=20=20=20%=20pro?= =?UTF-8?q?c=20test=20{s}=20{=20timerate=20{=20regexp=20-type=20p=20{\w+}?= =?UTF-8?q?=20$s=20}=20};=20test=20"=20=20[string=20repeat=20abc=2010000]?= =?UTF-8?q?=20=20"=20=20=20-=20101.726=20=C2=B5s/#=209831=20#=209830.3=20#?= =?UTF-8?q?/sec=201000.070=20net-ms=20=20=20+=20168.865=20=C2=B5s/#=205922?= =?UTF-8?q?=20#=205921.9=20#/sec=201000.017=20net-ms=20=20=20%=20proc=20te?= =?UTF-8?q?st=20{s}=20{=20timerate=20{=20regexp=20-type=20p=20{\w+}=20$s?= =?UTF-8?q?=20}=20};=20test=20"=20=20[string=20repeat=20abc=20100]=20=20"?= =?UTF-8?q?=20=20=20-=201.166015=20=C2=B5s/#=20857622=20#=20857622=20#/sec?= =?UTF-8?q?=201000.000=20net-ms=20=20=20+=201.811757=20=C2=B5s/#=20551951?= =?UTF-8?q?=20#=20551950=20#/sec=201000.001=20net-ms=20so=20may=20be=20it?= =?UTF-8?q?=20must=20be=20optional=20(-jit=20option)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generic/tclRegexp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index f7e85b156e72..c591b21034e4 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1270,7 +1270,7 @@ CompileRegexp( return NULL; } - regexpPtr->study = pcre_study(pcre, 0, &errstr); + regexpPtr->study = pcre_study(pcre, PCRE_STUDY_JIT_COMPILE, &errstr); if (errstr != NULL) { pcre_free(pcre); ckfree((char *)regexpPtr); @@ -1404,7 +1404,7 @@ FreeRegexp( if (regexpPtr->flags & TCL_REG_PCRE) { pcre_free(regexpPtr->pcre); if (regexpPtr->study) { - pcre_free(regexpPtr->study); + pcre_free_study(regexpPtr->study); } } else #endif From 6ee172735f45e8287642e36982975f7b5362b89a Mon Sep 17 00:00:00 2001 From: sebres Date: Mon, 18 Oct 2021 10:59:28 +0200 Subject: [PATCH 25/27] small fixes (if compiled without pcre) and code review --- generic/tclRegexp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index c591b21034e4..5e0087bb37cc 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -450,7 +450,9 @@ Tcl_RegExpExecObj( int flags) /* Regular expression execution flags. */ { TclRegexp *regexpPtr = (TclRegexp *) re; +#ifdef HAVE_PCRE TclRegexpStorage *reStorage = regexpPtr->reStorage; +#endif int length; int reflags = regexpPtr->flags; #define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) @@ -682,6 +684,7 @@ Tcl_RegExpGetInfo( regmatch_t *matches = regexpPtr->reStorage->matches; infoPtr->nsubs = regexpPtr->re.re_nsub; + infoPtr->matches = (Tcl_RegExpIndices *)matches; if (regexpPtr->flags & TCL_REG_PCRE) { #ifdef HAVE_PCRE int *offsets = regexpPtr->reStorage->offsets; @@ -692,13 +695,11 @@ Tcl_RegExpGetInfo( matches[i].rm_eo = offsets[i*2+1]; } } - infoPtr->matches = (Tcl_RegExpIndices *)matches; infoPtr->extendStart = -1; /* XXX support? */ #else Tcl_Panic("Cannot get info for PCRE match"); #endif } else { - infoPtr->matches = (Tcl_RegExpIndices *)matches; infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; } } @@ -1035,10 +1036,11 @@ SetRegexpFromAny( static void AllocCaptStorage(TclRegexp *regexpPtr) { - int veccnt, nsubs = regexpPtr->re.re_nsub; + int nsubs = regexpPtr->re.re_nsub; TclRegexpStorage *reStorage = regexpPtr->reStorage; #ifdef HAVE_PCRE + int veccnt; /* * We use special handling to allocate storages for PCRE offsets/matches, * because on some systems size we can use the same storage for both, @@ -1095,6 +1097,7 @@ AllocCaptStorage(TclRegexp *regexpPtr) (char*)reStorage->matches, reStorage->matchSize); } } + #endif } From 659cde34ed9ea66f84137b655c2b99462d0fe104 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 31 May 2022 16:27:27 +0200 Subject: [PATCH 26/27] tests/interp.test: fixed test "bad type" for interp regexp --- tests/interp.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/interp.test b/tests/interp.test index 6d48e63bbc9a..3723dd6340e4 100644 --- a/tests/interp.test +++ b/tests/interp.test @@ -3511,9 +3511,9 @@ test interp-37.1 {safe interps and min() and max(): Bug 2895741} -setup { test interp-37.11 {interp regexp} { list [catch {interp regexp} msg] $msg } {1 {wrong # args: should be "interp regexp path ?type?"}} -test interp-37.12 {interp regexp} { +test interp-37.12 {interp regexp} -body { list [catch {interp regexp {} invalid} msg] $msg -} {1 {bad type "invalid": must be classic or pcre}} +} -match glob -result {1 {bad type "invalid": *}} test interp-37.13 {interp regexp} { list [catch {interp regexp {} classic bogus} msg] $msg } {1 {wrong # args: should be "interp regexp path ?type?"}} From ac04fd7c208d0d4169d47fc2bbc382d9fc79bfb4 Mon Sep 17 00:00:00 2001 From: sebres Date: Fri, 13 Sep 2024 22:15:25 +0200 Subject: [PATCH 27/27] regexp: amend to pcre-support: forgotten shift flags in emitting of INST_REGEXP by compiling of switch command --- generic/tclCompCmds.c | 14 ++++++++++---- tests/regexp2.test | 2 ++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/generic/tclCompCmds.c b/generic/tclCompCmds.c index 4514e8f9aa38..1d040455b0b5 100644 --- a/generic/tclCompCmds.c +++ b/generic/tclCompCmds.c @@ -3121,7 +3121,8 @@ TclCompileRegexpCmd( /* * Pass correct RE compile flags. We use only Int1 (8-bit), but * that handles all the flags we want to pass. - * Note that TCL_REG_PCRE/TCL_REG_EXPLTYPE will be mapped to TCL_REG_ADVANCED. + * Note that TCL_REG_PCRE/TCL_REG_EXPLTYPE will be mapped to TCL_REG_ADVANCED, + * because INST_REGEXP always uses TCL_REG_ADVANCED flag. * Don't use TCL_REG_NOSUB as we may have backrefs. */ cflags = TCL_REG_COMPILE_SHIFT(cflags); /* int to byte */ @@ -4319,9 +4320,14 @@ TclCompileSwitchCmd( * or capture vars. */ - int cflags = TCL_REG_ADVANCED - | (noCase ? TCL_REG_NOCASE : 0); - + int cflags = (noCase ? TCL_REG_NOCASE : 0); + /* + * Pass correct RE compile flags. We use only Int1 (8-bit), but + * that handles all the flags we want to pass. + * Note that TCL_REG_PCRE/TCL_REG_EXPLTYPE will be mapped to TCL_REG_ADVANCED, + * because INST_REGEXP always uses TCL_REG_ADVANCED flag. + */ + cflags = TCL_REG_COMPILE_SHIFT(cflags); /* int to byte */ TclEmitInstInt1(INST_REGEXP, cflags, envPtr); } break; diff --git a/tests/regexp2.test b/tests/regexp2.test index 5f2a153741d4..740a94efa37e 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -29,11 +29,13 @@ set err [catch { foreach reeng {classic pcre dfa} { proc _test_reeng_available {} { + variable org_reeng variable reeng interp regexp {} $reeng # note we should not use simple regexp here (to avoid compile it via TclReToGlob to the glob expr): if {[catch { regexp -- {^(_)(?!_)$} {_} } errMsg]} { puts "ignore test of engine \"$reeng\": $errMsg" + interp regexp {} $org_reeng return 0 } return 1