diff --git a/ecl/hql/hqlfold.cpp b/ecl/hql/hqlfold.cpp index 7e1dcf9eda0..47935a91692 100644 --- a/ecl/hql/hqlfold.cpp +++ b/ecl/hql/hqlfold.cpp @@ -2796,7 +2796,7 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption StringBuffer pattern, search; v0->getUTF8Value(pattern); v1->getUTF8Value(search); - ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom)); + ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern.length(), pattern, !expr->hasAttribute(noCaseAtom)); compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.lengthUtf8(), search.str()); rtlDestroyCompiledU8StrRegExpr(compiled); } diff --git a/rtl/eclrtl/eclregex.cpp b/rtl/eclrtl/eclregex.cpp index c86a4a3e79c..0fd1f54f4a0 100644 --- a/rtl/eclrtl/eclregex.cpp +++ b/rtl/eclrtl/eclregex.cpp @@ -28,8 +28,11 @@ #include "platform.h" #include "eclrtl.hpp" #include "eclrtl_imp.hpp" +#include "jhash.hpp" #include "jlib.hpp" +#include + //--------------------------------------------------------------------------- // PCRE2 8-bit context module variables, used for STRING and UTF-8 support @@ -61,10 +64,11 @@ static void pcre2Free(void * block, void * /*userData*/) /// @param errCode PCRE2 error code /// @param msgPrefix Prefix for error message; can be an empty string; /// include a trailing space if a non-empty message is passed -/// @param regex OPTIONAL; regex pattern that was in play when error occurred +/// @param regex Regex pattern that was in play when error occurred; pass +/// an empty string if no regex was used /// @param errOffset OPTIONAL; offset in regex pattern where error occurred; -/// ignored if regex is null or empty -static void failWithPCRE2Error(int errCode, const char * msgPrefix, const char * regex = nullptr, int errOffset = -1) +/// ignored if regex is empty +static void failWithPCRE2Error(int errCode, const std::string & msgPrefix, const std::string & regex, int errOffset = -1) { const int errBuffSize = 120; char errBuff[errBuffSize]; @@ -80,7 +84,7 @@ static void failWithPCRE2Error(int errCode, const char * msgPrefix, const char * msg += std::to_string(errCode); msg += " (no error message available)"; } - if (regex && regex[0]) + if (!regex.empty()) { msg += " (regex: '"; msg += regex; @@ -98,42 +102,69 @@ static void failWithPCRE2Error(int errCode, const char * msgPrefix, const char * /// @brief Convert a PCRE2 error code to a string and throw an exception /// @param errCode PCRE2 error code /// @param msgPrefix Prefix for error message; can be an empty string; -/// include a trailing space if a non-empty message is passed -/// @param regex OPTIONAL; Unicode regex pattern that was in play when error occurred -/// @param errOffset OPTIONAL; offset in regex pattern where error occurred; +/// include a trailing space if a non-empty regex is passed +/// @param regex Unicode regex pattern that was in play when error occurred +/// @param regexLength The length of regex, in code points +/// @param errOffset Offset in regex pattern where error occurred; /// ignored if regex is null or empty -static void failWithUPCRE2Error(int errCode, const char * msgPrefix, const UChar * regex = nullptr, int errOffset = -1) +static void failWithUPCRE2Error(int errCode, const std::string & msgPrefix, const UChar * regex, int regexLength, int errOffset) { std::string regexPattern; - if (regex) + if (regex && regex[0]) { char * regexStr = nullptr; unsigned regexStrLen; - rtlUnicodeToEscapedStrX(regexStrLen, regexStr, rtlUnicodeStrlen(regex), regex); + rtlUnicodeToEscapedStrX(regexStrLen, regexStr, regexLength, regex); regexPattern = std::string(regexStr, regexStrLen); rtlFree(regexStr); } - failWithPCRE2Error(errCode, msgPrefix, regexPattern.c_str(), errOffset); + failWithPCRE2Error(errCode, msgPrefix, regexPattern, errOffset); } +/// @brief Convert a PCRE2 error code to a string and throw an exception +/// @param errCode PCRE2 error code +/// @param msg Error message; can be an empty string +/// @param regex OPTIONAL; Unicode regex pattern that was in play when error occurred +/// @param regexLength OPTIONAL; the length of regex, in code points +/// @param errOffset OPTIONAL; offset in regex pattern where error occurred; +/// ignored if regex is null or empty +static void failWithUPCRE2Error(int errCode, const std::string & msg) +{ + failWithUPCRE2Error(errCode, msg, nullptr, 0, -1); +} + +//--------------------------------------------------------------------------- +// Compiled regex pattern cache used for all data types +//--------------------------------------------------------------------------- + +class CompiledRegexPattern +{ + public: + CompiledRegexPattern() = default; + virtual ~CompiledRegexPattern() = default; +}; + +static CLRUCache compiledStrRegExprCache(500); // Cache size of 500 +static CriticalSection compiledStrRegExprLock; + //--------------------------------------------------------------------------- class CStrRegExprFindInstance : implements IStrRegExprFindInstance { private: bool matched = false; - pcre2_code_8 * compiledRegex = nullptr; // do not free; this will be owned by caller + std::shared_ptr compiledRegex = nullptr; pcre2_match_data_8 * matchData = nullptr; const char * subject = nullptr; // points to current subject of regex; do not free char * sample = nullptr; //only required if findstr/findvstr will be called public: - CStrRegExprFindInstance(pcre2_code_8 * _compiledRegex, const char * _subject, size32_t _from, size32_t _len, bool _keep) + CStrRegExprFindInstance(std::shared_ptr _compiledRegex, const char * _subject, size32_t _from, size32_t _len, bool _keep) : compiledRegex(_compiledRegex) { // See if UTF-8 is enabled on this compiled regex uint32_t option_bits; - pcre2_pattern_info_8(compiledRegex, PCRE2_INFO_ALLOPTIONS, &option_bits); + pcre2_pattern_info_8(compiledRegex.get(), PCRE2_INFO_ALLOPTIONS, &option_bits); bool utf8Enabled = (option_bits & PCRE2_UTF) != 0; // Make sure the offset and length is in code points (bytes), not characters size32_t subjectOffset = (utf8Enabled ? rtlUtf8Size(_from, _subject) : _from); @@ -152,16 +183,16 @@ class CStrRegExprFindInstance : implements IStrRegExprFindInstance } matched = false; - matchData = pcre2_match_data_create_from_pattern_8(compiledRegex, pcre2GeneralContext8); + matchData = pcre2_match_data_create_from_pattern_8(compiledRegex.get(), pcre2GeneralContext8); - int numMatches = pcre2_match_8(compiledRegex, (PCRE2_SPTR8)subject, subjectSize, 0, 0, matchData, pcre2MatchContext8); + int numMatches = pcre2_match_8(compiledRegex.get(), (PCRE2_SPTR8)subject, subjectSize, 0, 0, matchData, pcre2MatchContext8); matched = numMatches > 0; if (numMatches < 0 && numMatches != PCRE2_ERROR_NOMATCH) { // Treat everything else as an error - failWithPCRE2Error(numMatches, "Error in regex search: "); + failWithPCRE2Error(numMatches, "Error in regex search: ", ""); } } @@ -216,28 +247,13 @@ class CStrRegExprFindInstance : implements IStrRegExprFindInstance //--------------------------------------------------------------------------- -class CCompiledStrRegExpr : implements ICompiledStrRegExpr +class CCompiledStrRegExpr : public CompiledRegexPattern, implements ICompiledStrRegExpr { private: - pcre2_code_8 * compiledRegex = nullptr; + std::shared_ptr compiledRegex = nullptr; bool isUTF8Enabled = false; public: - CCompiledStrRegExpr(const char * _regex, bool _isCaseSensitive, bool _enableUTF8) - : isUTF8Enabled(_enableUTF8) - { - int errNum = 0; - PCRE2_SIZE errOffset; - uint32_t options = ((_isCaseSensitive ? 0 : PCRE2_CASELESS) | (_enableUTF8 ? PCRE2_UTF : 0)); - - compiledRegex = pcre2_compile_8((PCRE2_SPTR8)_regex, PCRE2_ZERO_TERMINATED, options, &errNum, &errOffset, pcre2CompileContext8); - - if (compiledRegex == nullptr) - { - failWithPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); - } - } - CCompiledStrRegExpr(int _regexLength, const char * _regex, bool _isCaseSensitive, bool _enableUTF8) : isUTF8Enabled(_enableUTF8) { @@ -246,26 +262,27 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr uint32_t options = ((_isCaseSensitive ? 0 : PCRE2_CASELESS) | (_enableUTF8 ? PCRE2_UTF : 0)); size32_t regexSize = (isUTF8Enabled ? rtlUtf8Size(_regexLength, _regex) : _regexLength); - compiledRegex = pcre2_compile_8((PCRE2_SPTR8)_regex, regexSize, options, &errNum, &errOffset, pcre2CompileContext8); + pcre2_code_8 * newCompiledRegex = pcre2_compile_8((PCRE2_SPTR8)_regex, regexSize, options, &errNum, &errOffset, pcre2CompileContext8); - if (compiledRegex == nullptr) + if (newCompiledRegex == nullptr) { - failWithPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); + failWithPCRE2Error(errNum, "Error in regex pattern: ", std::string(_regex, _regexLength), errOffset); } - } - ~CCompiledStrRegExpr() //CAVEAT non-virtual destructor ! - { - pcre2_code_free_8(compiledRegex); + compiledRegex = std::shared_ptr(newCompiledRegex, pcre2_code_free_8); } + CCompiledStrRegExpr(CCompiledStrRegExpr & other) = default; // Note non-const argument + + virtual ~CCompiledStrRegExpr() = default; + //ICompiledStrRegExpr void replace(size32_t & outlen, char * & out, size32_t slen, char const * str, size32_t rlen, char const * replace) const { PCRE2_SIZE pcreLen = 0; outlen = 0; - pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex, pcre2GeneralContext8); + pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex.get(), pcre2GeneralContext8); // This method is often called through an ECL interface and the provided lengths // (slen and rlen) are in characters, not bytes; we need to convert these to a @@ -279,20 +296,20 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr // Note that pcreLen will include space for a terminating null character; // we have to allocate memory for that byte to avoid a buffer overrun, // but we won't count that terminating byte - int replaceResult = pcre2_substitute_8(compiledRegex, (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, nullptr, &pcreLen); + int replaceResult = pcre2_substitute_8(compiledRegex.get(), (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, nullptr, &pcreLen); if (replaceResult < 0 && replaceResult != PCRE2_ERROR_NOMEMORY) { // PCRE2_ERROR_NOMEMORY is a normal result when we're just asking for the size of the output pcre2_match_data_free_8(matchData); - failWithPCRE2Error(replaceResult, "Error in regex replace: "); + failWithPCRE2Error(replaceResult, "Error in regex replace: ", ""); } if (pcreLen > 0) { out = (char *)rtlMalloc(pcreLen); - replaceResult = pcre2_substitute_8(compiledRegex, (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, (PCRE2_UCHAR8 *)out, &pcreLen); + replaceResult = pcre2_substitute_8(compiledRegex.get(), (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, (PCRE2_UCHAR8 *)out, &pcreLen); // Note that, weirdly, pcreLen will now contain the number of code points // in the result *excluding* the null terminator, so pcreLen will @@ -301,7 +318,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr if (replaceResult < 0) { pcre2_match_data_free_8(matchData); - failWithPCRE2Error(replaceResult, "Error in regex replace: "); + failWithPCRE2Error(replaceResult, "Error in regex replace: ", ""); } } @@ -323,7 +340,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr PCRE2_SIZE offset = 0; uint32_t matchOptions = 0; PCRE2_SIZE subjectSize = (isUTF8Enabled ? rtlUtf8Size(_subjectLen, _subject) : _subjectLen); - pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex, pcre2GeneralContext8); + pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex.get(), pcre2GeneralContext8); // Capture groups are ignored when gathering match results into a set, // so we will focus on only the first match (the entire matched string); @@ -332,7 +349,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr while (offset < subjectSize) { - int numMatches = pcre2_match_8(compiledRegex, (PCRE2_SPTR8)_subject, subjectSize, offset, matchOptions, matchData, pcre2MatchContext8); + int numMatches = pcre2_match_8(compiledRegex.get(), (PCRE2_SPTR8)_subject, subjectSize, offset, matchOptions, matchData, pcre2MatchContext8); if (numMatches < 0) { @@ -345,7 +362,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr { // Treat everything else as an error pcre2_match_data_free_8(matchData); - failWithPCRE2Error(numMatches, "Error in regex getMatchSet: "); + failWithPCRE2Error(numMatches, "Error in regex getMatchSet: ", ""); } } else if (numMatches > 0) @@ -389,16 +406,44 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr // STRING implementation //--------------------------------------------------------------------------- +CCompiledStrRegExpr* fetchOrCreateCompiledStrRegExpr(int _regexLength, const char * _regex, bool _isCaseSensitive) +{ + CCompiledStrRegExpr * compiledObjPtr = nullptr; + uint32_t options = (_isCaseSensitive ? 0 : PCRE2_CASELESS); + hash64_t regexHash = HASH64_INIT; + + // Create a hash of the regex pattern plus the options + regexHash = rtlHash64Data(_regexLength, _regex, regexHash); + regexHash = rtlHash64Data(sizeof(options), &options, regexHash); + + // Check the cache + { + CriticalBlock lock(compiledStrRegExprLock); + compiledObjPtr = dynamic_cast(compiledStrRegExprCache.get(regexHash)); + if (compiledObjPtr) + { + // Return a new copy of the cached object + return new CCompiledStrRegExpr(*compiledObjPtr); + } + + // Create a new object and cache a copy of it + compiledObjPtr = new CCompiledStrRegExpr(_regexLength, _regex, _isCaseSensitive, false); + compiledStrRegExprCache.set(regexHash, new CCompiledStrRegExpr(*compiledObjPtr)); + } + + return compiledObjPtr; +} + +//--------------------------------------------------------------------------- + ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExpr, isCaseSensitive, false); - return expr; + return fetchOrCreateCompiledStrRegExpr(strlen(regExpr), regExpr, isCaseSensitive); } ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(int regExprLength, const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExprLength, regExpr, isCaseSensitive, false); - return expr; + return fetchOrCreateCompiledStrRegExpr(regExprLength, regExpr, isCaseSensitive); } ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiledExpr) @@ -417,16 +462,44 @@ ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * findI // UTF8 implementation //--------------------------------------------------------------------------- +CCompiledStrRegExpr* fetchOrCreateCompiledU8StrRegExpr(int _regexLength, const char * _regex, bool _isCaseSensitive) +{ + CCompiledStrRegExpr * compiledObjPtr = nullptr; + uint32_t options = PCRE2_UTF | (_isCaseSensitive ? 0 : PCRE2_CASELESS); + hash64_t regexHash = HASH64_INIT; + + // Create a hash of the regex pattern plus the options + regexHash = rtlHash64Data(rtlUtf8Size(_regexLength, _regex), _regex, regexHash); + regexHash = rtlHash64Data(sizeof(options), &options, regexHash); + + // Check the cache + { + CriticalBlock lock(compiledStrRegExprLock); + compiledObjPtr = dynamic_cast(compiledStrRegExprCache.get(regexHash)); + if (compiledObjPtr) + { + // Return a new copy of the cached object + return new CCompiledStrRegExpr(*compiledObjPtr); + } + + // Create a new object and cache a copy of it + compiledObjPtr = new CCompiledStrRegExpr(_regexLength, _regex, _isCaseSensitive, true); + compiledStrRegExprCache.set(regexHash, new CCompiledStrRegExpr(*compiledObjPtr)); + } + + return compiledObjPtr; +} + +//--------------------------------------------------------------------------- + ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledU8StrRegExpr(const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExpr, isCaseSensitive, true); - return expr; + return fetchOrCreateCompiledU8StrRegExpr(rtlUtf8Length(regExpr), regExpr, isCaseSensitive); } ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledU8StrRegExpr(int regExprLength, const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExprLength, regExpr, isCaseSensitive, true); - return expr; + return fetchOrCreateCompiledU8StrRegExpr(regExprLength, regExpr, isCaseSensitive); } ECLRTL_API void rtlDestroyCompiledU8StrRegExpr(ICompiledStrRegExpr * compiledExpr) @@ -451,18 +524,18 @@ class CUStrRegExprFindInstance : implements IUStrRegExprFindInstance { private: bool matched = false; - pcre2_code_16 * compiledRegex = nullptr; // do not free; this will be owned by caller + std::shared_ptr compiledRegex = nullptr; pcre2_match_data_16 * matchData = nullptr; const UChar * subject = nullptr; // points to current subject of regex; do not free public: - CUStrRegExprFindInstance(pcre2_code_16 * _compiledRegex, const UChar * _subject, size32_t _from, size32_t _len) + CUStrRegExprFindInstance(std::shared_ptr _compiledRegex, const UChar * _subject, size32_t _from, size32_t _len) : compiledRegex(_compiledRegex) { subject = _subject + _from; matched = false; - matchData = pcre2_match_data_create_from_pattern_16(compiledRegex, pcre2GeneralContext16); - int numMatches = pcre2_match_16(compiledRegex, (PCRE2_SPTR16)subject, _len, 0, 0, matchData, pcre2MatchContext16); + matchData = pcre2_match_data_create_from_pattern_16(compiledRegex.get(), pcre2GeneralContext16); + int numMatches = pcre2_match_16(compiledRegex.get(), (PCRE2_SPTR16)subject, _len, 0, 0, matchData, pcre2MatchContext16); matched = numMatches > 0; @@ -523,50 +596,37 @@ class CUStrRegExprFindInstance : implements IUStrRegExprFindInstance //--------------------------------------------------------------------------- -class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr +class CCompiledUStrRegExpr : public CompiledRegexPattern, implements ICompiledUStrRegExpr { private: - pcre2_code_16 * compiledRegex = nullptr; + std::shared_ptr compiledRegex = nullptr; public: - CCompiledUStrRegExpr(const UChar * _regex, bool _isCaseSensitive = false) - { - int errNum = 0; - PCRE2_SIZE errOffset; - uint32_t options = (PCRE2_UCP | (_isCaseSensitive ? 0 : PCRE2_CASELESS)); - - compiledRegex = pcre2_compile_16((PCRE2_SPTR16)_regex, PCRE2_ZERO_TERMINATED, options, &errNum, &errOffset, pcre2CompileContext16); - - if (compiledRegex == nullptr) - { - failWithUPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); - } - } - CCompiledUStrRegExpr(int _regexLength, const UChar * _regex, bool _isCaseSensitive = false) { int errNum = 0; PCRE2_SIZE errOffset; uint32_t options = (PCRE2_UCP | (_isCaseSensitive ? 0 : PCRE2_CASELESS)); - compiledRegex = pcre2_compile_16((PCRE2_SPTR16)_regex, _regexLength, options, &errNum, &errOffset, pcre2CompileContext16); + pcre2_code_16 * newCompiledRegex = pcre2_compile_16((PCRE2_SPTR16)_regex, _regexLength, options, &errNum, &errOffset, pcre2CompileContext16); - if (compiledRegex == nullptr) + if (newCompiledRegex == nullptr) { - failWithUPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); + failWithUPCRE2Error(errNum, "Error in regex pattern: ", _regex, _regexLength, errOffset); } - } - ~CCompiledUStrRegExpr() - { - pcre2_code_free_16(compiledRegex); + compiledRegex = std::shared_ptr(newCompiledRegex, pcre2_code_free_16); } + CCompiledUStrRegExpr(CCompiledUStrRegExpr& other) = default; + + virtual ~CCompiledUStrRegExpr() = default; + void replace(size32_t & outlen, UChar * & out, size32_t slen, const UChar * str, size32_t rlen, UChar const * replace) const { PCRE2_SIZE pcreLen = 0; outlen = 0; - pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex, pcre2GeneralContext16); + pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex.get(), pcre2GeneralContext16); uint32_t replaceOptions = PCRE2_SUBSTITUTE_GLOBAL|PCRE2_SUBSTITUTE_EXTENDED; @@ -574,7 +634,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr // Note that pcreLen will include space for a terminating null character; // we have to allocate memory for that byte to avoid a buffer overrun, // but we won't count that terminating byte - int replaceResult = pcre2_substitute_16(compiledRegex, (PCRE2_SPTR16)str, slen, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, nullptr, &pcreLen); + int replaceResult = pcre2_substitute_16(compiledRegex.get(), (PCRE2_SPTR16)str, slen, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, nullptr, &pcreLen); if (replaceResult < 0 && replaceResult != PCRE2_ERROR_NOMEMORY) { @@ -587,7 +647,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr { out = (UChar *)rtlMalloc(pcreLen * sizeof(UChar)); - replaceResult = pcre2_substitute_16(compiledRegex, (PCRE2_SPTR16)str, slen, 0, replaceOptions, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, (PCRE2_UCHAR16 *)out, &pcreLen); + replaceResult = pcre2_substitute_16(compiledRegex.get(), (PCRE2_SPTR16)str, slen, 0, replaceOptions, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, (PCRE2_UCHAR16 *)out, &pcreLen); // Note that, weirdly, pcreLen will now contain the number of code points // in the result *excluding* the null terminator, so pcreLen will @@ -617,7 +677,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr size32_t outBytes = 0; PCRE2_SIZE offset = 0; uint32_t matchOptions = 0; - pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex, pcre2GeneralContext16); + pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex.get(), pcre2GeneralContext16); // Capture groups are ignored when gathering match results into a set, // so we will focus on only the first match (the entire matched string); @@ -626,7 +686,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr while (offset < _subjectLen) { - int numMatches = pcre2_match_16(compiledRegex, (PCRE2_SPTR16)_subject, _subjectLen, offset, matchOptions, matchData, pcre2MatchContext16); + int numMatches = pcre2_match_16(compiledRegex.get(), (PCRE2_SPTR16)_subject, _subjectLen, offset, matchOptions, matchData, pcre2MatchContext16); if (numMatches < 0) { @@ -682,16 +742,44 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr // UNICODE implementation //--------------------------------------------------------------------------- +CCompiledUStrRegExpr* fetchOrCreateCompiledUStrRegExpr(int _regexLength, const UChar * _regex, bool _isCaseSensitive) +{ + CCompiledUStrRegExpr * compiledObjPtr = nullptr; + uint32_t options = PCRE2_UCP | (_isCaseSensitive ? 0 : PCRE2_CASELESS); + hash64_t regexHash = HASH64_INIT; + + // Create a hash of the regex pattern plus the options + regexHash = rtlHash64Data(_regexLength * sizeof(UChar), _regex, regexHash); + regexHash = rtlHash64Data(sizeof(options), &options, regexHash); + + // Check the cache + { + CriticalBlock lock(compiledStrRegExprLock); + compiledObjPtr = dynamic_cast(compiledStrRegExprCache.get(regexHash)); + if (compiledObjPtr) + { + // Return a new copy of the cached object + return new CCompiledUStrRegExpr(*compiledObjPtr); + } + + // Create a new object and cache a copy of it + compiledObjPtr = new CCompiledUStrRegExpr(_regexLength, _regex, _isCaseSensitive); + compiledStrRegExprCache.set(regexHash, new CCompiledUStrRegExpr(*compiledObjPtr)); + } + + return compiledObjPtr; +} + +//--------------------------------------------------------------------------- + ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive) { - CCompiledUStrRegExpr * expr = new CCompiledUStrRegExpr(regExpr, isCaseSensitive); - return expr; + return fetchOrCreateCompiledUStrRegExpr(rtlUnicodeStrlen(regExpr), regExpr, isCaseSensitive); } ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(int regExprLength, const UChar * regExpr, bool isCaseSensitive) { - CCompiledUStrRegExpr * expr = new CCompiledUStrRegExpr(regExprLength, regExpr, isCaseSensitive); - return expr; + return fetchOrCreateCompiledUStrRegExpr(regExprLength, regExpr, isCaseSensitive); } ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr) diff --git a/rtl/eclrtl/eclrtl.cpp b/rtl/eclrtl/eclrtl.cpp index 1b8a9f1d313..156cad7066e 100644 --- a/rtl/eclrtl/eclrtl.cpp +++ b/rtl/eclrtl/eclrtl.cpp @@ -4960,6 +4960,19 @@ unsigned rtlUtf8Size(unsigned len, const void * _data) return offset; } +unsigned rtlUtf8Length(const void * _data) +{ + const byte * data = (const byte *)_data; + size32_t length = 0; + unsigned offset = 0; + while (data[offset]) + { + offset += readUtf8Size(data+offset); + length++; + } + return length; +} + unsigned rtlUtf8Length(unsigned size, const void * _data) { const byte * data = (const byte *)_data; diff --git a/rtl/eclrtl/eclrtl.hpp b/rtl/eclrtl/eclrtl.hpp index 53d4a09919b..6cb401b69b0 100644 --- a/rtl/eclrtl/eclrtl.hpp +++ b/rtl/eclrtl/eclrtl.hpp @@ -667,6 +667,7 @@ ECLRTL_API void rtlStrToVUnicode(unsigned outlen, UChar * out, unsigned inlen, c ECLRTL_API unsigned rtlUtf8Size(const void * data); ECLRTL_API unsigned rtlUtf8Size(unsigned len, const void * data); +ECLRTL_API unsigned rtlUtf8Length(const void * data); ECLRTL_API unsigned rtlUtf8Length(unsigned size, const void * data); ECLRTL_API unsigned rtlUtf8Char(const void * data); ECLRTL_API void rtlUtf8ToData(size32_t outlen, void * out, size32_t inlen, const char *in); diff --git a/system/jlib/jhash.hpp b/system/jlib/jhash.hpp index 3653abd4c21..28c33a18bfb 100644 --- a/system/jlib/jhash.hpp +++ b/system/jlib/jhash.hpp @@ -21,6 +21,7 @@ #define JHASH_HPP #include +#include #include #include @@ -687,4 +688,104 @@ class CTimeLimitedCache } }; +/** + * CLRUCache + * + * Least-Recently-Used cache class, specialized for key and + * value pointer types (the value is a pointer or a data type + * where a nullptr could represent a missing value). + * + * The get() method returns a found object by value. This + * is intentional and very useful for maintaining refcounts. + * + * There is a minimum size for the cache, defined by + * LRU_MIN_CACHE_SIZE. Attempts to create a smaller cache + * will be silently changed to the minimum size. If no + * initial size is provided to the constructor, the cache + * size will be set to LRU_MIN_CACHE_SIZE. + * + * Methods here are not thread-safe. Callers should block + * concurrent access for non-const methods (which are most + * of them). + */ + +#define LRU_MIN_CACHE_SIZE 10 + +template +class CLRUCache +{ + private: + std::list recentList; + std::unordered_map::iterator>> lookupMap; + size32_t maxCacheSize; + + void _downsize() + { + while (lookupMap.size() > maxCacheSize) + { + lookupMap.erase(recentList.back()); + recentList.pop_back(); + } + } + + public: + CLRUCache() : maxCacheSize(LRU_MIN_CACHE_SIZE) {} + CLRUCache(size32_t _maxCacheSize) : maxCacheSize(_maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize) {} + CLRUCache(const CLRUCache& other) = delete; + ~CLRUCache() = default; + + size32_t getCacheSize() const + { + return lookupMap.size(); + } + + size32_t setMaxCacheSize(size32_t _maxCacheSize) + { + maxCacheSize = _maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize; + _downsize(); + return maxCacheSize; + } + + PTRTYPE get(const KEYTYPE& key) + { + auto foundIter = lookupMap.find(key); + if (foundIter == lookupMap.end()) + return nullptr; + + recentList.erase(foundIter->second.second); + recentList.push_front(key); + lookupMap[key] = {foundIter->second.first, recentList.begin()}; + return foundIter->second.first; + } + + void set(const KEYTYPE& key, const PTRTYPE& value) + { + auto foundIter = lookupMap.find(key); + + if (foundIter == lookupMap.end()) + { + recentList.push_front(key); + lookupMap[key] = {value, recentList.begin()}; + _downsize(); + } + else + { + recentList.erase(foundIter->second.second); + recentList.push_front(key); + lookupMap[key] = {value, recentList.begin()}; + } + } + + bool remove(const KEYTYPE& key) + { + auto foundIter = lookupMap.find(key); + if (foundIter == lookupMap.end()) + return false; + + recentList.erase(foundIter->second.second); + lookupMap.erase(foundIter); + return true; + } +}; + #endif