diff --git a/ecl/hql/hqlfold.cpp b/ecl/hql/hqlfold.cpp index 7e1dcf9eda0..4a0226cbcfd 100644 --- a/ecl/hql/hqlfold.cpp +++ b/ecl/hql/hqlfold.cpp @@ -2796,7 +2796,7 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption StringBuffer pattern, search; v0->getUTF8Value(pattern); v1->getUTF8Value(search); - ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom)); + ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern.lengthUtf8(), pattern, !expr->hasAttribute(noCaseAtom)); compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.lengthUtf8(), search.str()); rtlDestroyCompiledU8StrRegExpr(compiled); } diff --git a/rtl/eclrtl/eclregex.cpp b/rtl/eclrtl/eclregex.cpp index c86a4a3e79c..275aa19cd4d 100644 --- a/rtl/eclrtl/eclregex.cpp +++ b/rtl/eclrtl/eclregex.cpp @@ -28,7 +28,12 @@ #include "platform.h" #include "eclrtl.hpp" #include "eclrtl_imp.hpp" +#include "jhash.hpp" #include "jlib.hpp" +#include "jmisc.hpp" +#include "jprop.hpp" + +#include //--------------------------------------------------------------------------- @@ -57,14 +62,16 @@ static void pcre2Free(void * block, void * /*userData*/) rtlFree(block); } -/// @brief Convert a PCRE2 error code to a string and throw an exception -/// @param errCode PCRE2 error code -/// @param msgPrefix Prefix for error message; can be an empty string; -/// include a trailing space if a non-empty message is passed -/// @param regex OPTIONAL; regex pattern that was in play when error occurred -/// @param errOffset OPTIONAL; offset in regex pattern where error occurred; -/// ignored if regex is null or empty -static void failWithPCRE2Error(int errCode, const char * msgPrefix, const char * regex = nullptr, int errOffset = -1) +/** + * @brief Handles failure reporting with a regex and throws an exception with the given error code and message. + * + * @param errCode The error code indicating the type of error that occurred. + * @param msgPrefix The prefix to be added to the error message; can be an empty string; include a trailing space if a non-empty regex is passed. + * @param regex The regular expression pattern; may be an empty string. + * @param regexLength The length (in code points) of the regular expression pattern. + * @param errOffset The offset into regex at which the error occurred. + */ +static void failWithPCRE2Error(int errCode, const std::string & msgPrefix, const std::string & regex, int errOffset) { const int errBuffSize = 120; char errBuff[errBuffSize]; @@ -80,7 +87,7 @@ static void failWithPCRE2Error(int errCode, const char * msgPrefix, const char * msg += std::to_string(errCode); msg += " (no error message available)"; } - if (regex && regex[0]) + if (!regex.empty()) { msg += " (regex: '"; msg += regex; @@ -95,25 +102,143 @@ static void failWithPCRE2Error(int errCode, const char * msgPrefix, const char * rtlFail(0, msg.c_str()); } -/// @brief Convert a PCRE2 error code to a string and throw an exception -/// @param errCode PCRE2 error code -/// @param msgPrefix Prefix for error message; can be an empty string; -/// include a trailing space if a non-empty message is passed -/// @param regex OPTIONAL; Unicode regex pattern that was in play when error occurred -/// @param errOffset OPTIONAL; offset in regex pattern where error occurred; -/// ignored if regex is null or empty -static void failWithUPCRE2Error(int errCode, const char * msgPrefix, const UChar * regex = nullptr, int errOffset = -1) +/** + * @brief Handles the failure of a regular expression operation and throws an exception with the given error code and message. + * + * @param errCode The error code associated with the failure. + * @param msg The error message describing the failure. + */ +static void failWithPCRE2Error(int errCode, const std::string & msg) +{ + failWithPCRE2Error(errCode, msg, "", -1); +} + +/** + * @brief Handles failure reporting with Unicode regex and throws an exception with the given error code and message. + * + * @param errCode The error code indicating the type of error that occurred. + * @param msgPrefix The prefix to be added to the error message; can be an empty string; include a trailing space if a non-empty message is passed. + * @param regex The regular expression pattern in UChar format. + * @param regexLength The length (in code points) of the regular expression pattern. + * @param errOffset The offset into regex at which the error occurred. + */ +static void failWithPCRE2Error(int errCode, const std::string & msgPrefix, const UChar * regex, int regexLength, int errOffset) { std::string regexPattern; - if (regex) + if (regex && regex[0]) { char * regexStr = nullptr; unsigned regexStrLen; - rtlUnicodeToEscapedStrX(regexStrLen, regexStr, rtlUnicodeStrlen(regex), regex); + rtlUnicodeToEscapedStrX(regexStrLen, regexStr, regexLength, regex); regexPattern = std::string(regexStr, regexStrLen); rtlFree(regexStr); } - failWithPCRE2Error(errCode, msgPrefix, regexPattern.c_str(), errOffset); + failWithPCRE2Error(errCode, msgPrefix, regexPattern, errOffset); +} + +//--------------------------------------------------------------------------- + +/** + * @brief Parent class of all compiled regular expression pattern classes; used for caching. + */ +class RegexCacheEntry +{ +private: + uint32_t savedOptions = 0; // set when the object is cached + std::string savedPattern; // used as a blob store; set when the object is cached + std::shared_ptr compiledRegex8 = nullptr; + std::shared_ptr compiledRegex16 = nullptr; + +public: + RegexCacheEntry() = delete; + + RegexCacheEntry(size32_t _patternSize, const char * _pattern, uint32_t _options, std::shared_ptr _compiledRegex8) + : savedOptions(_options), savedPattern(_pattern, _patternSize), compiledRegex8(_compiledRegex8) + {} + + RegexCacheEntry(size32_t _patternSize, const char * _pattern, uint32_t _options, std::shared_ptr _compiledRegex16) + : savedOptions(_options), savedPattern(_pattern, _patternSize), compiledRegex16(_compiledRegex16) + {} + + RegexCacheEntry(const RegexCacheEntry & other) = delete; + + static hash64_t hashValue(size32_t patternSize, const char * pattern, uint32_t options) + { + hash64_t hash = HASH64_INIT; + hash = rtlHash64Data(patternSize, pattern, hash); + hash = rtlHash64Data(sizeof(options), &options, hash); + return hash; + } + + bool hasSamePattern(size32_t patternSize, const char * pattern, uint32_t options) const + { + if ((patternSize == 0) || (patternSize != savedPattern.size())) + return false; + if (options != savedOptions) + return false; + return (memcmp(pattern, savedPattern.data(), patternSize) == 0); + } + + std::shared_ptr getCompiledRegex8() const { return compiledRegex8; } + std::shared_ptr getCompiledRegex16() const { return compiledRegex16; } +}; + +//--------------------------------------------------------------------------- + +#define DEFAULT_CACHE_MAX_SIZE 500 +static CLRUCache> compiledStrRegExprCache(DEFAULT_CACHE_MAX_SIZE); +static CriticalSection compiledStrRegExprLock; +static bool compiledCacheEnabled = true; + +/** + * @brief Provide an optional override to the maximum cache size for regex patterns. + * + * Functions searches with the containerized "expert" section or the bare-metal + * section for an optional "regex" subsection with a "cacheSize" attribute + * By default, the maximum cache size is set to 500 patterns. Override with 0 to disable caching. + */ +static void initMaxCacheSize() +{ +#ifdef _CONTAINERIZED + Owned expert; +#else + Owned envtree; + IPropertyTree * expert = nullptr; +#endif + + try + { +#ifdef _CONTAINERIZED + expert.setown(getGlobalConfigSP()->getPropTree("expert")); +#else + envtree.setown(getHPCCEnvironment()); + if (envtree) + expert = envtree->queryPropTree("Software/Globals"); +#endif + } + catch (IException *e) + { + e->Release(); + } + catch (...) + { + } + + size32_t cacheMaxSize = DEFAULT_CACHE_MAX_SIZE; + + if (expert) + { + IPropertyTree *regexProps = expert->queryPropTree("regex"); + if (regexProps) + { + cacheMaxSize = regexProps->getPropInt("@cacheSize", cacheMaxSize); + } + } + + if (cacheMaxSize > 0) + compiledStrRegExprCache.setMaxCacheSize(cacheMaxSize); + else + compiledCacheEnabled = false; } //--------------------------------------------------------------------------- @@ -122,18 +247,18 @@ class CStrRegExprFindInstance : implements IStrRegExprFindInstance { private: bool matched = false; - pcre2_code_8 * compiledRegex = nullptr; // do not free; this will be owned by caller + std::shared_ptr compiledRegex = nullptr; pcre2_match_data_8 * matchData = nullptr; const char * subject = nullptr; // points to current subject of regex; do not free char * sample = nullptr; //only required if findstr/findvstr will be called public: - CStrRegExprFindInstance(pcre2_code_8 * _compiledRegex, const char * _subject, size32_t _from, size32_t _len, bool _keep) + CStrRegExprFindInstance(std::shared_ptr _compiledRegex, const char * _subject, size32_t _from, size32_t _len, bool _keep) : compiledRegex(_compiledRegex) { // See if UTF-8 is enabled on this compiled regex uint32_t option_bits; - pcre2_pattern_info_8(compiledRegex, PCRE2_INFO_ALLOPTIONS, &option_bits); + pcre2_pattern_info_8(compiledRegex.get(), PCRE2_INFO_ALLOPTIONS, &option_bits); bool utf8Enabled = (option_bits & PCRE2_UTF) != 0; // Make sure the offset and length is in code points (bytes), not characters size32_t subjectOffset = (utf8Enabled ? rtlUtf8Size(_from, _subject) : _from); @@ -152,9 +277,9 @@ class CStrRegExprFindInstance : implements IStrRegExprFindInstance } matched = false; - matchData = pcre2_match_data_create_from_pattern_8(compiledRegex, pcre2GeneralContext8); + matchData = pcre2_match_data_create_from_pattern_8(compiledRegex.get(), pcre2GeneralContext8); - int numMatches = pcre2_match_8(compiledRegex, (PCRE2_SPTR8)subject, subjectSize, 0, 0, matchData, pcre2MatchContext8); + int numMatches = pcre2_match_8(compiledRegex.get(), (PCRE2_SPTR8)subject, subjectSize, 0, 0, matchData, pcre2MatchContext8); matched = numMatches > 0; @@ -219,25 +344,10 @@ class CStrRegExprFindInstance : implements IStrRegExprFindInstance class CCompiledStrRegExpr : implements ICompiledStrRegExpr { private: - pcre2_code_8 * compiledRegex = nullptr; + std::shared_ptr compiledRegex = nullptr; bool isUTF8Enabled = false; public: - CCompiledStrRegExpr(const char * _regex, bool _isCaseSensitive, bool _enableUTF8) - : isUTF8Enabled(_enableUTF8) - { - int errNum = 0; - PCRE2_SIZE errOffset; - uint32_t options = ((_isCaseSensitive ? 0 : PCRE2_CASELESS) | (_enableUTF8 ? PCRE2_UTF : 0)); - - compiledRegex = pcre2_compile_8((PCRE2_SPTR8)_regex, PCRE2_ZERO_TERMINATED, options, &errNum, &errOffset, pcre2CompileContext8); - - if (compiledRegex == nullptr) - { - failWithPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); - } - } - CCompiledStrRegExpr(int _regexLength, const char * _regex, bool _isCaseSensitive, bool _enableUTF8) : isUTF8Enabled(_enableUTF8) { @@ -246,26 +356,29 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr uint32_t options = ((_isCaseSensitive ? 0 : PCRE2_CASELESS) | (_enableUTF8 ? PCRE2_UTF : 0)); size32_t regexSize = (isUTF8Enabled ? rtlUtf8Size(_regexLength, _regex) : _regexLength); - compiledRegex = pcre2_compile_8((PCRE2_SPTR8)_regex, regexSize, options, &errNum, &errOffset, pcre2CompileContext8); + pcre2_code_8 * newCompiledRegex = pcre2_compile_8((PCRE2_SPTR8)_regex, regexSize, options, &errNum, &errOffset, pcre2CompileContext8); - if (compiledRegex == nullptr) + if (newCompiledRegex == nullptr) { - failWithPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); + failWithPCRE2Error(errNum, "Error in regex pattern: ", std::string(_regex, _regexLength), errOffset); } - } - ~CCompiledStrRegExpr() //CAVEAT non-virtual destructor ! - { - pcre2_code_free_8(compiledRegex); + compiledRegex = std::shared_ptr(newCompiledRegex, pcre2_code_free_8); } + CCompiledStrRegExpr(const RegexCacheEntry& cacheEntry, bool _enableUTF8) + : compiledRegex(cacheEntry.getCompiledRegex8()), isUTF8Enabled(_enableUTF8) + {} + + std::shared_ptr getCompiledRegex() const { return compiledRegex; } + //ICompiledStrRegExpr void replace(size32_t & outlen, char * & out, size32_t slen, char const * str, size32_t rlen, char const * replace) const { PCRE2_SIZE pcreLen = 0; outlen = 0; - pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex, pcre2GeneralContext8); + pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex.get(), pcre2GeneralContext8); // This method is often called through an ECL interface and the provided lengths // (slen and rlen) are in characters, not bytes; we need to convert these to a @@ -279,7 +392,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr // Note that pcreLen will include space for a terminating null character; // we have to allocate memory for that byte to avoid a buffer overrun, // but we won't count that terminating byte - int replaceResult = pcre2_substitute_8(compiledRegex, (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, nullptr, &pcreLen); + int replaceResult = pcre2_substitute_8(compiledRegex.get(), (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, nullptr, &pcreLen); if (replaceResult < 0 && replaceResult != PCRE2_ERROR_NOMEMORY) { @@ -292,7 +405,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr { out = (char *)rtlMalloc(pcreLen); - replaceResult = pcre2_substitute_8(compiledRegex, (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, (PCRE2_UCHAR8 *)out, &pcreLen); + replaceResult = pcre2_substitute_8(compiledRegex.get(), (PCRE2_SPTR8)str, sourceSize, 0, replaceOptions, matchData, pcre2MatchContext8, (PCRE2_SPTR8)replace, replaceSize, (PCRE2_UCHAR8 *)out, &pcreLen); // Note that, weirdly, pcreLen will now contain the number of code points // in the result *excluding* the null terminator, so pcreLen will @@ -323,7 +436,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr PCRE2_SIZE offset = 0; uint32_t matchOptions = 0; PCRE2_SIZE subjectSize = (isUTF8Enabled ? rtlUtf8Size(_subjectLen, _subject) : _subjectLen); - pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex, pcre2GeneralContext8); + pcre2_match_data_8 * matchData = pcre2_match_data_create_from_pattern_8(compiledRegex.get(), pcre2GeneralContext8); // Capture groups are ignored when gathering match results into a set, // so we will focus on only the first match (the entire matched string); @@ -332,7 +445,7 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr while (offset < subjectSize) { - int numMatches = pcre2_match_8(compiledRegex, (PCRE2_SPTR8)_subject, subjectSize, offset, matchOptions, matchData, pcre2MatchContext8); + int numMatches = pcre2_match_8(compiledRegex.get(), (PCRE2_SPTR8)_subject, subjectSize, offset, matchOptions, matchData, pcre2MatchContext8); if (numMatches < 0) { @@ -389,16 +502,62 @@ class CCompiledStrRegExpr : implements ICompiledStrRegExpr // STRING implementation //--------------------------------------------------------------------------- +/** + * @brief Fetches or creates a compiled string regular expression object. + * + * This function fetches a compiled string regular expression object from the cache if it exists, + * or creates a new one if it doesn't. The regular expression object is created based on the provided + * regex pattern, length, and case sensitivity flag. The created object is then cached for future use. + * + * @param _regexLength The length of the regex pattern. + * @param _regex The regex pattern. + * @param _isCaseSensitive Flag indicating whether the regex pattern is case sensitive or not. + * @return A pointer to a copy of the fetched or created CCompiledStrRegExpr object. The returned object + * * must eventually be deleted. + */ +CCompiledStrRegExpr* fetchOrCreateCompiledStrRegExpr(int _regexLength, const char * _regex, bool _isCaseSensitive) +{ + if (compiledCacheEnabled) + { + CCompiledStrRegExpr * compiledObjPtr = nullptr; + uint32_t options = (_isCaseSensitive ? 0 : PCRE2_CASELESS); + hash64_t regexHash = RegexCacheEntry::hashValue(_regexLength, _regex, options); + + // Check the cache + { + CriticalBlock lock(compiledStrRegExprLock); + RegexCacheEntry * cacheEntry = compiledStrRegExprCache.get(regexHash).get(); + + if (cacheEntry && cacheEntry->hasSamePattern(_regexLength, _regex, options)) + { + // Return a new compiled pattern object based on the cached information + return new CCompiledStrRegExpr(*cacheEntry, false); + } + + // Create a new compiled pattern object + compiledObjPtr = new CCompiledStrRegExpr(_regexLength, _regex, _isCaseSensitive, false); + // Create a cache entry for the new object + compiledStrRegExprCache.set(regexHash, std::make_shared(_regexLength, _regex, options, compiledObjPtr->getCompiledRegex())); + } + + return compiledObjPtr; + } + else + { + return new CCompiledStrRegExpr(_regexLength, _regex, _isCaseSensitive, false); + } +} + +//--------------------------------------------------------------------------- + ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExpr, isCaseSensitive, false); - return expr; + return fetchOrCreateCompiledStrRegExpr(strlen(regExpr), regExpr, isCaseSensitive); } ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(int regExprLength, const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExprLength, regExpr, isCaseSensitive, false); - return expr; + return fetchOrCreateCompiledStrRegExpr(regExprLength, regExpr, isCaseSensitive); } ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiledExpr) @@ -417,16 +576,63 @@ ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * findI // UTF8 implementation //--------------------------------------------------------------------------- +/** + * @brief Fetches or creates a compiled UTF-8 regular expression object. + * + * This function fetches a compiled UTF-8 regular expression object from the cache if it exists, + * or creates a new one if it doesn't. The regular expression object is created based on the provided + * regex pattern, length, and case sensitivity flag. The created object is then cached for future use. + * + * @param _regexLength The length of the regex pattern, in code points. + * @param _regex The regex pattern. + * @param _isCaseSensitive Flag indicating whether the regex pattern is case sensitive or not. + * @return A pointer to a copy of the fetched or created CCompiledStrRegExpr object. The returned object + * * must eventually be deleted. + */ +CCompiledStrRegExpr* fetchOrCreateCompiledU8StrRegExpr(int _regexLength, const char * _regex, bool _isCaseSensitive) +{ + if (compiledCacheEnabled) + { + CCompiledStrRegExpr * compiledObjPtr = nullptr; + unsigned int regexSize = rtlUtf8Size(_regexLength, _regex); + uint32_t options = PCRE2_UTF | (_isCaseSensitive ? 0 : PCRE2_CASELESS); + hash64_t regexHash = RegexCacheEntry::hashValue(regexSize, _regex, options); + + // Check the cache + { + CriticalBlock lock(compiledStrRegExprLock); + RegexCacheEntry * cacheEntry = compiledStrRegExprCache.get(regexHash).get(); + + if (cacheEntry && cacheEntry->hasSamePattern(regexSize, _regex, options)) + { + // Return a new compiled pattern object based on the cached information + return new CCompiledStrRegExpr(*cacheEntry, true); + } + + // Create a new compiled pattern object + compiledObjPtr = new CCompiledStrRegExpr(_regexLength, _regex, _isCaseSensitive, true); + // Create a cache entry for the new object + compiledStrRegExprCache.set(regexHash, std::make_shared(regexSize, _regex, options, compiledObjPtr->getCompiledRegex())); + } + + return compiledObjPtr; + } + else + { + return new CCompiledStrRegExpr(_regexLength, _regex, _isCaseSensitive, true); + } +} + +//--------------------------------------------------------------------------- + ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledU8StrRegExpr(const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExpr, isCaseSensitive, true); - return expr; + return fetchOrCreateCompiledU8StrRegExpr(rtlUtf8Length(regExpr), regExpr, isCaseSensitive); } ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledU8StrRegExpr(int regExprLength, const char * regExpr, bool isCaseSensitive) { - CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExprLength, regExpr, isCaseSensitive, true); - return expr; + return fetchOrCreateCompiledU8StrRegExpr(regExprLength, regExpr, isCaseSensitive); } ECLRTL_API void rtlDestroyCompiledU8StrRegExpr(ICompiledStrRegExpr * compiledExpr) @@ -451,25 +657,25 @@ class CUStrRegExprFindInstance : implements IUStrRegExprFindInstance { private: bool matched = false; - pcre2_code_16 * compiledRegex = nullptr; // do not free; this will be owned by caller + std::shared_ptr compiledRegex = nullptr; pcre2_match_data_16 * matchData = nullptr; const UChar * subject = nullptr; // points to current subject of regex; do not free public: - CUStrRegExprFindInstance(pcre2_code_16 * _compiledRegex, const UChar * _subject, size32_t _from, size32_t _len) + CUStrRegExprFindInstance(std::shared_ptr _compiledRegex, const UChar * _subject, size32_t _from, size32_t _len) : compiledRegex(_compiledRegex) { subject = _subject + _from; matched = false; - matchData = pcre2_match_data_create_from_pattern_16(compiledRegex, pcre2GeneralContext16); - int numMatches = pcre2_match_16(compiledRegex, (PCRE2_SPTR16)subject, _len, 0, 0, matchData, pcre2MatchContext16); + matchData = pcre2_match_data_create_from_pattern_16(compiledRegex.get(), pcre2GeneralContext16); + int numMatches = pcre2_match_16(compiledRegex.get(), (PCRE2_SPTR16)subject, _len, 0, 0, matchData, pcre2MatchContext16); matched = numMatches > 0; if (numMatches < 0 && numMatches != PCRE2_ERROR_NOMATCH) { // Treat everything else as an error - failWithUPCRE2Error(numMatches, "Error in regex search: "); + failWithPCRE2Error(numMatches, "Error in regex search: "); } } @@ -526,47 +732,36 @@ class CUStrRegExprFindInstance : implements IUStrRegExprFindInstance class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr { private: - pcre2_code_16 * compiledRegex = nullptr; + std::shared_ptr compiledRegex = nullptr; public: - CCompiledUStrRegExpr(const UChar * _regex, bool _isCaseSensitive = false) - { - int errNum = 0; - PCRE2_SIZE errOffset; - uint32_t options = (PCRE2_UCP | (_isCaseSensitive ? 0 : PCRE2_CASELESS)); - - compiledRegex = pcre2_compile_16((PCRE2_SPTR16)_regex, PCRE2_ZERO_TERMINATED, options, &errNum, &errOffset, pcre2CompileContext16); - - if (compiledRegex == nullptr) - { - failWithUPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); - } - } - CCompiledUStrRegExpr(int _regexLength, const UChar * _regex, bool _isCaseSensitive = false) { int errNum = 0; PCRE2_SIZE errOffset; uint32_t options = (PCRE2_UCP | (_isCaseSensitive ? 0 : PCRE2_CASELESS)); - compiledRegex = pcre2_compile_16((PCRE2_SPTR16)_regex, _regexLength, options, &errNum, &errOffset, pcre2CompileContext16); + pcre2_code_16 * newCompiledRegex = pcre2_compile_16((PCRE2_SPTR16)_regex, _regexLength, options, &errNum, &errOffset, pcre2CompileContext16); - if (compiledRegex == nullptr) + if (newCompiledRegex == nullptr) { - failWithUPCRE2Error(errNum, "Error in regex pattern: ", _regex, errOffset); + failWithPCRE2Error(errNum, "Error in regex pattern: ", _regex, _regexLength, errOffset); } - } - ~CCompiledUStrRegExpr() - { - pcre2_code_free_16(compiledRegex); + compiledRegex = std::shared_ptr(newCompiledRegex, pcre2_code_free_16); } + CCompiledUStrRegExpr(const RegexCacheEntry& cacheEntry) + : compiledRegex(cacheEntry.getCompiledRegex16()) + {} + + std::shared_ptr getCompiledRegex() const { return compiledRegex; } + void replace(size32_t & outlen, UChar * & out, size32_t slen, const UChar * str, size32_t rlen, UChar const * replace) const { PCRE2_SIZE pcreLen = 0; outlen = 0; - pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex, pcre2GeneralContext16); + pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex.get(), pcre2GeneralContext16); uint32_t replaceOptions = PCRE2_SUBSTITUTE_GLOBAL|PCRE2_SUBSTITUTE_EXTENDED; @@ -574,20 +769,20 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr // Note that pcreLen will include space for a terminating null character; // we have to allocate memory for that byte to avoid a buffer overrun, // but we won't count that terminating byte - int replaceResult = pcre2_substitute_16(compiledRegex, (PCRE2_SPTR16)str, slen, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, nullptr, &pcreLen); + int replaceResult = pcre2_substitute_16(compiledRegex.get(), (PCRE2_SPTR16)str, slen, 0, replaceOptions|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, nullptr, &pcreLen); if (replaceResult < 0 && replaceResult != PCRE2_ERROR_NOMEMORY) { // PCRE2_ERROR_NOMEMORY is a normal result when we're just asking for the size of the output pcre2_match_data_free_16(matchData); - failWithUPCRE2Error(replaceResult, "Error in regex replace: "); + failWithPCRE2Error(replaceResult, "Error in regex replace: "); } if (pcreLen > 0) { out = (UChar *)rtlMalloc(pcreLen * sizeof(UChar)); - replaceResult = pcre2_substitute_16(compiledRegex, (PCRE2_SPTR16)str, slen, 0, replaceOptions, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, (PCRE2_UCHAR16 *)out, &pcreLen); + replaceResult = pcre2_substitute_16(compiledRegex.get(), (PCRE2_SPTR16)str, slen, 0, replaceOptions, matchData, pcre2MatchContext16, (PCRE2_SPTR16)replace, rlen, (PCRE2_UCHAR16 *)out, &pcreLen); // Note that, weirdly, pcreLen will now contain the number of code points // in the result *excluding* the null terminator, so pcreLen will @@ -596,7 +791,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr if (replaceResult < 0) { pcre2_match_data_free_16(matchData); - failWithUPCRE2Error(replaceResult, "Error in regex replace: "); + failWithPCRE2Error(replaceResult, "Error in regex replace: "); } } @@ -617,7 +812,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr size32_t outBytes = 0; PCRE2_SIZE offset = 0; uint32_t matchOptions = 0; - pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex, pcre2GeneralContext16); + pcre2_match_data_16 * matchData = pcre2_match_data_create_from_pattern_16(compiledRegex.get(), pcre2GeneralContext16); // Capture groups are ignored when gathering match results into a set, // so we will focus on only the first match (the entire matched string); @@ -626,7 +821,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr while (offset < _subjectLen) { - int numMatches = pcre2_match_16(compiledRegex, (PCRE2_SPTR16)_subject, _subjectLen, offset, matchOptions, matchData, pcre2MatchContext16); + int numMatches = pcre2_match_16(compiledRegex.get(), (PCRE2_SPTR16)_subject, _subjectLen, offset, matchOptions, matchData, pcre2MatchContext16); if (numMatches < 0) { @@ -639,7 +834,7 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr { // Treat everything else as an error pcre2_match_data_free_16(matchData); - failWithUPCRE2Error(numMatches, "Error in regex getMatchSet: "); + failWithPCRE2Error(numMatches, "Error in regex getMatchSet: "); } } else if (numMatches > 0) @@ -682,16 +877,63 @@ class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr // UNICODE implementation //--------------------------------------------------------------------------- +/** + * @brief Fetches or creates a compiled Unicode regular expression object. + * + * This function fetches a compiled Unicode regular expression object from the cache if it exists, + * or creates a new one if it doesn't. The regular expression object is created based on the provided + * regex pattern, length, and case sensitivity flag. The created object is then cached for future use. + * + * @param _regexLength The length of the regex pattern, in code points. + * @param _regex The regex pattern. + * @param _isCaseSensitive Flag indicating whether the regex pattern is case sensitive or not. + * @return A pointer to a copy of the fetched or created CCompiledUStrRegExpr object. The returned object + * * must eventually be deleted. + */ +CCompiledUStrRegExpr* fetchOrCreateCompiledUStrRegExpr(int _regexLength, const UChar * _regex, bool _isCaseSensitive) +{ + if (compiledCacheEnabled) + { + CCompiledUStrRegExpr * compiledObjPtr = nullptr; + unsigned int regexSize = _regexLength * sizeof(UChar); + uint32_t options = PCRE2_UCP | (_isCaseSensitive ? 0 : PCRE2_CASELESS); + hash64_t regexHash = RegexCacheEntry::hashValue(regexSize, reinterpret_cast(_regex), options); + + // Check the cache + { + CriticalBlock lock(compiledStrRegExprLock); + RegexCacheEntry * cacheEntry = compiledStrRegExprCache.get(regexHash).get(); + + if (cacheEntry && cacheEntry->hasSamePattern(regexSize, reinterpret_cast(_regex), options)) + { + // Return a new copy of the cached object + return new CCompiledUStrRegExpr(*cacheEntry); + } + + // Create a new compiled pattern object + compiledObjPtr = new CCompiledUStrRegExpr(_regexLength, _regex, _isCaseSensitive); + // Create a cache entry for the new object + compiledStrRegExprCache.set(regexHash, std::make_shared(regexSize, reinterpret_cast(_regex), options, compiledObjPtr->getCompiledRegex())); + } + + return compiledObjPtr; + } + else + { + return new CCompiledUStrRegExpr(_regexLength, _regex, _isCaseSensitive); + } +} + +//--------------------------------------------------------------------------- + ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive) { - CCompiledUStrRegExpr * expr = new CCompiledUStrRegExpr(regExpr, isCaseSensitive); - return expr; + return fetchOrCreateCompiledUStrRegExpr(rtlUnicodeStrlen(regExpr), regExpr, isCaseSensitive); } ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(int regExprLength, const UChar * regExpr, bool isCaseSensitive) { - CCompiledUStrRegExpr * expr = new CCompiledUStrRegExpr(regExprLength, regExpr, isCaseSensitive); - return expr; + return fetchOrCreateCompiledUStrRegExpr(regExprLength, regExpr, isCaseSensitive); } ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr) @@ -737,6 +979,7 @@ MODULE_INIT(INIT_PRIORITY_ECLRTL_ECLRTL) pcre2CompileContext16 = pcre2_compile_context_create_16(pcre2GeneralContext16); pcre2MatchContext16 = pcre2_match_context_create_16(pcre2GeneralContext16); #endif // _USE_ICU + initMaxCacheSize(); return true; } diff --git a/rtl/eclrtl/eclrtl.cpp b/rtl/eclrtl/eclrtl.cpp index 1b8a9f1d313..156cad7066e 100644 --- a/rtl/eclrtl/eclrtl.cpp +++ b/rtl/eclrtl/eclrtl.cpp @@ -4960,6 +4960,19 @@ unsigned rtlUtf8Size(unsigned len, const void * _data) return offset; } +unsigned rtlUtf8Length(const void * _data) +{ + const byte * data = (const byte *)_data; + size32_t length = 0; + unsigned offset = 0; + while (data[offset]) + { + offset += readUtf8Size(data+offset); + length++; + } + return length; +} + unsigned rtlUtf8Length(unsigned size, const void * _data) { const byte * data = (const byte *)_data; diff --git a/rtl/eclrtl/eclrtl.hpp b/rtl/eclrtl/eclrtl.hpp index 53d4a09919b..6cb401b69b0 100644 --- a/rtl/eclrtl/eclrtl.hpp +++ b/rtl/eclrtl/eclrtl.hpp @@ -667,6 +667,7 @@ ECLRTL_API void rtlStrToVUnicode(unsigned outlen, UChar * out, unsigned inlen, c ECLRTL_API unsigned rtlUtf8Size(const void * data); ECLRTL_API unsigned rtlUtf8Size(unsigned len, const void * data); +ECLRTL_API unsigned rtlUtf8Length(const void * data); ECLRTL_API unsigned rtlUtf8Length(unsigned size, const void * data); ECLRTL_API unsigned rtlUtf8Char(const void * data); ECLRTL_API void rtlUtf8ToData(size32_t outlen, void * out, size32_t inlen, const char *in); diff --git a/system/jlib/jhash.hpp b/system/jlib/jhash.hpp index 3653abd4c21..91b154d052c 100644 --- a/system/jlib/jhash.hpp +++ b/system/jlib/jhash.hpp @@ -21,6 +21,7 @@ #define JHASH_HPP #include +#include #include #include @@ -687,4 +688,103 @@ class CTimeLimitedCache } }; +/** + * CLRUCache + * + * Least-Recently-Used cache class, specialized for key and + * value pointer types (the value is a pointer or a data type + * where a nullptr could represent a missing value). + * + * The get() method returns a found object by value. This + * is intentional and very useful for maintaining refcounts. + * + * There is a minimum size for the cache, defined by + * LRU_MIN_CACHE_SIZE. Attempts to create a smaller cache + * will be silently changed to the minimum size. If no + * initial size is provided to the constructor, the cache + * size will be set to LRU_MIN_CACHE_SIZE. + * + * Methods here are not thread-safe. Callers should block + * concurrent access for non-const methods (which are most + * of them). + */ + +#define LRU_MIN_CACHE_SIZE 10 + +template +class CLRUCache +{ + private: + std::list recentList; + std::unordered_map::iterator>> lookupMap; + size32_t maxCacheSize; + + void _downsize() + { + while (lookupMap.size() > maxCacheSize) + { + lookupMap.erase(recentList.back()); + recentList.pop_back(); + } + } + + public: + CLRUCache() : maxCacheSize(LRU_MIN_CACHE_SIZE) {} + CLRUCache(size32_t _maxCacheSize) : maxCacheSize(_maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize) {} + CLRUCache(const CLRUCache& other) = delete; + ~CLRUCache() = default; + + size32_t getCacheSize() const + { + return lookupMap.size(); + } + + size32_t setMaxCacheSize(size32_t _maxCacheSize) + { + maxCacheSize = _maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize; + _downsize(); + return maxCacheSize; + } + + PTRTYPE get(const KEYTYPE& key) + { + auto foundIter = lookupMap.find(key); + if (foundIter == lookupMap.end()) + return nullptr; + + recentList.splice(recentList.begin(), recentList, foundIter->second.second); + foundIter->second.second = recentList.begin(); + return foundIter->second.first; + } + + void set(const KEYTYPE& key, const PTRTYPE& value) + { + auto foundIter = lookupMap.find(key); + + if (foundIter == lookupMap.end()) + { + recentList.push_front(key); + lookupMap[key] = {value, recentList.begin()}; + _downsize(); + } + else + { + recentList.splice(recentList.begin(), recentList, foundIter->second.second); + foundIter->second.first = value; + foundIter->second.second = recentList.begin(); + } + } + + bool remove(const KEYTYPE& key) + { + auto foundIter = lookupMap.find(key); + if (foundIter == lookupMap.end()) + return false; + + recentList.erase(foundIter->second.second); + lookupMap.erase(foundIter); + return true; + } +}; + #endif diff --git a/testing/regress/ecl/key/regex_cache_string.xml b/testing/regress/ecl/key/regex_cache_string.xml new file mode 100644 index 00000000000..1ef7377e136 --- /dev/null +++ b/testing/regress/ecl/key/regex_cache_string.xml @@ -0,0 +1,3 @@ + + PASSED + diff --git a/testing/regress/ecl/key/regex_cache_unicode.xml b/testing/regress/ecl/key/regex_cache_unicode.xml new file mode 100644 index 00000000000..1ef7377e136 --- /dev/null +++ b/testing/regress/ecl/key/regex_cache_unicode.xml @@ -0,0 +1,3 @@ + + PASSED + diff --git a/testing/regress/ecl/key/regex_cache_utf8.xml b/testing/regress/ecl/key/regex_cache_utf8.xml new file mode 100644 index 00000000000..1ef7377e136 --- /dev/null +++ b/testing/regress/ecl/key/regex_cache_utf8.xml @@ -0,0 +1,3 @@ + + PASSED + diff --git a/testing/regress/ecl/regex_cache_string.ecl b/testing/regress/ecl/regex_cache_string.ecl new file mode 100644 index 00000000000..0222bb83389 --- /dev/null +++ b/testing/regress/ecl/regex_cache_string.ecl @@ -0,0 +1,70 @@ +/*############################################################################## + + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +#OPTION('globalFold', FALSE); + +IMPORT Std; + +regexDS := DATASET + ( + 100000, + TRANSFORM + ( + {STRING a}, + SELF.a := (STRING)RANDOM() + ), + DISTRIBUTED + ); + +res := PROJECT + ( + NOFOLD(regexDS), + TRANSFORM + ( + { + RECORDOF(LEFT), + STRING via_regex, + STRING via_find, + BOOLEAN is_matching + }, + SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, 'x'), + SELF.via_find := Std.Str.SubstituteIncluded(LEFT.a, LEFT.a[1], 'x'), + SELF.is_matching := SELF.via_regex = SELF.via_find, + SELF := LEFT + ), + PARALLEL(10) + ); + +numTests := COUNT(regexDS); +testsPassed := res(is_matching); +numTestsPassed := COUNT(testsPassed); +testsFailed := res(~is_matching); +numTestsFailed := COUNT(testsFailed); + +MIN_PASS_PERCENTAGE := 0.95; + +passedPercentage := numTestsPassed / numTests; +isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE; +resultStr := IF(isSuccess, 'PASSED', 'FAILED'); +fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2)); + +// Output for unit test parsing +OUTPUT(resultStr, NAMED('result')); + +// Uncomment the following to see details +// OUTPUT(numTests, NAMED('num_tests')); +// OUTPUT(numTestsPassed, NAMED('num_passed')); +// OUTPUT(numTestsFailed, NAMED('num_failed')); +// OUTPUT(fullResultStr, NAMED('result_desc')); +// OUTPUT(testsFailed, NAMED('failed_tests'), ALL); diff --git a/testing/regress/ecl/regex_cache_unicode.ecl b/testing/regress/ecl/regex_cache_unicode.ecl new file mode 100644 index 00000000000..c0fa569ec79 --- /dev/null +++ b/testing/regress/ecl/regex_cache_unicode.ecl @@ -0,0 +1,70 @@ +/*############################################################################## + + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +#OPTION('globalFold', FALSE); + +IMPORT Std; + +regexDS := DATASET + ( + 100000, + TRANSFORM + ( + {UNICODE a}, + SELF.a := (UNICODE)RANDOM() + ), + DISTRIBUTED + ); + +res := PROJECT + ( + NOFOLD(regexDS), + TRANSFORM + ( + { + RECORDOF(LEFT), + UNICODE via_regex, + UNICODE via_find, + BOOLEAN is_matching + }, + SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, u'x'), + SELF.via_find := (UNICODE)Std.Uni.SubstituteIncluded(LEFT.a, LEFT.a[1], u'x'), + SELF.is_matching := SELF.via_regex = SELF.via_find, + SELF := LEFT + ), + PARALLEL(10) + ); + +numTests := COUNT(regexDS); +testsPassed := res(is_matching); +numTestsPassed := COUNT(testsPassed); +testsFailed := res(~is_matching); +numTestsFailed := COUNT(testsFailed); + +MIN_PASS_PERCENTAGE := 0.95; + +passedPercentage := numTestsPassed / numTests; +isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE; +resultStr := IF(isSuccess, 'PASSED', 'FAILED'); +fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2)); + +// Output for unit test parsing +OUTPUT(resultStr, NAMED('result')); + +// Uncomment the following to see details +// OUTPUT(numTests, NAMED('num_tests')); +// OUTPUT(numTestsPassed, NAMED('num_passed')); +// OUTPUT(numTestsFailed, NAMED('num_failed')); +// OUTPUT(fullResultStr, NAMED('result_desc')); +// OUTPUT(testsFailed, NAMED('failed_tests'), ALL); diff --git a/testing/regress/ecl/regex_cache_utf8.ecl b/testing/regress/ecl/regex_cache_utf8.ecl new file mode 100644 index 00000000000..e3cf3d4398d --- /dev/null +++ b/testing/regress/ecl/regex_cache_utf8.ecl @@ -0,0 +1,70 @@ +/*############################################################################## + + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +#OPTION('globalFold', FALSE); + +IMPORT Std; + +regexDS := DATASET + ( + 100000, + TRANSFORM + ( + {UTF8 a}, + SELF.a := (UTF8)RANDOM() + ), + DISTRIBUTED + ); + +res := PROJECT + ( + NOFOLD(regexDS), + TRANSFORM + ( + { + RECORDOF(LEFT), + UTF8 via_regex, + UTF8 via_find, + BOOLEAN is_matching + }, + SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, u8'x'), + SELF.via_find := (UTF8)Std.Uni.SubstituteIncluded(LEFT.a, LEFT.a[1], u8'x'), + SELF.is_matching := SELF.via_regex = SELF.via_find, + SELF := LEFT + ), + PARALLEL(10) + ); + +numTests := COUNT(regexDS); +testsPassed := res(is_matching); +numTestsPassed := COUNT(testsPassed); +testsFailed := res(~is_matching); +numTestsFailed := COUNT(testsFailed); + +MIN_PASS_PERCENTAGE := 0.95; + +passedPercentage := numTestsPassed / numTests; +isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE; +resultStr := IF(isSuccess, 'PASSED', 'FAILED'); +fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2)); + +// Output for unit test parsing +OUTPUT(resultStr, NAMED('result')); + +// Uncomment the following to see details +// OUTPUT(numTests, NAMED('num_tests')); +// OUTPUT(numTestsPassed, NAMED('num_passed')); +// OUTPUT(numTestsFailed, NAMED('num_failed')); +// OUTPUT(fullResultStr, NAMED('result_desc')); +// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);