Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HPCC-31921 Add caching of regex compiled search patterns #18748

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ecl/hql/hqlfold.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2796,7 +2796,7 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
StringBuffer pattern, search;
v0->getUTF8Value(pattern);
v1->getUTF8Value(search);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern.lengthUtf8(), pattern, !expr->hasAttribute(noCaseAtom));
compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.lengthUtf8(), search.str());
rtlDestroyCompiledU8StrRegExpr(compiled);
}
Expand Down
443 changes: 343 additions & 100 deletions rtl/eclrtl/eclregex.cpp

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions rtl/eclrtl/eclrtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4960,6 +4960,19 @@ unsigned rtlUtf8Size(unsigned len, const void * _data)
return offset;
}

unsigned rtlUtf8Length(const void * _data)
{
const byte * data = (const byte *)_data;
size32_t length = 0;
unsigned offset = 0;
while (data[offset])
{
offset += readUtf8Size(data+offset);
length++;
}
return length;
}

unsigned rtlUtf8Length(unsigned size, const void * _data)
{
const byte * data = (const byte *)_data;
Expand Down
1 change: 1 addition & 0 deletions rtl/eclrtl/eclrtl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ ECLRTL_API void rtlStrToVUnicode(unsigned outlen, UChar * out, unsigned inlen, c

ECLRTL_API unsigned rtlUtf8Size(const void * data);
ECLRTL_API unsigned rtlUtf8Size(unsigned len, const void * data);
ECLRTL_API unsigned rtlUtf8Length(const void * data);
ECLRTL_API unsigned rtlUtf8Length(unsigned size, const void * data);
ECLRTL_API unsigned rtlUtf8Char(const void * data);
ECLRTL_API void rtlUtf8ToData(size32_t outlen, void * out, size32_t inlen, const char *in);
Expand Down
100 changes: 100 additions & 0 deletions system/jlib/jhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#define JHASH_HPP

#include <functional>
#include <list>
#include <unordered_map>
#include <utility>

Expand Down Expand Up @@ -687,4 +688,103 @@ class CTimeLimitedCache
}
};

/**
* CLRUCache
*
* Least-Recently-Used cache class, specialized for key and
* value pointer types (the value is a pointer or a data type
* where a nullptr could represent a missing value).
*
* The get() method returns a found object by value. This
* is intentional and very useful for maintaining refcounts.
*
* There is a minimum size for the cache, defined by
* LRU_MIN_CACHE_SIZE. Attempts to create a smaller cache
* will be silently changed to the minimum size. If no
* initial size is provided to the constructor, the cache
* size will be set to LRU_MIN_CACHE_SIZE.
*
* Methods here are not thread-safe. Callers should block
* concurrent access for non-const methods (which are most
* of them).
*/

#define LRU_MIN_CACHE_SIZE 10

template <class KEYTYPE, class PTRTYPE>
class CLRUCache
{
private:
std::list<KEYTYPE> recentList;
std::unordered_map<KEYTYPE, std::pair<PTRTYPE, typename std::list<KEYTYPE>::iterator>> lookupMap;
size32_t maxCacheSize;

void _downsize()
{
while (lookupMap.size() > maxCacheSize)
{
lookupMap.erase(recentList.back());
recentList.pop_back();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When will the cache entry be deleted? Should it be a map to a std::unique_ptr?

}
}

public:
CLRUCache() : maxCacheSize(LRU_MIN_CACHE_SIZE) {}
CLRUCache(size32_t _maxCacheSize) : maxCacheSize(_maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize) {}
CLRUCache(const CLRUCache& other) = delete;
~CLRUCache() = default;

size32_t getCacheSize() const
{
return lookupMap.size();
}

size32_t setMaxCacheSize(size32_t _maxCacheSize)
{
maxCacheSize = _maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize;
_downsize();
return maxCacheSize;
}

PTRTYPE get(const KEYTYPE& key)
{
auto foundIter = lookupMap.find(key);
if (foundIter == lookupMap.end())
return nullptr;

recentList.splice(recentList.begin(), recentList, foundIter->second.second);
foundIter->second.second = recentList.begin();
return foundIter->second.first;
}

void set(const KEYTYPE& key, const PTRTYPE& value)
{
auto foundIter = lookupMap.find(key);

if (foundIter == lookupMap.end())
{
recentList.push_front(key);
lookupMap[key] = {value, recentList.begin()};
_downsize();
}
else
{
recentList.splice(recentList.begin(), recentList, foundIter->second.second);
foundIter->second.first = value;
foundIter->second.second = recentList.begin();
ghalliday marked this conversation as resolved.
Show resolved Hide resolved
}
}

bool remove(const KEYTYPE& key)
{
auto foundIter = lookupMap.find(key);
if (foundIter == lookupMap.end())
return false;

recentList.erase(foundIter->second.second);
lookupMap.erase(foundIter);
return true;
}
};

#endif
3 changes: 3 additions & 0 deletions testing/regress/ecl/key/regex_cache_string.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='result'>
<Row><result>PASSED</result></Row>
</Dataset>
3 changes: 3 additions & 0 deletions testing/regress/ecl/key/regex_cache_unicode.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='result'>
<Row><result>PASSED</result></Row>
</Dataset>
3 changes: 3 additions & 0 deletions testing/regress/ecl/key/regex_cache_utf8.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='result'>
<Row><result>PASSED</result></Row>
</Dataset>
70 changes: 70 additions & 0 deletions testing/regress/ecl/regex_cache_string.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*##############################################################################

HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

IMPORT Std;

regexDS := DATASET
(
100000,
TRANSFORM
(
{STRING a},
SELF.a := (STRING)RANDOM()
),
DISTRIBUTED
);

res := PROJECT
(
NOFOLD(regexDS),
TRANSFORM
(
{
RECORDOF(LEFT),
STRING via_regex,
STRING via_find,
BOOLEAN is_matching
},
SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, 'x'),
SELF.via_find := Std.Str.SubstituteIncluded(LEFT.a, LEFT.a[1], 'x'),
SELF.is_matching := SELF.via_regex = SELF.via_find,
SELF := LEFT
),
PARALLEL(10)
);

numTests := COUNT(regexDS);
testsPassed := res(is_matching);
numTestsPassed := COUNT(testsPassed);
testsFailed := res(~is_matching);
numTestsFailed := COUNT(testsFailed);

MIN_PASS_PERCENTAGE := 0.95;

passedPercentage := numTestsPassed / numTests;
isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE;
resultStr := IF(isSuccess, 'PASSED', 'FAILED');
fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2));

// Output for unit test parsing
OUTPUT(resultStr, NAMED('result'));

// Uncomment the following to see details
// OUTPUT(numTests, NAMED('num_tests'));
// OUTPUT(numTestsPassed, NAMED('num_passed'));
// OUTPUT(numTestsFailed, NAMED('num_failed'));
// OUTPUT(fullResultStr, NAMED('result_desc'));
// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);
70 changes: 70 additions & 0 deletions testing/regress/ecl/regex_cache_unicode.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*##############################################################################

HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

IMPORT Std;

regexDS := DATASET
(
100000,
TRANSFORM
(
{UNICODE a},
SELF.a := (UNICODE)RANDOM()
),
DISTRIBUTED
);

res := PROJECT
(
NOFOLD(regexDS),
TRANSFORM
(
{
RECORDOF(LEFT),
UNICODE via_regex,
UNICODE via_find,
BOOLEAN is_matching
},
SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, u'x'),
SELF.via_find := (UNICODE)Std.Uni.SubstituteIncluded(LEFT.a, LEFT.a[1], u'x'),
SELF.is_matching := SELF.via_regex = SELF.via_find,
SELF := LEFT
),
PARALLEL(10)
);

numTests := COUNT(regexDS);
testsPassed := res(is_matching);
numTestsPassed := COUNT(testsPassed);
testsFailed := res(~is_matching);
numTestsFailed := COUNT(testsFailed);

MIN_PASS_PERCENTAGE := 0.95;

passedPercentage := numTestsPassed / numTests;
isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE;
resultStr := IF(isSuccess, 'PASSED', 'FAILED');
fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2));

// Output for unit test parsing
OUTPUT(resultStr, NAMED('result'));

// Uncomment the following to see details
// OUTPUT(numTests, NAMED('num_tests'));
// OUTPUT(numTestsPassed, NAMED('num_passed'));
// OUTPUT(numTestsFailed, NAMED('num_failed'));
// OUTPUT(fullResultStr, NAMED('result_desc'));
// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);
70 changes: 70 additions & 0 deletions testing/regress/ecl/regex_cache_utf8.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*##############################################################################

HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

IMPORT Std;

regexDS := DATASET
(
100000,
TRANSFORM
(
{UTF8 a},
SELF.a := (UTF8)RANDOM()
),
DISTRIBUTED
);

res := PROJECT
(
NOFOLD(regexDS),
TRANSFORM
(
{
RECORDOF(LEFT),
UTF8 via_regex,
UTF8 via_find,
BOOLEAN is_matching
},
SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, u8'x'),
SELF.via_find := (UTF8)Std.Uni.SubstituteIncluded(LEFT.a, LEFT.a[1], u8'x'),
SELF.is_matching := SELF.via_regex = SELF.via_find,
SELF := LEFT
),
PARALLEL(10)
);

numTests := COUNT(regexDS);
testsPassed := res(is_matching);
numTestsPassed := COUNT(testsPassed);
testsFailed := res(~is_matching);
numTestsFailed := COUNT(testsFailed);

MIN_PASS_PERCENTAGE := 0.95;

passedPercentage := numTestsPassed / numTests;
isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE;
resultStr := IF(isSuccess, 'PASSED', 'FAILED');
fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2));

// Output for unit test parsing
OUTPUT(resultStr, NAMED('result'));

// Uncomment the following to see details
// OUTPUT(numTests, NAMED('num_tests'));
// OUTPUT(numTestsPassed, NAMED('num_passed'));
// OUTPUT(numTestsFailed, NAMED('num_failed'));
// OUTPUT(fullResultStr, NAMED('result_desc'));
// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);
Loading