Skip to content

Commit

Permalink
Merge pull request #18748 from dcamper/hpcc-31921-regex-caching-master
Browse files Browse the repository at this point in the history
HPCC-31921 Add caching of regex compiled search patterns

Reviewed-by: Gavin Halliday <[email protected]>
Merged-by: Gavin Halliday <[email protected]>
  • Loading branch information
ghalliday authored Jun 20, 2024
2 parents b4d06e3 + 7997a0a commit 683bc67
Show file tree
Hide file tree
Showing 11 changed files with 677 additions and 101 deletions.
2 changes: 1 addition & 1 deletion ecl/hql/hqlfold.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2796,7 +2796,7 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
StringBuffer pattern, search;
v0->getUTF8Value(pattern);
v1->getUTF8Value(search);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern.lengthUtf8(), pattern, !expr->hasAttribute(noCaseAtom));
compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.lengthUtf8(), search.str());
rtlDestroyCompiledU8StrRegExpr(compiled);
}
Expand Down
443 changes: 343 additions & 100 deletions rtl/eclrtl/eclregex.cpp

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions rtl/eclrtl/eclrtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4960,6 +4960,19 @@ unsigned rtlUtf8Size(unsigned len, const void * _data)
return offset;
}

unsigned rtlUtf8Length(const void * _data)
{
const byte * data = (const byte *)_data;
size32_t length = 0;
unsigned offset = 0;
while (data[offset])
{
offset += readUtf8Size(data+offset);
length++;
}
return length;
}

unsigned rtlUtf8Length(unsigned size, const void * _data)
{
const byte * data = (const byte *)_data;
Expand Down
1 change: 1 addition & 0 deletions rtl/eclrtl/eclrtl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ ECLRTL_API void rtlStrToVUnicode(unsigned outlen, UChar * out, unsigned inlen, c

ECLRTL_API unsigned rtlUtf8Size(const void * data);
ECLRTL_API unsigned rtlUtf8Size(unsigned len, const void * data);
ECLRTL_API unsigned rtlUtf8Length(const void * data);
ECLRTL_API unsigned rtlUtf8Length(unsigned size, const void * data);
ECLRTL_API unsigned rtlUtf8Char(const void * data);
ECLRTL_API void rtlUtf8ToData(size32_t outlen, void * out, size32_t inlen, const char *in);
Expand Down
100 changes: 100 additions & 0 deletions system/jlib/jhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#define JHASH_HPP

#include <functional>
#include <list>
#include <unordered_map>
#include <utility>

Expand Down Expand Up @@ -687,4 +688,103 @@ class CTimeLimitedCache
}
};

/**
* CLRUCache
*
* Least-Recently-Used cache class, specialized for key and
* value pointer types (the value is a pointer or a data type
* where a nullptr could represent a missing value).
*
* The get() method returns a found object by value. This
* is intentional and very useful for maintaining refcounts.
*
* There is a minimum size for the cache, defined by
* LRU_MIN_CACHE_SIZE. Attempts to create a smaller cache
* will be silently changed to the minimum size. If no
* initial size is provided to the constructor, the cache
* size will be set to LRU_MIN_CACHE_SIZE.
*
* Methods here are not thread-safe. Callers should block
* concurrent access for non-const methods (which are most
* of them).
*/

#define LRU_MIN_CACHE_SIZE 10

template <class KEYTYPE, class PTRTYPE>
class CLRUCache
{
private:
std::list<KEYTYPE> recentList;
std::unordered_map<KEYTYPE, std::pair<PTRTYPE, typename std::list<KEYTYPE>::iterator>> lookupMap;
size32_t maxCacheSize;

void _downsize()
{
while (lookupMap.size() > maxCacheSize)
{
lookupMap.erase(recentList.back());
recentList.pop_back();
}
}

public:
CLRUCache() : maxCacheSize(LRU_MIN_CACHE_SIZE) {}
CLRUCache(size32_t _maxCacheSize) : maxCacheSize(_maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize) {}
CLRUCache(const CLRUCache& other) = delete;
~CLRUCache() = default;

size32_t getCacheSize() const
{
return lookupMap.size();
}

size32_t setMaxCacheSize(size32_t _maxCacheSize)
{
maxCacheSize = _maxCacheSize < LRU_MIN_CACHE_SIZE ? LRU_MIN_CACHE_SIZE : _maxCacheSize;
_downsize();
return maxCacheSize;
}

PTRTYPE get(const KEYTYPE& key)
{
auto foundIter = lookupMap.find(key);
if (foundIter == lookupMap.end())
return nullptr;

recentList.splice(recentList.begin(), recentList, foundIter->second.second);
foundIter->second.second = recentList.begin();
return foundIter->second.first;
}

void set(const KEYTYPE& key, const PTRTYPE& value)
{
auto foundIter = lookupMap.find(key);

if (foundIter == lookupMap.end())
{
recentList.push_front(key);
lookupMap[key] = {value, recentList.begin()};
_downsize();
}
else
{
recentList.splice(recentList.begin(), recentList, foundIter->second.second);
foundIter->second.first = value;
foundIter->second.second = recentList.begin();
}
}

bool remove(const KEYTYPE& key)
{
auto foundIter = lookupMap.find(key);
if (foundIter == lookupMap.end())
return false;

recentList.erase(foundIter->second.second);
lookupMap.erase(foundIter);
return true;
}
};

#endif
3 changes: 3 additions & 0 deletions testing/regress/ecl/key/regex_cache_string.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='result'>
<Row><result>PASSED</result></Row>
</Dataset>
3 changes: 3 additions & 0 deletions testing/regress/ecl/key/regex_cache_unicode.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='result'>
<Row><result>PASSED</result></Row>
</Dataset>
3 changes: 3 additions & 0 deletions testing/regress/ecl/key/regex_cache_utf8.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='result'>
<Row><result>PASSED</result></Row>
</Dataset>
70 changes: 70 additions & 0 deletions testing/regress/ecl/regex_cache_string.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

IMPORT Std;

regexDS := DATASET
(
100000,
TRANSFORM
(
{STRING a},
SELF.a := (STRING)RANDOM()
),
DISTRIBUTED
);

res := PROJECT
(
NOFOLD(regexDS),
TRANSFORM
(
{
RECORDOF(LEFT),
STRING via_regex,
STRING via_find,
BOOLEAN is_matching
},
SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, 'x'),
SELF.via_find := Std.Str.SubstituteIncluded(LEFT.a, LEFT.a[1], 'x'),
SELF.is_matching := SELF.via_regex = SELF.via_find,
SELF := LEFT
),
PARALLEL(10)
);

numTests := COUNT(regexDS);
testsPassed := res(is_matching);
numTestsPassed := COUNT(testsPassed);
testsFailed := res(~is_matching);
numTestsFailed := COUNT(testsFailed);

MIN_PASS_PERCENTAGE := 0.95;

passedPercentage := numTestsPassed / numTests;
isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE;
resultStr := IF(isSuccess, 'PASSED', 'FAILED');
fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2));

// Output for unit test parsing
OUTPUT(resultStr, NAMED('result'));

// Uncomment the following to see details
// OUTPUT(numTests, NAMED('num_tests'));
// OUTPUT(numTestsPassed, NAMED('num_passed'));
// OUTPUT(numTestsFailed, NAMED('num_failed'));
// OUTPUT(fullResultStr, NAMED('result_desc'));
// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);
70 changes: 70 additions & 0 deletions testing/regress/ecl/regex_cache_unicode.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

IMPORT Std;

regexDS := DATASET
(
100000,
TRANSFORM
(
{UNICODE a},
SELF.a := (UNICODE)RANDOM()
),
DISTRIBUTED
);

res := PROJECT
(
NOFOLD(regexDS),
TRANSFORM
(
{
RECORDOF(LEFT),
UNICODE via_regex,
UNICODE via_find,
BOOLEAN is_matching
},
SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, u'x'),
SELF.via_find := (UNICODE)Std.Uni.SubstituteIncluded(LEFT.a, LEFT.a[1], u'x'),
SELF.is_matching := SELF.via_regex = SELF.via_find,
SELF := LEFT
),
PARALLEL(10)
);

numTests := COUNT(regexDS);
testsPassed := res(is_matching);
numTestsPassed := COUNT(testsPassed);
testsFailed := res(~is_matching);
numTestsFailed := COUNT(testsFailed);

MIN_PASS_PERCENTAGE := 0.95;

passedPercentage := numTestsPassed / numTests;
isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE;
resultStr := IF(isSuccess, 'PASSED', 'FAILED');
fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2));

// Output for unit test parsing
OUTPUT(resultStr, NAMED('result'));

// Uncomment the following to see details
// OUTPUT(numTests, NAMED('num_tests'));
// OUTPUT(numTestsPassed, NAMED('num_passed'));
// OUTPUT(numTestsFailed, NAMED('num_failed'));
// OUTPUT(fullResultStr, NAMED('result_desc'));
// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);
70 changes: 70 additions & 0 deletions testing/regress/ecl/regex_cache_utf8.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

IMPORT Std;

regexDS := DATASET
(
100000,
TRANSFORM
(
{UTF8 a},
SELF.a := (UTF8)RANDOM()
),
DISTRIBUTED
);

res := PROJECT
(
NOFOLD(regexDS),
TRANSFORM
(
{
RECORDOF(LEFT),
UTF8 via_regex,
UTF8 via_find,
BOOLEAN is_matching
},
SELF.via_regex := REGEXREPLACE(LEFT.a[1], LEFT.a, u8'x'),
SELF.via_find := (UTF8)Std.Uni.SubstituteIncluded(LEFT.a, LEFT.a[1], u8'x'),
SELF.is_matching := SELF.via_regex = SELF.via_find,
SELF := LEFT
),
PARALLEL(10)
);

numTests := COUNT(regexDS);
testsPassed := res(is_matching);
numTestsPassed := COUNT(testsPassed);
testsFailed := res(~is_matching);
numTestsFailed := COUNT(testsFailed);

MIN_PASS_PERCENTAGE := 0.95;

passedPercentage := numTestsPassed / numTests;
isSuccess := passedPercentage >= MIN_PASS_PERCENTAGE;
resultStr := IF(isSuccess, 'PASSED', 'FAILED');
fullResultStr := resultStr + ': ' + (STRING)(ROUND(passedPercentage * 100, 2));

// Output for unit test parsing
OUTPUT(resultStr, NAMED('result'));

// Uncomment the following to see details
// OUTPUT(numTests, NAMED('num_tests'));
// OUTPUT(numTestsPassed, NAMED('num_passed'));
// OUTPUT(numTestsFailed, NAMED('num_failed'));
// OUTPUT(fullResultStr, NAMED('result_desc'));
// OUTPUT(testsFailed, NAMED('failed_tests'), ALL);

0 comments on commit 683bc67

Please sign in to comment.