From 93cc5bc94a3e9b8ca771d541b8dbe871374a43c1 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Fri, 13 Dec 2024 10:26:47 +0000 Subject: [PATCH] HPCC-33126 Add FNV-1a hashing function and use in jptree Resolves some pathologically slow cases of hash collisions in jptree maps. Signed-off-by: Jake Smith --- dali/daliadmin/daadmin.cpp | 5 +- dali/daliadmin/daadmin.hpp | 2 +- dali/daliadmin/daliadmin.cpp | 4 +- system/jlib/jhash.cpp | 145 +++++++++++++++++++++++++++++--- system/jlib/jhash.hpp | 26 ++++-- system/jlib/jlzw.cpp | 2 +- system/jlib/jptree.cpp | 2 +- system/jlib/jptree.ipp | 6 +- system/jlib/jsuperhash.hpp | 13 ++- testing/unittests/jlibtests.cpp | 122 ++++++++++++++++++++++++++- 10 files changed, 292 insertions(+), 35 deletions(-) diff --git a/dali/daliadmin/daadmin.cpp b/dali/daliadmin/daadmin.cpp index cee095e3321..837fe3882a4 100644 --- a/dali/daliadmin/daadmin.cpp +++ b/dali/daliadmin/daadmin.cpp @@ -2472,7 +2472,7 @@ void xmlSize(const char *filename, double pc) } } -void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML) +void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML, bool freePTree) { OwnedIFile iFile = createIFile(filename); OwnedIFileIO iFileIO = iFile->open(IFOread); @@ -2544,7 +2544,8 @@ void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool PROGLOG("Save took: %.2f", (float)timer.elapsedMs()/1000); } - ::LINK(iMaker->queryRoot()); // intentionally leak (avoid time clearing up) + if (!freePTree) + ::LINK(iMaker->queryRoot()); // intentionally leak (avoid time clearing up) } void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType) diff --git a/dali/daliadmin/daadmin.hpp b/dali/daliadmin/daadmin.hpp index 9230e245d41..b6f27473e45 100644 --- a/dali/daliadmin/daadmin.hpp +++ b/dali/daliadmin/daadmin.hpp @@ -28,7 +28,7 @@ namespace daadmin extern DALIADMIN_API void setDaliConnectTimeoutMs(unsigned timeoutMs); extern DALIADMIN_API void xmlSize(const char *filename, double pc); -extern DALIADMIN_API void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML); +extern DALIADMIN_API void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML, bool freePTree); extern DALIADMIN_API void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType = DXB_File); extern DALIADMIN_API void exportToFile(const char *path, const char *filename, bool safe = false); diff --git a/dali/daliadmin/daliadmin.cpp b/dali/daliadmin/daliadmin.cpp index c402fa6f62c..f3612d39606 100644 --- a/dali/daliadmin/daliadmin.cpp +++ b/dali/daliadmin/daliadmin.cpp @@ -233,13 +233,15 @@ int main(int argc, const char* argv[]) { bool useLowMemPTree = false; bool saveFormatedTree = false; + bool freePTree = false; bool parseOnly = getComponentConfigSP()->getPropBool("@parseonly"); if (!parseOnly) { useLowMemPTree = getComponentConfigSP()->getPropBool("@lowmem"); saveFormatedTree = getComponentConfigSP()->getPropBool("@savexml"); + freePTree = getComponentConfigSP()->getPropBool("@free"); } - loadXMLTest(params.item(1), parseOnly, useLowMemPTree, saveFormatedTree); + loadXMLTest(params.item(1), parseOnly, useLowMemPTree, saveFormatedTree, freePTree); } else { diff --git a/system/jlib/jhash.cpp b/system/jlib/jhash.cpp index da42a193ef2..5ea87918a9d 100644 --- a/system/jlib/jhash.cpp +++ b/system/jlib/jhash.cpp @@ -64,12 +64,14 @@ MODULE_EXIT() // do not want these in your leak checking call releaseAtoms() } -#define HASHONE(hash, c) { hash *= 0x01000193; hash ^= c; } // Fowler/Noll/Vo Hash... seems to work pretty well, and fast +#define HASHONE(hash, c) { hash *= fnvPrime32; hash ^= c; } // Fowler/Noll/Vo Hash... seems to work pretty well, and fast +#define HASHONE_FNV1a(hash, c) { hash ^= c; hash *= fnvPrime32; } //Following is potentially quicker, but not tested //#define HASHONE(hash, c) { hash += (hash<<1) + (hash<<4) + (hash<<7) + (hash<<8) + (hash<<24); hash ^= c; } -unsigned hashc( const unsigned char *k, unsigned length, unsigned initval) +// deprecatedHash* functions (non template versions) kept for unittest only +unsigned deprecatedHashc(const unsigned char *k, unsigned length, unsigned initval) { unsigned hash = initval; unsigned char c; @@ -94,7 +96,7 @@ unsigned hashc( const unsigned char *k, unsigned length, unsigned initval) return hash; } -unsigned hashcz( const unsigned char *k, unsigned initval) +unsigned deprecatedHashcz(const unsigned char *k, unsigned initval) { unsigned hash = initval; for (;;) @@ -107,8 +109,87 @@ unsigned hashcz( const unsigned char *k, unsigned initval) return hash; } -template -inline unsigned doHashValue( T value, unsigned initval) +// FNV Hash Operations +struct FNV1 +{ + static inline void apply(unsigned &hash, unsigned char c) + { + hash *= fnvPrime32; + hash ^= c; + } +}; + +struct FNV1a +{ + static inline void apply(unsigned &hash, unsigned char c) + { + hash ^= c; + hash *= fnvPrime32; + } +}; + +template +unsigned policyHashc(const unsigned char *k, unsigned length, unsigned initval) +{ + unsigned hash = initval; + unsigned char c; + while (length >= 8) + { + c = (*k++); HashPolicy::apply(hash, c); + c = (*k++); HashPolicy::apply(hash, c); + c = (*k++); HashPolicy::apply(hash, c); + c = (*k++); HashPolicy::apply(hash, c); + length-=4; + } + switch (length) + { + case 7: c = (*k++); HashPolicy::apply(hash, c); // fallthrough + case 6: c = (*k++); HashPolicy::apply(hash, c); // fallthrough + case 5: c = (*k++); HashPolicy::apply(hash, c); // fallthrough + case 4: c = (*k++); HashPolicy::apply(hash, c); // fallthrough + case 3: c = (*k++); HashPolicy::apply(hash, c); // fallthrough + case 2: c = (*k++); HashPolicy::apply(hash, c); // fallthrough + case 1: c = (*k++); HashPolicy::apply(hash, c); + } + return hash; +} + +template +unsigned policyHashcz(const unsigned char *k, unsigned initval) +{ + unsigned hash = initval; + for (;;) + { + unsigned char c = (*k++); + if (c == 0) + break; + HashPolicy::apply(hash, c); + } + return hash; +} + +unsigned hashc(const unsigned char *k, unsigned length, unsigned initval) +{ + return policyHashc(k, length, initval); +} + +unsigned hashcz(const unsigned char *k, unsigned initval) +{ + return policyHashcz(k, initval); +} + +unsigned hashc_fnv1a(const unsigned char *k, unsigned length, unsigned initval) +{ + return policyHashc(k, length, initval); +} + +unsigned hashcz_fnv1a(const unsigned char *k, unsigned initval) +{ + return policyHashcz(k, initval); +} + +template +inline unsigned doHashValue(T value, unsigned initval) { //The values returned from this function are only consistent with those from hashn() if running on little endian architecture unsigned hash = initval; @@ -117,30 +198,47 @@ inline unsigned doHashValue( T value, unsigned initval) { c = (byte)value; value >>= 8; - HASHONE(hash, c); + HashPolicy::apply(hash, c); } return hash; } -unsigned hashvalue( unsigned value, unsigned initval) +unsigned hashvalue(unsigned value, unsigned initval) { - return doHashValue(value, initval); + return doHashValue(value, initval); } unsigned hashvalue( unsigned __int64 value, unsigned initval) { - return doHashValue(value, initval); + return doHashValue(value, initval); } unsigned hashvalue( const void * value, unsigned initval) { - return doHashValue((memsize_t)value, initval); + return doHashValue((memsize_t)value, initval); +} + +unsigned hashvalue_fnv1a(unsigned value, unsigned initval) +{ + return doHashValue(value, initval); +} + +unsigned hashvalue_fnv1a(unsigned __int64 value, unsigned initval) +{ + return doHashValue(value, initval); +} + +unsigned hashvalue_fnv1a(const void * value, unsigned initval) +{ + return doHashValue((memsize_t)value, initval); } + #define GETWORDNC(k,n) ((GETBYTE0(n)+GETBYTE1(n)+GETBYTE2(n)+GETBYTE3(n))&0xdfdfdfdf) -unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval) +template +unsigned policyHashnc(const unsigned char *k, unsigned length, unsigned initval) { unsigned hash = initval; unsigned char c; @@ -165,8 +263,8 @@ unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval) return hash; } - -unsigned hashncz( const unsigned char *k, unsigned initval) +template +unsigned policyHashncz(const unsigned char *k, unsigned initval) { unsigned hash = initval; for (;;) @@ -180,6 +278,26 @@ unsigned hashncz( const unsigned char *k, unsigned initval) return hash; } +unsigned hashnc(const unsigned char *k, unsigned length, unsigned initval) +{ + return policyHashnc(k, length, initval); +} + +unsigned hashncz(const unsigned char *k, unsigned initval) +{ + return policyHashncz(k, initval); +} + +unsigned hashnc_fnv1a(const unsigned char *k, unsigned length, unsigned initval) +{ + return policyHashnc(k, length, initval); +} + +unsigned hashncz_fnv1a(const unsigned char *k, unsigned initval) +{ + return policyHashncz(k, initval); +} + MappingKey::MappingKey(const void * inKey, int keysize) { int ksm = keysize; @@ -201,6 +319,7 @@ MappingKey::MappingKey(const void * inKey, int keysize) key = temp; } + //-- Mapping --------------------------------------------------- unsigned MappingBase::getHash() const { return hash; } diff --git a/system/jlib/jhash.hpp b/system/jlib/jhash.hpp index c8790fe2e4c..48e06f1e34e 100644 --- a/system/jlib/jhash.hpp +++ b/system/jlib/jhash.hpp @@ -425,13 +425,25 @@ extern jlib_decl IIdAtom * createIdAtom(const char *value); extern jlib_decl IIdAtom * createIdAtom(const char *value, size32_t len); extern jlib_decl void releaseAtoms(); -extern jlib_decl unsigned hashc( const unsigned char *k, unsigned length, unsigned initval); -extern jlib_decl unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval); -extern jlib_decl unsigned hashcz( const unsigned char *k, unsigned initval); -extern jlib_decl unsigned hashncz( const unsigned char *k, unsigned initval); -extern jlib_decl unsigned hashvalue( unsigned value, unsigned initval); -extern jlib_decl unsigned hashvalue( unsigned __int64 value, unsigned initval); -extern jlib_decl unsigned hashvalue( const void * value, unsigned initval); + +extern jlib_decl unsigned deprecatedHashc(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned deprecatedHashcz(const unsigned char *k, unsigned initval); + +extern jlib_decl unsigned hashc(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned hashnc(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned hashcz(const unsigned char *k, unsigned initval); +extern jlib_decl unsigned hashncz(const unsigned char *k, unsigned initval); +extern jlib_decl unsigned hashvalue(unsigned value, unsigned initval); +extern jlib_decl unsigned hashvalue(unsigned __int64 value, unsigned initval); +extern jlib_decl unsigned hashvalue(const void * value, unsigned initval); + +extern jlib_decl unsigned hashc_fnv1a(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned hashnc_fnv1a(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned hashcz_fnv1a(const unsigned char *k, unsigned initval); +extern jlib_decl unsigned hashncz_fnv1a(const unsigned char *k, unsigned initval); +extern jlib_decl unsigned hashvalue_fnv1a(unsigned value, unsigned initval); +extern jlib_decl unsigned hashvalue_fnv1a(unsigned __int64 value, unsigned initval); +extern jlib_decl unsigned hashvalue_fnv1a(const void * value, unsigned initval); //================================================ // Minimal Hash table template - slightly less overhead that HashTable/SuperHashTable diff --git a/system/jlib/jlzw.cpp b/system/jlib/jlzw.cpp index 3c18d04dc93..e66dda15d49 100644 --- a/system/jlib/jlzw.cpp +++ b/system/jlib/jlzw.cpp @@ -310,7 +310,7 @@ void CLZWCompressor::open(void *buf,size32_t max) -#define HASHC(code,ch) (((0x01000193*(unsigned)code)^(unsigned char)ch)%LZW_HASH_TABLE_SIZE) +#define HASHC(code,ch) (((fnvPrime32*(unsigned)code)^(unsigned char)ch)%LZW_HASH_TABLE_SIZE) #define BE_MEMCPY4(dst,src) { if (supportbigendian) _WINCPYREV4(dst,src); else memcpy(dst,src,4); } diff --git a/system/jlib/jptree.cpp b/system/jlib/jptree.cpp index 5b4151b2461..49de17a2900 100644 --- a/system/jlib/jptree.cpp +++ b/system/jlib/jptree.cpp @@ -3847,7 +3847,7 @@ unsigned CAtomPTree::queryHash() const { const char *_name = name.get(); size32_t nl = strlen(_name); - return isnocase() ? hashnc((const byte *) _name, nl, fnvInitialHash32): hashc((const byte *) _name, nl, fnvInitialHash32); + return isnocase() ? hashnc_fnv1a((const byte *) _name, nl, fnvInitialHash32): hashc_fnv1a((const byte *) _name, nl, fnvInitialHash32); } } diff --git a/system/jlib/jptree.ipp b/system/jlib/jptree.ipp index fd5cb63fd89..4c5c0d194a7 100644 --- a/system/jlib/jptree.ipp +++ b/system/jlib/jptree.ipp @@ -58,7 +58,7 @@ protected: virtual unsigned getHashFromElement(const void *e) const override; virtual unsigned getHashFromFindParam(const void *fp) const override { - return hashcz((const unsigned char *)fp, fnvInitialHash32); + return hashcz_fnv1a((const unsigned char *)fp, fnvInitialHash32); } virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override { @@ -108,7 +108,7 @@ public: // SuperHashTable definitions virtual unsigned getHashFromFindParam(const void *fp) const override { - return hashncz((const unsigned char *)fp, fnvInitialHash32); + return hashncz_fnv1a((const unsigned char *)fp, fnvInitialHash32); } virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override { @@ -844,7 +844,7 @@ public: const char *myname = queryName(); assert(myname); size32_t nl = strlen(myname); - return isnocase() ? hashnc((const byte *)myname, nl, fnvInitialHash32): hashc((const byte *)myname, nl, fnvInitialHash32); + return isnocase() ? hashnc_fnv1a((const byte *)myname, nl, fnvInitialHash32): hashc_fnv1a((const byte *)myname, nl, fnvInitialHash32); } virtual void setName(const char *_name) override; virtual void setAttribute(const char *attr, const char *val, bool encoded) override; diff --git a/system/jlib/jsuperhash.hpp b/system/jlib/jsuperhash.hpp index 2c82a8fb5e8..51f4d62078c 100644 --- a/system/jlib/jsuperhash.hpp +++ b/system/jlib/jsuperhash.hpp @@ -29,11 +29,16 @@ #include "jmutex.hpp" constexpr unsigned fnvInitialHash32 = 0x811C9DC5; +constexpr unsigned fnvPrime32 = 0x01000193; extern jlib_decl unsigned hashc( const unsigned char *k, unsigned length, unsigned initval); extern jlib_decl unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval); extern jlib_decl unsigned hashcz( const unsigned char *k, unsigned initval); extern jlib_decl unsigned hashncz( const unsigned char *k, unsigned initval); +extern jlib_decl unsigned hashc_fnv1a(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned hashnc_fnv1a(const unsigned char *k, unsigned length, unsigned initval); +extern jlib_decl unsigned hashcz_fnv1a(const unsigned char *k, unsigned initval); +extern jlib_decl unsigned hashncz_fnv1a(const unsigned char *k, unsigned initval); class jlib_decl SuperHashTable : public CInterface { @@ -516,9 +521,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOfkeyPtr()), key, l+1); if (nocase) - hke->hashValue = hashnc((const unsigned char *)key, l, fnvInitialHash32); + hke->hashValue = hashnc_fnv1a((const unsigned char *)key, l, fnvInitialHash32); else - hke->hashValue = hashc((const unsigned char *)key, l, fnvInitialHash32); + hke->hashValue = hashc_fnv1a((const unsigned char *)key, l, fnvInitialHash32); hke->linkCount = 0; return hke; } @@ -611,9 +616,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOf -#include #include +#include +#include +#include +#include + #include "jsem.hpp" #include "jfile.hpp" #include "jdebug.hpp" @@ -3055,5 +3058,120 @@ class JLibUnicodeTest : public CppUnit::TestFixture CPPUNIT_TEST_SUITE_REGISTRATION( JLibUnicodeTest ); CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( JLibUnicodeTest, "JLibUnicodeTest" ); +class HashFuncTests : public CppUnit::TestFixture +{ +public: + virtual void setUp() override + { + generateFixedRandomNullTermiantedStrings(); + } + + CPPUNIT_TEST_SUITE(HashFuncTests); + CPPUNIT_TEST(fnvTests); + CPPUNIT_TEST_SUITE_END(); + +protected: + static constexpr unsigned maxCalls = 10'000'000; + static constexpr unsigned minLen = 5, maxLen = 100; + static constexpr unsigned lenRange = maxLen - minLen + 1; + static constexpr unsigned randomSeed = 42; + static constexpr size_t testBufferSize = 1'000'000; + CCycleTimer timer; + std::vector buffer; + + unsigned getOffsetLenHash(unsigned offset, unsigned hash) + { + hash ^= (offset * 0x27D4EB2D); // use MurMurHash3 multiplier to introduce more randomness + hash *= fnvPrime32; + return hash; + } + void generateFixedRandomNullTermiantedStrings() + { + buffer.resize(testBufferSize); + std::mt19937 rng(randomSeed); + std::uniform_int_distribution dist(1, 255); + + unsigned offset = 0; + unsigned lenHash = fnvInitialHash32; + while (offset < testBufferSize) + { + // create str lengths between min and max based on offset, + // so that we can predictably read them back + lenHash = getOffsetLenHash(offset, lenHash); + unsigned len = (lenHash % lenRange) + minLen; + + if (offset + len + 1 >= testBufferSize) + break; + + for (unsigned i=0; i + void testHashc() + { + unsigned hashResult = fnvInitialHash32; + + unsigned offset = 0; + for (unsigned i=0; i buffer.size()) + offset = 0; + + hashResult ^= HASHCFUNC(&buffer[offset], len, hashResult); + offset += len; + } + CPPUNIT_ASSERT(hashResult != 0); + } + template + void testHashcz() + { + unsigned hashResult = fnvInitialHash32; + + unsigned lenHash = fnvInitialHash32; + unsigned offset = 0; + for (unsigned i=0; i buffer.size()) + { + offset = 0; + lenHash = getOffsetLenHash(offset, fnvInitialHash32); + len = (lenHash % lenRange) + minLen; + } + dbgassertex(len == strlen((const char *)&buffer[offset])); + hashResult ^= HASHCZFUNC(&buffer[offset], hashResult); + offset += len + 1; + } + CPPUNIT_ASSERT(hashResult != 0); + } + void measure(const char *funcName, const std::function &func) + { + timer.reset(); + func(); + unsigned elapsed = timer.elapsedMs(); + double throughput = static_cast(maxCalls) / elapsed * 1000; + PROGLOG("%s: %u calls took %u ms (%.2f hashes/sec)", funcName, maxCalls, elapsed, throughput); + } + void fnvTests() + { + measure("deprecatedHashc (fnv1)", [this]() { testHashc(); }); + measure("deprecatedHashcz (fnv1)", [this]() { testHashcz(); }); + measure("hashc (fnv1)", [this]() { testHashc(); }); + measure("hashcz (fnv1)", [this]() { testHashcz(); }); + measure("hashc_fnv1a", [this]() { testHashc(); }); + measure("hashcz_fnv1a", [this]() { testHashcz(); }); + } +}; + +CPPUNIT_TEST_SUITE_REGISTRATION( HashFuncTests ); +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( HashFuncTests, "HashFuncTests" ); #endif // _USE_CPPUNIT