Skip to content

Commit

Permalink
HPCC-33126 Add FNV-1a hashing function and use in jptree
Browse files Browse the repository at this point in the history
Resolves some pathologically slow cases of hash collisions in
jptree maps.

Signed-off-by: Jake Smith <[email protected]>
  • Loading branch information
jakesmith committed Dec 13, 2024
1 parent d7baff7 commit 93cc5bc
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 35 deletions.
5 changes: 3 additions & 2 deletions dali/daliadmin/daadmin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2472,7 +2472,7 @@ void xmlSize(const char *filename, double pc)
}
}

void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML)
void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML, bool freePTree)
{
OwnedIFile iFile = createIFile(filename);
OwnedIFileIO iFileIO = iFile->open(IFOread);
Expand Down Expand Up @@ -2544,7 +2544,8 @@ void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool
PROGLOG("Save took: %.2f", (float)timer.elapsedMs()/1000);
}

::LINK(iMaker->queryRoot()); // intentionally leak (avoid time clearing up)
if (!freePTree)
::LINK(iMaker->queryRoot()); // intentionally leak (avoid time clearing up)
}

void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType)
Expand Down
2 changes: 1 addition & 1 deletion dali/daliadmin/daadmin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace daadmin

extern DALIADMIN_API void setDaliConnectTimeoutMs(unsigned timeoutMs);
extern DALIADMIN_API void xmlSize(const char *filename, double pc);
extern DALIADMIN_API void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML);
extern DALIADMIN_API void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML, bool freePTree);
extern DALIADMIN_API void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType = DXB_File);

extern DALIADMIN_API void exportToFile(const char *path, const char *filename, bool safe = false);
Expand Down
4 changes: 3 additions & 1 deletion dali/daliadmin/daliadmin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,15 @@ int main(int argc, const char* argv[])
{
bool useLowMemPTree = false;
bool saveFormatedTree = false;
bool freePTree = false;
bool parseOnly = getComponentConfigSP()->getPropBool("@parseonly");
if (!parseOnly)
{
useLowMemPTree = getComponentConfigSP()->getPropBool("@lowmem");
saveFormatedTree = getComponentConfigSP()->getPropBool("@savexml");
freePTree = getComponentConfigSP()->getPropBool("@free");
}
loadXMLTest(params.item(1), parseOnly, useLowMemPTree, saveFormatedTree);
loadXMLTest(params.item(1), parseOnly, useLowMemPTree, saveFormatedTree, freePTree);
}
else
{
Expand Down
145 changes: 132 additions & 13 deletions system/jlib/jhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@ MODULE_EXIT()
// do not want these in your leak checking call releaseAtoms()
}

#define HASHONE(hash, c) { hash *= 0x01000193; hash ^= c; } // Fowler/Noll/Vo Hash... seems to work pretty well, and fast
#define HASHONE(hash, c) { hash *= fnvPrime32; hash ^= c; } // Fowler/Noll/Vo Hash... seems to work pretty well, and fast
#define HASHONE_FNV1a(hash, c) { hash ^= c; hash *= fnvPrime32; }

//Following is potentially quicker, but not tested
//#define HASHONE(hash, c) { hash += (hash<<1) + (hash<<4) + (hash<<7) + (hash<<8) + (hash<<24); hash ^= c; }

unsigned hashc( const unsigned char *k, unsigned length, unsigned initval)
// deprecatedHash* functions (non template versions) kept for unittest only
unsigned deprecatedHashc(const unsigned char *k, unsigned length, unsigned initval)
{
unsigned hash = initval;
unsigned char c;
Expand All @@ -94,7 +96,7 @@ unsigned hashc( const unsigned char *k, unsigned length, unsigned initval)
return hash;
}

unsigned hashcz( const unsigned char *k, unsigned initval)
unsigned deprecatedHashcz(const unsigned char *k, unsigned initval)
{
unsigned hash = initval;
for (;;)
Expand All @@ -107,8 +109,87 @@ unsigned hashcz( const unsigned char *k, unsigned initval)
return hash;
}

template <typename T>
inline unsigned doHashValue( T value, unsigned initval)
// FNV Hash Operations
struct FNV1
{
static inline void apply(unsigned &hash, unsigned char c)
{
hash *= fnvPrime32;
hash ^= c;
}
};

struct FNV1a
{
static inline void apply(unsigned &hash, unsigned char c)
{
hash ^= c;
hash *= fnvPrime32;
}
};

template <typename HashPolicy>
unsigned policyHashc(const unsigned char *k, unsigned length, unsigned initval)
{
unsigned hash = initval;
unsigned char c;
while (length >= 8)
{
c = (*k++); HashPolicy::apply(hash, c);
c = (*k++); HashPolicy::apply(hash, c);
c = (*k++); HashPolicy::apply(hash, c);
c = (*k++); HashPolicy::apply(hash, c);
length-=4;
}
switch (length)
{
case 7: c = (*k++); HashPolicy::apply(hash, c); // fallthrough
case 6: c = (*k++); HashPolicy::apply(hash, c); // fallthrough
case 5: c = (*k++); HashPolicy::apply(hash, c); // fallthrough
case 4: c = (*k++); HashPolicy::apply(hash, c); // fallthrough
case 3: c = (*k++); HashPolicy::apply(hash, c); // fallthrough
case 2: c = (*k++); HashPolicy::apply(hash, c); // fallthrough
case 1: c = (*k++); HashPolicy::apply(hash, c);
}
return hash;
}

template <typename HashPolicy>
unsigned policyHashcz(const unsigned char *k, unsigned initval)
{
unsigned hash = initval;
for (;;)
{
unsigned char c = (*k++);
if (c == 0)
break;
HashPolicy::apply(hash, c);
}
return hash;
}

unsigned hashc(const unsigned char *k, unsigned length, unsigned initval)
{
return policyHashc<FNV1>(k, length, initval);
}

unsigned hashcz(const unsigned char *k, unsigned initval)
{
return policyHashcz<FNV1>(k, initval);
}

unsigned hashc_fnv1a(const unsigned char *k, unsigned length, unsigned initval)
{
return policyHashc<FNV1a>(k, length, initval);
}

unsigned hashcz_fnv1a(const unsigned char *k, unsigned initval)
{
return policyHashcz<FNV1a>(k, initval);
}

template <typename HashPolicy, typename T>
inline unsigned doHashValue(T value, unsigned initval)
{
//The values returned from this function are only consistent with those from hashn() if running on little endian architecture
unsigned hash = initval;
Expand All @@ -117,30 +198,47 @@ inline unsigned doHashValue( T value, unsigned initval)
{
c = (byte)value;
value >>= 8;
HASHONE(hash, c);
HashPolicy::apply(hash, c);
}
return hash;
}

unsigned hashvalue( unsigned value, unsigned initval)
unsigned hashvalue(unsigned value, unsigned initval)
{
return doHashValue(value, initval);
return doHashValue<FNV1>(value, initval);
}

unsigned hashvalue( unsigned __int64 value, unsigned initval)
{
return doHashValue(value, initval);
return doHashValue<FNV1>(value, initval);
}

unsigned hashvalue( const void * value, unsigned initval)
{
return doHashValue((memsize_t)value, initval);
return doHashValue<FNV1>((memsize_t)value, initval);
}

unsigned hashvalue_fnv1a(unsigned value, unsigned initval)
{
return doHashValue<FNV1a>(value, initval);
}

unsigned hashvalue_fnv1a(unsigned __int64 value, unsigned initval)
{
return doHashValue<FNV1a>(value, initval);
}

unsigned hashvalue_fnv1a(const void * value, unsigned initval)
{
return doHashValue<FNV1a>((memsize_t)value, initval);
}


#define GETWORDNC(k,n) ((GETBYTE0(n)+GETBYTE1(n)+GETBYTE2(n)+GETBYTE3(n))&0xdfdfdfdf)


unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval)
template <typename HashPolicy>
unsigned policyHashnc(const unsigned char *k, unsigned length, unsigned initval)
{
unsigned hash = initval;
unsigned char c;
Expand All @@ -165,8 +263,8 @@ unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval)
return hash;
}


unsigned hashncz( const unsigned char *k, unsigned initval)
template <typename HashPolicy>
unsigned policyHashncz(const unsigned char *k, unsigned initval)
{
unsigned hash = initval;
for (;;)
Expand All @@ -180,6 +278,26 @@ unsigned hashncz( const unsigned char *k, unsigned initval)
return hash;
}

unsigned hashnc(const unsigned char *k, unsigned length, unsigned initval)
{
return policyHashnc<FNV1>(k, length, initval);
}

unsigned hashncz(const unsigned char *k, unsigned initval)
{
return policyHashncz<FNV1>(k, initval);
}

unsigned hashnc_fnv1a(const unsigned char *k, unsigned length, unsigned initval)
{
return policyHashnc<FNV1a>(k, length, initval);
}

unsigned hashncz_fnv1a(const unsigned char *k, unsigned initval)
{
return policyHashncz<FNV1a>(k, initval);
}

MappingKey::MappingKey(const void * inKey, int keysize)
{
int ksm = keysize;
Expand All @@ -201,6 +319,7 @@ MappingKey::MappingKey(const void * inKey, int keysize)
key = temp;
}


//-- Mapping ---------------------------------------------------

unsigned MappingBase::getHash() const { return hash; }
Expand Down
26 changes: 19 additions & 7 deletions system/jlib/jhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,13 +425,25 @@ extern jlib_decl IIdAtom * createIdAtom(const char *value);
extern jlib_decl IIdAtom * createIdAtom(const char *value, size32_t len);

extern jlib_decl void releaseAtoms();
extern jlib_decl unsigned hashc( const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashcz( const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashncz( const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashvalue( unsigned value, unsigned initval);
extern jlib_decl unsigned hashvalue( unsigned __int64 value, unsigned initval);
extern jlib_decl unsigned hashvalue( const void * value, unsigned initval);

extern jlib_decl unsigned deprecatedHashc(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned deprecatedHashcz(const unsigned char *k, unsigned initval);

extern jlib_decl unsigned hashc(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashnc(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashcz(const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashncz(const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashvalue(unsigned value, unsigned initval);
extern jlib_decl unsigned hashvalue(unsigned __int64 value, unsigned initval);
extern jlib_decl unsigned hashvalue(const void * value, unsigned initval);

extern jlib_decl unsigned hashc_fnv1a(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashnc_fnv1a(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashcz_fnv1a(const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashncz_fnv1a(const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashvalue_fnv1a(unsigned value, unsigned initval);
extern jlib_decl unsigned hashvalue_fnv1a(unsigned __int64 value, unsigned initval);
extern jlib_decl unsigned hashvalue_fnv1a(const void * value, unsigned initval);

//================================================
// Minimal Hash table template - slightly less overhead that HashTable/SuperHashTable
Expand Down
2 changes: 1 addition & 1 deletion system/jlib/jlzw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ void CLZWCompressor::open(void *buf,size32_t max)



#define HASHC(code,ch) (((0x01000193*(unsigned)code)^(unsigned char)ch)%LZW_HASH_TABLE_SIZE)
#define HASHC(code,ch) (((fnvPrime32*(unsigned)code)^(unsigned char)ch)%LZW_HASH_TABLE_SIZE)

#define BE_MEMCPY4(dst,src) { if (supportbigendian) _WINCPYREV4(dst,src); else memcpy(dst,src,4); }

Expand Down
2 changes: 1 addition & 1 deletion system/jlib/jptree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3847,7 +3847,7 @@ unsigned CAtomPTree::queryHash() const
{
const char *_name = name.get();
size32_t nl = strlen(_name);
return isnocase() ? hashnc((const byte *) _name, nl, fnvInitialHash32): hashc((const byte *) _name, nl, fnvInitialHash32);
return isnocase() ? hashnc_fnv1a((const byte *) _name, nl, fnvInitialHash32): hashc_fnv1a((const byte *) _name, nl, fnvInitialHash32);
}
}

Expand Down
6 changes: 3 additions & 3 deletions system/jlib/jptree.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ protected:
virtual unsigned getHashFromElement(const void *e) const override;
virtual unsigned getHashFromFindParam(const void *fp) const override
{
return hashcz((const unsigned char *)fp, fnvInitialHash32);
return hashcz_fnv1a((const unsigned char *)fp, fnvInitialHash32);
}
virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override
{
Expand Down Expand Up @@ -108,7 +108,7 @@ public:
// SuperHashTable definitions
virtual unsigned getHashFromFindParam(const void *fp) const override
{
return hashncz((const unsigned char *)fp, fnvInitialHash32);
return hashncz_fnv1a((const unsigned char *)fp, fnvInitialHash32);
}
virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override
{
Expand Down Expand Up @@ -844,7 +844,7 @@ public:
const char *myname = queryName();
assert(myname);
size32_t nl = strlen(myname);
return isnocase() ? hashnc((const byte *)myname, nl, fnvInitialHash32): hashc((const byte *)myname, nl, fnvInitialHash32);
return isnocase() ? hashnc_fnv1a((const byte *)myname, nl, fnvInitialHash32): hashc_fnv1a((const byte *)myname, nl, fnvInitialHash32);
}
virtual void setName(const char *_name) override;
virtual void setAttribute(const char *attr, const char *val, bool encoded) override;
Expand Down
13 changes: 9 additions & 4 deletions system/jlib/jsuperhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,16 @@
#include "jmutex.hpp"

constexpr unsigned fnvInitialHash32 = 0x811C9DC5;
constexpr unsigned fnvPrime32 = 0x01000193;

extern jlib_decl unsigned hashc( const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashcz( const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashncz( const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashc_fnv1a(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashnc_fnv1a(const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashcz_fnv1a(const unsigned char *k, unsigned initval);
extern jlib_decl unsigned hashncz_fnv1a(const unsigned char *k, unsigned initval);

class jlib_decl SuperHashTable : public CInterface
{
Expand Down Expand Up @@ -516,9 +521,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOf<HashKeyElement, constchar
HashKeyElement *hke = (HashKeyElement *) checked_malloc(sizeof(HashKeyElement)+l+1,-605);
memcpy((void *) (hke->keyPtr()), key, l+1);
if (nocase)
hke->hashValue = hashnc((const unsigned char *)key, l, fnvInitialHash32);
hke->hashValue = hashnc_fnv1a((const unsigned char *)key, l, fnvInitialHash32);
else
hke->hashValue = hashc((const unsigned char *)key, l, fnvInitialHash32);
hke->hashValue = hashc_fnv1a((const unsigned char *)key, l, fnvInitialHash32);
hke->linkCount = 0;
return hke;
}
Expand Down Expand Up @@ -611,9 +616,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOf<HashKeyElement, constchar
virtual unsigned getHashFromFindParam(const void *fp) const
{
if (nocase)
return hashncz((const unsigned char *)fp, fnvInitialHash32);
return hashncz_fnv1a((const unsigned char *)fp, fnvInitialHash32);
else
return hashcz((const unsigned char *)fp, fnvInitialHash32);
return hashcz_fnv1a((const unsigned char *)fp, fnvInitialHash32);
}

virtual const void *getFindParam(const void *e) const
Expand Down
Loading

0 comments on commit 93cc5bc

Please sign in to comment.