From bb1a9e1871ff8836b6d68fc4c12bd78902b185b5 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Mon, 9 Dec 2024 13:06:58 +0000 Subject: [PATCH] HPCC-33022 Fix pathological dali load time JPTree (used by Dali SDS) could perform horrendously slowly when dealing with a large list of child nodes sharing a common prefix. This was due to how the hashing function was used (no seed number provided). With many entries having the same prefix, it caused a high number of hash collisions, resulting in excessive scanning when adding new members. This particularly affected the /GeneratedDlls section of Dali. After fixing the issue, the time to load this section dropped from over 4700 seconds to 17 seconds. Also add a 'loadxml' utility to daliadmin, that loads and times loading an xml file, and traces the amount of memory it consumes. Signed-off-by: Jake Smith --- dali/daliadmin/daadmin.cpp | 75 ++++++++++++++++++++++++++++++++++++ dali/daliadmin/daadmin.hpp | 1 + dali/daliadmin/daliadmin.cpp | 15 ++++++++ system/jlib/jhash.cpp | 2 +- system/jlib/jptree.cpp | 2 +- system/jlib/jptree.ipp | 6 +-- system/jlib/jstats.cpp | 2 +- system/jlib/jsuperhash.cpp | 22 +++++------ system/jlib/jsuperhash.hpp | 18 +++++---- 9 files changed, 118 insertions(+), 25 deletions(-) diff --git a/dali/daliadmin/daadmin.cpp b/dali/daliadmin/daadmin.cpp index 91f11f4a1d6..cee095e3321 100644 --- a/dali/daliadmin/daadmin.cpp +++ b/dali/daliadmin/daadmin.cpp @@ -2472,6 +2472,81 @@ void xmlSize(const char *filename, double pc) } } +void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML) +{ + OwnedIFile iFile = createIFile(filename); + OwnedIFileIO iFileIO = iFile->open(IFOread); + if (!iFileIO) + { + WARNLOG("File '%s' not found", filename); + return; + } + + class CDummyPTreeMaker : public CSimpleInterfaceOf + { + StringBuffer xpath; + unsigned level = 0; + public: + virtual IPropertyTree *queryRoot() override { return nullptr; } + virtual IPropertyTree *queryCurrentNode() override { return nullptr; } + virtual void reset() override { } + virtual IPropertyTree *create(const char *tag) override { return nullptr; } + // IPTreeNotifyEvent impl. + virtual void beginNode(const char *tag, bool sequence, offset_t startOffset) override { } + virtual void newAttribute(const char *name, const char *value) override { } + virtual void beginNodeContent(const char *tag) override { } + virtual void endNode(const char *tag, unsigned length, const void *value, bool binary, offset_t endOffset) override { } + }; + + byte flags=ipt_none; + PTreeReaderOptions readFlags=ptr_ignoreWhiteSpace; + Owned iMaker; + if (!parseOnly) + { + PROGLOG("Creating property tree from file: %s", filename); + byte flags = ipt_none; + if (useLowMemPTree) + { + PROGLOG("Using low memory property trees"); + flags = ipt_lowmem; + } + iMaker.setown(createPTreeMaker(flags)); + } + else + { + PROGLOG("Reading property tree from file (without creating it): %s", filename); + iMaker.setown(new CDummyPTreeMaker()); + } + + offset_t fSize = iFileIO->size(); + OwnedIFileIOStream stream = createIOStream(iFileIO); + OwnedIFileIOStream progressedIFileIOStream = createProgressIFileIOStream(stream, fSize, "Load progress", 1); + Owned reader = createXMLStreamReader(*progressedIFileIOStream, *iMaker, readFlags); + + ProcessInfo memInfo(ReadMemoryInfo); + __uint64 rss = memInfo.getActiveResidentMemory(); + CCycleTimer timer; + reader->load(); + memInfo.update(ReadMemoryInfo); + __uint64 rssUsed = memInfo.getActiveResidentMemory() - rss; + reader.clear(); + progressedIFileIOStream.clear(); + PROGLOG("Load took: %.2f - RSS consumed: %.2f MB", (float)timer.elapsedMs()/1000, (float)rssUsed/0x100000); + + if (!parseOnly && saveFormattedXML) + { + assertex(iMaker->queryRoot()); + StringBuffer outFilename(filename); + outFilename.append(".out.xml"); + PROGLOG("Saving to %s", outFilename.str()); + timer.reset(); + saveXML(outFilename, iMaker->queryRoot(), 2); + PROGLOG("Save took: %.2f", (float)timer.elapsedMs()/1000); + } + + ::LINK(iMaker->queryRoot()); // intentionally leak (avoid time clearing up) +} + void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType) { CDfsLogicalFileName lfn; diff --git a/dali/daliadmin/daadmin.hpp b/dali/daliadmin/daadmin.hpp index 7a86e86ab30..9230e245d41 100644 --- a/dali/daliadmin/daadmin.hpp +++ b/dali/daliadmin/daadmin.hpp @@ -28,6 +28,7 @@ namespace daadmin extern DALIADMIN_API void setDaliConnectTimeoutMs(unsigned timeoutMs); extern DALIADMIN_API void xmlSize(const char *filename, double pc); +extern DALIADMIN_API void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML); extern DALIADMIN_API void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType = DXB_File); extern DALIADMIN_API void exportToFile(const char *path, const char *filename, bool safe = false); diff --git a/dali/daliadmin/daliadmin.cpp b/dali/daliadmin/daliadmin.cpp index d70ceb728ed..c402fa6f62c 100644 --- a/dali/daliadmin/daliadmin.cpp +++ b/dali/daliadmin/daliadmin.cpp @@ -97,6 +97,9 @@ void usage(const char *exe) printf(" mpping -- time MP connect\n"); printf(" daliping [ ] -- time dali server connect\n"); printf(" getxref -- get all XREF information\n"); + printf(" loadxml [--lowmem[= ] [ files ] -- get all locked files/xpaths\n"); printf(" unlock <[path|file]> -- unlocks either matching xpath(s) or matching logical file(s), can contain wildcards\n"); printf(" validatestore [fix=]\n" @@ -226,6 +229,18 @@ int main(int argc, const char* argv[]) } else if (strieq(cmd, "remotetest")) remoteTest(params.item(1), false); + else if (strieq(cmd, "loadxml")) + { + bool useLowMemPTree = false; + bool saveFormatedTree = false; + bool parseOnly = getComponentConfigSP()->getPropBool("@parseonly"); + if (!parseOnly) + { + useLowMemPTree = getComponentConfigSP()->getPropBool("@lowmem"); + saveFormatedTree = getComponentConfigSP()->getPropBool("@savexml"); + } + loadXMLTest(params.item(1), parseOnly, useLowMemPTree, saveFormatedTree); + } else { UERRLOG("Unknown command %s",cmd); diff --git a/system/jlib/jhash.cpp b/system/jlib/jhash.cpp index 28bce3d24df..da42a193ef2 100644 --- a/system/jlib/jhash.cpp +++ b/system/jlib/jhash.cpp @@ -305,7 +305,7 @@ bool HashTable::keyeq(const void *key1, const void *key2, int ksize) const unsigned HashTable::hash(const void *key, int ksize) const { - unsigned h = 0x811C9DC5; + unsigned h = fnvInitialHash32; unsigned char *bp = (unsigned char *) key; if (ksize<=0) { diff --git a/system/jlib/jptree.cpp b/system/jlib/jptree.cpp index 1b02941f990..5b4151b2461 100644 --- a/system/jlib/jptree.cpp +++ b/system/jlib/jptree.cpp @@ -3847,7 +3847,7 @@ unsigned CAtomPTree::queryHash() const { const char *_name = name.get(); size32_t nl = strlen(_name); - return isnocase() ? hashnc((const byte *) _name, nl, 0): hashc((const byte *) _name, nl, 0); + return isnocase() ? hashnc((const byte *) _name, nl, fnvInitialHash32): hashc((const byte *) _name, nl, fnvInitialHash32); } } diff --git a/system/jlib/jptree.ipp b/system/jlib/jptree.ipp index f4037783958..fd5cb63fd89 100644 --- a/system/jlib/jptree.ipp +++ b/system/jlib/jptree.ipp @@ -58,7 +58,7 @@ protected: virtual unsigned getHashFromElement(const void *e) const override; virtual unsigned getHashFromFindParam(const void *fp) const override { - return hashcz((const unsigned char *)fp, 0); + return hashcz((const unsigned char *)fp, fnvInitialHash32); } virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override { @@ -108,7 +108,7 @@ public: // SuperHashTable definitions virtual unsigned getHashFromFindParam(const void *fp) const override { - return hashncz((const unsigned char *)fp, 0); + return hashncz((const unsigned char *)fp, fnvInitialHash32); } virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override { @@ -844,7 +844,7 @@ public: const char *myname = queryName(); assert(myname); size32_t nl = strlen(myname); - return isnocase() ? hashnc((const byte *)myname, nl, 0): hashc((const byte *)myname, nl, 0); + return isnocase() ? hashnc((const byte *)myname, nl, fnvInitialHash32): hashc((const byte *)myname, nl, fnvInitialHash32); } virtual void setName(const char *_name) override; virtual void setAttribute(const char *attr, const char *val, bool encoded) override; diff --git a/system/jlib/jstats.cpp b/system/jlib/jstats.cpp index 79d2944334d..283d148ee00 100644 --- a/system/jlib/jstats.cpp +++ b/system/jlib/jstats.cpp @@ -1290,7 +1290,7 @@ void StatisticsMapping::createMappings() } const unsigned * kinds = indexToKind.getArray(); - hashcode = hashc((const byte *)kinds, indexToKind.ordinality() * sizeof(unsigned), 0x811C9DC5); + hashcode = hashc((const byte *)kinds, indexToKind.ordinality() * sizeof(unsigned), fnvInitialHash32); //All StatisticsMapping objects are assumed to be static, and never destroyed. const StatisticsMapping * existing = allStatsMappings[hashcode]; diff --git a/system/jlib/jsuperhash.cpp b/system/jlib/jsuperhash.cpp index a1e2b22d229..3e26af7b888 100644 --- a/system/jlib/jsuperhash.cpp +++ b/system/jlib/jsuperhash.cpp @@ -33,8 +33,8 @@ //#define MY_TRACE_HASH #ifdef MY_TRACE_HASH -int my_search_tot = 0; -int my_search_num = 0; +static unsigned my_search_tot = 0; +static unsigned my_search_num = 0; #endif //-- SuperHashTable --------------------------------------------------- @@ -104,13 +104,13 @@ void SuperHashTable::dumpStats() const { #ifdef TRACE_HASH if (tablecount && search_tot && search_num) - printf("Hash table %d entries, %d size, average search length %d(%d/%d) max %d\n", tablecount, tablesize, - (int) (search_tot/search_num), search_tot, search_num, search_max); + printf("Hash table %u entries, %u size, average search length %" I64F "u(%" I64F "u/%u) max %u\n", tablecount, tablesize, + search_tot/search_num, search_tot, search_num, search_max); #endif } #ifdef TRACE_HASH -void SuperHashTable::note_searchlen(int len) const +void SuperHashTable::note_searchlen(unsigned len) const { search_tot += len; search_num++; @@ -151,8 +151,8 @@ unsigned SuperHashTable::doFind(unsigned findHash, const void * findParam) const } #ifdef MY_TRACE_HASH my_search_num++; - if(my_search_num != 0) - printf("Hash table average search length %d\n", (int) (my_search_tot/my_search_num)); + if (my_search_num != 0) + printf("Hash table average search length %u\n", my_search_tot/my_search_num); #endif #ifdef TRACE_HASH note_searchlen(searchlen); @@ -193,8 +193,8 @@ unsigned SuperHashTable::doFindElement(unsigned v, const void * findET) const } #ifdef MY_TRACE_HASH my_search_num++; - if(my_search_num != 0) - printf("Hash table average search length %d\n", (int) (my_search_tot/my_search_num)); + if (my_search_num != 0) + printf("Hash table average search length %u\n", my_search_tot/my_search_num); #endif #ifdef TRACE_HASH note_searchlen(searchlen); @@ -233,8 +233,8 @@ unsigned SuperHashTable::doFindNew(unsigned v) const } #ifdef MY_TRACE_HASH my_search_num++; - if(my_search_num != 0) - printf("Hash table average search length %d\n", (int) (my_search_tot/my_search_num)); + if (my_search_num != 0) + printf("Hash table average search length %u\n", my_search_tot/my_search_num); #endif #ifdef TRACE_HASH note_searchlen(searchlen); diff --git a/system/jlib/jsuperhash.hpp b/system/jlib/jsuperhash.hpp index 9936e86112f..2c82a8fb5e8 100644 --- a/system/jlib/jsuperhash.hpp +++ b/system/jlib/jsuperhash.hpp @@ -28,6 +28,8 @@ #include "jstring.hpp" #include "jmutex.hpp" +constexpr unsigned fnvInitialHash32 = 0x811C9DC5; + extern jlib_decl unsigned hashc( const unsigned char *k, unsigned length, unsigned initval); extern jlib_decl unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval); extern jlib_decl unsigned hashcz( const unsigned char *k, unsigned initval); @@ -88,7 +90,7 @@ class jlib_decl SuperHashTable : public CInterface void doKill(void); void expand(); void expand(unsigned newsize); - void note_searchlen(int) const; + void note_searchlen(unsigned) const; virtual void onAdd(void *et) = 0; virtual void onRemove(void *et) = 0; @@ -105,9 +107,9 @@ class jlib_decl SuperHashTable : public CInterface unsigned tablesize; unsigned tablecount; #ifdef TRACE_HASH - mutable int search_tot; - mutable int search_num; - mutable int search_max; + mutable unsigned __int64 search_tot; + mutable unsigned search_num; + mutable unsigned search_max; #endif }; @@ -514,9 +516,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOfkeyPtr()), key, l+1); if (nocase) - hke->hashValue = hashnc((const unsigned char *)key, l, 0); + hke->hashValue = hashnc((const unsigned char *)key, l, fnvInitialHash32); else - hke->hashValue = hashc((const unsigned char *)key, l, 0); + hke->hashValue = hashc((const unsigned char *)key, l, fnvInitialHash32); hke->linkCount = 0; return hke; } @@ -609,9 +611,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOf