From 23aeede1c1ac733bc7c5c357c64ca77912e17578 Mon Sep 17 00:00:00 2001 From: Richard Chapman Date: Wed, 23 Aug 2023 12:42:38 +0100 Subject: [PATCH 1/2] HPCC-30156 Instrument CriticalSection Signed-off-by: Richard Chapman --- CMakeLists.txt | 2 + cmake_modules/commonSetup.cmake | 3 +- cmake_modules/options.cmake | 2 +- common/thorhelper/roxiehelper.cpp | 2 +- common/workunit/wujobq.cpp | 4 +- ecl/hql/hqlexpr.ipp | 2 +- plugins/Rembed/Rembed.cpp | 2 +- roxie/roxiemem/roxiemem.cpp | 4 +- system/jlib/jdebug.hpp | 73 +------- system/jlib/jexcept.cpp | 4 +- system/jlib/jmutex.cpp | 127 ++++++++++++++ system/jlib/jmutex.hpp | 270 ++++++++++++++++++++++++------ system/jlib/jtiming.hpp | 97 +++++++++++ testing/unittests/jlibtests.cpp | 8 +- testing/unittests/unittests.cpp | 46 +++-- 15 files changed, 490 insertions(+), 156 deletions(-) create mode 100644 system/jlib/jtiming.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e226b94631f..32dd19d6eb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,8 @@ unset ( ENV{CC} ) unset ( ENV{CXX} ) set(TOP_LEVEL_PROJECT ON) +unset(PROFILING) +set(PROFILING ON) if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) set(TOP_LEVEL_PROJECT OFF) endif(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) diff --git a/cmake_modules/commonSetup.cmake b/cmake_modules/commonSetup.cmake index 1ccf2ee5607..20744cb5b28 100644 --- a/cmake_modules/commonSetup.cmake +++ b/cmake_modules/commonSetup.cmake @@ -257,8 +257,9 @@ IF ("${COMMONSETUP_DONE}" STREQUAL "") endif() endif() endif () - if (PROFILING AND (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)) + if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) add_definitions (-fno-omit-frame-pointer) + add_definitions (-D_PROFILING) endif () if (WIN32) diff --git a/cmake_modules/options.cmake b/cmake_modules/options.cmake index a744818799d..21b117f1aa5 100644 --- a/cmake_modules/options.cmake +++ b/cmake_modules/options.cmake @@ -68,7 +68,7 @@ option(SKIP_ECLWATCH "Skip building ECL Watch" OFF) option(USE_ADDRESS_SANITIZER "Use address sanitizer to spot leaks" OFF) option(INSTALL_VCPKG_CATALOG "Install vcpkg-catalog.txt" ON) option(PORTALURL "Set url to hpccsystems portal download page") -option(PROFILING "Set to true if planning to profile so stacks are informative" OFF) +option(PROFILING "Set to true if planning to profile so stacks are informative" ON) if ( NOT PORTALURL ) set( PORTALURL "http://hpccsystems.com/download" ) diff --git a/common/thorhelper/roxiehelper.cpp b/common/thorhelper/roxiehelper.cpp index c4ced835b4a..9fdd5507e2b 100644 --- a/common/thorhelper/roxiehelper.cpp +++ b/common/thorhelper/roxiehelper.cpp @@ -1795,7 +1795,7 @@ bool CSafeSocket::sendHeartBeat(const IContextLogger &logctx) class HttpResponseHandler { private: - CriticalBlock c; // should not be anyone writing but better to be safe + ICriticalBlock c; // should not be anyone writing but better to be safe StringBuffer header; StringBuffer content; ISocket *sock = nullptr; diff --git a/common/workunit/wujobq.cpp b/common/workunit/wujobq.cpp index 0a08bd9e77a..6ee7b9007dc 100644 --- a/common/workunit/wujobq.cpp +++ b/common/workunit/wujobq.cpp @@ -972,13 +972,13 @@ class CJobQueue: public CJobQueueBase, implements IJobQueue } } - class Cconnlockblock: public CriticalBlock + class Cconnlockblock: public ICriticalBlock { CJobQueue *parent; bool rollback; public: Cconnlockblock(CJobQueue *_parent,bool exclusive) - : CriticalBlock(_parent->crit) + : ICriticalBlock(_parent->crit) { parent = _parent; parent->connlock(exclusive); diff --git a/ecl/hql/hqlexpr.ipp b/ecl/hql/hqlexpr.ipp index 842b8b10cc1..110e4f0bbfc 100644 --- a/ecl/hql/hqlexpr.ipp +++ b/ecl/hql/hqlexpr.ipp @@ -65,7 +65,7 @@ public: }; #ifdef HQLEXPR_MULTI_THREADED -typedef CriticalBlock HqlCriticalBlock; +typedef ICriticalBlock HqlCriticalBlock; #else typedef NullCriticalBlock HqlCriticalBlock; #endif diff --git a/plugins/Rembed/Rembed.cpp b/plugins/Rembed/Rembed.cpp index d83f41ae42f..8ddf225d1f9 100644 --- a/plugins/Rembed/Rembed.cpp +++ b/plugins/Rembed/Rembed.cpp @@ -1437,7 +1437,7 @@ class REmbedFunctionContext: public CInterfaceOf RInside &R; RInside::Proxy result; StringAttr func; - CriticalBlock block; + ICriticalBlock block; Owned env; }; diff --git a/roxie/roxiemem/roxiemem.cpp b/roxie/roxiemem/roxiemem.cpp index 6488bd77f16..db238f540d1 100644 --- a/roxie/roxiemem/roxiemem.cpp +++ b/roxie/roxiemem/roxiemem.cpp @@ -3465,7 +3465,7 @@ class HeapCompactState } } public: - CriticalBlock lock; + ICriticalBlock lock; unsigned numPagesEmptied; CHeap * heap; Heaplet * next; // which heaplet to try to compact into next. Not so good if > 1 heaps in use. @@ -3516,7 +3516,7 @@ class NewHeapCompactState bool processHeap(unsigned low, unsigned max); protected: - CriticalBlock lock; + ICriticalBlock lock; PointerArrayOf heaplets; }; diff --git a/system/jlib/jdebug.hpp b/system/jlib/jdebug.hpp index 6c89b35b0ab..4834c15478c 100644 --- a/system/jlib/jdebug.hpp +++ b/system/jlib/jdebug.hpp @@ -23,78 +23,7 @@ #include "jiface.hpp" #include "jstats.h" #include - -#define TIMING - -__int64 jlib_decl cycle_to_nanosec(cycle_t cycles); -__int64 jlib_decl cycle_to_microsec(cycle_t cycles); -__int64 jlib_decl cycle_to_millisec(cycle_t cycles); -cycle_t jlib_decl nanosec_to_cycle(__int64 cycles); -cycle_t jlib_decl millisec_to_cycle(unsigned ms); -double jlib_decl getCycleToNanoScale(); -void jlib_decl display_time(const char * title, cycle_t diff); - -// X86 / X86_64 -#if defined(_ARCH_X86_64_) || defined(_ARCH_X86_) - -#define HAS_GOOD_CYCLE_COUNTER - -#if defined(_WIN32) && defined (_ARCH_X86_) -#pragma warning(push) -#pragma warning(disable:4035) -inline cycle_t getTSC() { __asm { __asm _emit 0x0f __asm _emit 0x31 } } -#pragma warning(pop) -#elif !defined(_WIN32) -inline cycle_t getTSC() { return __builtin_ia32_rdtsc(); } -#else -#include -inline cycle_t getTSC() { return __rdtsc(); } -#endif // WIN32 - -#elif defined(_ARCH_PPC) - -#define HAS_GOOD_CYCLE_COUNTER - -static inline cycle_t getTSC() -{ - int64_t result; -#ifdef _ARCH_PPC64 - /* - This reads timebase in one 64bit go. Does *not* include a workaround for the cell (see - http://ozlabs.org/pipermail/linuxppc-dev/2006-October/027052.html) - */ - __asm__ volatile( - "mftb %0" - : "=r" (result)); -#else - /* - Read the high 32bits of the timer, then the lower, and repeat if high order has changed in the meantime. See - http://ozlabs.org/pipermail/linuxppc-dev/1999-October/003889.html - */ - unsigned long dummy; - __asm__ volatile( - "mfspr %1,269\n\t" /* mftbu */ - "mfspr %L0,268\n\t" /* mftb */ - "mfspr %0,269\n\t" /* mftbu */ - "cmpw %0,%1\n\t" /* check if the high order word has chanegd */ - "bne $-16" - : "=r" (result), "=r" (dummy)); -#endif - return result; -} - -#else -// ARMFIX: cycle-count is not always available in user mode -// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0338g/Bihbeabc.html -// http://neocontra.blogspot.co.uk/2013/05/user-mode-performance-counters-for.html -inline cycle_t getTSC() { return 0; } -#endif // X86 - -#if defined(INLINE_GET_CYCLES_NOW) && defined(HAS_GOOD_CYCLE_COUNTER) -inline cycle_t get_cycles_now() { return getTSC(); } -#else -cycle_t jlib_decl get_cycles_now(); // equivalent to getTSC when available -#endif +#include struct HardwareInfo { diff --git a/system/jlib/jexcept.cpp b/system/jlib/jexcept.cpp index a716380ec1b..ec1fa29788b 100644 --- a/system/jlib/jexcept.cpp +++ b/system/jlib/jexcept.cpp @@ -37,7 +37,7 @@ #include #include #include -#ifdef __linux__ +#if defined(__linux__) || defined(__APPLE__) #include // comment out if not present #endif #ifdef __APPLE__ @@ -1702,7 +1702,7 @@ void printStackReport(__int64 startIP) #ifdef _WIN32 unsigned onstack=1234; doPrintStackReport(0, 0,(unsigned)&onstack); -#elif defined(__linux__) +#elif defined(__linux__) || defined(__APPLE__) DBGLOG("Backtrace:"); void *btarray[100]; unsigned btn = backtrace (btarray, 100); diff --git a/system/jlib/jmutex.cpp b/system/jlib/jmutex.cpp index 8d1fe0b369a..1db10f76217 100644 --- a/system/jlib/jmutex.cpp +++ b/system/jlib/jmutex.cpp @@ -118,8 +118,135 @@ int Mutex::unlockAll() return ret; } +static CriticalBlockInstrumentation *critBlocks[10000]; +static RelaxedAtomic nextCritBlock; +int compareCI (const void * a, const void * b) +{ + return - (*(CriticalBlockInstrumentation **)a)->compare(*(CriticalBlockInstrumentation **)b); +} + +MODULE_EXIT() +{ + unsigned numCritBlocks = nextCritBlock; + qsort(critBlocks, numCritBlocks, sizeof(CriticalBlockInstrumentation*), compareCI); + for (unsigned i = 0; i < numCritBlocks; i++) + { + critBlocks[i]->describe(i < 10); + } +} + +thread_local CriticalBlockInstrumentation *__cbinst = nullptr; + +std::size_t CriticalBlockInstrumentation::StackHash::operator()(const Stack& k) const +{ + return hashc((const byte *) k.stack, sizeof(k.stack), 0); +} +std::string humanTime(unsigned long cycles) +{ + if (!cycles) + return "0 ns"; + unsigned long v = cycle_to_nanosec(cycles); + if (v < 1000) + return std::to_string(v) + " ns"; + else if (v < 1000000) + return std::to_string(v/1000) + " us"; + else + return std::to_string(v/1000000) + " ms"; +} + +CriticalBlockInstrumentation::CriticalBlockInstrumentation(const char *_file, const char *_func, unsigned _line) +: file(strdup(_file)), func(strdup(_func)), line(_line), waitCycles(0), holdCycles(0), uncontended(0), contended(0) +{ + unsigned idx = nextCritBlock++; + if (idx < 10000) + critBlocks[idx] = this; +} + +void CriticalBlockInstrumentation::addStat(cycle_t wait, cycle_t hold, bool wasContended) +{ +#ifdef HAS_BACKTRACE + void *lbtBuff[numStackEntries+3]; + unsigned nFrames = backtrace(lbtBuff, numStackEntries+3); + NonReentrantSpinBlock b(lock); + if (nFrames == numStackEntries+3) + { + Stack *a = (Stack *) (lbtBuff+3); + stacks[*a]++; + } +#else + NonReentrantSpinBlock b(lock); +#endif + if (wasContended) + contended++; + else + uncontended++; + waitCycles += wait; + holdCycles += hold; +} + +CriticalBlockInstrumentation::~CriticalBlockInstrumentation() +{ +} + +void CriticalBlockInstrumentation::describe(bool includeStack) const +{ + DBGLOG("CritBlock %s (%s:%u): wait %s hold %s, entered %u times, contended %u times", func, strrchr(file, PATHSEPCHAR)+1, line, humanTime(waitCycles).c_str(), humanTime(holdCycles).c_str(), contended+uncontended, contended); + if (includeStack) + { + std::vector> sorted_stacks(stacks.begin(), stacks.end()); + std::sort(sorted_stacks.begin(), sorted_stacks.end(), [](auto &left, auto &right) { return left.second > right.second; }); + unsigned printed = 0; + for (auto &stack: sorted_stacks) + { + if (printed == 3) + break; + DBGLOG("Stack appeared %u times:", stack.second); + char** strs = backtrace_symbols(stack.first.stack, 4); + for (unsigned i = 0; i < 4; ++i) + DBGLOG("%s", strs[i]); + free(strs); + printed++; + } + } +} + +int CriticalBlockInstrumentation::compare(const CriticalBlockInstrumentation *other) const +{ + if (waitCycles < other->waitCycles) + return -1; + if (waitCycles > other->waitCycles) + return 1; + if (holdCycles < other->holdCycles) + return -1; + if (holdCycles > other->holdCycles) + return 1; + if (contended < other->contended) + return -1; + if (contended > other->contended) + return 1; + if (uncontended < other->uncontended) + return -1; + if (uncontended > other->uncontended) + return 1; + return 0; +} + +InstrumentedCriticalBlock::InstrumentedCriticalBlock(InstrumentedCriticalSection &c) : crit(c) +{ + inst = __cbinst; + start = get_cycles_now(); + c.enter(); + in = get_cycles_now(); +} + +InstrumentedCriticalBlock::~InstrumentedCriticalBlock() +{ + bool wasContended = crit.leave(); + cycle_t out = get_cycles_now(); + inst->addStat(in-start, out-in, wasContended); +} inline bool read_data(int fd, void *buf, size_t nbytes) { diff --git a/system/jlib/jmutex.hpp b/system/jlib/jmutex.hpp index 836f26ded1b..a5015e29e23 100644 --- a/system/jlib/jmutex.hpp +++ b/system/jlib/jmutex.hpp @@ -25,6 +25,13 @@ #include #include "jiface.hpp" #include "jsem.hpp" +#include "jtiming.hpp" +#include + +#if defined (__linux__) || defined(__FreeBSD__) || defined(__APPLE__) +#include // comment out if not present +#define HAS_BACKTRACE +#endif extern jlib_decl void ThreadYield(); extern jlib_decl void spinUntilReady(std::atomic_uint &value); @@ -34,6 +41,11 @@ extern jlib_decl void spinUntilReady(std::atomic_uint &value); //#define SPINLOCK_USE_MUTEX // for testing #define SPINLOCK_RR_CHECK // checks for realtime threads #define _ASSERT_LOCK_SUPPORT +//#define USE_INSTRUMENTED_CRITSECS +#endif + +#if defined(_PROFILING) +#define USE_INSTRUMENTED_CRITSECS #endif #ifdef SPINLOCK_USE_MUTEX @@ -195,7 +207,7 @@ TryEnterCriticalSection( ); }; -class jlib_decl CriticalSection +class jlib_decl UninstrumentedCriticalSection { // lightweight mutex within a single process private: @@ -255,12 +267,14 @@ class jlib_decl CriticalSection bool wouldBlock() { if (TryEnterCriticalSection(&flags)) { leave(); return false; } return true; } // debug only #endif }; + +typedef UninstrumentedCriticalSection InstrumentedCriticalSection; // Not worth implementing for Windows #else /** * Mutex locking wrapper. Use enter/leave to lock/unlock. */ -class CriticalSection +class UninstrumentedCriticalSection { private: MutexId mutex; @@ -268,9 +282,9 @@ class CriticalSection ThreadId owner; unsigned depth; #endif - CriticalSection (const CriticalSection &); + UninstrumentedCriticalSection (const UninstrumentedCriticalSection &) = delete; public: - inline CriticalSection() + inline UninstrumentedCriticalSection() { pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); @@ -287,7 +301,7 @@ class CriticalSection #endif } - inline ~CriticalSection() + inline ~UninstrumentedCriticalSection() { #ifdef _ASSERT_LOCK_SUPPORT assert(owner==0 && depth==0); @@ -327,70 +341,91 @@ class CriticalSection #endif } }; -#endif -/** - * Critical section delimiter, using scope to define lifetime of - * the lock on a critical section (parameter). - * Blocks on construction, unblocks on destruction. - */ -class CriticalBlock +class InstrumentedCriticalSection { - CriticalSection &crit; -public: - inline CriticalBlock(CriticalSection &c) : crit(c) { crit.enter(); } - inline ~CriticalBlock() { crit.leave(); } -}; - -/** - * Critical section delimiter, using scope to define lifetime of - * the lock on a critical section (parameter). - * Unblocks on construction, blocks on destruction. - */ -class CriticalUnblock -{ - CriticalSection &crit; -public: - inline CriticalUnblock(CriticalSection &c) : crit(c) { crit.leave(); } - inline ~CriticalUnblock() { crit.enter(); } -}; - -class CLeavableCriticalBlock -{ - CriticalSection &crit; - bool locked = false; +private: + MutexId mutex; +#ifdef _ASSERT_LOCK_SUPPORT + ThreadId owner = 0; +#endif + unsigned depth = 0; + bool wasContended = false; + InstrumentedCriticalSection (const InstrumentedCriticalSection &) = delete; public: - inline CLeavableCriticalBlock(CriticalSection &_crit) : crit(_crit) + inline InstrumentedCriticalSection() { - enter(); - } - inline CLeavableCriticalBlock(CriticalSection &_crit, bool lock) : crit(_crit) - { - if (lock) - enter(); + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); +#ifdef _DEBUG + verifyex(pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE)==0); // verify supports attr +#else + pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); +#endif + pthread_mutex_init(&mutex, &attr); + pthread_mutexattr_destroy(&attr); } - inline ~CLeavableCriticalBlock() + + inline ~InstrumentedCriticalSection() { - if (locked) - crit.leave(); +#ifdef _ASSERT_LOCK_SUPPORT + assert(owner==0); +#endif + assert(depth==0); + pthread_mutex_destroy(&mutex); } + inline void enter() { - if (locked) - return; - locked = true; - crit.enter(); +#ifdef _ASSERT_LOCK_SUPPORT + ThreadId me = GetCurrentThreadId(); +#endif + bool isContended = depth != 0; + pthread_mutex_lock(&mutex); + depth++; + wasContended = isContended; +#ifdef _ASSERT_LOCK_SUPPORT + owner = me; +#endif } - inline void leave() + + inline bool leave() { - if (locked) + bool isContended = false; + if (depth==1) { - locked = false; - crit.leave(); +#ifdef _ASSERT_LOCK_SUPPORT + owner = 0; +#endif + isContended = wasContended; } + depth--; + pthread_mutex_unlock(&mutex); + return isContended; + } + inline void assertLocked() + { +#ifdef _ASSERT_LOCK_SUPPORT + assertex(owner == GetCurrentThreadId()); +#endif } }; +#endif + +/** + * Critical section delimiter, using scope to define lifetime of + * the lock on a critical section (parameter). + * Blocks on construction, unblocks on destruction. + */ + +template class CriticalBlockOf +{ + T &crit; +public: + inline CriticalBlockOf(T &c) : crit(c) { crit.enter(); } + inline ~CriticalBlockOf() { crit.leave(); } +}; #ifdef SPINLOCK_USE_MUTEX // for testing @@ -541,6 +576,135 @@ class jlib_decl NonReentrantSpinLock #endif +typedef CriticalBlockOf UninstrumentedCriticalBlock; + +class CriticalBlockInstrumentation +{ +#ifdef HAS_BACKTRACE + static constexpr unsigned numStackEntries = 8; + class Stack + { + public: + void *stack[numStackEntries]; + bool operator==(const Stack &other) const + { + return memcmp(stack, other.stack, sizeof(stack)) == 0; + } + }; + + struct StackHash + { + std::size_t operator()(const Stack& k) const; + }; +#endif + +public: + CriticalBlockInstrumentation(const char *file, const char *func, unsigned line); + ~CriticalBlockInstrumentation(); + void addStat(cycle_t wait, cycle_t hold, bool wasContended); + void describe(bool withStack) const; + int compare(const CriticalBlockInstrumentation *other) const; +private: + NonReentrantSpinLock lock; + const char *file; + const char *func; + unsigned line; + cycle_t waitCycles; + cycle_t holdCycles; + unsigned uncontended; + unsigned contended; +#ifdef HAS_BACKTRACE + std::unordered_map stacks; +#endif +}; + +class InstrumentedCriticalBlock +{ + InstrumentedCriticalSection &crit; +public: + InstrumentedCriticalBlock(InstrumentedCriticalSection &c); + ~InstrumentedCriticalBlock(); +private: + CriticalBlockInstrumentation *inst; + cycle_t start = 0; + cycle_t in = 0; +}; + +/** + * Critical section delimiter, using scope to define lifetime of + * the lock on a critical section (parameter). + * Unblocks on construction, blocks on destruction. + */ +template class CriticalUnblockOf +{ + T &crit; +public: + inline CriticalUnblockOf(T &c) : crit(c) { crit.leave(); } + inline ~CriticalUnblockOf() { crit.enter(); } +}; + +typedef CriticalUnblockOf UninstrumentedCriticalUnblock; +typedef CriticalUnblockOf InstrumentedCriticalUnblock; + +template class CLeavableCriticalBlockOf +{ + T &crit; + bool locked = false; +public: + inline CLeavableCriticalBlockOf(T &_crit) : crit(_crit) + { + enter(); + } + inline CLeavableCriticalBlockOf(T &_crit, bool lock) : crit(_crit) + { + if (lock) + enter(); + } + inline ~CLeavableCriticalBlockOf() + { + if (locked) + crit.leave(); + } + inline void enter() + { + if (locked) + return; + locked = true; + crit.enter(); + } + inline void leave() + { + if (locked) + { + locked = false; + crit.leave(); + } + } +}; + +typedef CLeavableCriticalBlockOf UninstrumentedLeavableCriticalBlock; +typedef CLeavableCriticalBlockOf InstrumentedLeavableCriticalBlock; + +#ifdef USE_INSTRUMENTED_CRITSECS +#define __glue(a,b) a ## b +#define glue(a,b) __glue(a,b) + +extern thread_local CriticalBlockInstrumentation* __cbinst; + +typedef InstrumentedCriticalSection CriticalSection; +typedef InstrumentedCriticalBlock ICriticalBlock; +#define CriticalBlock static CriticalBlockInstrumentation *glue(instrumenter,__LINE__) = new CriticalBlockInstrumentation(__FILE__, __func__, __LINE__); __cbinst = glue(instrumenter,__LINE__); ICriticalBlock +typedef InstrumentedCriticalUnblock CriticalUnblock; +typedef InstrumentedLeavableCriticalBlock CLeavableCriticalBlock; +#else +typedef UninstrumentedCriticalSection CriticalSection; +typedef UninstrumentedCriticalBlock ICriticalBlock; +typedef ICriticalBlock CriticalBlock; +typedef UninstrumentedCriticalUnblock CriticalUnblock; +typedef UninstrumentedLeavableCriticalBlock CLeavableCriticalBlock; +#endif + + class NonReentrantSpinBlock { NonReentrantSpinLock &lock; diff --git a/system/jlib/jtiming.hpp b/system/jlib/jtiming.hpp new file mode 100644 index 00000000000..1c2216480de --- /dev/null +++ b/system/jlib/jtiming.hpp @@ -0,0 +1,97 @@ +/*############################################################################## + + HPCC SYSTEMS software Copyright (C) 2023 HPCC Systems®. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + + + +#ifndef JTIMING_HPP +#define JTIMING_HPP + +#include "jiface.hpp" + +#define TIMING + +__int64 jlib_decl cycle_to_nanosec(cycle_t cycles); +__int64 jlib_decl cycle_to_microsec(cycle_t cycles); +__int64 jlib_decl cycle_to_millisec(cycle_t cycles); +cycle_t jlib_decl nanosec_to_cycle(__int64 cycles); +cycle_t jlib_decl millisec_to_cycle(unsigned ms); +double jlib_decl getCycleToNanoScale(); +void jlib_decl display_time(const char * title, cycle_t diff); + +// X86 / X86_64 +#if defined(_ARCH_X86_64_) || defined(_ARCH_X86_) + +#define HAS_GOOD_CYCLE_COUNTER + +#if defined(_WIN32) && defined (_ARCH_X86_) +#pragma warning(push) +#pragma warning(disable:4035) +inline cycle_t getTSC() { __asm { __asm _emit 0x0f __asm _emit 0x31 } } +#pragma warning(pop) +#elif !defined(_WIN32) +inline cycle_t getTSC() { return __builtin_ia32_rdtsc(); } +#else +#include +inline cycle_t getTSC() { return __rdtsc(); } +#endif // WIN32 + +#elif defined(_ARCH_PPC) + +#define HAS_GOOD_CYCLE_COUNTER + +static inline cycle_t getTSC() +{ + int64_t result; +#ifdef _ARCH_PPC64 + /* + This reads timebase in one 64bit go. Does *not* include a workaround for the cell (see + http://ozlabs.org/pipermail/linuxppc-dev/2006-October/027052.html) + */ + __asm__ volatile( + "mftb %0" + : "=r" (result)); +#else + /* + Read the high 32bits of the timer, then the lower, and repeat if high order has changed in the meantime. See + http://ozlabs.org/pipermail/linuxppc-dev/1999-October/003889.html + */ + unsigned long dummy; + __asm__ volatile( + "mfspr %1,269\n\t" /* mftbu */ + "mfspr %L0,268\n\t" /* mftb */ + "mfspr %0,269\n\t" /* mftbu */ + "cmpw %0,%1\n\t" /* check if the high order word has chanegd */ + "bne $-16" + : "=r" (result), "=r" (dummy)); +#endif + return result; +} + +#else +// ARMFIX: cycle-count is not always available in user mode +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0338g/Bihbeabc.html +// http://neocontra.blogspot.co.uk/2013/05/user-mode-performance-counters-for.html +inline cycle_t getTSC() { return 0; } +#endif // X86 + +#if defined(INLINE_GET_CYCLES_NOW) && defined(HAS_GOOD_CYCLE_COUNTER) +inline cycle_t get_cycles_now() { return getTSC(); } +#else +cycle_t jlib_decl get_cycles_now(); // equivalent to getTSC when available +#endif + +#endif \ No newline at end of file diff --git a/testing/unittests/jlibtests.cpp b/testing/unittests/jlibtests.cpp index 9cc9c59e821..89febf87fa8 100644 --- a/testing/unittests/jlibtests.cpp +++ b/testing/unittests/jlibtests.cpp @@ -2463,10 +2463,10 @@ class AtomicTimingTest : public CppUnit::TestFixture const unsigned numCores = std::max(getAffinityCpus(), 16U); void runAllTests() { - DO_TEST(CriticalSection, CriticalBlock, unsigned __int64, 1, 1); - DO_TEST(CriticalSection, CriticalBlock, unsigned __int64, 2, 1); - DO_TEST(CriticalSection, CriticalBlock, unsigned __int64, 5, 1); - DO_TEST(CriticalSection, CriticalBlock, unsigned __int64, 1, 2); + DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 1, 1); + DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 2, 1); + DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 5, 1); + DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 1, 2); DO_TEST(SpinLock, SpinBlock, unsigned __int64, 1, 1); DO_TEST(SpinLock, SpinBlock, unsigned __int64, 2, 1); DO_TEST(SpinLock, SpinBlock, unsigned __int64, 5, 1); diff --git a/testing/unittests/unittests.cpp b/testing/unittests/unittests.cpp index a4c7e7e8ca6..f39c4bf4c5b 100644 --- a/testing/unittests/unittests.cpp +++ b/testing/unittests/unittests.cpp @@ -928,6 +928,7 @@ CPPUNIT_TEST_SUITE_REGISTRATION( PipeRunTest ); CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( PipeRunTest, "PipeRunTest" ); #endif + class RelaxedAtomicTimingTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE( RelaxedAtomicTimingTest ); @@ -936,11 +937,12 @@ class RelaxedAtomicTimingTest : public CppUnit::TestFixture void testRun() { - testRunner(0); - testRunner(1); +// testRunner(0); +// testRunner(1); testRunner(2); testRunner(3); - testRunner(4); +// testRunner(4); +// testRunner(5); } void testRunner(unsigned mode) @@ -948,7 +950,8 @@ class RelaxedAtomicTimingTest : public CppUnit::TestFixture CCycleTimer timer; unsigned count = 100000000; RelaxedAtomic ra[201]; - CriticalSection lock[201]; + UninstrumentedCriticalSection lock[201]; + InstrumentedCriticalSection ilock[201]; for (int a = 0; a < 201; a++) ra[a] = 0; @@ -956,7 +959,8 @@ class RelaxedAtomicTimingTest : public CppUnit::TestFixture class T : public CThreaded { public: - T(unsigned _count, RelaxedAtomic &_ra, CriticalSection &_lock, unsigned _mode) : CThreaded(""), count(_count), ra(_ra), lock(_lock), mode(_mode) + T(unsigned _count, RelaxedAtomic &_ra, UninstrumentedCriticalSection &_lock, InstrumentedCriticalSection &_ilock, unsigned _mode) + : CThreaded(""), count(_count), ra(_ra), lock(_lock), ilock(_ilock), mode(_mode) {} virtual int run() override { @@ -964,10 +968,10 @@ class RelaxedAtomicTimingTest : public CppUnit::TestFixture { case 0: test0(); break; case 1: test1(); break; - // Disabled next two for now as slow and not especially interesting - // case 2: test2(); break; - // case 3: test3(); break; - case 4: test4(); break; + case 2: test2(); break; + case 3: test3(); break; + case 4: test4(); break; + case 5: test5(); break; } return 0; } @@ -988,20 +992,29 @@ class RelaxedAtomicTimingTest : public CppUnit::TestFixture int &a = (int &) ra; while (count--) { - CriticalBlock b(lock); + UninstrumentedCriticalBlock b(lock); a++; } } void test3() + { + int &a = (int &) ra; + while (count--) + { + InstrumentedCriticalBlock b(ilock); + a++; + } + } + void test4() { RelaxedAtomic &a = ra; while (count--) { - CriticalBlock b(lock); + UninstrumentedCriticalBlock b(lock); a.fastAdd(1); } } - void test4() + void test5() { int &a = (int &) ra; while (count--) @@ -1015,10 +1028,11 @@ class RelaxedAtomicTimingTest : public CppUnit::TestFixture unsigned mode; unsigned count; RelaxedAtomic &ra; - CriticalSection &lock; - } t1a(count, ra[0], lock[0], mode), t2a(count, ra[0], lock[0], mode), t3a(count, ra[0], lock[0], mode), - t1b(count, ra[0], lock[0], mode), t2b(count, ra[1], lock[1], mode), t3b(count, ra[2], lock[2], mode), - t1c(count, ra[0], lock[0], mode), t2c(count, ra[100], lock[100], mode), t3c(count, ra[200], lock[200], mode);; + UninstrumentedCriticalSection &lock; + InstrumentedCriticalSection &ilock; + } t1a(count, ra[0], lock[0], ilock[0], mode), t2a(count, ra[0], lock[0], ilock[0], mode), t3a(count, ra[0], lock[0], ilock[0], mode), + t1b(count, ra[0], lock[1], ilock[1], mode), t2b(count, ra[1], lock[2], ilock[2], mode), t3b(count, ra[2], lock[3], ilock[3], mode), + t1c(count, ra[0], lock[4], ilock[4], mode), t2c(count, ra[100], lock[100], ilock[100], mode), t3c(count, ra[200], lock[200], ilock[200], mode);; DBGLOG("Testing RelaxedAtomics (test mode %u)", mode); t1a.start(); t2a.start(); From 44d8b8ed6a63cfcdfa467f269ca874ce3a4e20b2 Mon Sep 17 00:00:00 2001 From: Richard Chapman Date: Wed, 15 May 2024 15:45:12 +0100 Subject: [PATCH 2/2] HPCC-30156 Instrument CriticalSection Rename ICriticalBlock Clean up some unwanted changes Add some comments Signed-off-by: Richard Chapman --- CMakeLists.txt | 2 -- cmake_modules/commonSetup.cmake | 2 +- cmake_modules/options.cmake | 2 +- common/thorhelper/roxiehelper.cpp | 2 +- common/workunit/wujobq.cpp | 4 ++-- ecl/hql/hqlexpr.ipp | 2 +- plugins/Rembed/Rembed.cpp | 2 +- roxie/roxiemem/roxiemem.cpp | 4 ++-- system/jlib/jmutex.cpp | 2 +- system/jlib/jmutex.hpp | 19 ++++++++++++++----- testing/unittests/jlibtests.cpp | 8 ++++---- 11 files changed, 28 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32dd19d6eb8..e226b94631f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,8 +85,6 @@ unset ( ENV{CC} ) unset ( ENV{CXX} ) set(TOP_LEVEL_PROJECT ON) -unset(PROFILING) -set(PROFILING ON) if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) set(TOP_LEVEL_PROJECT OFF) endif(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) diff --git a/cmake_modules/commonSetup.cmake b/cmake_modules/commonSetup.cmake index 20744cb5b28..ee3cdca1626 100644 --- a/cmake_modules/commonSetup.cmake +++ b/cmake_modules/commonSetup.cmake @@ -257,7 +257,7 @@ IF ("${COMMONSETUP_DONE}" STREQUAL "") endif() endif() endif () - if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) + if (PROFILING AND (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)) add_definitions (-fno-omit-frame-pointer) add_definitions (-D_PROFILING) endif () diff --git a/cmake_modules/options.cmake b/cmake_modules/options.cmake index 21b117f1aa5..a744818799d 100644 --- a/cmake_modules/options.cmake +++ b/cmake_modules/options.cmake @@ -68,7 +68,7 @@ option(SKIP_ECLWATCH "Skip building ECL Watch" OFF) option(USE_ADDRESS_SANITIZER "Use address sanitizer to spot leaks" OFF) option(INSTALL_VCPKG_CATALOG "Install vcpkg-catalog.txt" ON) option(PORTALURL "Set url to hpccsystems portal download page") -option(PROFILING "Set to true if planning to profile so stacks are informative" ON) +option(PROFILING "Set to true if planning to profile so stacks are informative" OFF) if ( NOT PORTALURL ) set( PORTALURL "http://hpccsystems.com/download" ) diff --git a/common/thorhelper/roxiehelper.cpp b/common/thorhelper/roxiehelper.cpp index 9fdd5507e2b..d05a79c14c1 100644 --- a/common/thorhelper/roxiehelper.cpp +++ b/common/thorhelper/roxiehelper.cpp @@ -1795,7 +1795,7 @@ bool CSafeSocket::sendHeartBeat(const IContextLogger &logctx) class HttpResponseHandler { private: - ICriticalBlock c; // should not be anyone writing but better to be safe + CCriticalBlock c; // should not be anyone writing but better to be safe StringBuffer header; StringBuffer content; ISocket *sock = nullptr; diff --git a/common/workunit/wujobq.cpp b/common/workunit/wujobq.cpp index 6ee7b9007dc..a9272b4bb70 100644 --- a/common/workunit/wujobq.cpp +++ b/common/workunit/wujobq.cpp @@ -972,13 +972,13 @@ class CJobQueue: public CJobQueueBase, implements IJobQueue } } - class Cconnlockblock: public ICriticalBlock + class Cconnlockblock: public CCriticalBlock { CJobQueue *parent; bool rollback; public: Cconnlockblock(CJobQueue *_parent,bool exclusive) - : ICriticalBlock(_parent->crit) + : CCriticalBlock(_parent->crit) { parent = _parent; parent->connlock(exclusive); diff --git a/ecl/hql/hqlexpr.ipp b/ecl/hql/hqlexpr.ipp index 110e4f0bbfc..b7fd9720fdc 100644 --- a/ecl/hql/hqlexpr.ipp +++ b/ecl/hql/hqlexpr.ipp @@ -65,7 +65,7 @@ public: }; #ifdef HQLEXPR_MULTI_THREADED -typedef ICriticalBlock HqlCriticalBlock; +typedef CCriticalBlock HqlCriticalBlock; #else typedef NullCriticalBlock HqlCriticalBlock; #endif diff --git a/plugins/Rembed/Rembed.cpp b/plugins/Rembed/Rembed.cpp index 8ddf225d1f9..4151e2464c5 100644 --- a/plugins/Rembed/Rembed.cpp +++ b/plugins/Rembed/Rembed.cpp @@ -1437,7 +1437,7 @@ class REmbedFunctionContext: public CInterfaceOf RInside &R; RInside::Proxy result; StringAttr func; - ICriticalBlock block; + CCriticalBlock block; Owned env; }; diff --git a/roxie/roxiemem/roxiemem.cpp b/roxie/roxiemem/roxiemem.cpp index db238f540d1..c77390c7a0e 100644 --- a/roxie/roxiemem/roxiemem.cpp +++ b/roxie/roxiemem/roxiemem.cpp @@ -3465,7 +3465,7 @@ class HeapCompactState } } public: - ICriticalBlock lock; + CCriticalBlock lock; unsigned numPagesEmptied; CHeap * heap; Heaplet * next; // which heaplet to try to compact into next. Not so good if > 1 heaps in use. @@ -3516,7 +3516,7 @@ class NewHeapCompactState bool processHeap(unsigned low, unsigned max); protected: - ICriticalBlock lock; + CCriticalBlock lock; PointerArrayOf heaplets; }; diff --git a/system/jlib/jmutex.cpp b/system/jlib/jmutex.cpp index 1db10f76217..e28570dbe7c 100644 --- a/system/jlib/jmutex.cpp +++ b/system/jlib/jmutex.cpp @@ -136,7 +136,7 @@ MODULE_EXIT() } } -thread_local CriticalBlockInstrumentation *__cbinst = nullptr; +thread_local CriticalBlockInstrumentation *__cbinst = nullptr; // Used as an extra parameter to CriticalBlock macro, so we don't have to change every call std::size_t CriticalBlockInstrumentation::StackHash::operator()(const Stack& k) const { diff --git a/system/jlib/jmutex.hpp b/system/jlib/jmutex.hpp index a5015e29e23..08e5e9693e9 100644 --- a/system/jlib/jmutex.hpp +++ b/system/jlib/jmutex.hpp @@ -45,7 +45,8 @@ extern jlib_decl void spinUntilReady(std::atomic_uint &value); #endif #if defined(_PROFILING) -#define USE_INSTRUMENTED_CRITSECS +// Instrumented critsecs work best if PROFILING Cmake option is in use +//#define USE_INSTRUMENTED_CRITSECS #endif #ifdef SPINLOCK_USE_MUTEX @@ -685,6 +686,14 @@ template class CLeavableCriticalBlockOf typedef CLeavableCriticalBlockOf UninstrumentedLeavableCriticalBlock; typedef CLeavableCriticalBlockOf InstrumentedLeavableCriticalBlock; +// The CriticalBlock instrumentation works by replacing all references to the normal CriticalBlock constructor with a macro +// that instantiates a new (local static) CriticalBlockInstrumentation object, and passes it in to the constructor via a thread-local variable. +// It's done this way to avoid any overhead or code changes in the uninstrumented case, at the expense of some complexity and size overhead in the +// instrumented case. +// +// Because CriticalBlock may be a macro in the instrumented case, any place that a parameter or member variable of type CriticalBlock is needed +// must ust CCriticalBlock rather than CriticalBlock to avoid the macro expansion. + #ifdef USE_INSTRUMENTED_CRITSECS #define __glue(a,b) a ## b #define glue(a,b) __glue(a,b) @@ -692,14 +701,14 @@ typedef CLeavableCriticalBlockOf InstrumentedLeavab extern thread_local CriticalBlockInstrumentation* __cbinst; typedef InstrumentedCriticalSection CriticalSection; -typedef InstrumentedCriticalBlock ICriticalBlock; -#define CriticalBlock static CriticalBlockInstrumentation *glue(instrumenter,__LINE__) = new CriticalBlockInstrumentation(__FILE__, __func__, __LINE__); __cbinst = glue(instrumenter,__LINE__); ICriticalBlock +typedef InstrumentedCriticalBlock CCriticalBlock; +#define CriticalBlock static CriticalBlockInstrumentation *glue(instrumenter,__LINE__) = new CriticalBlockInstrumentation(__FILE__, __func__, __LINE__); __cbinst = glue(instrumenter,__LINE__); CCriticalBlock typedef InstrumentedCriticalUnblock CriticalUnblock; typedef InstrumentedLeavableCriticalBlock CLeavableCriticalBlock; #else typedef UninstrumentedCriticalSection CriticalSection; -typedef UninstrumentedCriticalBlock ICriticalBlock; -typedef ICriticalBlock CriticalBlock; +typedef UninstrumentedCriticalBlock CCriticalBlock; +typedef CCriticalBlock CriticalBlock; // Use in place of CriticalBlock when declaring members/parameters, to avoid macro expansion typedef UninstrumentedCriticalUnblock CriticalUnblock; typedef UninstrumentedLeavableCriticalBlock CLeavableCriticalBlock; #endif diff --git a/testing/unittests/jlibtests.cpp b/testing/unittests/jlibtests.cpp index 89febf87fa8..828f14bcc90 100644 --- a/testing/unittests/jlibtests.cpp +++ b/testing/unittests/jlibtests.cpp @@ -2463,10 +2463,10 @@ class AtomicTimingTest : public CppUnit::TestFixture const unsigned numCores = std::max(getAffinityCpus(), 16U); void runAllTests() { - DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 1, 1); - DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 2, 1); - DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 5, 1); - DO_TEST(CriticalSection, ICriticalBlock, unsigned __int64, 1, 2); + DO_TEST(CriticalSection, CCriticalBlock, unsigned __int64, 1, 1); + DO_TEST(CriticalSection, CCriticalBlock, unsigned __int64, 2, 1); + DO_TEST(CriticalSection, CCriticalBlock, unsigned __int64, 5, 1); + DO_TEST(CriticalSection, CCriticalBlock, unsigned __int64, 1, 2); DO_TEST(SpinLock, SpinBlock, unsigned __int64, 1, 1); DO_TEST(SpinLock, SpinBlock, unsigned __int64, 2, 1); DO_TEST(SpinLock, SpinBlock, unsigned __int64, 5, 1);