From 69085a937a4fa448963807ee0e00de449ec8e480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Mon, 6 Nov 2023 18:04:29 +0000 Subject: [PATCH] Add basic instrumentation support for GPU tracing libraries Add a simple instrumentation API built around the wallcycle regions with implementations using NVIDIA NVTX, AMD ROCTX and Intel ITT libraries. Tracing support can be enabled at build-time and allows the wallcycle regions to show up in tracing tools which greatly aids performance analysis. Implements #4446 Closes #4446 --- docs/release-notes/2024/major/features.rst | 11 + src/config.h.cmakein | 9 + src/gromacs/timing/CMakeLists.txt | 55 +++++ .../include/gromacs/timing/instrumentation.h | 225 ++++++++++++++++++ .../timing/include/gromacs/timing/wallcycle.h | 145 +++++++++++ src/gromacs/timing/wallcycle.cpp | 154 +++--------- src/gromacs/utility/binaryinformation.cpp | 7 + 7 files changed, 480 insertions(+), 126 deletions(-) create mode 100644 src/gromacs/timing/include/gromacs/timing/instrumentation.h diff --git a/docs/release-notes/2024/major/features.rst b/docs/release-notes/2024/major/features.rst index df9b4228282..38a40062db5 100644 --- a/docs/release-notes/2024/major/features.rst +++ b/docs/release-notes/2024/major/features.rst @@ -17,3 +17,14 @@ It is set to 2 by default for increased stability. If the TPR was generated with an earlier |Gromacs| version, the old default value of 3 will be used. + + +Added support for instrumentation based on wallcycle regions using NVTX/ROCTX/ITT +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +Basic support has been added for GPU tracing libraries so wallcycle main and sub-regions +will show up in tracing timelines which can help with performance analysis. +The tracing instrumentation support can be enabled with one of the following CMake variables: +``GMX_USE_NVTX``, ``GMX_USE_ROCTX``, ``GMX_USE_ITT``. + +:issue:`4446` diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 3625c6e559e..11a2186ed2c 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -167,6 +167,15 @@ /* Add support for tracing using Extrae */ #cmakedefine01 HAVE_EXTRAE +/* Enable NVIDIA NVTX instrumentation */ +#cmakedefine01 GMX_USE_NVTX + +/* Enable AMD ROCTX instrumentation */ +#cmakedefine01 GMX_USE_ROCTX + +/* Enable Intel ITT instrumentation */ +#cmakedefine01 GMX_USE_ITT + /* Use MPI (with mpicc) for parallelization */ #cmakedefine01 GMX_LIB_MPI diff --git a/src/gromacs/timing/CMakeLists.txt b/src/gromacs/timing/CMakeLists.txt index 84bc6afb175..213842382c3 100644 --- a/src/gromacs/timing/CMakeLists.txt +++ b/src/gromacs/timing/CMakeLists.txt @@ -55,6 +55,61 @@ target_link_libraries(timing INTERFACE legacy_api ) +# Tracing support for NVTX / ROCTX / ITT (with some basic includ/lib detection support, +# sufficient for most use-cases and this is a dev-feature anyway). +if(GMX_USE_NVTX) + find_path (NVTX_INCLUDE_DIR + NAMES nvToolsExt.h + HINTS ENV CUDA_HOME "${CUDA_TOOLKIT_ROOT_DIR}" + PATH_SUFFIXES include include/nvtx3 + REQUIRED + ) + find_library (NVTX_LIBRARY + NAMES libnvToolsExt.so + HINTS ENV CUDA_HOME "${CUDA_TOOLKIT_ROOT_DIR}" + PATH_SUFFIXES lib64 lib + REQUIRED + ) + target_include_directories(timing INTERFACE "${NVTX_INCLUDE_DIR}") + target_link_libraries(timing INTERFACE "${NVTX_LIBRARY}") + # As of CUDA 11.8, there are a lot of old-style casts in nvToolsExt.h + gmx_target_interface_warning_suppression(timing "-Wno-old-style-cast" HAS_WARNING_NO_OLD_STYLE_CAST) +endif() + +if(GMX_USE_ROCTX) + find_path (ROCTX_INCLUDE_DIR + NAMES roctracer/roctx.h + HINTS ENV ROCM_PATH ENV ROCM_HOME "${HIPSYCL_SYCLCC_ROCM_PATH}" + PATH_SUFFIXES include + REQUIRED + ) + find_library (ROCTX_LIBRARY + NAMES libroctx64.so + HINTS ENV ROCM_PATH ENV ROCM_HOME "${HIPSYCL_SYCLCC_ROCM_PATH}" + PATH_SUFFIXES roctracer/lib64 roctracer/lib lib64 lib + REQUIRED + ) + target_include_directories(timing INTERFACE "${ROCTX_INCLUDE_DIR}") + target_link_libraries(timing INTERFACE "${ROCTX_LIBRARY}") +endif() + +if(GMX_USE_ITT) + find_path (ITTNOTIFY_INCLUDE_DIR + NAMES ittnotify.h + HINTS ENV VTUNE_PROFILER_DIR + PATH_SUFFIXES include + REQUIRED + ) + find_library (ITTNOTIFY_LIBRARY + NAMES libittnotify.a # We need the static library + HINTS ENV VTUNE_PROFILER_DIR + PATH_SUFFIXES lib64 lib + REQUIRED + ) + target_include_directories(timing INTERFACE "${ITTNOTIFY_INCLUDE_DIR}" SYSTEM) + target_link_libraries(timing INTERFACE "${ITTNOTIFY_LIBRARY}") +endif() + # TODO: when timing is an OBJECT target #target_link_libraries(timing PUBLIC legacy_api) #target_link_libraries(timing PRIVATE common) diff --git a/src/gromacs/timing/include/gromacs/timing/instrumentation.h b/src/gromacs/timing/include/gromacs/timing/instrumentation.h new file mode 100644 index 00000000000..8fb1fd6d984 --- /dev/null +++ b/src/gromacs/timing/include/gromacs/timing/instrumentation.h @@ -0,0 +1,225 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright 2023- The GROMACS Authors + * and the project initiators Erik Lindahl, Berk Hess and David van der Spoel. + * Consult the AUTHORS/COPYING files and https://www.gromacs.org for details. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * https://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at https://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out https://www.gromacs.org. + */ + +/*! \internal \file + * + * \brief + * Define basic tracing API for manual instrumentation. + * + * This header implements a simple set of tracing range start/stop functions + * which can be used for manual instrumentation of application code. + * Since current use is only through the wallcycle module, we define two + * sets of start/stop functions corresponding to the main and sub-counters + * in the wallcycle module. + * + * The current implementation supports the following tracing APIs: + * - NVIDIA NVTX + * - AMD ROCTX + * - Intel ITT + * + * \author Szilárd Páll + * + */ + +#include "gromacs/utility/basedefinitions.h" + + +// +// Forward declarations of the tracing functions with the inlineable definitions for each tracing API below. +// + +/*! \brief Start a main tracing region. + * + * Note that the \p rangeId argument is currently only used with NVTX for aiding + * in coloring of trace regions. + * + * \param[in] rangeName String containing the name of the traced range. + * \param[in] rangeId Numeric ID of the range. + */ +static void traceRangeStart(const char* rangeName, int rangeId); + + +/*! \brief Start a tracing sub-region. + * + * Note that the \p rangeId argument is currently only used with NVTX for aiding + * in coloring of trace regions. + * + * \param[in] rangeName String containing the name of the traced range. + * \param[in] rangeId Numeric ID of the range. + */ +static void traceSubRangeStart(const char* rangeName, int rangeId); + +/*! \brief End a main tracing region. + * + * Note that this should always be paired with a traceRangeStart(). + */ +static void traceRangeEnd(); + +/*! \brief End a tracing sub-region. + * + * Note that this should always be paired with a traceSubRangeStart(). + */ +static void traceSubRangeEnd(); + +#if (GMX_USE_NVTX + GMX_USE_ROCTX + GMX_USE_ITT) > 1 +# error "Cannot have multiple instrumentation flavors enabled at the same time" +#endif + + +#if GMX_USE_NVTX + +# include "nvToolsExt.h" + +//! List of colors for main ranges +static constexpr uint32_t c_rangeColors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, + 0xff00ffff, 0xffff0000, 0xffffffff }; +//! Number of colors for main ranges +static constexpr int c_numRangeColors = sizeof(c_rangeColors) / sizeof(uint32_t); + +//! List of colors for sub-ranges +static constexpr uint32_t c_subRangeColors[] = { 0x9900ff00, 0x990000ff, 0x99ffff00, 0x99ff00ff, + 0x9900ffff, 0x99ff0000, 0x99ffffff }; +//! Number of colors for sub-ranges +static constexpr int c_numSubRangeColors = sizeof(c_subRangeColors) / sizeof(uint32_t); + +static void traceRangeStart(const char* rangeName, int rangeId) +{ + int colorId = rangeId % c_numRangeColors; + nvtxEventAttributes_t eventAttrib = { 0 }; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = c_rangeColors[colorId]; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = rangeName; + nvtxRangePushEx(&eventAttrib); +} + +static void traceSubRangeStart(const char* rangeName, int rangeId) +{ + int colorId = rangeId % c_numSubRangeColors; + nvtxEventAttributes_t eventAttrib = { 0 }; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = c_subRangeColors[colorId]; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = rangeName; + nvtxRangePushEx(&eventAttrib); +} + +static void traceRangeEnd() +{ + nvtxRangePop(); +} + +static void traceSubRangeEnd() +{ + nvtxRangePop(); +} + +#elif GMX_USE_ROCTX + +# include "roctracer/roctx.h" + +static void traceRangeStart(const char* rangeName, int /*rangeId*/) +{ + roctxRangePush(rangeName); +} + +static void traceSubRangeStart(const char* rangeName, int /*rangeId*/) +{ + roctxRangePush(rangeName); +} + +static void traceRangeEnd() +{ + roctxRangePop(); +} + +static void traceSubRangeEnd() +{ + roctxRangePop(); +} + +#elif GMX_USE_ITT + +# ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wold-style-cast" +# pragma clang diagnostic ignored "-Wnewline-eof" +# endif +# include +# ifdef __clang__ +# pragma clang diagnostic pop +# endif + +// Defined in wallcycle.cpp, initialized in wallcycle_init +extern const __itt_domain* g_ittDomain; +extern __itt_string_handle* g_ittCounterHandles[]; +extern __itt_string_handle* g_ittSubCounterHandles[]; + +static void traceRangeStart(const char* /*rangeName*/, int rangeId) +{ + __itt_task_begin(g_ittDomain, __itt_null, __itt_null, g_ittCounterHandles[rangeId]); +} + +static void traceSubRangeStart(const char* /*rangeName*/, int rangeId) +{ + __itt_task_begin(g_ittDomain, __itt_null, __itt_null, g_ittSubCounterHandles[rangeId]); +} + +static void traceRangeEnd() +{ + __itt_task_end(g_ittDomain); +} + +static void traceSubRangeEnd() +{ + __itt_task_end(g_ittDomain); +} + +#else + +gmx_unused static void traceRangeStart(gmx_unused const char* rangeName, gmx_unused int rangeId) {} +gmx_unused static void traceSubRangeStart(gmx_unused const char* rangeName, gmx_unused int rangeId) +{ +} + +gmx_unused static void traceRangeEnd() {} +gmx_unused static void traceSubRangeEnd() {} + + +#endif diff --git a/src/gromacs/timing/include/gromacs/timing/wallcycle.h b/src/gromacs/timing/include/gromacs/timing/wallcycle.h index 6d8917440ee..95821a85ea0 100644 --- a/src/gromacs/timing/include/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/include/gromacs/timing/wallcycle.h @@ -44,6 +44,7 @@ #include #include "gromacs/timing/cyclecounter.h" +#include "gromacs/timing/instrumentation.h" #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/enumerationhelpers.h" @@ -161,6 +162,136 @@ enum class WallCycleSubCounter : int Count }; +template +static constexpr bool checkStringsLengths(const Container& strings) +{ + // NOLINTNEXTLINE(readability-use-anyofallof) // std::all_of is constexpr only since C++20 + for (const char* str : strings) + { + if (std::char_traits::length(str) > maxLength) + { + return false; + } + } + return true; +} + +/* Each name should not exceed 22 printing characters + (ie. terminating null can be twentieth) */ +static const char* enumValuetoString(WallCycleCounter enumValue) +{ + constexpr gmx::EnumerationArray wallCycleCounterNames = { + "Run", + "Step", + "PP during PME", + "Domain decomp.", + "DD comm. load", + "DD comm. bounds", + "Vsite constr.", + "Send X to PME", + "Neighbor search", + "Launch PP GPU ops.", + "Comm. coord.", + "Force", + "Wait + Comm. F", + "PME mesh", + "PME GPU mesh", + "PME redist. X/F", + "PME spread", + "PME gather", + "PME 3D-FFT", + "PME 3D-FFT Comm.", + "PME solve LJ", + "PME solve Elec", + "Wait PME GPU D2H", + "PME 3D-FFT", + "PME solve", + "Wait PME GPU gather", + "Reduce GPU PME F", + "Launch PME GPU ops.", + "Wait PME Recv. PP X", + "Wait PME GPU spread", + "Wait GPU FFT to PME", + "PME Halo exch comm", + "PME wait for PP", + "Wait + Recv. PME F", + "Wait Bonded GPU", + "Wait GPU NB nonloc.", + "Wait GPU NB local", + "Wait GPU state copy", + "NB X/F buffer ops.", + "Vsite spread", + "COM pull force", + "AWH", + "Write traj.", + "Update", + "Constraints", + "Comm. energies", + "Enforced rotation", + "Add rot. forces", + "Position swapping", + "IMD", + "MD Graph", + "Test" + }; + static_assert(checkStringsLengths<22>(wallCycleCounterNames)); + return wallCycleCounterNames[enumValue]; +} + +// Clang complains about this function not used in builds without subcounters +// clang-format off +CLANG_DIAGNOSTIC_IGNORE(-Wunneeded-internal-declaration) +// clang-format on +static const char* enumValuetoString(WallCycleSubCounter enumValue) +{ + constexpr gmx::EnumerationArray wallCycleSubCounterNames = { + "DD redist.", + "DD NS grid + sort", + "DD setup comm.", + "DD make top.", + "DD make constr.", + "DD top. other", + "DD GPU ops.", + "NS grid local", + "NS grid non-local", + "NS search local", + "NS search non-local", + "Bonded F", + "Bonded-FEP F", + "Restraints F", + "Listed buffer ops.", + "NB pruning", + "NB F kernel", + "NB F clear", + "NB FEP", + "NB FEP reduction", + "Launch GPU NB tasks", + "Launch GPU Bonded", + "Launch state copy", + "Ewald F correction", + "NB X buffer ops.", + "NB F buffer ops.", + "Clear force buffer", + "Launch GPU NB X ops.", + "Launch GPU NB F ops.", + "Launch GPU Comm. X", + "Launch GPU Comm. F", + "Launch GPU update", + "Launch PME GPU FFT", + "Graph wait pre-capture", + "Graph capture", + "Graph instantiate/upd.", + "Graph wait pre-launch", + "Graph launch", + "Constraints Comm.", // constraints communication time, note that this counter will contain load imbalance + "Test subcounter" + }; + static_assert(checkStringsLengths<22>(wallCycleSubCounterNames)); + return wallCycleSubCounterNames[enumValue]; +} +CLANG_DIAGNOSTIC_RESET + + //! Number of all main counters. static constexpr int sc_numWallCycleCounters = static_cast(WallCycleCounter::Count); //! Number of all subcyclecounters. @@ -269,6 +400,11 @@ inline void wallcycle_all_stop(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycl //! Starts the cycle counter (and increases the call count) inline void wallcycle_start(gmx_wallcycle* wc, WallCycleCounter ewc) { + if (ewc >= WallCycleCounter::Step) + { + traceRangeStart(enumValuetoString(ewc), static_cast(ewc)); + } + if (wc == nullptr) { return; @@ -310,6 +446,11 @@ inline void wallcycle_start_nocount(gmx_wallcycle* wc, WallCycleCounter ewc) //! Stop the cycle count for ewc , returns the last cycle count inline double wallcycle_stop(gmx_wallcycle* wc, WallCycleCounter ewc) { + if (ewc >= WallCycleCounter::Step) + { + traceRangeEnd(); + } + gmx_cycles_t cycle, last; if (wc == nullptr) @@ -390,6 +531,8 @@ inline void wallcycle_sub_start(gmx_wallcycle* wc, WallCycleSubCounter ewcs) { if constexpr (sc_useCycleSubcounters) { + traceSubRangeStart(enumValuetoString(ewcs), static_cast(ewcs)); + if (wc != nullptr) { wc->wcsc[ewcs].start = gmx_cycles_read(); @@ -415,6 +558,8 @@ inline void wallcycle_sub_stop(gmx_wallcycle* wc, WallCycleSubCounter ewcs) { if constexpr (sc_useCycleSubcounters) { + traceSubRangeEnd(); + if (wc != nullptr) { wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start; diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index cc9ff2be2e4..9b12ca723cc 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -65,134 +65,18 @@ static constexpr bool sc_onlyMainDebugPrints = true; //! True if cycle counter nesting depth debugging prints are enabled static constexpr bool sc_debugPrintDepth = false; -template -static constexpr bool checkStringsLengths(const Container& strings) -{ - // NOLINTNEXTLINE(readability-use-anyofallof) // std::all_of is constexpr only since C++20 - for (const char* str : strings) - { - if (std::char_traits::length(str) > maxLength) - { - return false; - } - } - return true; -} -/* Each name should not exceed 22 printing characters - (ie. terminating null can be twentieth) */ -static const char* enumValuetoString(WallCycleCounter enumValue) -{ - constexpr gmx::EnumerationArray wallCycleCounterNames = { - "Run", - "Step", - "PP during PME", - "Domain decomp.", - "DD comm. load", - "DD comm. bounds", - "Vsite constr.", - "Send X to PME", - "Neighbor search", - "Launch PP GPU ops.", - "Comm. coord.", - "Force", - "Wait + Comm. F", - "PME mesh", - "PME GPU mesh", - "PME redist. X/F", - "PME spread", - "PME gather", - "PME 3D-FFT", - "PME 3D-FFT Comm.", - "PME solve LJ", - "PME solve Elec", - "Wait PME GPU D2H", - "PME 3D-FFT", - "PME solve", - "Wait PME GPU gather", - "Reduce GPU PME F", - "Launch PME GPU ops.", - "Wait PME Recv. PP X", - "Wait PME GPU spread", - "Wait GPU FFT to PME", - "PME Halo exch comm", - "PME wait for PP", - "Wait + Recv. PME F", - "Wait Bonded GPU", - "Wait GPU NB nonloc.", - "Wait GPU NB local", - "Wait GPU state copy", - "NB X/F buffer ops.", - "Vsite spread", - "COM pull force", - "AWH", - "Write traj.", - "Update", - "Constraints", - "Comm. energies", - "Enforced rotation", - "Add rot. forces", - "Position swapping", - "IMD", - "MD Graph", - "Test" - }; - static_assert(checkStringsLengths<22>(wallCycleCounterNames)); - return wallCycleCounterNames[enumValue]; -} +#if GMX_USE_ITT +# ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" +# endif +//! Heler for Intel tracing tools instrumentation +const __itt_domain* g_ittDomain = __itt_domain_create("GMX"); +__itt_string_handle* g_ittCounterHandles[static_cast(WallCycleCounter::Count)]; +__itt_string_handle* g_ittSubCounterHandles[static_cast(WallCycleSubCounter::Count)]; +#endif -// Clang complains about this function not used in builds without subcounters -// clang-format off -CLANG_DIAGNOSTIC_IGNORE(-Wunneeded-internal-declaration) -// clang-format on -static const char* enumValuetoString(WallCycleSubCounter enumValue) -{ - constexpr gmx::EnumerationArray wallCycleSubCounterNames = { - "DD redist.", - "DD NS grid + sort", - "DD setup comm.", - "DD make top.", - "DD make constr.", - "DD top. other", - "DD GPU ops.", - "NS grid local", - "NS grid non-local", - "NS search local", - "NS search non-local", - "Bonded F", - "Bonded-FEP F", - "Restraints F", - "Listed buffer ops.", - "NB pruning", - "NB F kernel", - "NB F clear", - "NB FEP", - "NB FEP reduction", - "Launch GPU NB tasks", - "Launch GPU Bonded", - "Launch state copy", - "Ewald F correction", - "NB X buffer ops.", - "NB F buffer ops.", - "Clear force buffer", - "Launch GPU NB X ops.", - "Launch GPU NB F ops.", - "Launch GPU Comm. X", - "Launch GPU Comm. F", - "Launch GPU update", - "Launch PME GPU FFT", - "Graph wait pre-capture", - "Graph capture", - "Graph instantiate/upd.", - "Graph wait pre-launch", - "Graph launch", - "Constraints Comm.", // constraints communication time, note that this counter will contain load imbalance - "Test subcounter" - }; - static_assert(checkStringsLengths<22>(wallCycleSubCounterNames)); - return wallCycleSubCounterNames[enumValue]; -} -CLANG_DIAGNOSTIC_RESET /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */ static const char* enumValuetoString(PmeStage enumValue) @@ -251,9 +135,27 @@ std::unique_ptr wallcycle_init(FILE* fplog, int resetstep, const wc->isMainRank = (cr == nullptr) || MAIN(cr); } +#if GMX_USE_ITT + for (auto wcc : gmx::EnumerationWrapper{}) + { + g_ittCounterHandles[static_cast(wcc)] = __itt_string_handle_create(enumValuetoString(wcc)); + } + for (auto wcsc : gmx::EnumerationWrapper{}) + { + g_ittSubCounterHandles[static_cast(wcsc)] = + __itt_string_handle_create(enumValuetoString(wcsc)); + } +#endif + return wc; } +#if GMX_USE_ITT +# ifdef __clang__ +# pragma clang diagnostic pop +# endif +#endif + void gmx_wallcycle::checkStart(WallCycleCounter ewc) { // NOLINTNEXTLINE(readability-misleading-indentation) diff --git a/src/gromacs/utility/binaryinformation.cpp b/src/gromacs/utility/binaryinformation.cpp index 8ad079c3241..aef98ccf666 100644 --- a/src/gromacs/utility/binaryinformation.cpp +++ b/src/gromacs/utility/binaryinformation.cpp @@ -409,6 +409,13 @@ void gmx_print_version_info(gmx::TextWriter* writer) writer->writeLine("Tracing support: disabled"); #endif +#if GMX_USE_NVTX + writer->writeLine("Instrumention API: NVTX"); +#elif GMX_USE_ROCTX + writer->writeLine("Instrumention API: ROCTX"); +#elif GMX_USE_ITT + writer->writeLine("Instrumention API: ITT"); +#endif /* TODO: The below strings can be quite long, so it would be nice to wrap * them. Can wait for later, as the main branch has ready code to do all