NVIDIA · ghost · Mar 6, 2023 · Mar 7, 2023 · Mar 8, 2023 · Mar 8, 2023
diff --git a/c/include/nvtx3/nvToolsExtMem.h b/c/include/nvtx3/nvToolsExtMem.h
diff --git a/c/include/nvtx3/nvToolsExtMemCudaRt.h b/c/include/nvtx3/nvToolsExtMemCudaRt.h
@@ -0,0 +1,152 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+#ifndef NVTOOLSEXTV3_MEM_CUDART_V1
+#define NVTOOLSEXTV3_MEM_CUDART_V1
+
+#include "nvToolsExtMem.h"
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/** \brief The memory is from a CUDA runtime array.
+ * 
+ * Relevant functions: cudaMallocArray,  cudaMalloc3DArray
+ * Also cudaArray_t from other types such as cudaMipmappedArray_t
+ * 
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
+ * 
+ * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()   
+ * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_CUDA_ARRAY 0x11 
+
+/** \brief structure to describe memory in a CUDA array object
+ */
+typedef struct nvtxMemCudaArrayRangeDesc_v1 
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+    cudaArray_t  src;
+    size_t offset[3];
+    size_t extent[3];
+} nvtxMemCudaArrayRangeDesc_v1;
+typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
+
+
+/** \brief The memory is from a CUDA device array.
+ * 
+ * Relevant functions: cuArrayCreate,  cuArray3DCreate
+ * Also CUarray from other types such as CUmipmappedArray
+ * 
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
+ * 
+ * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()   
+ * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_CU_ARRAY 0x12
+
+/** \brief structure to describe memory in a CUDA array object
+ */
+typedef struct nvtxMemCuArrayRangeDesc_v1 
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+    CUarray  src;
+    size_t offset[3];
+    size_t extent[3];
+} nvtxMemCuArrayRangeDesc_v1;
+typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
+
+/* Reserving 0x2-0xF for more common types */
+
+#define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1
+
+/** \brief Get the permission object that represent the CUDA runtime device 
+ * or cuda driver context
+ * 
+ * This object will allow developers to adjust permissions applied to work executed 
+ * on the GPU.  It may be inherited or overridden by permissions object bound
+ * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
+ * 
+ * Ex. change the peer to peer access permissions between devices in entirety 
+ * or punch through special holes
+ * 
+ * By default, all memory is accessible that naturally would be to a CUDA kernel until 
+ * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
+ * 
+ * This object should also represent the CUDA driver API level context.
+*/
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions(
+                                                                nvtxDomainHandle_t domain);
+
+/** \brief Get the permission object that represent the CUDA runtime device 
+ * or cuda driver context
+ * 
+ * This object will allow developers to adjust permissions applied to work executed 
+ * on the GPU.  It may be inherited or overridden by permissions object bound
+ * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
+ * 
+ * Ex. change the peer to peer access permissions between devices in entirety 
+ * or punch through special holes
+ * 
+ * By default, all memory is accessible that naturally would be to a CUDA kernel until 
+ * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
+ * 
+ * This object should also represent the CUDA driver API level context.
+*/
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions(
+                                                                nvtxDomainHandle_t domain,    
+                                                                int device);
+
+
+/** \brief Change the default behavior for all memory mapped in from a particular device.
+ * 
+ * While typically all memory defaults to readable and writable, users may desire to limit
+ * access to reduced default permissions such as read-only and a per-device basis.
+ * 
+ * Regions can used to further override smaller windows of memory.
+ * 
+ * devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES
+ * 
+*/
+NVTX_DECLSPEC void NVTX_API  nvtxMemCudaSetPeerAccess( 
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissions, 
+    int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */
+    uint32_t flags ); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
+
+
+/** @} */ /*END defgroup*/
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot included directly */
+#include "nvtxExtDetail/nvtxExtImplMemCudaRt1.h"
+#undef NVTX_EXT_IMPL_MEM_CUDART_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */
diff --git a/c/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h b/c/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
@@ -0,0 +1,139 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifndef NVTX_EXT_IMPL_H
+#define NVTX_EXT_IMPL_H
+/* ---- Include required platform headers ---- */
+
+#if defined(_WIN32) 
+
+#include <Windows.h>
+
+#else
+#include <unistd.h>
+
+#if defined(__ANDROID__)
+#include <android/api-level.h> 
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#endif
+
+/* ---- Define macros used in this file ---- */
+
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+// #ifdef __GNUC__
+// #pragma GCC visibility push(hidden)
+// #endif
+
+#define NVTX_EXTENSION_FRESH 0
+#define NVTX_EXTENSION_DISABLED 1
+#define NVTX_EXTENSION_STARTING 2
+#define NVTX_EXTENSION_LOADED 3
+
+typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
+
+typedef struct nvtxExtModuleSegment_t
+{
+    size_t segmentId;
+    size_t slotCount;
+    intptr_t* slots;
+
+} nvtxExtModuleSegment_t;
+
+typedef struct nvtxExtModuleInfo_t
+{
+    uint16_t nvtxVer;
+    uint16_t structSize;
+    uint16_t moduleId;
+    uint16_t compatId;
+    size_t segmentsCount;
+    nvtxExtModuleSegment_t* segments;
+    NvtxExtGetExportFunction_t getExportFunction;
+} nvtxExtModuleInfo_t;
+
+typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
+
+/* nvtxExtGlobals1_t is for the global storage of slots for function pointers and function tables.
+* Slots ranges are pre-assigned to extensions.
+* other, potentially larger, globals will be created once there is insufficient room for a new extension.
+*/
+#define NVTX3EXT_GLOBALS1_SLOT_GROUP_ID 1 /* incrimented with each new ext global we introduce */
+#define NVTX3EXT_GLOBALS1_SLOT_COUNT 256
+typedef struct nvtxExtGlobals1_t
+{
+    NvtxExtInitializeInjectionFunc_t injectionFnPtr;
+    size_t slotGroupId;
+    size_t slotCount;
+    intptr_t slots[256];
+
+} nvtxExtGlobals1_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
+{
+    (NvtxExtInitializeInjectionFunc_t)0,
+    1,
+    NVTX3EXT_GLOBALS1_SLOT_COUNT,
+    {0}
+};
+
+
+
+#define NVTX_EXT_INIT_GUARD
+#include "nvtxExtInit.h"
+#undef NVTX_EXT_INIT_GUARD
+
+// #ifdef __GNUC__
+// #pragma GCC visibility pop
+// #endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_H */