From 35d0c7310ea06d8d1528b5b7de68950e29361ace Mon Sep 17 00:00:00 2001 From: Julia Jiang Date: Thu, 3 Nov 2022 23:02:08 -0400 Subject: [PATCH] SWDEV-354898 - HIP documents patch for 5.4 release. Change-Id: I52a20f69775ad06672321fdaa114dbee815a9838 --- docs/markdown/hip_debugging.md | 2 +- docs/markdown/hip_kernel_language.md | 16 +- docs/markdown/hip_programming_guide.md | 3 - include/hip/hip_runtime_api.h | 237 ++++++++++++------------- 4 files changed, 130 insertions(+), 128 deletions(-) diff --git a/docs/markdown/hip_debugging.md b/docs/markdown/hip_debugging.md index b26915c50e..c6e857a90f 100644 --- a/docs/markdown/hip_debugging.md +++ b/docs/markdown/hip_debugging.md @@ -262,7 +262,7 @@ The following is the summary of the most useful environment variables in HIP. | AMD_SERIALIZE_COPY
Serialize copies. | 0 | 1: Wait for completion before enqueue.
2: Wait for completion after enqueue.
3: Both. | | HIP_HOST_COHERENT
Coherent memory in hipHostMalloc. | 0 | 0: memory is not coherent between host and GPU.
1: memory is coherent with host. | | AMD_DIRECT_DISPATCH
Enable direct kernel dispatch. | 1 | 0: Disable.
1: Enable. | - +| GPU_MAX_HW_QUEUES
The maximum number of hardware queues allocated per device. | 4 | The variable controls how many independent hardware queues HIP runtime can create per process, per device. If application allocates more HIP streams than this number, then HIP runtime will reuse the same hardware queues for the new streams in round robin manner. Please note, this maximum number does not apply to either hardware queues that are created for CU masked HIP streams, or cooperative queue for HIP Cooperative Groups (there is only one single queue per device). | ## General Debugging Tips - 'gdb --args' can be used to conveniently pass the executable and arguments to gdb. diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index c4cc2301a6..2b4f7e9cc8 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -455,9 +455,9 @@ Following is the list of supported integer intrinsics. Note that intrinsics are | unsigned int __popcll ( unsigned long long int x )
Count the number of bits that are set to 1 in a 64 bit integer. | | int __mul24 ( int x, int y )
Multiply two 24bit integers. | | unsigned int __umul24 ( unsigned int x, unsigned int y )
Multiply two 24bit unsigned integers. | -[1] +[1] The HIP-Clang implementation of __ffs() and __ffsll() contains code to add a constant +1 to produce the ffs result format. -For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, +For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, HIP-Clang provides __lastbit_u32_u32(unsigned int input) and __lastbit_u32_u64(unsigned long long int input). The index returned by __lastbit_ instructions starts at -1, while for ffs the index starts at 0. @@ -496,6 +496,18 @@ long long int clock64() ``` Returns the value of counter that is incremented every clock cycle on device. Difference in values returned provides the cycles used. +``` +long long int wall_clock64() +``` +Returns wall clock count at a constant frequency on the device, which can be queried via HIP API with hipDeviceAttributeWallClockRate attribute of the device in HIP application code, for example, +``` +int wallClkRate = 0; //in kilohertz +HIPCHECK(hipDeviceGetAttribute(&wallClkRate, hipDeviceAttributeWallClockRate, deviceId)); +``` +Where hipDeviceAttributeWallClockRate is a device attribute. +Note that, wall clock frequency is a per-device attribute. + + ## Atomic Functions Atomic functions execute as read-modify-write operations residing in global or shared memory. No other device or thread can observe or modify the memory location during an atomic operation. If multiple instructions from different devices or threads target the same memory location, the instructions are serialized in an undefined order. diff --git a/docs/markdown/hip_programming_guide.md b/docs/markdown/hip_programming_guide.md index 1fc966e898..507a72c502 100644 --- a/docs/markdown/hip_programming_guide.md +++ b/docs/markdown/hip_programming_guide.md @@ -102,9 +102,6 @@ A stronger system-level fence can be specified when the event is created with hi - hipEventReleaseToSystem : Perform a system-scope release operation when the event is recorded.  This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing.  Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use hipEventReleaseToSystem. - hipEventDisableTiming: Events created with this flag would not record profiling data and provide best performance if used for synchronization. -Note, for HIP Events used in kernel dispatch using hipExtLaunchKernelGGL/hipExtLaunchKernel, events passed in the API are not explicitly recorded and should only be used to get elapsed time for that specific launch. -In case events are used across multiple dispatches, for example, start and stop events from different hipExtLaunchKernelGGL/hipExtLaunchKernel calls, they will be treated as invalid unrecorded events, HIP will throw error "hipErrorInvalidHandle" from hipEventElapsedTime. - ### Summary and Recommendations: - Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as threadfence_system to work transparently. diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index 1d8a6cc731..1967f4734e 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -1208,6 +1208,102 @@ typedef enum hipGraphInstantiateFlags { 1, ///< Automatically free memory allocated in a graph before relaunching. } hipGraphInstantiateFlags; +/** + * Memory allocation properties + */ +typedef struct hipMemAllocationProp { + hipMemAllocationType type; ///< Memory allocation type + hipMemAllocationHandleType requestedHandleType; ///< Requested handle type + hipMemLocation location; ///< Memory location + void* win32HandleMetaData; ///< Metadata for Win32 handles + struct { + unsigned char compressionType; ///< Compression type + unsigned char gpuDirectRDMACapable; ///< RDMA capable + unsigned short usage; ///< Usage + } allocFlags; +} hipMemAllocationProp; + +/** + * Generic handle for memory allocation + */ +typedef struct ihipMemGenericAllocationHandle* hipMemGenericAllocationHandle_t; + +/** + * @brief Flags for granularity + * @enum + * @ingroup Enumerations + */ +typedef enum hipMemAllocationGranularity_flags { + hipMemAllocationGranularityMinimum = 0x0, ///< Minimum granularity + hipMemAllocationGranularityRecommended = 0x1 ///< Recommended granularity for performance +} hipMemAllocationGranularity_flags; + +/** + * @brief Memory handle type + * @enum + * @ingroup Enumerations + */ +typedef enum hipMemHandleType { + hipMemHandleTypeGeneric = 0x0 ///< Generic handle type +} hipMemHandleType; + +/** + * @brief Memory operation types + * @enum + * @ingroup Enumerations + */ +typedef enum hipMemOperationType { + hipMemOperationTypeMap = 0x1, ///< Map operation + hipMemOperationTypeUnmap = 0x2 ///< Unmap operation +} hipMemOperationType; + +/** + * @brief Subresource types for sparse arrays + * @enum + * @ingroup Enumerations + */ +typedef enum hipArraySparseSubresourceType { + hipArraySparseSubresourceTypeSparseLevel = 0x0, ///< Sparse level + hipArraySparseSubresourceTypeMiptail = 0x1 ///< Miptail +} hipArraySparseSubresourceType; + +/** + * Map info for arrays + */ +typedef struct hipArrayMapInfo { + hipResourceType resourceType; ///< Resource type + union { + hipMipmappedArray mipmap; + hipArray_t array; + } resource; + hipArraySparseSubresourceType subresourceType; ///< Sparse subresource type + union { + struct { + unsigned int level; ///< For mipmapped arrays must be a valid mipmap level. For arrays must be zero + unsigned int layer; ///< For layered arrays must be a valid layer index. Otherwise, must be zero + unsigned int offsetX; ///< X offset in elements + unsigned int offsetY; ///< Y offset in elements + unsigned int offsetZ; ///< Z offset in elements + unsigned int extentWidth; ///< Width in elements + unsigned int extentHeight; ///< Height in elements + unsigned int extentDepth; ///< Depth in elements + } sparseLevel; + struct { + unsigned int layer; ///< For layered arrays must be a valid layer index. Otherwise, must be zero + unsigned long long offset; ///< Offset within mip tail + unsigned long long size; ///< Extent in bytes + } miptail; + } subresource; + hipMemOperationType memOperationType; ///< Memory operation type + hipMemHandleType memHandleType; ///< Memory handle type + union { + hipMemGenericAllocationHandle_t memHandle; + } memHandle; + unsigned long long offset; ///< Offset within the memory + unsigned int deviceBitMask; ///< Device ordinal bit mask + unsigned int flags; ///< flags for future use, must be zero now. + unsigned int reserved[2]; ///< Reserved for future use, must be zero now. +} hipArrayMapInfo; // Doxygen end group GlobalDefs /** @} */ //------------------------------------------------------------------------------------------------- @@ -1246,7 +1342,7 @@ hipError_t hipInit(unsigned int flags); * * @param [out] driverVersion * - * @returns #hipSuccess, #hipErrorInavlidValue + * @returns #hipSuccess, #hipErrorInvalidValue * * @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision. * This function always set *driverVersion to 4 as an approximation though HIP supports @@ -1262,7 +1358,7 @@ hipError_t hipDriverGetVersion(int* driverVersion); * * @param [out] runtimeVersion * - * @returns #hipSuccess, #hipErrorInavlidValue + * @returns #hipSuccess, #hipErrorInvalidValue * * @warning The version definition of HIP runtime is different from CUDA. * On AMD platform, the function returns HIP runtime version, @@ -1277,7 +1373,7 @@ hipError_t hipRuntimeGetVersion(int* runtimeVersion); * @param [out] device * @param [in] ordinal * - * @returns #hipSuccess, #hipErrorInavlidDevice + * @returns #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipDeviceGet(hipDevice_t* device, int ordinal); @@ -1287,7 +1383,7 @@ hipError_t hipDeviceGet(hipDevice_t* device, int ordinal); * @param [out] minor * @param [in] device * - * @returns #hipSuccess, #hipErrorInavlidDevice + * @returns #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device); /** @@ -1296,7 +1392,7 @@ hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device * @param [in] len * @param [in] device * - * @returns #hipSuccess, #hipErrorInavlidDevice + * @returns #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device); /** @@ -1308,7 +1404,7 @@ hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device); * it is still open to changes and may have outstanding issues. * * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorNotInitialized, - * #hipErrorDeInitialized + * #hipErrorDeinitialized */ hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device); /** @@ -1318,7 +1414,7 @@ hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device); * @param [in] srcDevice * @param [in] dstDevice * - * @returns #hipSuccess, #hipErrorInavlidDevice + * @returns #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, int srcDevice, int dstDevice); @@ -1328,7 +1424,7 @@ hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, * @param [in] len * @param [in] device * - * @returns #hipSuccess, #hipErrorInavlidDevice + * @returns #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device); /** @@ -1336,7 +1432,7 @@ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device); * @param [out] device handle * @param [in] PCI Bus ID * - * @returns #hipSuccess, #hipErrorInavlidDevice, #hipErrorInvalidValue + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue */ hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId); /** @@ -1344,7 +1440,7 @@ hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId); * @param [out] bytes * @param [in] device * - * @returns #hipSuccess, #hipErrorInavlidDevice + * @returns #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device); // doxygen end initialization @@ -1700,8 +1796,9 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr); * hipErrorInvalidHandle, * hipErrorTooManyPeers * - * @note No guarantees are made about the address returned in @p *devPtr. - * In particular, multiple processes may not receive the same address for the same @p handle. + * @note During multiple processes, using the same memory handle opened by the current context, + * there is no guarantee that the same device poiter will be returned in @p *devPtr. + * This is diffrent from CUDA. * */ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags); @@ -2373,13 +2470,6 @@ hipError_t hipEventSynchronize(hipEvent_t event); * recorded on one or both events (that is, hipEventQuery() would return #hipErrorNotReady on at * least one of the events), then #hipErrorNotReady is returned. * - * Note, for HIP Events used in kernel dispatch using hipExtLaunchKernelGGL/hipExtLaunchKernel, - * events passed in hipExtLaunchKernelGGL/hipExtLaunchKernel are not explicitly recorded and should - * only be used to get elapsed time for that specific launch. In case events are used across - * multiple dispatches, for example, start and stop events from different hipExtLaunchKernelGGL/ - * hipExtLaunchKernel calls, they will be treated as invalid unrecorded events, HIP will throw - * error "hipErrorInvalidHandle" from hipEventElapsedTime. - * * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord, * hipEventSynchronize */ @@ -3326,7 +3416,7 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, * @param[in] src Data being copy from * @param[in] sizeBytes Data size in bytes * - * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidValue * * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, @@ -3344,7 +3434,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes); * @param[in] src Data being copy from * @param[in] sizeBytes Data size in bytes * - * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidValue * * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, @@ -3362,7 +3452,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes); * @param[in] src Data being copy from * @param[in] sizeBytes Data size in bytes * - * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidValue * * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, @@ -3380,7 +3470,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte * @param[in] src Data being copy from * @param[in] sizeBytes Data size in bytes * - * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidValue * * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, @@ -3398,7 +3488,7 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, h * @param[in] src Data being copy from * @param[in] sizeBytes Data size in bytes * - * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidValue * * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, @@ -3416,7 +3506,7 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h * @param[in] src Data being copy from * @param[in] sizeBytes Data size in bytes * - * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidValue * * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, @@ -6392,103 +6482,6 @@ hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object, u */ -/** - * Memory allocation properties - */ -typedef struct hipMemAllocationProp { - hipMemAllocationType type; ///< Memory allocation type - hipMemAllocationHandleType requestedHandleType; ///< Requested handle type - hipMemLocation location; ///< Memory location - void* win32HandleMetaData; ///< Metadata for Win32 handles - struct { - unsigned char compressionType; ///< Compression type - unsigned char gpuDirectRDMACapable; ///< RDMA capable - unsigned short usage; ///< Usage - } allocFlags; -} hipMemAllocationProp; - -/** - * Generic handle for memory allocation - */ -typedef struct ihipMemGenericAllocationHandle* hipMemGenericAllocationHandle_t; - -/** - * @brief Flags for granularity - * @enum - * @ingroup Enumerations - */ -typedef enum hipMemAllocationGranularity_flags { - hipMemAllocationGranularityMinimum = 0x0, ///< Minimum granularity - hipMemAllocationGranularityRecommended = 0x1 ///< Recommended granularity for performance -} hipMemAllocationGranularity_flags; - -/** - * @brief Memory handle type - * @enum - * @ingroup Enumerations - */ -typedef enum hipMemHandleType { - hipMemHandleTypeGeneric = 0x0 ///< Generic handle type -} hipMemHandleType; - -/** - * @brief Memory operation types - * @enum - * @ingroup Enumerations - */ -typedef enum hipMemOperationType { - hipMemOperationTypeMap = 0x1, ///< Map operation - hipMemOperationTypeUnmap = 0x2 ///< Unmap operation -} hipMemOperationType; - -/** - * @brief Subresource types for sparse arrays - * @enum - * @ingroup Enumerations - */ -typedef enum hipArraySparseSubresourceType { - hipArraySparseSubresourceTypeSparseLevel = 0x0, ///< Sparse level - hipArraySparseSubresourceTypeMiptail = 0x1 ///< Miptail -} hipArraySparseSubresourceType; - -/** - * Map info for arrays - */ -typedef struct hipArrayMapInfo { - hipResourceType resourceType; ///< Resource type - union { - hipMipmappedArray mipmap; - hipArray_t array; - } resource; - hipArraySparseSubresourceType subresourceType; ///< Sparse subresource type - union { - struct { - unsigned int level; ///< For mipmapped arrays must be a valid mipmap level. For arrays must be zero - unsigned int layer; ///< For layered arrays must be a valid layer index. Otherwise, must be zero - unsigned int offsetX; ///< X offset in elements - unsigned int offsetY; ///< Y offset in elements - unsigned int offsetZ; ///< Z offset in elements - unsigned int extentWidth; ///< Width in elements - unsigned int extentHeight; ///< Height in elements - unsigned int extentDepth; ///< Depth in elements - } sparseLevel; - struct { - unsigned int layer; ///< For layered arrays must be a valid layer index. Otherwise, must be zero - unsigned long long offset; ///< Offset within mip tail - unsigned long long size; ///< Extent in bytes - } miptail; - } subresource; - hipMemOperationType memOperationType; ///< Memory operation type - hipMemHandleType memHandleType; ///< Memory handle type - union { - hipMemGenericAllocationHandle_t memHandle; - } memHandle; - unsigned long long offset; ///< Offset within the memory - unsigned int deviceBitMask; ///< Device ordinal bit mask - unsigned int flags; ///< flags for future use, must be zero now. - unsigned int reserved[2]; ///< Reserved for future use, must be zero now. -} hipArrayMapInfo; - /** *------------------------------------------------------------------------------------------------- *-------------------------------------------------------------------------------------------------