diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 0e0a252eb6..470629cc14 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -14,5 +14,6 @@ updates:
- "documentation"
- "dependencies"
- "ci:docs-only"
+ target-branch: "docs/develop"
reviewers:
- "samjwu"
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 46cf268483..7e7680e31f 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -17,4 +17,4 @@ on:
jobs:
call-workflow-passing-data:
name: Documentation
- uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@develop
+ uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@local_spellcheck_file
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 02a17b0df0..83379456c9 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,7 +6,7 @@ version: 2
sphinx:
configuration: docs/conf.py
-formats: [htmlzip, pdf, epub]
+formats: []
python:
install:
@@ -18,16 +18,14 @@ build:
python: "3.10"
apt_packages:
- "doxygen"
+ - "gfortran" # For pre-processing fortran sources
- "graphviz" # For dot graphs in doxygen
jobs:
post_checkout:
- - if [ -d ../llvm-project ]; then rmdir ../llvm-project; fi
- - if [ -d ../clr ]; then rmdir ../clr; fi
- - if [ -d ../ROCR-Runtime ]; then rmdir ../ROCR-Runtime; fi
- - git clone --depth=1 --single-branch --branch rocdoc-195 https://github.com/StreamHPC/llvm-project.git ../llvm-project
- - git clone --depth=1 --single-branch --branch develop https://github.com/ROCm/clr.git ../clr
- - git clone --depth=1 --single-branch --branch master https://github.com/ROCm/ROCR-Runtime.git ../ROCR-Runtime
+ - if [ -d ../clr ]; then rm -rf ../clr; fi
+ - if [ -d ../ROCR-Runtime ]; then rm -rf ../ROCR-Runtime; fi
+ - git clone --depth=1 --single-branch --branch docs/develop https://github.com/ROCm/clr.git ../clr
+ - git clone --depth=1 --single-branch --branch master https://github.com/ROCm/ROCR-Runtime.git ../ROCR-Runtime
post_build:
- rm -rf ../clr
- - rm -rf ../llvm-project
- rm -rf ../ROCR-Runtime
diff --git a/.spellcheck_local.yaml b/.spellcheck_local.yaml
new file mode 100644
index 0000000000..64b887d1d5
--- /dev/null
+++ b/.spellcheck_local.yaml
@@ -0,0 +1,4 @@
+matrix:
+- name: Markdown
+ sources:
+ - 'docs/**/*.md'
diff --git a/.wordlist.txt b/.wordlist.txt
index 45af247c0d..6edde54d60 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -3,6 +3,7 @@ ALUs
AmgX
APU
AQL
+AXPY
Asynchrony
backtrace
Bitcode
@@ -12,24 +13,32 @@ builtins
Builtins
CAS
clr
+coroutines
cuBLASLt
cuCtx
cuDNN
+dataflow
deallocate
+decompositions
denormal
+Dereferencing
dll
DirectX
EIGEN
EIGEN's
enqueue
enqueues
+entrypoint
+entrypoints
enum
embeded
extern
fatbinary
+foundationally
frontends
gedit
GPGPU
+GWS
hardcoded
HC
HIP's
@@ -40,12 +49,15 @@ hipother
HIPRTC
hcBLAS
icc
+IILE
+iGPU
inplace
Interoperation
interoperate
Intrinsics
intrinsics
IPC
+IPs
isa
Lapack
latencies
@@ -65,28 +77,40 @@ multithreading
NCCL
NDRange
nonnegative
+NOP
Numa
Nsight
+overindex
+overindexing
oversubscription
+pragmas
preconditioners
prefetched
preprocessor
PTX
PyHIP
queryable
+prefetching
representable
RMW
ROCm's
rocTX
RTC
RTTI
+rvalue
+SAXPY
scalarizing
sceneries
+shaders
SIMT
SPMV
structs
SYCL
syntaxes
+tradeoffs
+templated
typedefs
+UMM
+variadic
WinGDB
-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
\ No newline at end of file
+zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
diff --git a/README.md b/README.md
index f651518d35..e7037c7fdb 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ HIP releases are typically naming convention for each ROCM release to help diffe
* [Installation](docs/install/install.rst)
* [HIP FAQ](docs/how-to/faq.md)
-* [HIP Kernel Language](docs/reference/kernel_language.rst)
+* [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst)
* [HIP Porting Guide](docs/how-to/hip_porting_guide.md)
* [HIP Porting Driver Guide](docs/how-to/hip_porting_driver_api.md)
* [HIP Programming Guide](docs/how-to/programming_manual.md)
@@ -88,7 +88,7 @@ hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors,
atomics, and timer functions.
-It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP Kernel Language](docs/reference/kernel_language.rst) for a full description).
+It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst) for a full description).
Here's an example of defining a simple 'vector_square' kernel.
```cpp
diff --git a/docs/conf.py b/docs/conf.py
index 3dec52d636..82bcefee89 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -43,4 +43,12 @@
cpp_id_attributes = ["__global__", "__device__", "__host__", "__forceinline__", "static"]
cpp_paren_attributes = ["__declspec"]
-suppress_warnings = ["etoc.toctree"]
\ No newline at end of file
+suppress_warnings = ["etoc.toctree"]
+
+numfig = False
+
+
+exclude_patterns = [
+ "doxygen/mainpage.md",
+ "understand/glossary.md"
+]
\ No newline at end of file
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio
new file mode 100644
index 0000000000..4f1ff494f2
--- /dev/null
+++ b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio
@@ -0,0 +1,904 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
new file mode 100644
index 0000000000..298cd48218
--- /dev/null
+++ b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio
similarity index 98%
rename from docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio
rename to docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio
index fb4c19fef9..e4c0c90d2d 100644
--- a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio
+++ b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio
@@ -1,6 +1,6 @@
-
+
@@ -1411,7 +1411,7 @@
-
+
@@ -1591,7 +1591,7 @@
-
+
@@ -1762,7 +1762,7 @@
-
+
@@ -1876,7 +1876,7 @@
-
+
@@ -2047,7 +2047,7 @@
-
+
@@ -3490,7 +3490,7 @@
-
+
@@ -3670,7 +3670,7 @@
-
+
@@ -3841,7 +3841,7 @@
-
+
@@ -3955,7 +3955,7 @@
-
+
@@ -4126,7 +4126,7 @@
-
+
@@ -4534,7 +4534,7 @@
-
+
@@ -4600,7 +4600,7 @@
-
+
@@ -4771,7 +4771,7 @@
-
+
@@ -4933,7 +4933,7 @@
-
+
@@ -4984,163 +4984,163 @@
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
new file mode 100644
index 0000000000..ebe4794576
--- /dev/null
+++ b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/conflict_free_reduction.drawio b/docs/data/tutorial/reduction/conflict_free_reduction.drawio
new file mode 100644
index 0000000000..b1f0b51074
--- /dev/null
+++ b/docs/data/tutorial/reduction/conflict_free_reduction.drawio
@@ -0,0 +1,448 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/conflict_free_reduction.svg b/docs/data/tutorial/reduction/conflict_free_reduction.svg
new file mode 100644
index 0000000000..71eb0660ed
--- /dev/null
+++ b/docs/data/tutorial/reduction/conflict_free_reduction.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/foldl.drawio b/docs/data/tutorial/reduction/foldl.drawio
new file mode 100644
index 0000000000..1d5228da9e
--- /dev/null
+++ b/docs/data/tutorial/reduction/foldl.drawio
@@ -0,0 +1,142 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/foldl.svg b/docs/data/tutorial/reduction/foldl.svg
new file mode 100644
index 0000000000..7603080193
--- /dev/null
+++ b/docs/data/tutorial/reduction/foldl.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/naive_reduction.drawio b/docs/data/tutorial/reduction/naive_reduction.drawio
new file mode 100644
index 0000000000..b186c58aad
--- /dev/null
+++ b/docs/data/tutorial/reduction/naive_reduction.drawio
@@ -0,0 +1,442 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/naive_reduction.svg b/docs/data/tutorial/reduction/naive_reduction.svg
new file mode 100644
index 0000000000..922bfff1e9
--- /dev/null
+++ b/docs/data/tutorial/reduction/naive_reduction.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/parallel_foldl.drawio b/docs/data/tutorial/reduction/parallel_foldl.drawio
new file mode 100644
index 0000000000..6b04c73cc2
--- /dev/null
+++ b/docs/data/tutorial/reduction/parallel_foldl.drawio
@@ -0,0 +1,142 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/parallel_foldl.svg b/docs/data/tutorial/reduction/parallel_foldl.svg
new file mode 100644
index 0000000000..d5edb0accb
--- /dev/null
+++ b/docs/data/tutorial/reduction/parallel_foldl.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/reduced_divergence_reduction.drawio b/docs/data/tutorial/reduction/reduced_divergence_reduction.drawio
new file mode 100644
index 0000000000..0f1bd277ad
--- /dev/null
+++ b/docs/data/tutorial/reduction/reduced_divergence_reduction.drawio
@@ -0,0 +1,442 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/reduced_divergence_reduction.svg b/docs/data/tutorial/reduction/reduced_divergence_reduction.svg
new file mode 100644
index 0000000000..9661e05115
--- /dev/null
+++ b/docs/data/tutorial/reduction/reduced_divergence_reduction.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/warp_reduction.drawio b/docs/data/tutorial/reduction/warp_reduction.drawio
new file mode 100644
index 0000000000..583f90cdd2
--- /dev/null
+++ b/docs/data/tutorial/reduction/warp_reduction.drawio
@@ -0,0 +1,421 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/warp_reduction.svg b/docs/data/tutorial/reduction/warp_reduction.svg
new file mode 100644
index 0000000000..ec8d0a829b
--- /dev/null
+++ b/docs/data/tutorial/reduction/warp_reduction.svg
@@ -0,0 +1,2 @@
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/warp_reduction_with_shared.drawio b/docs/data/tutorial/reduction/warp_reduction_with_shared.drawio
new file mode 100644
index 0000000000..338407f45e
--- /dev/null
+++ b/docs/data/tutorial/reduction/warp_reduction_with_shared.drawio
@@ -0,0 +1,707 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/tutorial/reduction/warp_reduction_with_shared.svg b/docs/data/tutorial/reduction/warp_reduction_with_shared.svg
new file mode 100644
index 0000000000..65b6d642b8
--- /dev/null
+++ b/docs/data/tutorial/reduction/warp_reduction_with_shared.svg
@@ -0,0 +1,3 @@
+
\ No newline at end of file
diff --git a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg b/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg
deleted file mode 100644
index a3f57994fb..0000000000
--- a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg
+++ /dev/null
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/docs/data/unified_memory/um.drawio b/docs/data/unified_memory/um.drawio
new file mode 100644
index 0000000000..fac74f4b60
--- /dev/null
+++ b/docs/data/unified_memory/um.drawio
@@ -0,0 +1,1878 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/data/unified_memory/um.svg b/docs/data/unified_memory/um.svg
new file mode 100644
index 0000000000..748949b271
--- /dev/null
+++ b/docs/data/unified_memory/um.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index d4bb54bd5d..b0e7231aa6 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -832,7 +832,8 @@ WARN_LOGFILE =
INPUT = mainpage.md \
../../include/hip \
../../../clr/hipamd/include/hip/amd_detail/amd_hip_gl_interop.h \
- ../../../llvm-project/clang/lib/Headers/__clang_hip_math.h \
+ ../../../clr/hipamd/include/hip/amd_detail/amd_surface_functions.h \
+ ../../../clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h \
../../../ROCR-Runtime/src/inc/hsa_ext_amd.h
# This tag can be used to specify the character encoding of the source files
@@ -2195,8 +2196,18 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-PREDEFINED = __HIP_PLATFORM_AMD__ \
- __dparm(x)=
+PREDEFINED = "__HIP_PLATFORM_AMD__" \
+ "DOXYGEN_SHOULD_INCLUDE_THIS=1" \
+ "DOXYGEN_SHOULD_SKIP_THIS=1" \
+ "__dparm(x)=" \
+ "__cplusplus=201103L" \
+ "__host__=" \
+ "__device__=" \
+ "__hip_img_chk__=" \
+ "__CG_QUALIFIER__=" \
+ "__CG_STATIC_QUALIFIER__=static" \
+ "_CG_STATIC_CONST_DECL_=static constexpr" \
+ "HIP_ENABLE_WARP_SYNC_BUILTINS"
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/cooperative_groups.rst
new file mode 100644
index 0000000000..9568ff9f9d
--- /dev/null
+++ b/docs/how-to/cooperative_groups.rst
@@ -0,0 +1,490 @@
+.. meta::
+ :description: This topic describes how to use cooperative groups in HIP
+ :keywords: AMD, ROCm, HIP, cooperative groups
+
+.. _cooperative_groups_how-to:
+
+*******************************************************************************
+Cooperative Groups
+*******************************************************************************
+
+Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. Cooperative Groups let you define your own set of thread groups which may fit your user-cases better than those defined by the hardware. This lets you specify the level of granularity for thread communication which can lead to more efficient parallel decompositions.
+
+The API is accessible in the ``cooperative_groups`` namespace after the ``hip_cooperative_groups.h`` is included. The header contains the following elements:
+
+* Static functions to create groups and subgroups.
+* Hardware-accelerated operations over the whole group, like shuffles.
+* Data types of cooperative groups.
+* Synchronize member function of the groups.
+* Get group properties member functions.
+
+Cooperative groups thread model
+===============================
+
+The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarchy ` and :ref:`block hierarchy `.
+
+.. _coop_thread_top_hierarchy:
+
+.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
+ :alt: Diagram depicting nested rectangles of varying color. The outermost one
+ titled "Grid", inside sets of different sized rectangles layered on
+ one another titled "Block". Each "Block" containing sets of uniform
+ rectangles layered on one another titled "Warp". Each of the "Warp"
+ titled rectangles filled with downward pointing arrows inside.
+
+ Cooperative group thread hierarchy in grids.
+
+The **multi grid** is an abstraction of potentially multiple simultaneous launches of the same kernel over multiple devices (Deprecated since 5.0). The **grid** in cooperative groups is a single dispatch of kernels for execution like the original grid.
+
+.. note::
+
+ The ability to synchronize over a grid or multi grid requires the kernel to be launched using the specific cooperative groups API.
+
+The **block** is the same as the :ref:`inherent_thread_model` block entity.
+
+.. note::
+
+ Explicit warp-level thread handling is absent from the Cooperative Groups API. In order to exploit the known hardware SIMD width on which built-in functionality translates to simpler logic, you can use the group partitioning part of the API, such as ``tiled_partition``.
+
+.. _coop_thread_bottom_hierarchy:
+
+.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
+ :alt: The new level between block thread and threads.
+
+ Cooperative group thread hierarchy in blocks.
+
+The cooperative groups API introduce a new level between block thread and threads. The :ref:`thread-block tile ` give the opportunity to have tiles in the thread block, while the :ref:`coalesced group ` holds the active threads of the parent group. These groups further discussed in the :ref:`groups types ` section.
+
+For details on memory model, check the :ref:`memory model description `.
+
+.. _coop_group_types:
+
+Group types
+===========
+
+Group types are based on the levels of synchronization and data sharing among threads.
+
+Thread-block group
+------------------
+
+Represents an intra-block cooperative groups type where the participating threads within the group are the same threads that participated in the currently executing ``block``.
+
+.. code-block:: cpp
+
+ class thread_block;
+
+Constructed via:
+
+.. code-block:: cpp
+
+ thread_block g = this_thread_block();
+
+The ``group_index()`` , ``thread_index()`` , ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` , ``sync()`` and ``group_dim()`` member functions are public of the thread_block class. For further details, check the :ref:`thread_block references ` .
+
+Grid group
+------------
+
+Represents an inter-block cooperative groups type where the group's participating threads span multiple blocks running the same kernel on the same device. Use the cooperative launch API to enable synchronization across the grid group.
+
+.. code-block:: cpp
+
+ class grid_group;
+
+Constructed via:
+
+.. code-block:: cpp
+
+ grid_group g = this_grid();
+
+The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` and ``sync()`` member functions
+are public of the ``grid_group`` class. For further details, check the :ref:`grid_group references `.
+
+Multi-grid group
+------------------
+
+Represents an inter-device cooperative groups type where the participating threads within the group span multiple devices that run the same kernel on the devices. Use the cooperative launch API to enable synchronization across the multi-grid group.
+
+.. code-block:: cpp
+
+ class multi_grid_group;
+
+Constructed via:
+
+.. code-block:: cpp
+
+ // Kernel must be launched with the cooperative multi-device API
+ multi_grid_group g = this_multi_grid();
+
+The ``num_grids()`` , ``grid_rank()`` , ``thread_rank()``, ``size()``, ``cg_type()``, ``is_valid()`` ,
+and ``sync()`` member functions are public of the ``multi_grid_group`` class. For
+further details check the :ref:`multi_grid_group references ` .
+
+.. _coop_thread_block_tile:
+
+Thread-block tile
+------------------
+
+This constructs a templated class derived from ``thread_group``. The template defines the tile
+size of the new thread group at compile time. This group type also supports sub-wave level intrinsics.
+
+.. code-block:: cpp
+
+ template
+ class thread_block_tile;
+
+Constructed via:
+
+.. code-block:: cpp
+
+ template
+ _CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g)
+
+
+.. note::
+
+ * Size must be a power of 2 and not larger than warp (wavefront) size.
+ * ``shfl()`` functions support integer or float type.
+
+The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl()``, ``shfl_down()``, ``shfl_up()``, ``shfl_xor()``, ``ballot()``, ``any()``, ``all()``, ``match_any()`` and ``match_all()`` member functions are public of the ``thread_block_tile`` class. For further details, check the :ref:`thread_block_tile references ` .
+
+.. _coop_coalesced_groups:
+
+Coalesced groups
+------------------
+
+Threads (64 threads on CDNA and 32 threads on RDNA) in a warp cannot execute different instructions simultaneously, so conditional branches are executed serially within the warp. When threads encounter a conditional branch, they can diverge, resulting in some threads being disabled, if they do not meet the condition to execute that branch. The active threads referred as coalesced, and coalesced group represents an active thread group within a warp.
+
+.. note::
+
+ The NVIDIA GPU's independent thread scheduling presents the appearance that threads on different branches execute concurrently.
+
+.. warning::
+
+ AMD GPUs do not support independent thread scheduling. Some CUDA application can rely on this feature and the ported HIP version on AMD GPUs can deadlock, when they try to make use of independent thread scheduling.
+
+This group type also supports sub-wave level intrinsics.
+
+.. code-block:: cpp
+
+ class coalesced_group;
+
+Constructed via:
+
+.. code-block:: cpp
+
+ coalesced_group active = coalesced_threads();
+
+.. note::
+
+ ``shfl()`` functions support integer or float type.
+
+The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl()``, ``shfl_down()``, ``shfl_up()``, ``ballot()``, ``any()``, ``all()``, ``match_any()`` and ``match_all()`` member functions are public of the ``coalesced_group`` class. For more information, see :ref:`coalesced_group references ` .
+
+Cooperative groups simple example
+=================================
+
+The difference to the original block model in the ``reduce_sum`` device function is the following.
+
+.. tab-set::
+ .. tab-item:: Original Block
+ :sync: original-block
+
+ .. code-block:: cuda
+
+ __device__ int reduce_sum(int *shared, int val) {
+
+ // Thread ID
+ const unsigned int thread_id = threadIdx.x;
+
+ // Every iteration the number of active threads
+ // halves, until we processed all values
+ for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) {
+ // Store value in shared memory with thread ID
+ shared[thread_id] = val;
+
+ // Synchronize all threads
+ __syncthreads();
+
+ // Active thread sum up
+ if(thread_id < i)
+ val += shared[thread_id + i];
+
+ // Synchronize all threads in the group
+ __syncthreads();
+ }
+
+ // ...
+ }
+
+ .. tab-item:: Cooperative Groups
+ :sync: cooperative-groups
+
+ .. code-block:: cuda
+
+ __device__ int reduce_sum(thread_group g,
+ int *shared,
+ int val) {
+
+ // Thread ID
+ const unsigned int group_thread_id = g.thread_rank();
+
+ // Every iteration the number of active threads
+ // halves, until we processed all values
+ for(unsigned int i = g.size() / 2; i > 0; i /= 2) {
+ // Store value in shared memroy with thread ID
+ shared[group_thread_id] = val;
+
+ // Synchronize all threads in the group
+ g.sync();
+
+ // Active thread sum up
+ if(group_thread_id < i)
+ val += shared[group_thread_id + i];
+
+ // Synchronize all threads in the group
+ g.sync();
+ }
+
+ // ...
+ }
+
+The ``reduce_sum()`` function call and input data initialization difference to the original block model is the following.
+
+.. tab-set::
+ .. tab-item:: Original Block
+ :sync: original-block
+
+ .. code-block:: cuda
+
+ __global__ void sum_kernel(...) {
+
+ // ...
+
+ // Workspace array in shared memory
+ __shared__ unsigned int workspace[2048];
+
+ // ...
+
+ // Perform reduction
+ output = reduce_sum(workspace, input);
+
+ // ...
+ }
+
+ .. tab-item:: Cooperative Groups
+ :sync: cooperative-groups
+
+ .. code-block:: cuda
+
+ __global__ void sum_kernel(...) {
+
+ // ...
+
+ // Workspace array in shared memory
+ __shared__ unsigned int workspace[2048];
+
+ // ...
+
+ // Initialize the thread_block
+ thread_block thread_block_group = this_thread_block();
+ // Perform reduction
+ output = reduce_sum(thread_block_group, workspace, input);
+
+ // ...
+ }
+
+At the device function, the input group type is the ``thread_group``, which is the parent class of all the cooperative groups type. With this, you can write generic functions, which can work with any type of cooperative groups.
+
+.. _coop_synchronization:
+
+Synchronization
+===============
+
+With each group type, the synchronization requires using the correct cooperative groups launch API.
+
+**Check the kernel launch capability**
+
+.. tab-set::
+ .. tab-item:: Thread-block
+ :sync: thread-block
+
+ Do not need kernel launch validation.
+
+ .. tab-item:: Grid
+ :sync: grid
+
+ Confirm the cooperative launch capability on the single AMD GPU:
+
+ .. code-block:: cpp
+
+ int device = 0;
+ int supports_coop_launch = 0;
+ // Check support
+ // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
+ HIP_CHECK(hipGetDevice(&device));
+ HIP_CHECK(
+ hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device));
+ if(!supports_coop_launch)
+ {
+ std::cout << "Skipping, device " << device << " does not support cooperative groups"
+ << std::endl;
+ return 0;
+ }
+
+ .. tab-item:: Multi-grid
+ :sync: multi-grid
+
+ Confirm the cooperative launch capability over multiple GPUs:
+
+ .. code-block:: cpp
+
+ // Check support of cooperative groups
+ std::vector deviceIDs;
+ for(int deviceID = 0; deviceID < device_count; deviceID++) {
+ #ifdef __HIP_PLATFORM_AMD__
+ int supports_coop_launch = 0;
+ HIP_CHECK(
+ hipDeviceGetAttribute(
+ &supports_coop_launch,
+ hipDeviceAttributeCooperativeMultiDeviceLaunch,
+ deviceID));
+ if(!supports_coop_launch) {
+ std::cout << "Skipping, device " << deviceID << " does not support cooperative groups"
+ << std::endl;
+ }
+ else
+ #endif
+ {
+ std::cout << deviceID << std::endl;
+ // Collect valid deviceIDs.
+ deviceIDs.push_back(deviceID);
+ }
+ }
+
+**Kernel launch**
+
+.. tab-set::
+ .. tab-item:: Thread-block
+ :sync: thread-block
+
+ You can access the new block representation using the original kernel launch methods.
+
+ .. code-block:: cpp
+
+ void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
+ // Launching kernel from host.
+ HIP_CHECK(hipLaunchKernelGGL(vector_reduce_kernel,
+ dim3(num_blocks),
+ dim3(threads_per_block),
+ 0,
+ hipStreamDefault,
+ &d_vector,
+ &d_block_reduced,
+ &d_partition_reduced));
+
+ .. tab-item:: Grid
+ :sync: grid
+
+ Launch the cooperative kernel on a single GPU:
+
+ .. code-block:: cpp
+
+ void* params[] = {};
+ // Launching kernel from host.
+ HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel,
+ dim3(num_blocks),
+ dim3(threads_per_block),
+ 0,
+ 0,
+ hipStreamDefault));
+
+ .. tab-item:: Multi-grid
+ :sync: multi-grid
+
+ Launch the cooperative kernel over multiple GPUs:
+
+ .. code-block:: cpp
+
+ hipLaunchParams *launchParamsList = (hipLaunchParams*)malloc(sizeof(hipLaunchParams) * deviceIDs.size());
+ for(int deviceID : deviceIDs) {
+
+ // Set device
+ HIP_CHECK(hipSetDevice(deviceID));
+
+ // Create stream
+ hipStream_t stream;
+ HIP_CHECK(hipStreamCreate(&stream));
+
+ // Parameters
+ void* params[] = {&(d_vector[deviceID]), &(d_block_reduced[deviceID]), &(d_partition_reduced[deviceID])};
+
+ // Set launchParams
+ launchParamsList[deviceID].func = (void*)vector_reduce_kernel;
+ launchParamsList[deviceID].gridDim = dim3(1);
+ launchParamsList[deviceID].blockDim = dim3(threads_per_block);
+ launchParamsList[deviceID].sharedMem = 0;
+ launchParamsList[deviceID].stream = stream;
+ launchParamsList[deviceID].args = params;
+ }
+
+ HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList,
+ (int)deviceIDs.size(),
+ hipCooperativeLaunchMultiDeviceNoPreSync));
+
+**Device side synchronization**
+
+.. tab-set::
+ .. tab-item:: Thread-block
+ :sync: thread-block
+
+ The device side code of the thread_block synchronization over single GPUs:
+
+ .. code-block:: cpp
+
+ thread_block g = this_thread_block();
+ g.sync();
+
+ .. tab-item:: Grid
+ :sync: grid
+
+ The device side code of the grid synchronization over single GPUs:
+
+ .. code-block:: cpp
+
+ grid_group grid = this_grid();
+ grid.sync();
+
+ .. tab-item:: Multi-grid
+ :sync: multi-grid
+
+ The device side code of the multi-grid synchronization over multiple GPUs:
+
+ .. code-block:: cpp
+
+ multi_grid_group multi_grid = this_multi_grid();
+ multi_grid.sync();
+
+Unsupported NVIDIA CUDA features
+================================
+
+HIP doesn't support the following NVIDIA CUDA optional headers:
+
+* ``cooperative_groups/memcpy_async.h``
+* ``cooperative_groups/reduce.h``
+* ``cooperative_groups/scan.h``
+
+HIP doesn't support the following CUDA class in ``cooperative_groups`` namespace:
+
+* ``cluster_group``
+
+HIP doesn't support the following CUDA functions/operators in ``cooperative_groups`` namespace:
+
+* ``synchronize``
+* ``memcpy_async``
+* ``wait`` and ``wait_prior``
+* ``barrier_arrive`` and ``barrier_wait``
+* ``invoke_one`` and ``invoke_one_broadcast``
+* ``reduce``
+* ``reduce_update_async`` and ``reduce_store_async``
+* Reduce operators ``plus`` , ``less`` , ``greater`` , ``bit_and`` , ``bit_xor`` and ``bit_or``
+* ``inclusive_scan`` and ``exclusive_scan``
diff --git a/docs/how-to/faq.md b/docs/how-to/faq.md
index d87357b617..805e5a5ac0 100644
--- a/docs/how-to/faq.md
+++ b/docs/how-to/faq.md
@@ -38,7 +38,7 @@ See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs
* Virtual functions, indirect functions and try/catch (CUDA 4.0)
* `__prof_trigger`
* PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly.
-* Several kernel features are under development. See the {doc}`/reference/kernel_language` for more information.
+* Several kernel features are under development. See the {doc}`/reference/cpp_language_extensions` for more information.
## Is HIP a drop-in replacement for CUDA?
diff --git a/docs/how-to/hip_porting_driver_api.md b/docs/how-to/hip_porting_driver_api.md
index d42353b631..57879264a2 100644
--- a/docs/how-to/hip_porting_driver_api.md
+++ b/docs/how-to/hip_porting_driver_api.md
@@ -1,4 +1,4 @@
-# Porting CUDA Driver API
+# Porting CUDA driver API
## Introduction to the CUDA Driver and Runtime APIs
@@ -43,7 +43,7 @@ The CUDA Runtime API unifies the Context API with the Device API. This simplifie
HIP provides a context API to facilitate easy porting from existing Driver codes.
In HIP, the `Ctx` functions largely provide an alternate syntax for changing the active device.
-Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked `hipCtx` APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm/HIP/blob/develop/docs/reference/deprecated_api_list.md).
+Most new applications will prefer to use `hipSetDevice` or the stream APIs, therefore HIP has marked `hipCtx` APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](../reference/deprecated_api_list).
## HIP Module and `Ctx` APIs
diff --git a/docs/how-to/hip_porting_guide.md b/docs/how-to/hip_porting_guide.md
index 1a51339b66..1bcbd3ea8e 100644
--- a/docs/how-to/hip_porting_guide.md
+++ b/docs/how-to/hip_porting_guide.md
@@ -1,4 +1,4 @@
-# HIP Porting Guide
+# HIP porting guide
In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease
the porting of existing CUDA code into the HIP environment. This section describes the available tools
@@ -366,7 +366,7 @@ run hipcc when appropriate.
### ``warpSize``
-Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/kernel_language.html#warp-cross-lane-functions) for information on how to write portable wave-aware code.
+Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/cpp_language_extensions.html#warp-cross-lane-functions) for information on how to write portable wave-aware code.
### Kernel launch with group size > 256
diff --git a/docs/how-to/hip_rtc.md b/docs/how-to/hip_rtc.md
index 344bd7b35e..6a37f8d87e 100644
--- a/docs/how-to/hip_rtc.md
+++ b/docs/how-to/hip_rtc.md
@@ -1,4 +1,4 @@
-# Programming for HIP Runtime Compiler (RTC)
+# Programming for HIP runtime compiler (RTC)
HIP lets you compile kernels at runtime with the `hiprtc*` APIs.
Kernels can be stored as a text string and can be passed to HIPRTC APIs alongside options to guide the compilation.
diff --git a/docs/how-to/performance_guidelines.rst b/docs/how-to/performance_guidelines.rst
index aa8bcb1fce..ced9707356 100644
--- a/docs/how-to/performance_guidelines.rst
+++ b/docs/how-to/performance_guidelines.rst
@@ -3,7 +3,7 @@
:keywords: AMD, ROCm, HIP, CUDA, performance, guidelines
*******************************************************************************
-Performance Guidelines
+Performance guidelines
*******************************************************************************
The AMD HIP Performance Guidelines are a set of best practices designed to help
diff --git a/docs/how-to/programming_manual.md b/docs/how-to/programming_manual.md
index df6a80261c..33ab58de93 100644
--- a/docs/how-to/programming_manual.md
+++ b/docs/how-to/programming_manual.md
@@ -1,4 +1,4 @@
-# HIP Programming Manual
+# HIP programming manual
## Host Memory
@@ -140,7 +140,7 @@ HIP now supports runtime compilation (HIP RTC), the usage of which will provide
HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes.
-For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/index.html).
+For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](../doxygen/html/index).
For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available.
diff --git a/docs/how-to/unified_memory.rst b/docs/how-to/unified_memory.rst
new file mode 100644
index 0000000000..f64189454c
--- /dev/null
+++ b/docs/how-to/unified_memory.rst
@@ -0,0 +1,577 @@
+.. meta::
+ :description: This chapter describes introduces Unified Memory (UM) and shows
+ how to use it in AMD HIP.
+ :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU
+
+*******************************************************************************
+Unified memory
+*******************************************************************************
+
+In conventional architectures, CPUs and GPUs have dedicated memory like Random
+Access Memory (RAM) and Video Random Access Memory (VRAM). This architectural
+design, while effective, can be limiting in terms of memory capacity and
+bandwidth, as continuous memory copying is required to allow the processors to
+access the appropriate data. New architectural features like Heterogeneous
+System Architectures (HSA) and Unified Memory (UM) help avoid these limitations
+and promise increased efficiency and innovation.
+
+Unified memory
+==============
+Unified Memory is a single memory address space accessible from any processor
+within a system. This setup simplifies memory management processes and enables
+applications to allocate data that can be read or written by code running on
+either CPUs or GPUs. The Unified memory model is shown in the following figure.
+
+.. figure:: ../data/unified_memory/um.svg
+
+AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory
+Architecture. On a single die, a central processing unit (CPU) is combined
+with an integrated graphics processing unit (iGPU), and both have access to a
+high-bandwidth memory (HBM) module named Unified Memory. The CPU enables
+high-performance, low-latency operations, while the GPU is optimized for high
+throughput (data processed by unit time).
+
+.. _unified memory system requirements:
+
+System requirements
+===================
+Unified memory is supported on Linux by all modern AMD GPUs from the Vega
+series onward. Unified memory management can be achieved with managed memory
+allocation and, for the latest GPUs, with a system allocator.
+
+The table below lists the supported allocators. The allocators are described in
+the next section.
+
+.. list-table:: Supported Unified Memory Allocators
+ :widths: 40, 25, 25, 25
+ :header-rows: 1
+ :align: center
+
+ * - Architecture
+ - ``hipMallocManaged()``
+ - ``__managed__``
+ - ``malloc()``
+ * - MI200, MI300 Series
+ - ✅
+ - ✅
+ - ✅ :sup:`1`
+ * - MI100
+ - ✅
+ - ✅
+ - ❌
+ * - RDNA (Navi) Series
+ - ✅
+ - ✅
+ - ❌
+ * - GCN5 (Vega) Series
+ - ✅
+ - ✅
+ - ❌
+
+✅: **Supported**
+
+❌: **Unsupported**
+
+:sup:`1` Works only with ``XNACK=1``. First GPU access causes recoverable
+page-fault. For more details, visit
+`GPU memory `_.
+
+.. _unified memory programming models:
+
+Unified memory programming models
+=================================
+
+Showcasing various unified memory programming models, the model availability
+depends on your architecture. For more information, see :ref:`unified memory
+system requirements` and :ref:`checking unified memory management support`.
+
+- **HIP managed memory allocation API**:
+
+ The ``hipMallocManaged()`` is a dynamic memory allocator available on
+ all GPUs with unified memory support. For more details, visit
+ :ref:`unified_memory_reference`.
+
+- **HIP managed variables**:
+
+ The ``__managed__`` declaration specifier, which serves as its counterpart,
+ is supported on all modern AMD cards and can be utilized for static
+ allocation.
+
+- **System allocation API**:
+
+ Starting with the AMD MI300 series, the ``malloc()`` system allocator allows
+ you to reserve unified memory. The system allocator is more versatile and
+ offers an easy transition from a CPU written C++ code to a HIP code as the
+ same system allocation API is used.
+
+.. _checking unified memory management support:
+
+Checking unified memory management support
+------------------------------------------
+Some device attributes can offer information about which :ref:`unified memory
+programming models` are supported. The attribute value is 1 if the
+functionality is supported, and 0 if it is not supported.
+
+.. list-table:: Device attributes for unified memory management
+ :widths: 40, 60
+ :header-rows: 1
+ :align: center
+
+ * - attribute
+ - description
+ * - ``hipDeviceAttributeManagedMemory``
+ - unified addressing is supported
+ * - ``hipDeviceAttributeConcurrentManagedAccess``
+ - full managed memory support, concurrent access is supported
+ * - ``hipDeviceAttributePageableMemoryAccess``
+ - both managed and system memory allocation API is supported
+
+The following examples show how to use device attributes:
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ int main() {
+ int d;
+ hipGetDevice(&d);
+
+ int is_cma = 0;
+ hipDeviceGetAttribute(&is_cma, hipDeviceAttributeConcurrentManagedAccess, d);
+ std::cout << "HIP Managed Memory: "
+ << (is_cma == 1 ? "is" : "NOT")
+ << " supported" << std::endl;
+ return 0;
+ }
+
+Example for unified memory management
+-------------------------------------
+
+The following example shows how to use unified memory management with
+``hipMallocManaged()``, function, with ``__managed__`` attribute for static
+allocation and standard ``malloc()`` allocation. For comparison, the Explicit
+Memory Management example is presented in the last tab.
+
+.. tab-set::
+
+ .. tab-item:: hipMallocManaged()
+
+ .. code-block:: cpp
+ :emphasize-lines: 12-15
+
+ #include
+ #include
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+
+ // Allocate memory for a, b and c that is accessible to both device and host codes.
+ hipMallocManaged(&a, sizeof(*a));
+ hipMallocManaged(&b, sizeof(*b));
+ hipMallocManaged(&c, sizeof(*c));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ hipDeviceSynchronize();
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ hipFree(a);
+ hipFree(b);
+ hipFree(c);
+
+ return 0;
+ }
+
+
+ .. tab-item:: __managed__
+
+ .. code-block:: cpp
+ :emphasize-lines: 9-10
+
+ #include
+ #include
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ // Declare a, b and c as static variables.
+ __managed__ int a, b, c;
+
+ int main() {
+ // Setup input values.
+ a = 1;
+ b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
+
+ // Wait for GPU to finish before accessing on host.
+ hipDeviceSynchronize();
+
+ // Prints the result.
+ std::cout << a << " + " << b << " = " << c << std::endl;
+
+ return 0;
+ }
+
+
+ .. tab-item:: malloc()
+
+ .. code-block:: cpp
+ :emphasize-lines: 12-15
+
+ #include
+ #include
+
+ // Addition of two values.
+ __global__ void add(int* a, int* b, int* c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int* a, * b, * c;
+
+ // Allocate memory for a, b, and c.
+ a = (int*)malloc(sizeof(*a));
+ b = (int*)malloc(sizeof(*b));
+ c = (int*)malloc(sizeof(*c));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ hipDeviceSynchronize();
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ free(a);
+ free(b);
+ free(c);
+
+ return 0;
+ }
+
+
+ .. tab-item:: Explicit Memory Management
+
+ .. code-block:: cpp
+ :emphasize-lines: 17-24, 29-30
+
+ #include
+ #include
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int a, b, c;
+ int *d_a, *d_b, *d_c;
+
+ // Setup input values.
+ a = 1;
+ b = 2;
+
+ // Allocate device copies of a, b and c.
+ hipMalloc(&d_a, sizeof(*d_a));
+ hipMalloc(&d_b, sizeof(*d_b));
+ hipMalloc(&d_c, sizeof(*d_c));
+
+ // Copy input values to device.
+ hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice);
+ hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice);
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
+
+ // Copy the result back to the host.
+ hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost);
+
+ // Cleanup allocated memory.
+ hipFree(d_a);
+ hipFree(d_b);
+ hipFree(d_c);
+
+ // Prints the result.
+ std::cout << a << " + " << b << " = " << c << std::endl;
+
+ return 0;
+ }
+
+.. _using unified memory management:
+
+Using unified memory management (UMM)
+=====================================
+
+Unified memory management (UMM) is a feature that can simplify the complexities
+of memory management in GPU computing. It is particularly useful in
+heterogeneous computing environments with heavy memory usage with both a CPU
+and a GPU, which would require large memory transfers. Here are some areas
+where UMM can be beneficial:
+
+- **Simplification of Memory Management**:
+
+ UMM can help to simplify the complexities of memory management. This can make
+ it easier for developers to write code without worrying about memory
+ allocation and deallocation details.
+
+- **Data Migration**:
+
+ UMM allows for efficient data migration between the host (CPU) and the device
+ (GPU). This can be particularly useful for applications that need to move
+ data back and forth between the device and host.
+
+- **Improved Programming Productivity**:
+
+ As a positive side effect, UMM can reduce the lines of code, thereby
+ improving programming productivity.
+
+In HIP, pinned memory allocations are coherent by default. Pinned memory is
+host memory mapped into the address space of all GPUs, meaning that the pointer
+can be used on both host and device. Using pinned memory instead of pageable
+memory on the host can improve bandwidth.
+
+While UMM can provide numerous benefits, it's important to be aware of the
+potential performance overhead associated with UMM. You must thoroughly test
+and profile your code to ensure it's the most suitable choice for your use
+case.
+
+.. _unified memory runtime hints:
+
+Unified memory HIP runtime hints for the better performance
+===========================================================
+
+Unified memory HIP runtime hints can help improve the performance of your code if
+you know your code's ability and infrastructure. Some hint techniques are
+presented in this section.
+
+The hint functions can set actions on a selected device, which can be
+identified by ``hipGetDeviceProperties(&prop, device_id)``. There are two
+special ``device_id`` values:
+
+- ``hipCpuDeviceId`` = -1 means that the advised device is the CPU.
+- ``hipInvalidDeviceId`` = -2 means that the device is invalid.
+
+For the best performance, profile your application to optimize the
+utilization of HIP runtime hints.
+
+Data prefetching
+----------------
+
+Data prefetching is a technique used to improve the performance of your
+application by moving data closer to the processing unit before it's actually
+needed.
+
+.. code-block:: cpp
+ :emphasize-lines: 20-23,31-32
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+ int deviceId;
+ hipGetDevice(&deviceId); // Get the current device ID
+
+ // Allocate memory for a, b and c that is accessible to both device and host codes.
+ hipMallocManaged(&a, sizeof(*a));
+ hipMallocManaged(&b, sizeof(*b));
+ hipMallocManaged(&c, sizeof(*c));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Prefetch the data to the GPU device.
+ hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0);
+ hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0);
+ hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0);
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ hipDeviceSynchronize();
+
+ // Prefetch the result back to the CPU.
+ hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0);
+
+ // Wait for the prefetch operations to complete.
+ hipDeviceSynchronize();
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ hipFree(a);
+ hipFree(b);
+ hipFree(c);
+
+ return 0;
+ }
+
+Remember to check the return status of ``hipMemPrefetchAsync()`` to ensure that
+the prefetch operations are completed successfully.
+
+Memory advice
+-------------
+
+The effectiveness of ``hipMemAdvise()`` comes from its ability to inform the
+runtime system of the developer's intentions regarding memory usage. When the
+runtime system has knowledge of the expected memory access patterns, it can
+make better decisions about data placement and caching, leading to more
+efficient execution of the application. However, the actual impact on
+performance can vary based on the specific use case and the hardware
+architecture.
+
+For the description of ``hipMemAdvise()`` and the detailed list of advice,
+visit the :ref:`unified_memory_reference`.
+
+Here is the updated version of the example above with memory advice.
+
+.. code-block:: cpp
+ :emphasize-lines: 17-26
+
+ #include
+ #include
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+
+ // Allocate memory for a, b, and c accessible to both device and host codes.
+ hipMallocManaged(&a, sizeof(*a));
+ hipMallocManaged(&b, sizeof(*b));
+ hipMallocManaged(&c, sizeof(*c));
+
+ // Set memory advice for a, b, and c to be accessed by the CPU.
+ hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
+ hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
+ hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
+
+ // Additionally, set memory advice for a, b, and c to be read mostly from the device 0.
+ constexpr int device = 0;
+ hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, device);
+ hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, device);
+ hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, device);
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ hipDeviceSynchronize();
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ hipFree(a);
+ hipFree(b);
+ hipFree(c);
+
+ return 0;
+ }
+
+
+Memory range attributes
+-----------------------
+
+Memory Range attributes allow you to query attributes of a given memory range.
+
+The ``hipMemRangeGetAttribute()`` is added to the example to query the
+``hipMemRangeAttributeReadMostly`` attribute of the memory range pointed to by
+``a``. The result is stored in ``attributeValue`` and then printed out.
+
+For more details, visit the
+:ref:`unified_memory_reference`.
+
+.. code-block:: cpp
+ :emphasize-lines: 29-34
+
+ #include
+ #include
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+ unsigned int attributeValue;
+ constexpr size_t attributeSize = sizeof(attributeValue);
+
+ // Allocate memory for a, b and c that is accessible to both device and host codes.
+ hipMallocManaged(&a, sizeof(*a));
+ hipMallocManaged(&b, sizeof(*b));
+ hipMallocManaged(&c, sizeof(*c));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ hipDeviceSynchronize();
+
+ // Query an attribute of the memory range.
+ hipMemRangeGetAttribute(&attributeValue,
+ attributeSize,
+ hipMemRangeAttributeReadMostly,
+ a,
+ sizeof(*a));
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+ std::cout << "The queried attribute value is: " << attributeValue << std::endl;
+
+ // Cleanup allocated memory.
+ hipFree(a);
+ hipFree(b);
+ hipFree(c);
+
+ return 0;
+ }
+
+Asynchronously attach memory to a stream
+----------------------------------------
+
+The ``hipStreamAttachMemAsync`` function would be able to asynchronously attach memory to a stream, which can help concurrent execution when using streams.
+
+Currently, this function is a no-operation (NOP) function on AMD GPUs. It simply returns success after the runtime memory validation passed. This function is necessary on Microsoft Windows, and UMM is not supported on this operating system with AMD GPUs at the moment.
diff --git a/docs/index.md b/docs/index.md
index a12dac2ec5..b19f100c88 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -11,7 +11,7 @@ For HIP supported AMD GPUs on multiple operating systems, see:
The CUDA enabled NVIDIA GPUs are supported by HIP. For more information, see [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
-On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Common Language Runtime (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows,
+On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Common Language Runtimes (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows,
On non-AMD platforms, like NVIDIA, HIP provides header files required to support non-AMD specific back-end implementation in the repository ['hipother'](https://github.com/ROCm/hipother), which translates from the HIP runtime APIs to CUDA runtime APIs.
@@ -38,13 +38,15 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support
:::{grid-item-card} How to
-* [Programming Manual](./how-to/programming_manual)
-* [HIP Porting Guide](./how-to/hip_porting_guide)
-* [HIP Porting: Driver API Guide](./how-to/hip_porting_driver_api)
+* [Programming manual](./how-to/programming_manual)
+* [HIP porting guide](./how-to/hip_porting_guide)
+* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
* {doc}`./how-to/hip_rtc`
* {doc}`./how-to/performance_guidelines`
* [Debugging with HIP](./how-to/debugging)
* {doc}`./how-to/logging`
+* [Unified memory](./how-to/unified_memory)
+* [Cooperative Groups](./how-to/cooperative_groups)
* {doc}`./how-to/faq`
:::
@@ -52,17 +54,25 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support
:::{grid-item-card} Reference
* {doc}`/doxygen/html/index`
-* [C++ language extensions](./reference/kernel_language)
-* [Comparing Syntax for different APIs](./reference/terms)
-* [HSA Runtime API for ROCm](./reference/virtual_rocr)
+* [C++ language extensions](./reference/cpp_language_extensions)
+* [C++ language support](./reference/cpp_language_support)
+* [HIP math API](./reference/math_api)
+* [Comparing syntax for different APIs](./reference/terms)
+* [HSA runtime API for ROCm](./reference/virtual_rocr)
+* [HIP managed memory allocation API](./reference/unified_memory_reference)
+* [HIP Cooperative Groups API](./reference/cooperative_groups)
* [List of deprecated APIs](./reference/deprecated_api_list)
:::
:::{grid-item-card} Tutorial
+* [HIP basic examples](https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic)
* [HIP examples](https://github.com/ROCm/HIP-Examples)
* [HIP test samples](https://github.com/ROCm/hip-tests/tree/develop/samples)
+* [SAXPY tutorial](./tutorial/saxpy)
+* [Reduction tutorial](./tutorial/reduction)
+* [Cooperative groups tutorial](./tutorial/cooperative_groups_tutorial)
:::
diff --git a/docs/reference/cooperative_groups_reference.rst b/docs/reference/cooperative_groups_reference.rst
new file mode 100644
index 0000000000..52ca19ab6a
--- /dev/null
+++ b/docs/reference/cooperative_groups_reference.rst
@@ -0,0 +1,93 @@
+.. meta::
+ :description: This chapter lists types and device API wrappers related to the Cooperative Group feature. Programmers can directly use these API features in their kernels.
+ :keywords: AMD, ROCm, HIP, cooperative groups
+
+.. _cooperative_groups_reference:
+
+*******************************************************************************
+HIP Cooperative Groups API
+*******************************************************************************
+
+Cooperative kernel launches
+===========================
+
+The following host-side functions are used for cooperative kernel launches.
+
+.. doxygenfunction:: hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream)
+
+.. doxygenfunction:: hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream)
+
+.. doxygenfunction:: hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags)
+
+.. doxygenfunction:: hipModuleLaunchCooperativeKernel
+
+.. doxygenfunction:: hipModuleLaunchCooperativeKernelMultiDevice
+
+Cooperative groups classes
+==========================
+
+The following cooperative groups classes can be used on the device side.
+
+.. _thread_group_ref:
+
+.. doxygenclass:: cooperative_groups::thread_group
+ :members:
+
+.. _thread_block_ref:
+
+.. doxygenclass:: cooperative_groups::thread_block
+ :members:
+
+.. _grid_group_ref:
+
+.. doxygenclass:: cooperative_groups::grid_group
+ :members:
+
+.. _multi_grid_group_ref:
+
+.. doxygenclass:: cooperative_groups::multi_grid_group
+ :members:
+
+.. _thread_block_tile_ref:
+
+.. doxygenclass:: cooperative_groups::thread_block_tile
+ :members:
+
+.. _coalesced_group_ref:
+
+.. doxygenclass:: cooperative_groups::coalesced_group
+ :members:
+
+Cooperative groups construct functions
+======================================
+
+The following functions are used to construct different group-type instances on the device side.
+
+.. doxygenfunction:: cooperative_groups::this_multi_grid
+
+.. doxygenfunction:: cooperative_groups::this_grid
+
+.. doxygenfunction:: cooperative_groups::this_thread_block
+
+.. doxygenfunction:: cooperative_groups::coalesced_threads
+
+.. doxygenfunction:: cooperative_groups::tiled_partition(const ParentCGTy &g)
+
+.. doxygenfunction:: cooperative_groups::tiled_partition(const thread_group &parent, unsigned int tile_size)
+
+.. doxygenfunction:: cooperative_groups::binary_partition(const coalesced_group& cgrp, bool pred)
+
+.. doxygenfunction:: cooperative_groups::binary_partition(const thread_block_tile& tgrp, bool pred)
+
+Cooperative groups exposed API functions
+========================================
+
+The following functions are the exposed API for different group-type instances on the device side.
+
+.. doxygenfunction:: cooperative_groups::group_size
+
+.. doxygenfunction:: cooperative_groups::thread_rank
+
+.. doxygenfunction:: cooperative_groups::is_valid
+
+.. doxygenfunction:: cooperative_groups::sync
\ No newline at end of file
diff --git a/docs/reference/kernel_language.rst b/docs/reference/cpp_language_extensions.rst
similarity index 58%
rename from docs/reference/kernel_language.rst
rename to docs/reference/cpp_language_extensions.rst
index 10a014fe6e..7c0eb0ccf8 100644
--- a/docs/reference/kernel_language.rst
+++ b/docs/reference/cpp_language_extensions.rst
@@ -5,7 +5,7 @@
:keywords: AMD, ROCm, HIP, CUDA, c++ language extensions, HIP functions
********************************************************************************
-C++ Language Extensions
+C++ language extensions
********************************************************************************
HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in
@@ -51,8 +51,7 @@ Supported ``__global__`` functions are:
* Run on the device
* Called (launched) from the host
-HIP ``__global__`` functions must have a ``void`` return type. The first parameter in a HIP ``__global__``
-function must have the type ``hipLaunchParm``. Refer to :ref:`kernel-launch-example` to see usage.
+HIP ``__global__`` functions must have a ``void`` return type.
HIP doesn't support dynamic-parallelism, which means that you can't call ``__global__`` functions from
the device.
@@ -105,7 +104,7 @@ You can include your kernel arguments after these parameters.
.. code-block:: cpp
// Example hipLaunchKernelGGL pseudocode:
- __global__ MyKernel(hipLaunchParm lp, float *A, float *B, float *C, size_t N)
+ __global__ MyKernel(float *A, float *B, float *C, size_t N)
{
...
}
@@ -155,7 +154,7 @@ Kernel launch example
MyKernel<<>> (a,b,c,n);
// Alternatively, kernel can be launched by
- // hipLaunchKernelGGL(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N);
+ // hipLaunchKernelGGL(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N);
}
Variable type qualifiers
@@ -293,8 +292,7 @@ dimensions to 1.
Memory fence instructions
====================================================
-HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using ``threadfence_system()`` in
-the HIP-Clang path, you can use the following workaround:
+HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using ``threadfence_system()`` in the HIP-Clang path, you can use the following workaround:
#. Build HIP with the ``HIP_COHERENT_HOST_ALLOC`` environment variable enabled.
#. Modify kernels that use ``__threadfence_system()`` as follows:
@@ -307,1141 +305,66 @@ the HIP-Clang path, you can use the following workaround:
Synchronization functions
====================================================
-The ``__syncthreads()`` built-in function is supported in HIP. The ``__syncthreads_count(int)``,
-``__syncthreads_and(int)``, and ``__syncthreads_or(int)`` functions are under development.
-
-Math functions
-====================================================
-
-HIP-Clang supports a set of math operations that are callable from the device. HIP supports most of the device functions supported by CUDA.
-These are described in the following sections.
-
-Single precision mathematical functions
---------------------------------------------------------------------------------------------
-
-Following is the list of supported single precision mathematical functions.
-
-.. list-table:: Single precision mathematical functions
-
- * - **Function**
- - **Supported on Host**
- - **Supported on Device**
-
- * - | ``float abs(float x)``
- | Returns the absolute value of :math:`x`
- - ✓
- - ✓
-
- * - | ``float acosf(float x)``
- | Returns the arc cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float acoshf(float x)``
- | Returns the nonnegative arc hyperbolic cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float asinf(float x)``
- | Returns the arc sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float asinhf(float x)``
- | Returns the arc hyperbolic sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float atanf(float x)``
- | Returns the arc tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float atan2f(float x, float y)``
- | Returns the arc tangent of the ratio of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``float atanhf(float x)``
- | Returns the arc hyperbolic tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float cbrtf(float x)``
- | Returns the cube root of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float ceilf(float x)``
- | Returns ceiling of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float copysignf(float x, float y)``
- | Create value with given magnitude, copying sign of second value.
- - ✓
- - ✓
-
- * - | ``float cosf(float x)``
- | Returns the cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float coshf(float x)``
- | Returns the hyperbolic cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float cospif(float x)``
- | Returns the cosine of :math:`\pi \cdot x`.
- - ✓
- - ✓
-
- * - | ``float cyl_bessel_i0f(float x)``
- | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`.
- - ✗
- - ✗
-
- * - | ``float cyl_bessel_i1f(float x)``
- | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`.
- - ✗
- - ✗
-
- * - | ``float erff(float x)``
- | Returns the error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float erfcf(float x)``
- | Returns the complementary error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float erfcinvf(float x)``
- | Returns the inverse complementary function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float erfcxf(float x)``
- | Returns the scaled complementary error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float erfinvf(float x)``
- | Returns the inverse error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float expf(float x)``
- | Returns :math:`e^x`.
- - ✓
- - ✓
-
- * - | ``float exp10f(float x)``
- | Returns :math:`10^x`.
- - ✓
- - ✓
-
- * - | ``float exp2f( float x)``
- | Returns :math:`2^x`.
- - ✓
- - ✓
-
- * - | ``float expm1f(float x)``
- | Returns :math:`ln(x - 1)`
- - ✓
- - ✓
-
- * - | ``float fabsf(float x)``
- | Returns the absolute value of `x`
- - ✓
- - ✓
-
- * - | ``float fdimf(float x, float y)``
- | Returns the positive difference between :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``float fdividef(float x, float y)``
- | Divide two floating point values.
- - ✓
- - ✓
-
- * - | ``float floorf(float x)``
- | Returns the largest integer less than or equal to :math:`x`.
- - ✓
- - ✓
-
- * - | ``float fmaf(float x, float y, float z)``
- | Returns :math:`x \cdot y + z` as a single operation.
- - ✓
- - ✓
-
- * - | ``float fmaxf(float x, float y)``
- | Determine the maximum numeric value of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``float fminf(float x, float y)``
- | Determine the minimum numeric value of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``float fmodf(float x, float y)``
- | Returns the floating-point remainder of :math:`x / y`.
- - ✓
- - ✓
-
- * - | ``float modff(float x, float* iptr)``
- | Break down :math:`x` into fractional and integral parts.
- - ✓
- - ✗
-
- * - | ``float frexpf(float x, int* nptr)``
- | Extract mantissa and exponent of :math:`x`.
- - ✓
- - ✗
-
- * - | ``float hypotf(float x, float y)``
- | Returns the square root of the sum of squares of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``int ilogbf(float x)``
- | Returns the unbiased integer exponent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``bool isfinite(float x)``
- | Determine whether :math:`x` is finite.
- - ✓
- - ✓
-
- * - | ``bool isinf(float x)``
- | Determine whether :math:`x` is infinite.
- - ✓
- - ✓
-
- * - | ``bool isnan(float x)``
- | Determine whether :math:`x` is a ``NAN``.
- - ✓
- - ✓
-
- * - | ``float j0f(float x)``
- | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``float j1f(float x)``
- | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``float jnf(int n, float x)``
- | Returns the value of the Bessel function of the first kind of order n for :math:`x`.
- - ✓
- - ✓
-
- * - | ``float ldexpf(float x, int exp)``
- | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float lgammaf(float x)``
- | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
- - ✓
- - ✗
-
- * - | ``long int lrintf(float x)``
- | Round :math:`x` to nearest integer value.
- - ✓
- - ✓
-
- * - | ``long long int llrintf(float x)``
- | Round :math:`x` to nearest integer value.
- - ✓
- - ✓
-
- * - | ``long int lroundf(float x)``
- | Round to nearest integer value.
- - ✓
- - ✓
-
- * - | ``long long int llroundf(float x)``
- | Round to nearest integer value.
- - ✓
- - ✓
-
- * - | ``float log10f(float x)``
- | Returns the base 10 logarithm of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float log1pf(float x)``
- | Returns the natural logarithm of :math:`x + 1`.
- - ✓
- - ✓
-
- * - | ``float log2f(float x)``
- | Returns the base 2 logarithm of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float logf(float x)``
- | Returns the natural logarithm of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float logbf(float x)``
- | Returns the floating point representation of the exponent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float nanf(const char* tagp)``
- | Returns "Not a Number" value.
- - ✗
- - ✓
-
- * - | ``float nearbyintf(float x)``
- | Round :math:`x` to the nearest integer.
- - ✓
- - ✓
-
- * - | ``float nextafterf(float x, float y)``
- | Returns next representable single-precision floating-point value after argument.
- - ✓
- - ✗
-
- * - | ``float norm3df(float x, float y, float z)``
- | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`.
- - ✓
- - ✓
-
- * - | ``float norm4df(float x, float y, float z, float w)``
- | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`.
- - ✓
- - ✓
-
- * - | ``float normcdff(float y)``
- | Returns the standard normal cumulative distribution function.
- - ✓
- - ✓
-
- * - | ``float normcdfinvf(float y)``
- | Returns the inverse of the standard normal cumulative distribution function.
- - ✓
- - ✓
-
- * - | ``float normf(int dim, const float *a)``
- | Returns the square root of the sum of squares of any number of coordinates.
- - ✓
- - ✓
-
- * - | ``float powf(float x, float y)``
- | Returns :math:`x^y`.
- - ✓
- - ✓
-
- * - | ``float powif(float base, int iexp)``
- | Returns the value of first argument to the power of second argument.
- - ✓
- - ✓
-
- * - | ``float remainderf(float x, float y)``
- | Returns single-precision floating-point remainder.
- - ✓
- - ✓
-
- * - | ``float remquof(float x, float y, int* quo)``
- | Returns single-precision floating-point remainder and part of quotient.
- - ✓
- - ✓
-
- * - | ``float roundf(float x)``
- | Round to nearest integer value in floating-point.
- - ✓
- - ✓
-
- * - | ``float rcbrtf(float x)``
- | Returns the reciprocal cube root function.
- - ✓
- - ✓
-
- * - | ``float rhypotf(float x, float y)``
- | Returns one over the square root of the sum of squares of two arguments.
- - ✓
- - ✓
-
- * - | ``float rintf(float x)``
- | Round input to nearest integer value in floating-point.
- - ✓
- - ✓
-
- * - | ``float rnorm3df(float x, float y, float z)``
- | Returns one over the square root of the sum of squares of three coordinates of the argument.
- - ✓
- - ✓
-
- * - | ``float rnorm4df(float x, float y, float z, float w)``
- | Returns one over the square root of the sum of squares of four coordinates of the argument.
- - ✓
- - ✓
-
- * - | ``float rnormf(int dim, const float *a)``
- | Returns the reciprocal of square root of the sum of squares of any number of coordinates.
- - ✓
- - ✓
-
- * - | ``float scalblnf(float x, long int n)``
- | Scale :math:`x` by :math:`2^n`.
- - ✓
- - ✓
-
- * - | ``float scalbnf(float x, int n)``
- | Scale :math:`x` by :math:`2^n`.
- - ✓
- - ✓
-
- * - | ``bool signbit(float x)``
- | Return the sign bit of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float sinf(float x)``
- | Returns the sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float sinhf(float x)``
- | Returns the hyperbolic sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float sinpif(float x)``
- | Returns the hyperbolic sine of :math:`\pi \cdot x`.
- - ✓
- - ✓
-
- * - | ``void sincosf(float x, float *sptr, float *cptr)``
- | Returns the sine and cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``void sincospif(float x, float *sptr, float *cptr)``
- | Returns the sine and cosine of :math:`\pi \cdot x`.
- - ✓
- - ✓
-
- * - | ``float sqrtf(float x)``
- | Returns the square root of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float rsqrtf(float x)``
- | Returns the reciprocal of the square root of :math:`x`.
- - ✗
- - ✓
-
- * - | ``float tanf(float x)``
- | Returns the tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float tanhf(float x)``
- | Returns the hyperbolic tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float tgammaf(float x)``
- | Returns the gamma function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``float truncf(float x)``
- | Truncate :math:`x` to the integral part.
- - ✓
- - ✓
-
- * - | ``float y0f(float x)``
- | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``float y1f(float x)``
- | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``float ynf(int n, float x)``
- | Returns the value of the Bessel function of the second kind of order n for :math:`x`.
- - ✓
- - ✓
-
-Double precision mathematical functions
---------------------------------------------------------------------------------------------
-
-Following is the list of supported double precision mathematical functions.
-
-.. list-table:: Double precision mathematical functions
-
- * - **Function**
- - **Supported on Host**
- - **Supported on Device**
-
- * - | ``double abs(double x)``
- | Returns the absolute value of :math:`x`
- - ✓
- - ✓
-
- * - | ``double acos(double x)``
- | Returns the arc cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double acosh(double x)``
- | Returns the nonnegative arc hyperbolic cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double asin(double x)``
- | Returns the arc sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double asinh(double x)``
- | Returns the arc hyperbolic sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double atan(double x)``
- | Returns the arc tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double atan2(double x, double y)``
- | Returns the arc tangent of the ratio of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``double atanh(double x)``
- | Returns the arc hyperbolic tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double cbrt(double x)``
- | Returns the cube root of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double ceil(double x)``
- | Returns ceiling of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double copysign(double x, double y)``
- | Create value with given magnitude, copying sign of second value.
- - ✓
- - ✓
-
- * - | ``double cos(double x)``
- | Returns the cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double cosh(double x)``
- | Returns the hyperbolic cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double cospi(double x)``
- | Returns the cosine of :math:`\pi \cdot x`.
- - ✓
- - ✓
-
- * - | ``double cyl_bessel_i0(double x)``
- | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`.
- - ✗
- - ✗
-
- * - | ``double cyl_bessel_i1(double x)``
- | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`.
- - ✗
- - ✗
-
- * - | ``double erf(double x)``
- | Returns the error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double erfc(double x)``
- | Returns the complementary error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double erfcinv(double x)``
- | Returns the inverse complementary function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double erfcx(double x)``
- | Returns the scaled complementary error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double erfinv(double x)``
- | Returns the inverse error function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double exp(double x)``
- | Returns :math:`e^x`.
- - ✓
- - ✓
-
- * - | ``double exp10(double x)``
- | Returns :math:`10^x`.
- - ✓
- - ✓
-
- * - | ``double exp2( double x)``
- | Returns :math:`2^x`.
- - ✓
- - ✓
-
- * - | ``double expm1(double x)``
- | Returns :math:`ln(x - 1)`
- - ✓
- - ✓
-
- * - | ``double fabs(double x)``
- | Returns the absolute value of `x`
- - ✓
- - ✓
-
- * - | ``double fdim(double x, double y)``
- | Returns the positive difference between :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``double floor(double x)``
- | Returns the largest integer less than or equal to :math:`x`.
- - ✓
- - ✓
-
- * - | ``double fma(double x, double y, double z)``
- | Returns :math:`x \cdot y + z` as a single operation.
- - ✓
- - ✓
-
- * - | ``double fmax(double x, double y)``
- | Determine the maximum numeric value of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``double fmin(double x, double y)``
- | Determine the minimum numeric value of :math:`x` and :math:`y`.
- - ✓
- - ✓
- * - | ``double fmod(double x, double y)``
- | Returns the floating-point remainder of :math:`x / y`.
- - ✓
- - ✓
-
- * - | ``double modf(double x, double* iptr)``
- | Break down :math:`x` into fractional and integral parts.
- - ✓
- - ✗
-
- * - | ``double frexp(double x, int* nptr)``
- | Extract mantissa and exponent of :math:`x`.
- - ✓
- - ✗
-
- * - | ``double hypot(double x, double y)``
- | Returns the square root of the sum of squares of :math:`x` and :math:`y`.
- - ✓
- - ✓
-
- * - | ``int ilogb(double x)``
- | Returns the unbiased integer exponent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``bool isfinite(double x)``
- | Determine whether :math:`x` is finite.
- - ✓
- - ✓
-
- * - | ``bool isin(double x)``
- | Determine whether :math:`x` is infinite.
- - ✓
- - ✓
-
- * - | ``bool isnan(double x)``
- | Determine whether :math:`x` is a ``NAN``.
- - ✓
- - ✓
-
- * - | ``double j0(double x)``
- | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``double j1(double x)``
- | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``double jn(int n, double x)``
- | Returns the value of the Bessel function of the first kind of order n for :math:`x`.
- - ✓
- - ✓
-
- * - | ``double ldexp(double x, int exp)``
- | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double lgamma(double x)``
- | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
- - ✓
- - ✗
-
- * - | ``long int lrint(double x)``
- | Round :math:`x` to nearest integer value.
- - ✓
- - ✓
-
- * - | ``long long int llrint(double x)``
- | Round :math:`x` to nearest integer value.
- - ✓
- - ✓
-
- * - | ``long int lround(double x)``
- | Round to nearest integer value.
- - ✓
- - ✓
-
- * - | ``long long int llround(double x)``
- | Round to nearest integer value.
- - ✓
- - ✓
-
- * - | ``double log10(double x)``
- | Returns the base 10 logarithm of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double log1p(double x)``
- | Returns the natural logarithm of :math:`x + 1`.
- - ✓
- - ✓
-
- * - | ``double log2(double x)``
- | Returns the base 2 logarithm of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double log(double x)``
- | Returns the natural logarithm of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double logb(double x)``
- | Returns the floating point representation of the exponent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double nan(const char* tagp)``
- | Returns "Not a Number" value.
- - ✗
- - ✓
-
- * - | ``double nearbyint(double x)``
- | Round :math:`x` to the nearest integer.
- - ✓
- - ✓
-
- * - | ``double nextafter(double x, double y)``
- | Returns next representable double-precision floating-point value after argument.
- - ✓
- - ✓
-
- * - | ``double norm3d(double x, double y, double z)``
- | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`.
- - ✓
- - ✓
-
- * - | ``double norm4d(double x, double y, double z, double w)``
- | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`.
- - ✓
- - ✓
-
- * - | ``double normcdf(double y)``
- | Returns the standard normal cumulative distribution function.
- - ✓
- - ✓
-
- * - | ``double normcdfinv(double y)``
- | Returns the inverse of the standard normal cumulative distribution function.
- - ✓
- - ✓
-
- * - | ``double norm(int dim, const double *a)``
- | Returns the square root of the sum of squares of any number of coordinates.
- - ✓
- - ✓
-
- * - | ``double pow(double x, double y)``
- | Returns :math:`x^y`.
- - ✓
- - ✓
-
- * - | ``double powi(double base, int iexp)``
- | Returns the value of first argument to the power of second argument.
- - ✓
- - ✓
-
- * - | ``double remainder(double x, double y)``
- | Returns double-precision floating-point remainder.
- - ✓
- - ✓
-
- * - | ``double remquo(double x, double y, int* quo)``
- | Returns double-precision floating-point remainder and part of quotient.
- - ✓
- - ✗
-
- * - | ``double round(double x)``
- | Round to nearest integer value in floating-point.
- - ✓
- - ✓
-
- * - | ``double rcbrt(double x)``
- | Returns the reciprocal cube root function.
- - ✓
- - ✓
-
- * - | ``double rhypot(double x, double y)``
- | Returns one over the square root of the sum of squares of two arguments.
- - ✓
- - ✓
-
- * - | ``double rint(double x)``
- | Round input to nearest integer value in floating-point.
- - ✓
- - ✓
-
- * - | ``double rnorm3d(double x, double y, double z)``
- | Returns one over the square root of the sum of squares of three coordinates of the argument.
- - ✓
- - ✓
-
- * - | ``double rnorm4d(double x, double y, double z, double w)``
- | Returns one over the square root of the sum of squares of four coordinates of the argument.
- - ✓
- - ✓
-
- * - | ``double rnorm(int dim, const double *a)``
- | Returns the reciprocal of square root of the sum of squares of any number of coordinates.
- - ✓
- - ✓
-
- * - | ``double scalbln(double x, long int n)``
- | Scale :math:`x` by :math:`2^n`.
- - ✓
- - ✓
-
- * - | ``double scalbn(double x, int n)``
- | Scale :math:`x` by :math:`2^n`.
- - ✓
- - ✓
+Synchronization functions causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- * - | ``bool signbit(double x)``
- | Return the sign bit of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double sin(double x)``
- | Returns the sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double sinh(double x)``
- | Returns the hyperbolic sine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double sinpi(double x)``
- | Returns the hyperbolic sine of :math:`\pi \cdot x`.
- - ✓
- - ✓
-
- * - | ``void sincos(double x, double *sptr, double *cptr)``
- | Returns the sine and cosine of :math:`x`.
- - ✓
- - ✓
-
- * - | ``void sincospi(double x, double *sptr, double *cptr)``
- | Returns the sine and cosine of :math:`\pi \cdot x`.
- - ✓
- - ✓
-
- * - | ``double sqrt(double x)``
- | Returns the square root of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double rsqrt(double x)``
- | Returns the reciprocal of the square root of :math:`x`.
- - ✗
- - ✓
-
- * - | ``double tan(double x)``
- | Returns the tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double tanh(double x)``
- | Returns the hyperbolic tangent of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double tgamma(double x)``
- | Returns the gamma function of :math:`x`.
- - ✓
- - ✓
-
- * - | ``double trunc(double x)``
- | Truncate :math:`x` to the integral part.
- - ✓
- - ✓
-
- * - | ``double y0(double x)``
- | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``double y1(double x)``
- | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`.
- - ✓
- - ✓
-
- * - | ``double yn(int n, double x)``
- | Returns the value of the Bessel function of the second kind of order n for :math:`x`.
- - ✓
- - ✓
-
-Integer intrinsics
---------------------------------------------------------------------------------------------
-
-Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only.
-
-.. list-table:: Integer intrinsics mathematical functions
-
- * - **Function**
-
- * - | ``unsigned int __brev(unsigned int x)``
- | Reverse the bit order of a 32 bit unsigned integer.
-
- * - | ``unsigned long long int __brevll(unsigned long long int x)``
- | Reverse the bit order of a 64 bit unsigned integer.
-
- * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)``
- | Return selected bytes from two 32-bit unsigned integers.
-
- * - | ``unsigned int __clz(int x)``
- | Return the number of consecutive high-order zero bits in 32 bit integer.
-
- * - | ``unsigned int __clzll(long long int x)``
- | Return the number of consecutive high-order zero bits in 64 bit integer.
-
- * - | ``unsigned int __ffs(int x)``
- | Find the position of least significant bit set to 1 in a 32 bit integer.
-
- * - | ``unsigned int __ffsll(long long int x)``
- | Find the position of least significant bit set to 1 in a 64 bit signed integer.
-
- * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)``
- | Find the position of the n-th set to 1 bit in a 32-bit integer.
-
- * - | ``unsigned int __fns64(unsigned long long int mask, unsigned int base, int offset)``
- | Find the position of the n-th set to 1 bit in a 64-bit integer.
-
- * - | ``unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)``
- | Concatenate :math:`hi` and :math:`lo`, shift left by shift & 31 bits, return the most significant 32 bits.
-
- * - | ``unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)``
- | Concatenate :math:`hi` and :math:`lo`, shift left by min(shift, 32) bits, return the most significant 32 bits.
-
- * - | ``unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)``
- | Concatenate :math:`hi` and :math:`lo`, shift right by shift & 31 bits, return the least significant 32 bits.
-
- * - | ``unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)``
- | Concatenate :math:`hi` and :math:`lo`, shift right by min(shift, 32) bits, return the least significant 32 bits.
-
- * - | ``unsigned int __hadd(int x, int y)``
- | Compute average of signed input arguments, avoiding overflow in the intermediate sum.
-
- * - | ``unsigned int __rhadd(int x, int y)``
- | Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum.
-
- * - | ``unsigned int __uhadd(int x, int y)``
- | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum.
-
- * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)``
- | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum.
-
- * - | ``int __sad(int x, int y, int z)``
- | Returns :math:`|x - y| + z`, the sum of absolute difference.
-
- * - | ``unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)``
- | Returns :math:`|x - y| + z`, the sum of absolute difference.
-
- * - | ``unsigned int __popc(unsigned int x)``
- | Count the number of bits that are set to 1 in a 32 bit integer.
-
- * - | ``unsigned int __popcll(unsigned long long int x)``
- | Count the number of bits that are set to 1 in a 64 bit integer.
-
- * - | ``int __mul24(int x, int y)``
- | Multiply two 24bit integers.
-
- * - | ``unsigned int __umul24(unsigned int x, unsigned int y)``
- | Multiply two 24bit unsigned integers.
-
- * - | ``int __mulhi(int x, int y)``
- | Returns the most significant 32 bits of the product of the two 32-bit integers.
-
- * - | ``unsigned int __umulhi(unsigned int x, unsigned int y)``
- | Returns the most significant 32 bits of the product of the two 32-bit unsigned integers.
-
- * - | ``long long int __mul64hi(long long int x, long long int y)``
- | Returns the most significant 64 bits of the product of the two 64-bit integers.
-
- * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)``
- | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers.
-
-The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format.
-For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform,
-HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``.
-The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0.
-
-Floating-point Intrinsics
---------------------------------------------------------------------------------------------
-
-Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only.
-
-.. note::
-
- Only the nearest even rounding mode supported on AMD GPUs by defaults. The ``_rz``, ``_ru`` and
- ``_rd`` suffixed intrinsic functions are existing in HIP AMD backend, if the
- ``OCML_BASIC_ROUNDED_OPERATIONS`` macro is defined.
-
-.. list-table:: Single precision intrinsics mathematical functions
-
- * - **Function**
-
- * - | ``float __cosf(float x)``
- | Returns the fast approximate cosine of :math:`x`.
-
- * - | ``float __exp10f(float x)``
- | Returns the fast approximate for 10 :sup:`x`.
-
- * - | ``float __expf(float x)``
- | Returns the fast approximate for e :sup:`x`.
-
- * - | ``float __fadd_rn(float x, float y)``
- | Add two floating-point values in round-to-nearest-even mode.
-
- * - | ``float __fdiv_rn(float x, float y)``
- | Divide two floating point values in round-to-nearest-even mode.
-
- * - | ``float __fmaf_rn(float x, float y, float z)``
- | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode.
-
- * - | ``float __fmul_rn(float x, float y)``
- | Multiply two floating-point values in round-to-nearest-even mode.
-
- * - | ``float __frcp_rn(float x, float y)``
- | Returns ``1 / x`` in round-to-nearest-even mode.
-
- * - | ``float __frsqrt_rn(float x)``
- | Returns ``1 / √x`` in round-to-nearest-even mode.
-
- * - | ``float __fsqrt_rn(float x)``
- | Returns ``√x`` in round-to-nearest-even mode.
-
- * - | ``float __fsub_rn(float x, float y)``
- | Subtract two floating-point values in round-to-nearest-even mode.
+The ``__syncthreads()`` built-in function is supported in HIP. The ``__syncthreads_count(int)``,
+``__syncthreads_and(int)``, and ``__syncthreads_or(int)`` functions are under development.
- * - | ``float __log10f(float x)``
- | Returns the fast approximate for base 10 logarithm of :math:`x`.
+The Cooperative Groups API offer options to do synchronization on a developer defined set of thread groups. For further information, check :ref:`Cooperative Groups API ` or :ref:`Cooperative Groups how to `.
- * - | ``float __log2f(float x)``
- | Returns the fast approximate for base 2 logarithm of :math:`x`.
+Math functions
+====================================================
- * - | ``float __logf(float x)``
- | Returns the fast approximate for natural logarithm of :math:`x`.
+HIP-Clang supports a set of math operations that are callable from the device. HIP supports most of the device functions supported by CUDA. These are described on :ref:`Math API page `.
- * - | ``float __powf(float x, float y)``
- | Returns the fast approximate of x :sup:`y`.
+Texture functions
+===============================================
- * - | ``float __saturatef(float x)``
- | Clamp :math:`x` to [+0.0, 1.0].
+The supported texture functions are listed in ``texture_fetch_functions.h`` and
+``texture_indirect_functions.h`` header files in the
+`HIP-AMD backend repository `_.
- * - | ``float __sincosf(float x, float* sinptr, float* cosptr)``
- | Returns the fast approximate of sine and cosine of :math:`x`.
+Texture functions are not supported on some devices. To determine if texture functions are supported
+on your device, use ``Macro __HIP_NO_IMAGE_SUPPORT == 1``. You can query the attribute
+``hipDeviceAttributeImageSupport`` to check if texture functions are supported in the host runtime
+code.
- * - | ``float __sinf(float x)``
- | Returns the fast approximate sine of :math:`x`.
+Surface functions
+===============================================
- * - | ``float __tanf(float x)``
- | Returns the fast approximate tangent of :math:`x`.
+The following surface functions are supported in HIP:
-.. list-table:: Double precision intrinsics mathematical functions
+.. doxygengroup:: Surface
+ :content-only:
- * - **Function**
+.. doxygenfunction:: surf1Dread
- * - | ``double __dadd_rn(double x, double y)``
- | Add two floating-point values in round-to-nearest-even mode.
+.. doxygenfunction:: surf1Dwrite
- * - | ``double __ddiv_rn(double x, double y)``
- | Divide two floating-point values in round-to-nearest-even mode.
+.. doxygenfunction:: surf2Dread
- * - | ``double __dmul_rn(double x, double y)``
- | Multiply two floating-point values in round-to-nearest-even mode.
+.. doxygenfunction:: surf2Dwrite
- * - | ``double __drcp_rn(double x, double y)``
- | Returns ``1 / x`` in round-to-nearest-even mode.
+.. doxygenfunction:: surf3Dread
- * - | ``double __dsqrt_rn(double x)``
- | Returns ``√x`` in round-to-nearest-even mode.
+.. doxygenfunction:: surf3Dwrite
- * - | ``double __dsub_rn(double x, double y)``
- | Subtract two floating-point values in round-to-nearest-even mode.
+.. doxygenfunction:: surf1DLayeredread
- * - | ``double __fma_rn(double x, double y, double z)``
- | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode.
+.. doxygenfunction:: surf1DLayeredwrite
+.. doxygenfunction:: surf2DLayeredread
-Texture functions
-===============================================
+.. doxygenfunction:: surf2DLayeredwrite
-The supported texture functions are listed in ``texture_fetch_functions.h`` and
-``texture_indirect_functions.h`` header files in the
-`HIP-AMD backend repository `_.
+.. doxygenfunction:: surfCubemapread
-Texture functions are not supported on some devices. To determine if texture functions are supported
-on your device, use ``Macro __HIP_NO_IMAGE_SUPPORT == 1``. You can query the attribute
-``hipDeviceAttributeImageSupport`` to check if texture functions are supported in the host runtime
-code.
+.. doxygenfunction:: surfCubemapwrite
-Surface functions
-===============================================
+.. doxygenfunction:: surfCubemapLayeredread
-Surface functions are not supported.
+.. doxygenfunction:: surfCubemapLayeredwrite
Timer functions
===============================================
@@ -1474,6 +397,8 @@ To read a high-resolution timer from the device, HIP provides the following buil
Where ``hipDeviceAttributeWallClockRate`` is a device attribute. Note that wall clock frequency is a
per-device attribute.
+ Note that ``clock()`` and ``clock64()`` do not work properly on AMD RDNA3 (GFX11) graphic processors.
+
Atomic functions
===============================================
@@ -2051,6 +976,8 @@ HIP supports the following kernel language cooperative groups types and function
- ✓
- ✓
+For further information, check :ref:`Cooperative Groups API ` or :ref:`Cooperative Groups how to `.
+
Warp matrix functions
============================================================
@@ -2073,7 +1000,7 @@ HIP does not support kernel language warp matrix types or functions.
- ✗
- ✓
- * - ``void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)``
+ * - ``void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)``
- ✗
- ✓
@@ -2260,7 +1187,7 @@ Unroll with a bounds that is known at compile-time is supported. For example:
.. code-block:: cpp
- #pragma unroll 1 /* tell compiler to never unroll the loop */
+ #pragma unroll 1 /* tell compiler to never unroll the loop */
for (int i=0; i<16; i++) ...
.. code-block:: cpp
diff --git a/docs/reference/cpp_language_support.rst b/docs/reference/cpp_language_support.rst
new file mode 100644
index 0000000000..829cfa9e48
--- /dev/null
+++ b/docs/reference/cpp_language_support.rst
@@ -0,0 +1,169 @@
+.. meta::
+ :description: This chapter describes the C++ support of the HIP ecosystem
+ ROCm software.
+ :keywords: AMD, ROCm, HIP, C++
+
+*******************************************************************************
+C++ language support
+*******************************************************************************
+
+The ROCm platform enables the power of combined C++ and HIP (Heterogeneous-computing
+Interface for Portability) code. This code is compiled with a ``clang`` or ``clang++``
+compiler. The official compilers support the HIP platform, or you can use the
+``amdclang`` or ``amdclang++`` included in the ROCm installation, which are a wrapper for
+the official versions.
+
+The source code is compiled according to the ``C++03``, ``C++11``, ``C++14``, ``C++17``,
+and ``C++20`` standards, along with HIP-specific extensions, but is subject to
+restrictions. The key restriction is the reduced support of standard library in device
+code. This is due to the fact that by default a function is considered to run on host,
+except for ``constexpr`` functions, which can run on host and device as well.
+
+.. _language_modern_cpp_support:
+Modern C++ support
+===============================================================================
+
+C++ is considered a modern programming language as of C++11. This section describes how
+HIP supports these new C++ features.
+
+C++11 support
+-------------------------------------------------------------------------------
+
+The C++11 standard introduced many new features. These features are supported in HIP host
+code, with some notable omissions on the device side. The rule of thumb here is that
+``constexpr`` functions work on device, the rest doesn't. This means that some important
+functionality like ``std::function`` is missing on the device, but unfortunately the
+standard library wasn't designed with HIP in mind, which means that the support is in a
+state of "works as-is".
+
+Certain features have restrictions and clarifications. For example, any functions using
+the ``constexpr`` qualifier or the new ``initializer lists``, ``std::move`` or
+``std::forward`` features are implicitly considered to have the ``__host__`` and
+``__device__`` execution space specifier. Also, ``constexpr`` variables that are static
+members or namespace scoped can be used from both host and device, but only for read
+access. Dereferencing a static ``constexpr`` outside its specified execution space causes
+an error.
+
+Lambdas are supported, but there are some extensions and restrictions on their usage. For
+more information, see the `Extended lambdas`_ section below.
+
+C++14 support
+-------------------------------------------------------------------------------
+
+The C++14 language features are supported.
+
+C++17 support
+-------------------------------------------------------------------------------
+
+All C++17 language features are supported.
+
+C++20 support
+-------------------------------------------------------------------------------
+
+All C++20 language features are supported, but extensions and restrictions apply. C++20
+introduced coroutines and modules, which fundamentally changed how programs are written.
+HIP doesn't support these features. However, ``consteval`` functions can be called from
+host and device, even if specified for host use only.
+
+The three-way comparison operator (spaceship operator ``<=>``) works with host and device
+code.
+
+.. _language_restrictions:
+Extensions and restrictions
+===============================================================================
+
+In addition to the deviations from the standard, there are some general extensions and
+restrictions to consider.
+
+Global functions
+-------------------------------------------------------------------------------
+
+Functions that serve as an entry point for device execution are called kernels and are
+specified with the ``__global__`` qualifier. To call a kernel function, use the triple
+chevron operator: ``<<< >>>``. Kernel functions must have a ``void`` return type. These
+functions can't:
+
+* have a ``constexpr`` specifier
+* have a parameter of type ``std::initializer_list`` or ``va_list``
+* use an rvalue reference as a parameter.
+* use parameters having different sizes in host and device code, e.g. long double arguments, or structs containing long double members.
+* use struct-type arguments which have different layout in host and device code.
+
+Kernels can have variadic template parameters, but only one parameter pack, which must be
+the last item in the template parameter list.
+
+Device space memory specifiers
+-------------------------------------------------------------------------------
+
+HIP includes device space memory specifiers to indicate whether a variable is allocated
+in host or device memory and how its memory should be allocated. HIP supports the
+``__device__``, ``__shared__``, ``__managed__``, and ``__constant__`` specifiers.
+
+The ``__device__`` and ``__constant__`` specifiers define global variables, which are
+allocated within global memory on the HIP devices. The only difference is that
+``__constant__`` variables can't be changed after allocation. The ``__shared__``
+specifier allocates the variable within shared memory, which is available for all threads
+in a block.
+
+The ``__managed__`` variable specifier creates global variables that are initially
+undefined and unaddressed within the global symbol table. The HIP runtime allocates
+managed memory and defines the symbol when it loads the device binary. A managed variable
+can be accessed in both device and host code.
+
+It's important to know where a variable is stored because it is only available from
+certain locations. Generally, variables allocated in the host memory are not accessible
+from the device code, while variables allocated in the device memory are not directly
+accessible from the host code. Dereferencing a pointer to device memory on the host
+results in a segmentation fault. Accessing device variables in host code should be done
+through kernel execution or HIP functions like ``hipMemCpyToSymbol``.
+
+Exception handling
+-------------------------------------------------------------------------------
+
+An important difference between the host and device code is exception handling. In device
+code, this control flow isn't available due to the hardware architecture. The device
+code must use return codes to handle errors.
+
+Kernel parameters
+-------------------------------------------------------------------------------
+
+There are some restrictions on kernel function parameters. They cannot be passed by
+reference, because these functions are called from the host but run on the device. Also,
+a variable number of arguments is not allowed.
+
+Classes
+-------------------------------------------------------------------------------
+
+Classes work on both the host and device side, but there are some constraints. The
+``static`` member functions can't be ``__global__``. ``Virtual`` member functions work,
+but a ``virtual`` function must not be called from the host if the parent object was
+created on the device, or the other way around, because this behavior is undefined.
+Another minor restriction is that ``__device__`` variables, that are global scoped must
+have trivial constructors.
+
+Polymorphic function wrappers
+-------------------------------------------------------------------------------
+
+HIP doesn't support the polymorphic function wrapper ``std::function``, which was
+introduced in C++11.
+
+Extended lambdas
+-------------------------------------------------------------------------------
+
+HIP supports Lambdas, which by default work as expected.
+
+Lambdas have implicit host device attributes. This means that they can be executed by
+both host and device code, and works the way you would expect. To make a lambda callable
+only by host or device code, users can add ``__host__`` or ``__device__`` attribute. The
+only restriction is that host variables can only be accessed through copy on the device.
+Accessing through reference will cause undefined behavior.
+
+Inline namespaces
+-------------------------------------------------------------------------------
+
+Inline namespaces are supported, but with a few exceptions. The following entities can't
+be declared in namespace scope within an inline unnamed namespace:
+
+* ``__managed__``, ``__device__``, ``__shared__`` and ``__constant__`` variables
+* ``__global__`` function and function templates
+* variables with surface or texture type
diff --git a/docs/reference/math_api.rst b/docs/reference/math_api.rst
new file mode 100644
index 0000000000..fd3a215dd2
--- /dev/null
+++ b/docs/reference/math_api.rst
@@ -0,0 +1,1121 @@
+.. meta::
+ :description: This chapter describes the built-in math functions that are accessible in HIP.
+ :keywords: AMD, ROCm, HIP, CUDA, math functions, HIP math functions
+
+.. _math_api_reference:
+
+********************************************************************************
+HIP math API
+********************************************************************************
+
+HIP-Clang supports a set of math operations that are callable from the device. HIP supports most of the device functions supported by NVIDIA CUDA. These are described in the following sections.
+
+Single precision mathematical functions
+=======================================
+
+
+Following is the list of supported single precision mathematical functions.
+
+.. list-table:: Single precision mathematical functions
+
+ * - **Function**
+ - **Supported on Host**
+ - **Supported on Device**
+
+ * - | ``float abs(float x)``
+ | Returns the absolute value of :math:`x`
+ - ✓
+ - ✓
+
+ * - | ``float acosf(float x)``
+ | Returns the arc cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float acoshf(float x)``
+ | Returns the nonnegative arc hyperbolic cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float asinf(float x)``
+ | Returns the arc sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float asinhf(float x)``
+ | Returns the arc hyperbolic sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float atanf(float x)``
+ | Returns the arc tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float atan2f(float x, float y)``
+ | Returns the arc tangent of the ratio of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``float atanhf(float x)``
+ | Returns the arc hyperbolic tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float cbrtf(float x)``
+ | Returns the cube root of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float ceilf(float x)``
+ | Returns ceiling of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float copysignf(float x, float y)``
+ | Create value with given magnitude, copying sign of second value.
+ - ✓
+ - ✓
+
+ * - | ``float cosf(float x)``
+ | Returns the cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float coshf(float x)``
+ | Returns the hyperbolic cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float cospif(float x)``
+ | Returns the cosine of :math:`\pi \cdot x`.
+ - ✓
+ - ✓
+
+ * - | ``float cyl_bessel_i0f(float x)``
+ | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`.
+ - ✗
+ - ✗
+
+ * - | ``float cyl_bessel_i1f(float x)``
+ | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`.
+ - ✗
+ - ✗
+
+ * - | ``float erff(float x)``
+ | Returns the error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float erfcf(float x)``
+ | Returns the complementary error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float erfcinvf(float x)``
+ | Returns the inverse complementary function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float erfcxf(float x)``
+ | Returns the scaled complementary error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float erfinvf(float x)``
+ | Returns the inverse error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float expf(float x)``
+ | Returns :math:`e^x`.
+ - ✓
+ - ✓
+
+ * - | ``float exp10f(float x)``
+ | Returns :math:`10^x`.
+ - ✓
+ - ✓
+
+ * - | ``float exp2f( float x)``
+ | Returns :math:`2^x`.
+ - ✓
+ - ✓
+
+ * - | ``float expm1f(float x)``
+ | Returns :math:`ln(x - 1)`
+ - ✓
+ - ✓
+
+ * - | ``float fabsf(float x)``
+ | Returns the absolute value of `x`
+ - ✓
+ - ✓
+
+ * - | ``float fdimf(float x, float y)``
+ | Returns the positive difference between :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``float fdividef(float x, float y)``
+ | Divide two floating point values.
+ - ✓
+ - ✓
+
+ * - | ``float floorf(float x)``
+ | Returns the largest integer less than or equal to :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float fmaf(float x, float y, float z)``
+ | Returns :math:`x \cdot y + z` as a single operation.
+ - ✓
+ - ✓
+
+ * - | ``float fmaxf(float x, float y)``
+ | Determine the maximum numeric value of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``float fminf(float x, float y)``
+ | Determine the minimum numeric value of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``float fmodf(float x, float y)``
+ | Returns the floating-point remainder of :math:`x / y`.
+ - ✓
+ - ✓
+
+ * - | ``float modff(float x, float* iptr)``
+ | Break down :math:`x` into fractional and integral parts.
+ - ✓
+ - ✗
+
+ * - | ``float frexpf(float x, int* nptr)``
+ | Extract mantissa and exponent of :math:`x`.
+ - ✓
+ - ✗
+
+ * - | ``float hypotf(float x, float y)``
+ | Returns the square root of the sum of squares of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``int ilogbf(float x)``
+ | Returns the unbiased integer exponent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``bool isfinite(float x)``
+ | Determine whether :math:`x` is finite.
+ - ✓
+ - ✓
+
+ * - | ``bool isinf(float x)``
+ | Determine whether :math:`x` is infinite.
+ - ✓
+ - ✓
+
+ * - | ``bool isnan(float x)``
+ | Determine whether :math:`x` is a ``NAN``.
+ - ✓
+ - ✓
+
+ * - | ``float j0f(float x)``
+ | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float j1f(float x)``
+ | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float jnf(int n, float x)``
+ | Returns the value of the Bessel function of the first kind of order n for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float ldexpf(float x, int exp)``
+ | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float lgammaf(float x)``
+ | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
+ - ✓
+ - ✗
+
+ * - | ``long int lrintf(float x)``
+ | Round :math:`x` to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``long long int llrintf(float x)``
+ | Round :math:`x` to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``long int lroundf(float x)``
+ | Round to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``long long int llroundf(float x)``
+ | Round to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``float log10f(float x)``
+ | Returns the base 10 logarithm of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float log1pf(float x)``
+ | Returns the natural logarithm of :math:`x + 1`.
+ - ✓
+ - ✓
+
+ * - | ``float log2f(float x)``
+ | Returns the base 2 logarithm of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float logf(float x)``
+ | Returns the natural logarithm of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float logbf(float x)``
+ | Returns the floating point representation of the exponent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float nanf(const char* tagp)``
+ | Returns "Not a Number" value.
+ - ✗
+ - ✓
+
+ * - | ``float nearbyintf(float x)``
+ | Round :math:`x` to the nearest integer.
+ - ✓
+ - ✓
+
+ * - | ``float nextafterf(float x, float y)``
+ | Returns next representable single-precision floating-point value after argument.
+ - ✓
+ - ✗
+
+ * - | ``float norm3df(float x, float y, float z)``
+ | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`.
+ - ✓
+ - ✓
+
+ * - | ``float norm4df(float x, float y, float z, float w)``
+ | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`.
+ - ✓
+ - ✓
+
+ * - | ``float normcdff(float y)``
+ | Returns the standard normal cumulative distribution function.
+ - ✓
+ - ✓
+
+ * - | ``float normcdfinvf(float y)``
+ | Returns the inverse of the standard normal cumulative distribution function.
+ - ✓
+ - ✓
+
+ * - | ``float normf(int dim, const float *a)``
+ | Returns the square root of the sum of squares of any number of coordinates.
+ - ✓
+ - ✓
+
+ * - | ``float powf(float x, float y)``
+ | Returns :math:`x^y`.
+ - ✓
+ - ✓
+
+ * - | ``float powif(float base, int iexp)``
+ | Returns the value of first argument to the power of second argument.
+ - ✓
+ - ✓
+
+ * - | ``float remainderf(float x, float y)``
+ | Returns single-precision floating-point remainder.
+ - ✓
+ - ✓
+
+ * - | ``float remquof(float x, float y, int* quo)``
+ | Returns single-precision floating-point remainder and part of quotient.
+ - ✓
+ - ✓
+
+ * - | ``float roundf(float x)``
+ | Round to nearest integer value in floating-point.
+ - ✓
+ - ✓
+
+ * - | ``float rcbrtf(float x)``
+ | Returns the reciprocal cube root function.
+ - ✓
+ - ✓
+
+ * - | ``float rhypotf(float x, float y)``
+ | Returns one over the square root of the sum of squares of two arguments.
+ - ✓
+ - ✓
+
+ * - | ``float rintf(float x)``
+ | Round input to nearest integer value in floating-point.
+ - ✓
+ - ✓
+
+ * - | ``float rnorm3df(float x, float y, float z)``
+ | Returns one over the square root of the sum of squares of three coordinates of the argument.
+ - ✓
+ - ✓
+
+ * - | ``float rnorm4df(float x, float y, float z, float w)``
+ | Returns one over the square root of the sum of squares of four coordinates of the argument.
+ - ✓
+ - ✓
+
+ * - | ``float rnormf(int dim, const float *a)``
+ | Returns the reciprocal of square root of the sum of squares of any number of coordinates.
+ - ✓
+ - ✓
+
+ * - | ``float scalblnf(float x, long int n)``
+ | Scale :math:`x` by :math:`2^n`.
+ - ✓
+ - ✓
+
+ * - | ``float scalbnf(float x, int n)``
+ | Scale :math:`x` by :math:`2^n`.
+ - ✓
+ - ✓
+
+ * - | ``bool signbit(float x)``
+ | Return the sign bit of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float sinf(float x)``
+ | Returns the sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float sinhf(float x)``
+ | Returns the hyperbolic sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float sinpif(float x)``
+ | Returns the hyperbolic sine of :math:`\pi \cdot x`.
+ - ✓
+ - ✓
+
+ * - | ``void sincosf(float x, float *sptr, float *cptr)``
+ | Returns the sine and cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``void sincospif(float x, float *sptr, float *cptr)``
+ | Returns the sine and cosine of :math:`\pi \cdot x`.
+ - ✓
+ - ✓
+
+ * - | ``float sqrtf(float x)``
+ | Returns the square root of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float rsqrtf(float x)``
+ | Returns the reciprocal of the square root of :math:`x`.
+ - ✗
+ - ✓
+
+ * - | ``float tanf(float x)``
+ | Returns the tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float tanhf(float x)``
+ | Returns the hyperbolic tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float tgammaf(float x)``
+ | Returns the gamma function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float truncf(float x)``
+ | Truncate :math:`x` to the integral part.
+ - ✓
+ - ✓
+
+ * - | ``float y0f(float x)``
+ | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float y1f(float x)``
+ | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``float ynf(int n, float x)``
+ | Returns the value of the Bessel function of the second kind of order n for :math:`x`.
+ - ✓
+ - ✓
+
+Double precision mathematical functions
+=======================================
+
+Following is the list of supported double precision mathematical functions.
+
+.. list-table:: Double precision mathematical functions
+
+ * - **Function**
+ - **Supported on Host**
+ - **Supported on Device**
+
+ * - | ``double abs(double x)``
+ | Returns the absolute value of :math:`x`
+ - ✓
+ - ✓
+
+ * - | ``double acos(double x)``
+ | Returns the arc cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double acosh(double x)``
+ | Returns the nonnegative arc hyperbolic cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double asin(double x)``
+ | Returns the arc sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double asinh(double x)``
+ | Returns the arc hyperbolic sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double atan(double x)``
+ | Returns the arc tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double atan2(double x, double y)``
+ | Returns the arc tangent of the ratio of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``double atanh(double x)``
+ | Returns the arc hyperbolic tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double cbrt(double x)``
+ | Returns the cube root of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double ceil(double x)``
+ | Returns ceiling of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double copysign(double x, double y)``
+ | Create value with given magnitude, copying sign of second value.
+ - ✓
+ - ✓
+
+ * - | ``double cos(double x)``
+ | Returns the cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double cosh(double x)``
+ | Returns the hyperbolic cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double cospi(double x)``
+ | Returns the cosine of :math:`\pi \cdot x`.
+ - ✓
+ - ✓
+
+ * - | ``double cyl_bessel_i0(double x)``
+ | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`.
+ - ✗
+ - ✗
+
+ * - | ``double cyl_bessel_i1(double x)``
+ | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`.
+ - ✗
+ - ✗
+
+ * - | ``double erf(double x)``
+ | Returns the error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double erfc(double x)``
+ | Returns the complementary error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double erfcinv(double x)``
+ | Returns the inverse complementary function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double erfcx(double x)``
+ | Returns the scaled complementary error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double erfinv(double x)``
+ | Returns the inverse error function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double exp(double x)``
+ | Returns :math:`e^x`.
+ - ✓
+ - ✓
+
+ * - | ``double exp10(double x)``
+ | Returns :math:`10^x`.
+ - ✓
+ - ✓
+
+ * - | ``double exp2( double x)``
+ | Returns :math:`2^x`.
+ - ✓
+ - ✓
+
+ * - | ``double expm1(double x)``
+ | Returns :math:`ln(x - 1)`
+ - ✓
+ - ✓
+
+ * - | ``double fabs(double x)``
+ | Returns the absolute value of `x`
+ - ✓
+ - ✓
+
+ * - | ``double fdim(double x, double y)``
+ | Returns the positive difference between :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``double floor(double x)``
+ | Returns the largest integer less than or equal to :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double fma(double x, double y, double z)``
+ | Returns :math:`x \cdot y + z` as a single operation.
+ - ✓
+ - ✓
+
+ * - | ``double fmax(double x, double y)``
+ | Determine the maximum numeric value of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``double fmin(double x, double y)``
+ | Determine the minimum numeric value of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``double fmod(double x, double y)``
+ | Returns the floating-point remainder of :math:`x / y`.
+ - ✓
+ - ✓
+
+ * - | ``double modf(double x, double* iptr)``
+ | Break down :math:`x` into fractional and integral parts.
+ - ✓
+ - ✗
+
+ * - | ``double frexp(double x, int* nptr)``
+ | Extract mantissa and exponent of :math:`x`.
+ - ✓
+ - ✗
+
+ * - | ``double hypot(double x, double y)``
+ | Returns the square root of the sum of squares of :math:`x` and :math:`y`.
+ - ✓
+ - ✓
+
+ * - | ``int ilogb(double x)``
+ | Returns the unbiased integer exponent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``bool isfinite(double x)``
+ | Determine whether :math:`x` is finite.
+ - ✓
+ - ✓
+
+ * - | ``bool isin(double x)``
+ | Determine whether :math:`x` is infinite.
+ - ✓
+ - ✓
+
+ * - | ``bool isnan(double x)``
+ | Determine whether :math:`x` is a ``NAN``.
+ - ✓
+ - ✓
+
+ * - | ``double j0(double x)``
+ | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double j1(double x)``
+ | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double jn(int n, double x)``
+ | Returns the value of the Bessel function of the first kind of order n for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double ldexp(double x, int exp)``
+ | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double lgamma(double x)``
+ | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`.
+ - ✓
+ - ✗
+
+ * - | ``long int lrint(double x)``
+ | Round :math:`x` to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``long long int llrint(double x)``
+ | Round :math:`x` to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``long int lround(double x)``
+ | Round to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``long long int llround(double x)``
+ | Round to nearest integer value.
+ - ✓
+ - ✓
+
+ * - | ``double log10(double x)``
+ | Returns the base 10 logarithm of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double log1p(double x)``
+ | Returns the natural logarithm of :math:`x + 1`.
+ - ✓
+ - ✓
+
+ * - | ``double log2(double x)``
+ | Returns the base 2 logarithm of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double log(double x)``
+ | Returns the natural logarithm of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double logb(double x)``
+ | Returns the floating point representation of the exponent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double nan(const char* tagp)``
+ | Returns "Not a Number" value.
+ - ✗
+ - ✓
+
+ * - | ``double nearbyint(double x)``
+ | Round :math:`x` to the nearest integer.
+ - ✓
+ - ✓
+
+ * - | ``double nextafter(double x, double y)``
+ | Returns next representable double-precision floating-point value after argument.
+ - ✓
+ - ✓
+
+ * - | ``double norm3d(double x, double y, double z)``
+ | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`.
+ - ✓
+ - ✓
+
+ * - | ``double norm4d(double x, double y, double z, double w)``
+ | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`.
+ - ✓
+ - ✓
+
+ * - | ``double normcdf(double y)``
+ | Returns the standard normal cumulative distribution function.
+ - ✓
+ - ✓
+
+ * - | ``double normcdfinv(double y)``
+ | Returns the inverse of the standard normal cumulative distribution function.
+ - ✓
+ - ✓
+
+ * - | ``double norm(int dim, const double *a)``
+ | Returns the square root of the sum of squares of any number of coordinates.
+ - ✓
+ - ✓
+
+ * - | ``double pow(double x, double y)``
+ | Returns :math:`x^y`.
+ - ✓
+ - ✓
+
+ * - | ``double powi(double base, int iexp)``
+ | Returns the value of first argument to the power of second argument.
+ - ✓
+ - ✓
+
+ * - | ``double remainder(double x, double y)``
+ | Returns double-precision floating-point remainder.
+ - ✓
+ - ✓
+
+ * - | ``double remquo(double x, double y, int* quo)``
+ | Returns double-precision floating-point remainder and part of quotient.
+ - ✓
+ - ✗
+
+ * - | ``double round(double x)``
+ | Round to nearest integer value in floating-point.
+ - ✓
+ - ✓
+
+ * - | ``double rcbrt(double x)``
+ | Returns the reciprocal cube root function.
+ - ✓
+ - ✓
+
+ * - | ``double rhypot(double x, double y)``
+ | Returns one over the square root of the sum of squares of two arguments.
+ - ✓
+ - ✓
+
+ * - | ``double rint(double x)``
+ | Round input to nearest integer value in floating-point.
+ - ✓
+ - ✓
+
+ * - | ``double rnorm3d(double x, double y, double z)``
+ | Returns one over the square root of the sum of squares of three coordinates of the argument.
+ - ✓
+ - ✓
+
+ * - | ``double rnorm4d(double x, double y, double z, double w)``
+ | Returns one over the square root of the sum of squares of four coordinates of the argument.
+ - ✓
+ - ✓
+
+ * - | ``double rnorm(int dim, const double *a)``
+ | Returns the reciprocal of square root of the sum of squares of any number of coordinates.
+ - ✓
+ - ✓
+
+ * - | ``double scalbln(double x, long int n)``
+ | Scale :math:`x` by :math:`2^n`.
+ - ✓
+ - ✓
+
+ * - | ``double scalbn(double x, int n)``
+ | Scale :math:`x` by :math:`2^n`.
+ - ✓
+ - ✓
+
+ * - | ``bool signbit(double x)``
+ | Return the sign bit of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double sin(double x)``
+ | Returns the sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double sinh(double x)``
+ | Returns the hyperbolic sine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double sinpi(double x)``
+ | Returns the hyperbolic sine of :math:`\pi \cdot x`.
+ - ✓
+ - ✓
+
+ * - | ``void sincos(double x, double *sptr, double *cptr)``
+ | Returns the sine and cosine of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``void sincospi(double x, double *sptr, double *cptr)``
+ | Returns the sine and cosine of :math:`\pi \cdot x`.
+ - ✓
+ - ✓
+
+ * - | ``double sqrt(double x)``
+ | Returns the square root of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double rsqrt(double x)``
+ | Returns the reciprocal of the square root of :math:`x`.
+ - ✗
+ - ✓
+
+ * - | ``double tan(double x)``
+ | Returns the tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double tanh(double x)``
+ | Returns the hyperbolic tangent of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double tgamma(double x)``
+ | Returns the gamma function of :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double trunc(double x)``
+ | Truncate :math:`x` to the integral part.
+ - ✓
+ - ✓
+
+ * - | ``double y0(double x)``
+ | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double y1(double x)``
+ | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`.
+ - ✓
+ - ✓
+
+ * - | ``double yn(int n, double x)``
+ | Returns the value of the Bessel function of the second kind of order n for :math:`x`.
+ - ✓
+ - ✓
+
+Integer intrinsics
+==================
+
+Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only.
+
+.. list-table:: Integer intrinsics mathematical functions
+
+ * - **Function**
+
+ * - | ``unsigned int __brev(unsigned int x)``
+ | Reverse the bit order of a 32 bit unsigned integer.
+
+ * - | ``unsigned long long int __brevll(unsigned long long int x)``
+ | Reverse the bit order of a 64 bit unsigned integer.
+
+ * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)``
+ | Return selected bytes from two 32-bit unsigned integers.
+
+ * - | ``unsigned int __clz(int x)``
+ | Return the number of consecutive high-order zero bits in 32 bit integer.
+
+ * - | ``unsigned int __clzll(long long int x)``
+ | Return the number of consecutive high-order zero bits in 64 bit integer.
+
+ * - | ``unsigned int __ffs(int x)``
+ | Find the position of least significant bit set to 1 in a 32 bit integer.
+
+ * - | ``unsigned int __ffsll(long long int x)``
+ | Find the position of least significant bit set to 1 in a 64 bit signed integer.
+
+ * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)``
+ | Find the position of the n-th set to 1 bit in a 32-bit integer.
+
+ * - | ``unsigned int __fns64(unsigned long long int mask, unsigned int base, int offset)``
+ | Find the position of the n-th set to 1 bit in a 64-bit integer.
+
+ * - | ``unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)``
+ | Concatenate :math:`hi` and :math:`lo`, shift left by shift & 31 bits, return the most significant 32 bits.
+
+ * - | ``unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)``
+ | Concatenate :math:`hi` and :math:`lo`, shift left by min(shift, 32) bits, return the most significant 32 bits.
+
+ * - | ``unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)``
+ | Concatenate :math:`hi` and :math:`lo`, shift right by shift & 31 bits, return the least significant 32 bits.
+
+ * - | ``unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)``
+ | Concatenate :math:`hi` and :math:`lo`, shift right by min(shift, 32) bits, return the least significant 32 bits.
+
+ * - | ``unsigned int __hadd(int x, int y)``
+ | Compute average of signed input arguments, avoiding overflow in the intermediate sum.
+
+ * - | ``unsigned int __rhadd(int x, int y)``
+ | Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum.
+
+ * - | ``unsigned int __uhadd(int x, int y)``
+ | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum.
+
+ * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)``
+ | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum.
+
+ * - | ``int __sad(int x, int y, int z)``
+ | Returns :math:`|x - y| + z`, the sum of absolute difference.
+
+ * - | ``unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)``
+ | Returns :math:`|x - y| + z`, the sum of absolute difference.
+
+ * - | ``unsigned int __popc(unsigned int x)``
+ | Count the number of bits that are set to 1 in a 32 bit integer.
+
+ * - | ``unsigned int __popcll(unsigned long long int x)``
+ | Count the number of bits that are set to 1 in a 64 bit integer.
+
+ * - | ``int __mul24(int x, int y)``
+ | Multiply two 24bit integers.
+
+ * - | ``unsigned int __umul24(unsigned int x, unsigned int y)``
+ | Multiply two 24bit unsigned integers.
+
+ * - | ``int __mulhi(int x, int y)``
+ | Returns the most significant 32 bits of the product of the two 32-bit integers.
+
+ * - | ``unsigned int __umulhi(unsigned int x, unsigned int y)``
+ | Returns the most significant 32 bits of the product of the two 32-bit unsigned integers.
+
+ * - | ``long long int __mul64hi(long long int x, long long int y)``
+ | Returns the most significant 64 bits of the product of the two 64-bit integers.
+
+ * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)``
+ | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers.
+
+The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format.
+For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform,
+HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``.
+The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0.
+
+Floating-point Intrinsics
+=========================
+
+Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only.
+
+.. note::
+
+ Only the nearest even rounding mode supported on AMD GPUs by defaults. The ``_rz``, ``_ru`` and
+ ``_rd`` suffixed intrinsic functions are existing in HIP AMD backend, if the
+ ``OCML_BASIC_ROUNDED_OPERATIONS`` macro is defined.
+
+.. list-table:: Single precision intrinsics mathematical functions
+
+ * - **Function**
+
+ * - | ``float __cosf(float x)``
+ | Returns the fast approximate cosine of :math:`x`.
+
+ * - | ``float __exp10f(float x)``
+ | Returns the fast approximate for 10 :sup:`x`.
+
+ * - | ``float __expf(float x)``
+ | Returns the fast approximate for e :sup:`x`.
+
+ * - | ``float __fadd_rn(float x, float y)``
+ | Add two floating-point values in round-to-nearest-even mode.
+
+ * - | ``float __fdiv_rn(float x, float y)``
+ | Divide two floating point values in round-to-nearest-even mode.
+
+ * - | ``float __fmaf_rn(float x, float y, float z)``
+ | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode.
+
+ * - | ``float __fmul_rn(float x, float y)``
+ | Multiply two floating-point values in round-to-nearest-even mode.
+
+ * - | ``float __frcp_rn(float x, float y)``
+ | Returns ``1 / x`` in round-to-nearest-even mode.
+
+ * - | ``float __frsqrt_rn(float x)``
+ | Returns ``1 / √x`` in round-to-nearest-even mode.
+
+ * - | ``float __fsqrt_rn(float x)``
+ | Returns ``√x`` in round-to-nearest-even mode.
+
+ * - | ``float __fsub_rn(float x, float y)``
+ | Subtract two floating-point values in round-to-nearest-even mode.
+
+ * - | ``float __log10f(float x)``
+ | Returns the fast approximate for base 10 logarithm of :math:`x`.
+
+ * - | ``float __log2f(float x)``
+ | Returns the fast approximate for base 2 logarithm of :math:`x`.
+
+ * - | ``float __logf(float x)``
+ | Returns the fast approximate for natural logarithm of :math:`x`.
+
+ * - | ``float __powf(float x, float y)``
+ | Returns the fast approximate of x :sup:`y`.
+
+ * - | ``float __saturatef(float x)``
+ | Clamp :math:`x` to [+0.0, 1.0].
+
+ * - | ``float __sincosf(float x, float* sinptr, float* cosptr)``
+ | Returns the fast approximate of sine and cosine of :math:`x`.
+
+ * - | ``float __sinf(float x)``
+ | Returns the fast approximate sine of :math:`x`.
+
+ * - | ``float __tanf(float x)``
+ | Returns the fast approximate tangent of :math:`x`.
+
+.. list-table:: Double precision intrinsics mathematical functions
+
+ * - **Function**
+
+ * - | ``double __dadd_rn(double x, double y)``
+ | Add two floating-point values in round-to-nearest-even mode.
+
+ * - | ``double __ddiv_rn(double x, double y)``
+ | Divide two floating-point values in round-to-nearest-even mode.
+
+ * - | ``double __dmul_rn(double x, double y)``
+ | Multiply two floating-point values in round-to-nearest-even mode.
+
+ * - | ``double __drcp_rn(double x, double y)``
+ | Returns ``1 / x`` in round-to-nearest-even mode.
+
+ * - | ``double __dsqrt_rn(double x)``
+ | Returns ``√x`` in round-to-nearest-even mode.
+
+ * - | ``double __dsub_rn(double x, double y)``
+ | Subtract two floating-point values in round-to-nearest-even mode.
+
+ * - | ``double __fma_rn(double x, double y, double z)``
+ | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode.
diff --git a/docs/reference/terms.md b/docs/reference/terms.md
index 4d4be12296..ea2b9d96ab 100644
--- a/docs/reference/terms.md
+++ b/docs/reference/terms.md
@@ -1,4 +1,4 @@
-# Table Comparing Syntax for Different Compute APIs
+# Table comparing syntax for different compute APIs
|Term|CUDA|HIP|OpenCL|
|---|---|---|---|
diff --git a/docs/reference/unified_memory_reference.rst b/docs/reference/unified_memory_reference.rst
new file mode 100644
index 0000000000..12922d7664
--- /dev/null
+++ b/docs/reference/unified_memory_reference.rst
@@ -0,0 +1,13 @@
+.. meta::
+ :description: This chapter describes introduces Unified Memory (UM) and shows
+ how to use it in AMD HIP.
+ :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU
+
+.. _unified_memory_reference:
+
+*******************************************************************************
+HIP managed memory allocation API
+*******************************************************************************
+
+.. doxygengroup:: MemoryM
+ :content-only:
diff --git a/docs/reference/virtual_rocr.rst b/docs/reference/virtual_rocr.rst
index 8241fa07ef..444882fc7e 100644
--- a/docs/reference/virtual_rocr.rst
+++ b/docs/reference/virtual_rocr.rst
@@ -5,7 +5,7 @@
:keywords: AMD, ROCm, HIP, HSA, ROCR runtime, virtual memory management
*******************************************************************************
-HSA Runtime API for ROCm
+HSA runtime API for ROCm
*******************************************************************************
The following functions are located in the https://github.com/ROCm/ROCR-Runtime repository.
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 0750f76bab..be820ed494 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -29,25 +29,40 @@ subtrees:
- file: how-to/performance_guidelines
- file: how-to/debugging
- file: how-to/logging
+ - file: how-to/cooperative_groups
+ - file: how-to/unified_memory
+ title: Unified memory
- file: how-to/faq
- caption: Reference
entries:
- file: doxygen/html/index
- - file: reference/kernel_language
+ - file: reference/cpp_language_extensions
title: C++ language extensions
+ - file: reference/cpp_language_support
+ title: C++ language support
+ - file: reference/math_api
- file: reference/terms
- title: Comparing Syntax for different APIs
+ title: Comparing syntax for different APIs
+ - file: reference/cooperative_groups_reference
+ title: HIP Cooperative groups API
- file: reference/virtual_rocr
+ - file: reference/unified_memory_reference
+ title: HIP managed memory allocation API
- file: reference/deprecated_api_list
title: List of deprecated APIs
- caption: Tutorials
entries:
+ - url: https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic
+ title: HIP basic examples
- url: https://github.com/ROCm/HIP-Examples
title: HIP examples
- url: https://github.com/ROCm/hip-tests/tree/develop/samples
title: HIP test samples
+ - file: tutorial/saxpy
+ - file: tutorial/reduction
+ - file: tutorial/cooperative_groups_tutorial
- caption: About
entries:
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 8d22b2d9da..c993b607be 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.1.1
+rocm-docs-core[api_reference]==1.4.0
sphinxcontrib.doxylink
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index dbe8cdca79..f1e26e18f9 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -4,11 +4,11 @@
#
# pip-compile requirements.in
#
-accessible-pygments==0.0.4
+accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==0.7.16
# via sphinx
-babel==2.14.0
+babel==2.15.0
# via
# pydata-sphinx-theme
# sphinx
@@ -16,7 +16,7 @@ beautifulsoup4==4.12.3
# via pydata-sphinx-theme
breathe==4.35.0
# via rocm-docs-core
-certifi==2024.2.2
+certifi==2024.6.2
# via requests
cffi==1.16.0
# via
@@ -31,7 +31,7 @@ click==8.1.7
# sphinx-external-toc
click-log==0.4.0
# via doxysphinx
-cryptography==42.0.5
+cryptography==42.0.8
# via pyjwt
deprecated==1.2.14
# via pygithub
@@ -41,7 +41,7 @@ docutils==0.21.2
# myst-parser
# pydata-sphinx-theme
# sphinx
-doxysphinx==3.3.7
+doxysphinx==3.3.8
# via rocm-docs-core
fastjsonschema==2.19.1
# via rocm-docs-core
@@ -53,7 +53,7 @@ idna==3.7
# via requests
imagesize==1.4.1
# via sphinx
-jinja2==3.1.3
+jinja2==3.1.4
# via
# myst-parser
# sphinx
@@ -67,27 +67,29 @@ markdown-it-py==3.0.0
# myst-parser
markupsafe==2.1.5
# via jinja2
-mdit-py-plugins==0.4.0
+mdit-py-plugins==0.4.1
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
-mpire==2.10.1
+mpire==2.10.2
# via doxysphinx
-myst-parser==3.0.0
+myst-parser==3.0.1
# via rocm-docs-core
+numpy==1.26.4
+ # via doxysphinx
packaging==24.0
# via
# pydata-sphinx-theme
# sphinx
pycparser==2.22
# via cffi
-pydata-sphinx-theme==0.15.2
+pydata-sphinx-theme==0.15.3
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.3.0
# via rocm-docs-core
-pygments==2.17.2
+pygments==2.18.0
# via
# accessible-pygments
# mpire
@@ -110,11 +112,11 @@ pyyaml==6.0.1
# myst-parser
# rocm-docs-core
# sphinx-external-toc
-requests==2.31.0
+requests==2.32.3
# via
# pygithub
# sphinx
-rocm-docs-core[api-reference]==1.1.1
+rocm-docs-core[api-reference]==1.4.0
# via -r requirements.in
six==1.16.0
# via python-dateutil
@@ -140,11 +142,11 @@ sphinx-book-theme==1.1.2
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
-sphinx-design==0.5.0
+sphinx-design==0.6.0
# via rocm-docs-core
sphinx-external-toc==1.0.1
# via rocm-docs-core
-sphinx-notfound-page==1.0.0
+sphinx-notfound-page==1.0.2
# via rocm-docs-core
sphinxcontrib-applehelp==1.0.8
# via sphinx
@@ -162,9 +164,9 @@ sphinxcontrib-serializinghtml==1.1.10
# via sphinx
tomli==2.0.1
# via sphinx
-tqdm==4.66.2
+tqdm==4.66.4
# via mpire
-typing-extensions==4.11.0
+typing-extensions==4.12.1
# via
# pydata-sphinx-theme
# pygithub
diff --git a/docs/tutorial/cooperative_groups_tutorial.rst b/docs/tutorial/cooperative_groups_tutorial.rst
new file mode 100644
index 0000000000..270bedae75
--- /dev/null
+++ b/docs/tutorial/cooperative_groups_tutorial.rst
@@ -0,0 +1,240 @@
+.. meta::
+ :description: HIP cooperative groups tutorial
+ :keywords: AMD, ROCm, HIP, cooperative groups, tutorial
+
+*******************************************************************************
+Cooperative Groups
+*******************************************************************************
+
+This tutorial demonstrates the basic concepts of cooperative groups in the HIP (Heterogeneous-computing Interface for Portability) programming model and the most essential tooling supporting it. This topic also reviews the commonalities of heterogeneous APIs. Familiarity with the C/C++ compilation model and the language is assumed.
+
+Prerequisites
+=============
+
+To follow this tutorial, you'll need properly installed drivers and a HIP compiler toolchain to compile your code. Because ROCm HIP supports compiling and running on Linux and Microsoft Windows with AMD and NVIDIA GPUs, review the HIP development package installation before starting this tutorial. For more information, see :doc:`/install/install`.
+
+Simple HIP Code
+===============
+
+To become familiar with heterogeneous programming, review the :doc:`SAXPY tutorial ` and the first HIP code subsection. Compiling is also described in that tutorial.
+
+Tiled partition
+===============
+
+You can use tiled partition to calculate the sum of ``partition_size`` length sequences and the sum of ``result_size``/ ``BlockSize`` length sequences. The host-side reference implementation is the following:
+
+.. code-block:: cpp
+
+ // Host-side function to perform the same reductions as executed on the GPU
+ std::vector ref_reduced(const unsigned int partition_size,
+ std::vector input)
+ {
+ const unsigned int input_size = input.size();
+ const unsigned int result_size = input_size / partition_size;
+ std::vector result(result_size);
+
+ for(unsigned int i = 0; i < result_size; i++)
+ {
+ unsigned int partition_result = 0;
+ for(unsigned int j = 0; j < partition_size; j++)
+ {
+ partition_result += input[partition_size * i + j];
+ }
+ result[i] = partition_result;
+ }
+
+ return result;
+ }
+
+Device-side code
+----------------
+
+To calculate the sum of the sets of numbers, the tutorial uses the shared memory-based reduction on the device side. The warp level intrinsics usage is not covered in this tutorial, unlike in the :doc:`reduction tutorial. ` ``x`` input variable is a shared pointer, which needs to be synchronized after every value change. The ``thread_group`` input parameter can be ``thread_block_tile`` or ``thread_block`` because the ``thread_group`` is the parent class of these types. The ``val`` are the numbers to calculate the sum of. The returned results of this function return the final results of the reduction on thread ID 0 of the ``thread_group``, and for every other thread, the function results are 0.
+
+.. code-block:: cuda
+
+ /// \brief Summation of `unsigned int val`'s in `thread_group g` using shared memory `x`
+ __device__ unsigned int reduce_sum(thread_group g, unsigned int* x, unsigned int val)
+ {
+ // Rank of this thread in the group
+ const unsigned int group_thread_id = g.thread_rank();
+
+ // We start with half the group size as active threads
+ // Every iteration the number of active threads halves, until we processed all values
+ for(unsigned int i = g.size() / 2; i > 0; i /= 2)
+ {
+ // Store value for this thread in a shared, temporary array
+ x[group_thread_id] = val;
+
+ // Synchronize all threads in the group
+ g.sync();
+
+ // If our thread is still active, sum with its counterpart in the other half
+ if(group_thread_id < i)
+ {
+ val += x[group_thread_id + i];
+ }
+
+ // Synchronize all threads in the group
+ g.sync();
+ }
+
+ // Only the first thread returns a valid value
+ if(g.thread_rank() == 0)
+ return val;
+ else
+ return 0;
+ }
+
+The ``reduce_sum`` device function is reused to calculate the block and custom
+partition sum of the input numbers. The kernel has three sections:
+
+1. Initialization of the reduction function variables.
+2. The reduction of thread block and store the results in global memory.
+3. The reduction of custom partition and store the results in global memory.
+
+1. Initialization of the reduction function variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this code section, the shared memory is declared, the thread_block_group and
+custom_partition are defined, and the input variables are loaded from global
+memory.
+
+.. code-block:: cuda
+
+ // threadBlockGroup consists of all threads in the block
+ thread_block thread_block_group = this_thread_block();
+
+ // Workspace array in shared memory required for reduction
+ __shared__ unsigned int workspace[2048];
+
+ unsigned int output;
+
+ // Input to reduce
+ const unsigned int input = d_vector[thread_block_group.thread_rank()];
+
+ // ...
+
+ // Every custom_partition group consists of 16 threads
+ thread_block_tile custom_partition
+ = tiled_partition(thread_block_group);
+
+
+2. The reduction of thread block
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this code section, the sum is calculated on ``thread_block_group`` level, then the results are stored in global memory.
+
+.. code-block:: cuda
+
+ // Perform reduction
+ output = reduce_sum(thread_block_group, workspace, input);
+
+ // Only the first thread returns a valid value
+ if(thread_block_group.thread_rank() == 0)
+ {
+ d_block_reduced_vector[0] = output;
+ }
+
+3. The reduction of custom partition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this code section, the sum is calculated on the custom partition level, then the results are stored in global memory. The custom partition is a partial block of the thread block, it means the reduction calculates on a shorter sequence of input numbers than at the ``thread_block_group`` case.
+
+.. code-block:: cuda
+
+ // Perform reduction
+ output = reduce_sum(custom_partition, &workspace[group_offset], input);
+
+ // Only the first thread in each partition returns a valid value
+ if(custom_partition.thread_rank() == 0)
+ {
+ const unsigned int partition_id = thread_block_group.thread_rank() / PartitionSize;
+ d_partition_reduced_vector[partition_id] = output;
+ }
+
+Host-side code
+--------------
+
+On the host-side, the following steps are done in the example:
+
+1. Confirm the cooperative group support on AMD GPUs.
+2. Initialize the cooperative group configuration.
+3. Allocate and copy input to global memory.
+4. Launch the cooperative kernel.
+5. Save the results from global memory.
+6. Free the global memory.
+
+Only the first, second and fourth steps are important from the cooperative groups aspect, that's why those steps are detailed further.
+
+1. Confirm the cooperative group support on AMD GPUs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Not all AMD GPUs support cooperative groups. You can confirm support with the following code:
+
+.. code-block:: cpp
+
+ #ifdef __HIP_PLATFORM_AMD__
+ int device = 0;
+ int supports_coop_launch = 0;
+ // Check support
+ // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
+ HIP_CHECK(hipGetDevice(&device));
+ HIP_CHECK(
+ hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device));
+ if(!supports_coop_launch)
+ {
+ std::cout << "Skipping, device " << device << " does not support cooperative groups"
+ << std::endl;
+ return 0;
+ }
+ #endif
+
+2. Initialize the cooperative group configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the example, there is only one block in the grid, and the ``threads_per_block`` must be dividable with ``partition_size``.
+
+.. code-block:: cpp
+
+ // Number of blocks to launch.
+ constexpr unsigned int num_blocks = 1;
+
+ // Number of threads in each kernel block.
+ constexpr unsigned int threads_per_block = 64;
+
+ // Total element count of the input vector.
+ constexpr unsigned int size = num_blocks * threads_per_block;
+
+ // Total elements count of a tiled_partition.
+ constexpr unsigned int partition_size = 16;
+
+ // Total size (in bytes) of the input vector.
+ constexpr size_t size_bytes = sizeof(unsigned int) * size;
+
+ static_assert(threads_per_block % partition_size == 0,
+ "threads_per_block must be a multiple of partition_size");
+
+4. Launch the kernel
+~~~~~~~~~~~~~~~~~~~~
+
+The kernel launch is done with the ``hipLaunchCooperativeKernel`` of the cooperative groups API.
+
+.. code-block:: cpp
+
+ void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
+ // Launching kernel from host.
+ HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel,
+ dim3(num_blocks),
+ dim3(threads_per_block),
+ params,
+ 0,
+ hipStreamDefault));\
+
+ // Check if the kernel launch was successful.
+ HIP_CHECK(hipGetLastError());
+
+Conclusion
+==========
+
+With cooperative groups, you can easily use custom partitions to create custom tiles for custom solutions. You can find the complete code at `cooperative groups ROCm example. `_
diff --git a/docs/tutorial/reduction.rst b/docs/tutorial/reduction.rst
new file mode 100644
index 0000000000..9d11ba6c85
--- /dev/null
+++ b/docs/tutorial/reduction.rst
@@ -0,0 +1,722 @@
+.. meta::
+ :description: HIP reduction tutorial
+ :keywords: AMD, ROCm, HIP, reduction, tutorial
+
+*************************************************************
+Reduction
+*************************************************************
+
+Reduction is a common algorithmic operation used in parallel programming to reduce an array of elements into a shorter array of elements or a single value. This document exploits reduction to introduce some key considerations while designing and optimizing GPU algorithms.
+
+This document is a rejuvenation and extension of the invaluable `work of Mark Harris `_. While the author approaches the topic with a less naive approach, reviewing some original material is valuable to see how much the underlying hardware has changed. This document provides a greater insight to demonstrate progress.
+
+The algorithm
+=============
+
+Reduction has many names depending on the domain; in functional programming it's referred to as `fold `_, in C++, it's called ``std::accumulate`` and in C++17, as ``std::reduce``. A reduction takes a range of inputs and "reduces" the given range with a binary operation to a singular or scalar output. Canonically, a reduction requires a "zero" element that bootstraps the algorithm and serves as one of the initial operands to the binary operation. The "zero" element is generally called `identity or neutral `_ element in the group theory, which implies that it is an operand that doesn't change the result. Some typical use cases are: calculating a sum or normalizing a dataset and finding the maximum value in the dataset. The latter use case is discussed further in this tutorial.
+
+.. figure:: ../data/tutorial/reduction/foldl.svg
+ :alt: Diagram demonstrating fold left
+
+There are multiple variations of reduction that allow parallel processing. The approach taken by ``std::reduce`` requires the user-provided binary operator to operate on any combination of identity and input range elements, or even exclusively on any of them. This allows you to insert any number of identities to facilitate parallel processing and then combine the partial results of parallel execution.
+
+.. figure:: ../data/tutorial/reduction/parallel_foldl.svg
+ :alt: Diagram demonstrating parallel fold left
+
+Reduction on GPUs
+=================
+
+Implementing reductions on GPUs requires a basic understanding of the :doc:`/understand/programming_model_reference`. The document explores aspects of low-level optimization best discussed through the :ref:`inherent_thread_model`, and refrains from using cooperative groups.
+
+Synchronizing parallel threads of execution across a GPU is crucial for correctness as the partial results can't be synchronized before they manifest. Synchronizing all the threads running on a GPU at any given time is possible, however, it is a costly and intricate operation. If synchronization is not absolutely necessary, map the parallel algorithm so that multiprocessors and blocks can make independent progress and need not sync frequently.
+
+There are ten reduction implementations in the `rocm-examples `_, which are described in the following sections.
+
+Naive shared reduction
+----------------------
+
+The naive algorithm takes a tree-like shape, where the computational domain is purposefully distributed among blocks. In all blocks, all threads participate in loading data from persistent (from the kernel's perspective) global memory into the shared memory. This helps to perform tree-like reduction for a single thread by writing the partial result to global, in a location unique to the block, which allows the block to make independent progress. The partial results are combined in subsequent launches of the same kernel until a scalar result is reached.
+
+.. figure:: ../data/tutorial/reduction/naive_reduction.svg
+ :alt: Diagram demonstrating naive reduction
+
+This approach requires temporary storage based on the number of blocks launched, as each block outputs a scalar partial result. Depending on the need to store or destroy the input, a second temporary storage might be needed, which could be large enough to store the results of the second kernel launch. Alternatively, you can reuse the storage of the larger than necessary original input. These implementations differ so slightly that the document only considers the use case where the input could be destroyed.
+
+.. code-block:: C++
+
+ std::size_t factor = block_size; // block_size from hipGetDeviceProperties()
+ auto new_size = [factor](const std::size_t actual)
+ {
+ // Every pass reduces input length by 'factor'. If actual size is not divisible by factor,
+ // an extra output element is produced using some number of zero_elem inputs.
+ return actual / factor + (actual % factor == 0 ? 0 : 1);
+ };
+
+For threads that don't have unique inputs, feed ``zero_elem`` instances to threads. The backing of double-buffering is allocated as such:
+
+.. code-block:: C++
+
+ // Initialize host-side storage
+ std::vector input(input_count);
+ std::iota(input.begin(), input.end(), 0);
+
+ // Initialize device-side storage
+ unsigned *front,
+ *back;
+ hipMalloc((void**)&front, sizeof(unsigned) * input_count);
+ hipMalloc((void**)&back, sizeof(unsigned) * new_size(input_count));
+
+ hipMemcpy(front, input.data(), input.size() * sizeof(unsigned), hipMemcpyHostToDevice);
+
+Data is initialized on the host and dispatched to the device followed by the commencement of device-side reduction. The swapping of the double-buffer on the last iteration is omitted, therefore the result is in the back-buffer irrespective of the input size.
+
+.. code-block:: C++
+
+ for (uint32_t curr = input_count; curr > 1;)
+ {
+ hipLaunchKernelGGL(
+ kernel,
+ dim3(new_size(curr)),
+ dim3(block_size),
+ factor * sizeof(unsigned),
+ hipStreamDefault,
+ front,
+ back,
+ kernel_op,
+ zero_elem,
+ curr);
+
+ curr = new_size(curr);
+ if (curr > 1)
+ std::swap(front, back);
+ }
+
+
+This structure persists in the kernel throughout all the variations of reduction with slight modifications to ``factor`` and shared memory allocation:
+
+.. code-block:: C++
+
+ template
+ __global__ void kernel(
+ T* front,
+ T* back,
+ F op,
+ T zero_elem,
+ uint32_t front_size)
+ {
+ extern __shared__ T shared[];
+
+ // Overindex-safe read of input
+ auto read_global_safe = [&](const uint32_t i)
+ {
+ return i < front_size ? front[i] : zero_elem;
+ };
+
+ const uint32_t tid = threadIdx.x,
+ bid = blockIdx.x,
+ gid = bid * blockDim.x + tid;
+
+ // Read input from front buffer to shared
+ shared[tid] = read_global_safe(gid);
+ __syncthreads();
+
+ // Shared reduction
+ for (uint32_t i = 1; i < blockDim.x; i *= 2)
+ {
+ if (tid % (2 * i) == 0)
+ shared[tid] = op(shared[tid], shared[tid + i]);
+ __syncthreads();
+ }
+
+ // Write result from shared to back buffer
+ if (tid == 0)
+ back[bid] = shared[0];
+ }
+
+While the ``tid % (2 * i) == 0`` indexing scheme yields correct results, it also leads to high thread divergence. Thread divergence indicates the event when the threads in a warp diverge, which implies that the threads have to execute different instructions in a given clock cycle. This is easily manifested using ``if-else`` statements as shown here, but can also be manifested as ``for`` loop dependent on thread ID lengths. Even though the number of active threads participating in the reduction reduces, warps remain active longer than necessary, as at least one lane in a warp hits the ``if`` statement.
+
+Reducing thread divergence
+--------------------------
+
+You can reduce divergence by keeping dataflow between memory addresses identical but reassigning the thread ids.
+
+.. figure:: ../data/tutorial/reduction/reduced_divergence_reduction.svg
+ :alt: Diagram demonstrating reduced divergence reduction
+
+.. code-block:: diff
+ :emphasize-lines: 4-7
+
+ // Shared reduction
+ for (uint32_t i = 1; i < blockDim.x; i *= 2)
+ {
+ - if (tid % (2 * i) == 0)
+ - shared[tid] = op(shared[tid], shared[tid + i]);
+ + if (uint32_t j = 2 * i * tid; j < blockDim.x)
+ + shared[j] = op(shared[j], shared[j + i]);
+ __syncthreads();
+ }
+
+This way inactive threads start accumulating uniformly towards the higher thread ID index range and might uniformly skip to ``__syncthreads()``. However, this introduces a bank conflicts issue.
+
+Resolving bank conflicts
+------------------------
+
+Both AMD and NVIDIA implement shared memory in the hardware by organizing storage into banks of various sizes. This hardware element is known as Local Data Share (LDS) on AMD hardware. On NVIDIA hardware, it's implemented using the same silicon as the L1 data cache. You can think of shared memory as a striped 2-dimensional range of memory. Shared memory bank's count, width, and depth depend on the architecture. A bank conflict occurs when different threads in a warp access the same bank during the same operation. In this case, the hardware prevents the attempted concurrent accesses to the same bank by converting them into serial accesses.
+
+- `"AMD Instinct MI200" Instruction Set Architecture, Chapter 11.1 `_
+- `"RDNA 2" Instruction Set Architecture, Chapter 10.1 `_
+
+A notable exception is when the shared read uniformly broadcasts to the same address across the entire warp. A better implementation of the naive algorithm is to form continuous ranges of the threads activities and their memory accesses.
+
+.. code-block:: diff
+ :emphasize-lines: 2-7
+
+ // Shared reduction
+ -for (uint32_t i = 1; i < blockDim.x; i *= 2)
+ -{
+ - if (tid % (2 * i) == 0)
+ +for (uint32_t i = blockDim.x / 2; i != 0; i /= 2)
+ +{
+ + if (tid < i)
+ shared[tid] = op(shared[tid], shared[tid + i]);
+ __syncthreads();
+ }
+
+.. figure:: ../data/tutorial/reduction/conflict_free_reduction.svg
+ :alt: Diagram demonstrating bank conflict free reduction
+
+.. note::
+
+ To avoid bank conflicts, read shared memory in a coalesced manner, which implies that reads/writes of each lane in a warp evaluate to consecutive locations. Analyzing the read/write patterns could help you to understand the cause of bank conflicts. For more details, check `CDNA3 ISA `_ or `RDNA3 ISA `_ data share operations chapter.
+
+Utilize upper half of the block
+-------------------------------
+
+The preceding implementation is free of low-level GPU-specific anti-patterns. However, it still exhibits some common shortcomings. The loop performing the reduction in the shared memory starts from ``i = blockDim.x / 2`` and the first predicate ``if (tid < i)`` immediately disables half of the block, which only helps load the data into the shared memory. You can change the kernel along with the calculation of ``factor`` on the host, as shown here:
+
+.. code-block:: diff
+ :emphasize-lines: 3,4
+
+ const uint32_t tid = threadIdx.x,
+ bid = blockIdx.x,
+ - gid = bid * blockDim.x + tid;
+ + gid = bid * (blockDim.x * 2) + tid;
+
+ // Read input from front buffer to shared
+ -shared[tid] = read_global_safe(gid);
+ +shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+ __syncthreads();
+
+By eliminating half of the threads and giving meaningful work to all the threads by unconditionally performing a binary ``op``, you can prevent the wastage of half of the threads.
+
+Even though global memory is read in a coalesced fashion, as preferred by the memory controller, optimal performance is still limited by the instruction throughput.
+Omit superfluous synchronization
+--------------------------------
+
+Warps are known to execute in a strict lockstep fashion. Therefore, once shared reduction reaches a point where only a single warp participates meaningfully, you can cut short the loop and let the rest of the warps terminate. Moreover, you can also unroll the loop without syncing the entire block.
+
+The ``tmp`` namespace used beyond this point in this document holds a handful of template meta-programmed utilities to facilitate writing flexible and optimal code.
+
+:code:`tmp::static_for` is not just a constant folding within the optimizer but a variation of the language :code:`for` loop, where the running index is a compile-time constant and is eligible for use in compile-time evaluated contexts.
+
+Consider the following code:
+
+.. code-block:: C++
+
+ constexpr int size = 4;
+ for (int i = 0 ; i < size ; ++i)
+ {
+ printf("%d", i);
+ }
+
+This compiles to the following binaries:
+
+**LLVM Block**
+
+.. code-block::
+
+ main:
+ push rbx
+ lea rbx, [rip + .L.str]
+ mov rdi, rbx
+ xor esi, esi
+ xor eax, eax
+ call printf@PLT
+ mov rdi, rbx
+ mov esi, 1
+ xor eax, eax
+ call printf@PLT
+ mov rdi, rbx
+ mov esi, 2
+ xor eax, eax
+ call printf@PLT
+ mov rdi, rbx
+ mov esi, 3
+ xor eax, eax
+ call printf@PLT
+ xor eax, eax
+ pop rbx
+ ret
+ .L.str:
+ .asciz "%d"
+
+
+**GCC**
+
+.. code-block:: asm
+
+ .LC0:
+ .string "%d"
+ main:
+ push rbx
+ xor ebx, ebx
+ .L2:
+ mov esi, ebx
+ mov edi, OFFSET FLAT:.LC0
+ xor eax, eax
+ add ebx, 1
+ call printf
+ cmp ebx, 4
+ jne .L2
+ xor eax, eax
+ pop rbx
+ ret
+
+
+**MSVC**
+
+.. code-block::
+
+ main PROC
+ $LN12:
+ push rbx
+ sub rsp, 32
+ xor ebx, ebx
+ npad 8
+ $LL4@main:
+ mov edx, ebx
+ lea rcx, OFFSET FLAT:'string'
+ call printf
+ inc ebx
+ cmp ebx, 4
+ jl SHORT $LL4@main
+ xor eax, eax
+ add rsp, 32
+ pop rbx
+ ret 0
+ main ENDP
+
+
+LLVM unrolls the loop and compiles to a flat series of ``printf`` invocations, while both GCC and MSVC keep the loop intact, as visible from the compare (``cmp``) and the jump (``jne``, ``jl``) instructions. LLVM code generation is identical to manually writing the unrolled loop:
+
+.. code-block:: C++
+
+ printf("%d", 0);
+ printf("%d", 1);
+ printf("%d", 2);
+ printf("%d", 3);
+
+While various non-standard pragmas are available to hint or force the compiler to unroll the loop, we instead use template meta-programming to force feed the compiler the unrolled loop.
+
+.. code-block:: C++
+
+ constexpr int size = 4;
+
+ // Maybe unrolled loop
+ for (int i = 0 ; i < size ; ++i)
+ {
+ printf("%d", i);
+ }
+
+ // Force unrolled loop
+ using namespace tmp;
+ static_for<0, less_than, increment<1>>([]()
+ {
+ printf("%d", i);
+ });
+
+The most notable structural difference is that in the language ``for`` loop, the loop variable is given a name in the beginning, while in the ``static_for`` utility, the loop variable is given a name in the end. An important bonus is that in the loop's body, you can use the running index ``i`` in contexts requiring constant expressions such as template arguments or inside ``if constexpr``.
+
+:code:`tmp::static_switch` takes runtime value and runtime dispatches to a range of set of tabulated functions, where said value is a compile-time constant and is eligible for use in compile-time evaluated contexts.
+
+Consider the following code:
+
+.. code-block:: C++
+
+ int warp_size = device_props.warpSize;
+ switch (warp_size)
+ {
+ case 32:
+ hipLaunchKernelGGL(kernel<32>, ...);
+ break;
+ case 64:
+ hipLaunchKernelGGL(kernel<64>, ...);
+ break;
+ }
+
+In the preceding code, note the code repetition for all possible values of ``warp_size``, the code is prepared to handle. To avoid this, use ``tmp::static_switch``, as shown:
+
+.. code-block:: C++
+
+ tmp::static_switch(warp_size, [&]()
+ {
+ hipLaunchKernelGGL(kernel, ...);
+ });
+
+.. code-block:: diff
+ :emphasize-lines: 1,2,9,10,16-24
+
+ -template
+ +template
+ __global__ void kernel(
+ ...
+ )
+ {
+ ...
+ // Shared reduction
+ -for (uint32_t i = blockDim.x / 2; i != 0; i /= 2)
+ +for (uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2)
+ {
+ if (tid < i)
+ shared[tid] = op(shared[tid], shared[tid + i]);
+ __syncthreads();
+ }
+ +// Warp reduction
+ +tmp::static_for, tmp::divide<2>>([&]()
+ +{
+ + if (tid < I)
+ + shared[tid] = op(shared[tid], shared[tid + I]);
+ +#ifdef __HIP_PLATFORM_NVIDIA__
+ + __syncwarp(0xffffffff >> (WarpSize - I));
+ +#endif
+ +});
+
+Because HIP typically targets hardware with warp sizes of 32 (NVIDIA GPUs and RDNA AMD GPUs) and 64 (CDNA AMD GPUs), portable HIP code must handle both. That is why instead of assuming a warp size of 32, make the warp size a template argument of the kernel. This allows you to unroll the final loop using ``tmp::static_for`` in a parametric way but still having the code read much like an ordinary loop.
+
+Promoting the warp size to being a compile-time constant also requires you to handle it similarly on the host-side. You can sandwich the kernel launch with ``tmp::static_switch``, promoting the snake-case run-time ``warp_size`` variable to a camel-case compile-time constant ``WarpSize``.
+
+.. code-block:: diff
+ :emphasize-lines: 4,5,7,8,18
+
+ // Device-side reduction
+ for (uint32_t curr = input_count; curr > 1;)
+ {
+ + tmp::static_range_switch(warp_size, [&]() noexcept
+ + {
+ hipLaunchKernelGGL(
+ - kernel,
+ + kernel,
+ dim3(new_size(curr)),
+ dim3(block_size),
+ factor * sizeof(unsigned),
+ hipStreamDefault,
+ front,
+ back,
+ kernel_op,
+ zero_elem,
+ curr);
+ + });
+ ...
+ }
+
+.. note::
+
+ Neither RDNA- nor CDNA-based AMD hardware provides guaranteed independent progress to lanes of the same warp. When targeting NVIDIA hardware, lanes of a warp might execute somewhat independently as long as the programmer assists the compiler using dedicated built-in functions. This feature is called Independent Thread Scheduling. The HIP headers don't expose the necessary warp primitives and their overloads.
+
+ Portable applications can still tap into this feature with carefully ``#ifdef`` -ed code, but at this particular optimization level, it's a requirement. The code implicitly relies on the lockstep behavior of an ROCm wavefront, but CUDA warps don't share this property. You must synchronize all the active lanes of a warp to avoid a data race with some lanes progressing faster than others in the same warp.
+
+Unroll all loops
+----------------
+
+While the previous step primarily aims to remove unnecessary syncing, it also unrolls the end of the loop. However, you could also force unrolling the first part of the loop. This saves a few scalar registers (values the compiler can prove to be uniform across warps).
+
+.. code-block:: diff
+ :emphasize-lines: 1-4,11,12,17,18,20-23,26
+
+ -template
+ -__global__ void kernel(
+ +template
+ +__global__ __launch_bounds__(BlockSize) void kernel(
+ T* front,
+ T* back,
+ F op,
+ T zero_elem,
+ uint32_t front_size)
+ {
+ - extern __shared__ T shared[];
+ + __shared__ T shared[BlockSize];
+
+ ...
+
+ // Shared reduction
+ - for (uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2)
+ + tmp::static_for, tmp::divide<2>>([&]()
+ {
+ - if (tid < i)
+ - shared[tid] = op(shared[tid], shared[tid + i]);
+ + if (tid < I)
+ + shared[tid] = op(shared[tid], shared[tid + I]);
+ __syncthreads();
+ }
+ + );
+
+Introducing yet another template argument for the kernel and moving from ``for`` to ``tmp::static_for`` leads to the following two notable improvements:
+
+- Introducing new attribute ``__launch_bounds__(BlockSize)`` to the kernel instructs the compiler that the kernel will only be launched using the designated block size. This implies that the launches of differing block sizes will fail. This allows the optimizer to enroll the ``blockDim.x`` variable in constant folding as well as get information about register usage.
+- Turning the block size into a compile-time constant allows you to statically allocate the shared memory.
+
+Communicate using warp-collective functions
+-------------------------------------------
+
+Shared memory provides a fast communication path within a block, however when performing reduction within the last warp, you can use faster means of communication, which is warp-collective or cross-lane functions. Instead of using the hardware-backed shared memory, you can directly copy between the local memory (registers) of each lane in a warp. This can be achieve using the shuffle functions.
+
+See how to use ``__shfl_down()``, which is one of the most restrictive but also the most structured communication schemes.
+
+.. code-block:: C++
+
+ // Warp reduction
+ if (tid < WarpSize)
+ {
+ T res = op(shared[tid], shared[tid + WarpSize]);
+ tmp::static_for, tmp::divide<2>>([&]()
+ {
+ res = op(res, __shfl_down(res, Delta));
+ });
+
+ // Write result from shared to back buffer
+ if (tid == 0)
+ back[bid] = res;
+ }
+
+Using warp-collective functions for communication requires the control flow to be uniform across warps, as the name warp-collective implies. Therefore, you can see that the thread ID is being checked outside the loop, but the result is written inside due to variable scoping.
+
+Prefer warp communication over shared
+-------------------------------------
+
+As mentioned in the previous step, communication between local memory is faster than shared memory. Instead of relying on the local memory only at the end of the tree-like reduction, a better approach is to turn the tree reduction inside out and perform multiple warp reductions in parallel on all active threads, thus communicating only their partial results through the shared memory.
+
+.. figure:: ../data/tutorial/reduction/warp_reduction.svg
+ :alt: Diagram demonstrating warp reduction
+
+.. figure:: ../data/tutorial/reduction/warp_reduction_with_shared.svg
+ :alt: Diagram demonstrating warp reduction and results store in shared memory
+
+The kernel versions differ significantly enough to be described using a diff; use afresh instead.
+
+.. code-block:: C++
+
+ template
+ __global__ __launch_bounds__(BlockSize) void kernel(
+ T* front,
+ T* back,
+ F op,
+ T zero_elem,
+ uint32_t front_size)
+ {
+ // ...
+ }
+
+The kernel signature and the reduction factor are the same as in previous cases; only the implementation differs.
+
+.. code-block:: C++
+
+ static constexpr uint32_t WarpCount = BlockSize / WarpSize;
+
+ __shared__ T shared[WarpCount];
+
+ auto read_global_safe =
+ [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+ auto read_shared_safe =
+ [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
+
+ const uint32_t tid = threadIdx.x,
+ bid = blockIdx.x,
+ gid = bid * (blockDim.x * 2) + tid,
+ wid = tid / WarpSize,
+ lid = tid % WarpSize;
+
+ // Read input from front buffer to local
+ T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+
+As we communicate the results of warps through shared memory, the same number of elements are required in the shared memory as warps within the block. Similar to how you can only launch kernels at block granularity, you can only warp reduce with ``WarpSize`` granularity due to the collective nature of the cross-lane builtins. To address this, you can use ``read_shared_safe`` to pad overindexing by reading ``zero_elem``. Reading from global remains unaffected.
+
+.. code-block:: C++
+
+ // Perform warp reductions and communicate results via shared
+ // for (uint32_t ActiveWarps = WarpCount;
+ // ActiveWarps != 0;
+ // ActiveWarps = ActiveWarps != 1 ?
+ // divide_ceil(ActiveWarps, WarpSize) :
+ // ActiveWarps = 0)
+ tmp::static_for<
+ WarpCount,
+ tmp::not_equal<0>,
+ tmp::select<
+ tmp::not_equal<1>,
+ tmp::divide_ceil,
+ tmp::constant<0>>>([&]()
+ {
+ if(wid < ActiveWarps)
+ {
+ // Warp reduction
+ tmp::static_for, tmp::divide<2>>([&]()
+ {
+ res = op(res, __shfl_down(res, Delta));
+ });
+
+ // Write warp result from local to shared
+ if(lid == 0)
+ shared[wid] = res;
+ }
+ __syncthreads();
+
+ // Read warp result from shared to local
+ res = read_shared_safe(tid);
+ });
+
+ // Write result from local to back buffer
+ if(tid == 0)
+ back[bid] = res;
+
+``ActiveWarps`` iterates from ``WarpCount`` until it reaches ``0``. Every iteration of ``ActiveWarps`` reduces the ``WarpSize``. In cases where the partial result count isn't a divisor of ``ActiveWarps`` and you need to launch an extra warp, use ``tmp::divide_ceil``, which always rounds to positive infinity. The tertiary ``tmp::select`` is required because such division never reaches ``0``, so you must terminate the loop after the last warp concludes.
+
+In each iteration, if the warp is active, which means it has at least a single valid input, it carries out a pass of warp reduction and writes output based on warp ID. Reading is carried out based on thread ID. Global output continues to be based on block ID.
+
+Amortize bookkeeping variable overhead
+--------------------------------------
+
+The previous sections explained how to reduce register usage to improve occupancy. This allows more blocks to execute in parallel on all multiprocessors, leading to more global store/load latency to be hidden. Reducing the number of kernels in flight while still carrying out the same workload reduces the wastage of registers while loading and maintaining bookkeeping variables such as kernel indices.
+
+An example of this optimization is performing one binary ``op`` while loading input from global. Even though the operation is said to be carried out "in flight", the two values are loaded into local memory (registers) before ``op`` is called.
+
+A more general form of this optimization is wrapping most kernel logic in loops that carry out the workload of multiple kernel instances but require storing only a single instance of most of the bookkeeping logic. In code, this multiplicity factor is referred to via the ``ItemsPerThread`` compile-time constant, which is supplied by a template argument to allow for loop unrolling.
+
+This kernel variant utilizes another generally applicable utility known as ``hip::static_array``, which is a more restrictive wrapper over the builtin array than ``std::array``, as it allows indexing only compile-time constants using the usual tuple-like ``template auto get(...)`` interface.
+
+.. note::
+
+ On a GPU, there is no stack, and the local memory is provisioned from the register file. This provisioning takes place statically. To paraphrase, the address range of a thread's local memory is determined at compile-time. When an array is defined and used in the local storage, the compiler can only maintain its storage in the register file as long as all accesses to the array are computable by the compiler at compile-time. It doesn't need to be a compile-time constant as long as the compiler can resolve the addresses of the accesses through constant folding or some other means. If the compiler fails to do so, the array will be backed by global memory, which is indicated by allocating a non-zero number of spill registers observable using static analysis tools. However, this is slower by the magnitude of multiple order. ``hip::static_array`` via its ``hip::get<>`` interface ensures that no such spills occur.
+
+.. code-block:: C++
+
+ template
+ __global__ static __launch_bounds__(BlockSize) void kernel(...)
+
+The kernel now has three compile-time configurable parameters. The only part of the kernel that changes depends on how you load data from global and perform the binary operation on those loaded values. So, the following step to read input from front buffer to global is now split into two steps: :ref:`reading ``ItemsPerThread`` `and :ref:`processing ``ItemsPerThread`` `.
+
+.. code-block:: C++
+
+ // Read input from front buffer to local
+ T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+
+.. _reading-items:
+
+Reading ``ItemsPerThread``
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The change to reading happens inside `read_global_safe`:
+
+.. code-block:: C++
+
+ auto read_global_safe = [&](const int32_t i) -> hip::static_array
+ {
+ return [&](std::integer_sequence)
+ {
+ if(i + ItemsPerThread < front_size)
+ return hip::static_array{
+ front[i + I]...
+ };
+ else
+ return hip::static_array{
+ (i + I < front_size ? front[i + I] : zero_elem)...
+ };
+ }(std::make_integer_sequence());
+ };
+
+Note that each array element is being loaded consecutively without the flexibility of a configurable ``ItemsPerThread`` property. This is morally equivalent to:
+
+.. code-block:: C++
+
+ T arr[4] = {
+ front[gid + 0],
+ front[gid + 1],
+ front[gid + 2],
+ front[gid + 3]
+ }
+
+This is exactly what's happening in the ``front[i + I]...`` fold-expression. However, this can only be issued if the entire read operates on real input without padding using ``zero_elem``. If some reads over-index the input, the read turns into:
+
+.. code-block:: C++
+
+ T arr[4] = {
+ i + 0 < front_size ? front[i + 0] : zero_elem,
+ i + 1 < front_size ? front[i + 1] : zero_elem,
+ i + 2 < front_size ? front[i + 2] : zero_elem,
+ i + 3 < front_size ? front[i + 3] : zero_elem
+ }
+
+This makes it easier for the compiler to recognize vector loads from global. As the performance at large is dominated by how you move the data, it's only natural to utilize dedicated instructions to move more data with less binary. This is evident by the huge performance improvement when loading two values per thread. For more information, see `the compiler explorer `_ to learn how loading for AMD (both RDNA and CDNA) compiles to ``global_load_dwordx4``, where ``x4`` denotes the 4-vector variant of the instruction.
+
+.. note::
+
+ Note that ``read_global_safe``, which used to take an ``uint32_t`` as the index type, now takes a signed integer. When indexing an array with unsigned integers, the compiler has to handle integer overflows, as the C/C++ standards defined them. It might happen that some part of the vector load indices overflow, thus resulting in a non-contiguous read. If you change the previously linked code to use an unsigned integer as the thread ID, the compiler won't emit a vector load. Signed integer overflow is an undefined behavior, and hence, unknown to the optimizer. To convey the absence of overflow to the compiler with unsigned indices, add ``__builtin_assume(gid + 4 > gid)``, or the more portable ``[[assume]](gid + 4 > gid)``, once ``amdclang++`` supports it.
+
+``read_global_safe`` implementation is an Immediately Invoked Lambda Expression (IILE), because ``ItemsPerThread`` is an integer value, while you need a compile-time ``iota``-like sequence of integers as a pack for the fold-expressions to expand on. This can only occur as part of template argument deduction on the IILE.
+
+.. _processing-items:
+
+Processing ``ItemsPerThread``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once the kernel reads ``ItemsPerThread`` number of inputs to local, it immediately reduces them to a scalar. There is no reason to propagate the input element multiplicity to the warp reduction phase.
+
+.. code-block:: C++
+
+ T res = [&]()
+ {
+ // Read input from front buffer to local
+ hip::static_array arr = read_global_safe(gid);
+
+ // Reduce ItemsPerThread to scalar
+ tmp::static_for<1, tmp::less_than, tmp::increment<1>>([&]()
+ {
+ get<0>(arr) = op(get<0>(arr), get(arr));
+ });
+
+ return get<0>(arr);
+ }();
+
+Two-pass reduction
+------------------
+
+Alter kernel launch and input fetching such that no more blocks are launched than what a subsequent kernel launch's single block can conveniently reduce, while performing multiple passes of input reading from global and combining their results before engaging in the end game tree-like reduction.
+
+With this method, you can save at least one to two kernel launches for large inputs.
+
+Global data share
+-----------------
+
+.. warning::
+
+ This modification can only be executed on AMD hardware.
+
+Perform the first step of the two-pass reduction, but in the end, instead of writing to global and reading it back in a subsequent kernel, write the partial results to the Global Data Share (GDS). This is an ``N+1`` th shared memory that is accessed by all multiprocessors and is also on-chip memory.
+
+.. note::
+
+ The API doesn't guarantee the order in which blocks are scheduled even though all GPUs schedule them in the same monotonically increasing order of block ids. Relying on this implicitly, the last block of a grid is in the optimal position to observe the side effects of all other blocks (using spinlocks or other methods) without occupying a multiprocessor for longer than necessary.
+
+Without launching a second kernel, you can make the last block collect the results of all other blocks from GDS by implicitly exploiting the scheduling behavior or relying on another AMD-specific feature called Global Wave Sync (GWS) to merge them for a final tree-like reduction.
+
+.. note::
+
+ GDS and GWS are reserved runtime features that the HIP API doesn’t cover. Invoking these functionalities requires inline AMDGCN assembly. Moreover, the fact that the runtime doesn’t virtualize the GDS, imposes further restrictions on concurrent scheduling of other kernels.
+
+Conclusion
+==========
+
+Optimizing code on GPUs, like on any other architecture, requires careful consideration and balancing of resources and costs of various operations to obtain optimal performance. This document explored optimizing reductions much beyond the territory of diminishing returns. This approach introduced multiple optimization techniques and discussed opportunities.
+
+The document focused on reductions when an entire device participates in it. Still, the choice of optimal compile-time constants or even the algorithm itself might not be optimal when its multiple blocks participate in multiple parallel reductions or when each thread performs its reduction. However, when multiple devices participate in the same reduction, other aspects must be considered.
+
+Most solutions, including the ones covered in this document, are given to the end users in a turnkey fashion via algorithm primitive libraries. These solutions might not be the fastest in all cases, but they are close to being the gold standard for carrying out certain operations as reasonably as possible.
diff --git a/docs/tutorial/saxpy.rst b/docs/tutorial/saxpy.rst
new file mode 100644
index 0000000000..b1f693cafb
--- /dev/null
+++ b/docs/tutorial/saxpy.rst
@@ -0,0 +1,751 @@
+.. meta::
+ :description: The SAXPY tutorial on HIP
+ :keywords: AMD, ROCm, HIP, SAXPY, tutorial
+
+*******************************************************************************
+SAXPY - Hello, HIP
+*******************************************************************************
+
+This tutorial explains the basic concepts of the single-source
+Heterogeneous-computing Interface for Portability (HIP) programming model and
+the essential tooling around it. It also reviews some commonalities of
+heterogenous APIs in general. This topic assumes basic familiarity with the
+C/C++ compilation model and language.
+
+Prerequisites
+=============
+
+To follow this tutorial, you'll need installed drivers and a HIP compiler
+toolchain to compile your code. Because HIP for ROCm supports compiling and
+running on Linux and Windows with AMD and NVIDIA GPUs, the combination of
+install instructions is more than worth covering as part of this tutorial. For
+more information about installing HIP development packages, see
+:doc:`/install/install`.
+
+.. _hip-tutorial-saxpy-heterogeneous-programming:
+
+Heterogeneous programming
+=========================
+
+*Heterogeneous programming* and *offloading APIs* are often mentioned together. Heterogeneous programming deals with devices of varying capabilities simultaneously. Offloading focuses on the "remote" and asynchronous aspects of computation. HIP encompasses both. It exposes GPGPU (general-purpose GPU) programming much like ordinary host-side CPU programming and lets you move data across various devices.
+
+When programming in HIP (and other heterogenous APIs for that matter), remember that target devices are built for a specific purpose. They are designed with different tradeoffs than traditional CPUs and therefore have very different performance characteristics. Even subtle changes in code might adversely affect execution time.
+
+Your first lines of HIP code
+============================
+
+First, let's do the "Hello, World!" of GPGPU: SAXPY. Single-precision A times X Plus Y (*SAXPY*) is a mathematical acronym; a vector equation :math:`a\cdot x+y=z` where :math:`a\in\mathbb{R}` is a scalar and :math:`x,y,z\in\mathbb{V}` are vector quantities of some large dimensionality. This vector space is defined over the set of reals. Practically speaking, you can compute this using a single ``for`` loop over three arrays.
+
+.. code-block:: C++
+
+ for (int i = 0 ; i < N ; ++i)
+ z[i] = a * x[i] + y[i];
+
+In linear algebra libraries, such as BLAS (Basic Linear Algebra Subsystem) this operation is defined as AXPY "A times X Plus Y". The "S" comes from *single-precision*, meaning that array element is ``float`` -s (IEEE 754 binary32 representation).
+
+To quickly get started, use the set of `HIP samples from GitHub `_. With Git configured on your machine, open a command-line and navigate to your desired working directory, then run:
+
+.. code-block:: shell
+
+ git clone https://github.com/amd/rocm-examples.git
+
+A simple implementation of SAXPY resides in the ``HIP-Basic/saxpy/main.hip`` file in this repository. The HIP code here mostly deals with where data has to be and when, and how devices transform this data. The first HIP calls deal with allocating device-side memory and copying data from host-side memory to device side in a C runtime-like fashion.
+
+.. code-block:: C++
+
+ // Allocate and copy vectors to device memory.
+ float* d_x{};
+ float* d_y{};
+ HIP_CHECK(hipMalloc(&d_x, size_bytes));
+ HIP_CHECK(hipMalloc(&d_y, size_bytes));
+ HIP_CHECK(hipMemcpy(d_x, x.data(), size_bytes, hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d_y, y.data(), size_bytes, hipMemcpyHostToDevice));
+
+``HIP_CHECK`` is a custom macro borrowed from the examples utilities which checks the error code returned by API functions for errors and reports them to the console. It is not essential to the API, but it is a good practice to check the error codes of the HIP APIs in case you pass on incorrect values to the API, or the API might be out of resources.
+
+The code selects the device to allocate to and to copy to. Commands are issued to the HIP runtime per thread, and every thread has a device set as the target of commands. The default device is ``0``, which is equivalent to calling ``hipSetDevice(0)``.
+
+Launch the calculation on the device after the input data has been prepared.
+
+.. code-block:: C++
+
+ __global__ void saxpy_kernel(const float a, const float* d_x, float* d_y, const unsigned int size)
+ {
+ // ...
+ }
+
+ int main()
+ {
+ // ...
+
+ // Launch the kernel on the default stream.
+ saxpy_kernel<<>>(a, d_x, d_y, size);
+ }
+
+Analyze at the signature of the offloaded function:
+
+- ``__global__`` instructs the compiler to generate code for this function as an
+ entrypoint to a device program, such that it can be launched from the host.
+- The function does not return anything, because there is no trivial way to
+ construct a return channel of a parallel invocation. Device-side entrypoints
+ may not return a value, their results should be communicated using output
+ parameters.
+- Device-side functions are typically called compute kernels, or just kernels
+ for short. This is to distinguish them from non-graphics-related graphics
+ shaders, or just shaders for short.
+- Arguments are taken by value and all arguments shall be
+ `TriviallyCopyable `_,
+ meaning they should be `memcpy`-friendly. (Imagine if they had custom copy
+ constructors. Where would that logic execute? On the host? On the device?)
+ Pointer arguments are pointers to device memory, one typically backed by
+ VRAM.
+- We said that we'll be computing :math:`a\cdot x+y=z`, however we only pass
+ two pointers to the function. We'll be canonically reusing one of the inputs
+ as outputs.
+
+This function is launched from the host using a language extension often called
+the triple chevron syntax. Inside the angle brackets, provide the following.
+
+- The number of :ref:`blocks ` to launch (our :ref:`grid ` size)
+- The number of threads in a :ref:`block ` (our :ref:`block ` size)
+- The amount of shared memory to allocate by the host
+- The device stream to enqueue the operation on
+
+The :ref:`block ` size and shared memory become important later in :doc:`reduction`. For
+now, a hardcoded ``256`` is a safe default for simple kernels such as this.
+Following the triple chevron is ordinary function argument passing.
+
+Look at how the kernel is implemented.
+
+.. code-block:: C++
+
+ __global__ void saxpy_kernel(const float a, const float* d_x, float* d_y, const unsigned int size)
+ {
+ // Compute the current thread's index in the grid.
+ const unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // The grid can be larger than the number of items in the vectors. Avoid out-of-bounds addressing.
+ if(global_idx < size)
+ {
+ d_y[global_idx] = a * d_x[global_idx] + d_y[global_idx];
+ }
+ }
+
+- The unique linear index identifying the thread is computed from the :ref:`block ` ID
+ the thread is a member of, the :ref:`block `'s size and the ID of the thread within
+ the :ref:`block `.
+- A check is made to avoid overindexing the input.
+- The useful part of the computation is carried out.
+
+Retrieval of the result from the device is done much like input data copy. In this current step the results copied from device to host. The opposite direction of the input data copy:
+
+.. code-block:: C++
+
+ HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost));
+
+Compiling on the command line
+=============================
+
+.. _setting_up_the_command-line:
+
+Setting up the command line
+---------------------------
+
+Strictly speaking there's no such thing as "setting up the command-line
+for compilation" on Linux. To make invocations more terse, Linux and Windows
+example follow.
+
+.. tab-set::
+ .. tab-item:: Linux and AMD
+ :sync: linux-amd
+
+ While distro maintainers might package ROCm so that it installs to
+ system-default locations, AMD's installation packages aren't. They need to
+ be added to the PATH by the user.
+
+ .. code-block:: bash
+
+ export PATH=/opt/rocm/bin:${PATH}
+
+ You should be able to call the compiler on the command line now:
+
+ .. code-block:: bash
+
+ amdclang++ --version
+
+ .. note::
+
+ Docker images distributed by AMD, such as
+ `rocm-terminal `_ already
+ have `/opt/rocm/bin` on the Path for convenience. This subtly affects
+ CMake package detection logic of ROCm libraries.
+
+ .. tab-item:: Linux and NVIDIA
+ :sync: linux-nvidia
+
+ Both distro maintainers and NVIDIA package CUDA so that ``nvcc`` and related
+ tools are available on the command line by default. You can call the
+ compiler on the command line with:
+
+ .. code-block:: bash
+
+ nvcc --version
+
+ .. tab-item:: Windows and AMD
+ :sync: windows-amd
+
+ Windows compilers and command line tooling have traditionally relied on
+ extra environmental variables and PATH entries to function correctly.
+ Visual Studio refers to command lines with this setup as "Developer
+ Command Prompt" or "Developer PowerShell" for ``cmd.exe`` and PowerShell
+ respectively.
+
+ The HIP SDK on Windows doesn't include a complete toolchain. You will also
+ need:
+
+ - The Microsoft Windows SDK. It provides the import libs to crucial system
+ libraries that all executables must link to and some auxiliary compiler
+ tooling.
+ - A Standard Template Library (STL). Installed as part of the Microsoft
+ Visual C++ compiler (MSVC) or with Visual Studio.
+
+ If you don't have a version of Visual Studio 2022 installed, for a
+ minimal command line experience, install the
+ `Build Tools for Visual Studio 2022 `_
+ with the Desktop Developemnt Workload. Under Individual Components select:
+
+ - A version of the Windows SDK
+ - "MSVC v143 - VS 2022 C++ x64/x86 build tools (Latest)"
+ - "C++ CMake tools for Windows" (optional)
+
+ .. note::
+
+ The "C++ CMake tools for Windows" individual component is a convenience which
+ puts both ``cmake.exe`` and ``ninja.exe`` onto the PATH inside developer
+ command prompts. You can install these manually, but then you must manage
+ them manually.
+
+ Visual Studio 2017 and later are detectable as COM object instances via WMI.
+ To setup a command line from any shell for the latest Visual Studio's
+ default Visual C++ toolset issue:
+
+ .. code-block:: powershell
+
+ $InstallationPath = Get-CimInstance MSFT_VSInstance | Sort-Object -Property Version -Descending | Select-Object -First 1 -ExpandProperty InstallLocation
+ Import-Module $InstallationPath\Common7\Tools\Microsoft.VisualStudio.DevShell.dll
+ Enter-VsDevShell -InstallPath $InstallationPath -SkipAutomaticLocation -Arch amd64 -HostArch amd64 -DevCmdArguments '-no_logo'
+ $env:PATH = "${env:HIP_PATH}bin;${env:PATH}"
+
+ You should be able to call the compiler on the command line now:
+
+ .. code-block:: powershell
+
+ clang++ --version
+
+ .. tab-item:: Windows and NVIDIA
+ :sync: windows-nvidia
+
+ Windows compilers and command line tooling have traditionally relied on
+ extra environmental variables and PATH entries to function correctly.
+ Visual Studio refers to command lines with this setup as "Developer
+ Command Prompt" or "Developer PowerShell" for ``cmd.exe`` and PowerShell
+ respectively.
+
+ The HIP and CUDA SDKs on Windows don't include complete toolchains. You will
+ also need:
+
+ - The Microsoft Windows SDK. It provides the import libs to crucial system
+ libraries that all executables must link to and some auxiliary compiler
+ tooling.
+ - A Standard Template Library (STL). Installed as part of the Microsoft
+ Visual C++ compiler (MSVC) or with Visual Studio.
+
+ If you don't have a version of Visual Studio 2022 installed, for a
+ minimal command line experience, install the
+ `Build Tools for Visual Studio 2022 `_
+ with the Desktop Developemnt Workload. Under Individual Components select:
+
+ - A version of the Windows SDK
+ - "MSVC v143 - VS 2022 C++ x64/x86 build tools (Latest)"
+ - "C++ CMake tools for Windows" (optional)
+
+ .. note::
+
+ The "C++ CMake tools for Windows" individual component is a convenience which
+ puts both ``cmake.exe`` and ``ninja.exe`` onto the PATH inside developer
+ command prompts. You can install these manually, but then you must manage
+ them manually.
+
+ Visual Studio 2017 and later are detectable as COM object instances via WMI.
+ To setup a command line from any shell for the latest Visual Studio's
+ default Visual C++ toolset issue:
+
+ .. code-block:: powershell
+
+ $InstallationPath = Get-CimInstance MSFT_VSInstance | Sort-Object -Property Version -Descending | Select-Object -First 1 -ExpandProperty InstallLocation
+ Import-Module $InstallationPath\Common7\Tools\Microsoft.VisualStudio.DevShell.dll
+ Enter-VsDevShell -InstallPath $InstallationPath -SkipAutomaticLocation -Arch amd64 -HostArch amd64 -DevCmdArguments '-no_logo'
+
+ You should be able to call the compiler on the command line now:
+
+ .. code-block:: powershell
+
+ nvcc --version
+
+Invoking the compiler manually
+------------------------------
+
+To compile and link a single-file application, use the following commands:
+
+.. tab-set::
+ .. tab-item:: Linux and AMD
+ :sync: linux-amd
+
+ .. code-block:: bash
+
+ amdclang++ ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -lamdhip64 -L /opt/rocm/lib -O2
+
+ .. tab-item:: Linux and NVIDIA
+ :sync: linux-nvidia
+
+ .. code-block:: bash
+
+ nvcc ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -I /opt/rocm/include -O2 -x cu
+
+ .. tab-item:: Windows and AMD
+ :sync: windows-amd
+
+ .. code-block:: powershell
+
+ clang++ .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I .\Common -lamdhip64 -L ${env:HIP_PATH}lib -O2
+
+ .. tab-item:: Windows and NVIDIA
+ :sync: windows-nvidia
+
+ .. code-block:: powershell
+
+ nvcc .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I ${env:HIP_PATH}include -I .\Common -O2 -x cu
+
+Depending on your computer, the resulting binary might or might not run. If not,
+it typically complains about "Invalid device function". That error
+(corresponding to the ``hipErrorInvalidDeviceFunction`` entry of ``hipError_t``)
+means that the runtime could not find a device program binary of the
+appropriate flavor embedded into the executable.
+
+So far, the discussion has covered how data makes it from the host to the
+device and back. It has also discussed the device code as source, with the HIP
+runtime arguing that the correct binary to dispatch for execution. How can you
+find out what device binary flavors are embedded into the executable?
+
+.. tab-set::
+
+ .. tab-item:: Linux and AMD
+ :sync: linux-amd
+
+ The utilities included with ROCm help significantly to inspect binary
+ artifacts on disk. Add the ROCmCC installation folder to your PATH if you
+ want to use these utilities (the utilities expect them to be on the PATH).
+
+ You can list embedded program binaries using ``roc-obj-ls``.
+
+ .. code-block:: bash
+
+ roc-obj-ls ./saxpy
+
+ It should return something like:
+
+ .. code-block:: shell
+
+ 1 host-x86_64-unknown-linux file://./saxpy#offset=12288&size=0
+ 1 hipv4-amdgcn-amd-amdhsa--gfx803 file://./saxpy#offset=12288&size=9760
+
+ The compiler embeds a version 4 code object (more on `code
+ object versions `_)
+ and used the LLVM target triple `amdgcn-amd-amdhsa--gfx803` (more on `target triples
+ `_). You can
+ extract that program object in a disassembled fashion for human consumption
+ via ``roc-obj``.
+
+ .. code-block:: bash
+
+ roc-obj -t gfx803 -d ./saxpy
+
+ This creates two files on disk and ``.s`` extension is of most interest.
+ Opening this file or dumping it to the console using ``cat``
+ lets find the disassembled binary of the SAXPY compute kernel, something
+ similar to:
+
+ .. code-block::
+
+ Disassembly of section .text:
+
+ <_Z12saxpy_kernelfPKfPfj>:
+ s_load_dword s0, s[4:5], 0x2c // 000000001000: C0020002 0000002C
+ s_load_dword s1, s[4:5], 0x18 // 000000001008: C0020042 00000018
+ s_waitcnt lgkmcnt(0) // 000000001010: BF8C007F
+ s_and_b32 s0, s0, 0xffff // 000000001014: 8600FF00 0000FFFF
+ s_mul_i32 s6, s6, s0 // 00000000101C: 92060006
+ v_add_u32_e32 v0, vcc, s6, v0 // 000000001020: 32000006
+ v_cmp_gt_u32_e32 vcc, s1, v0 // 000000001024: 7D980001
+ s_and_saveexec_b64 s[0:1], vcc // 000000001028: BE80206A
+ s_cbranch_execz 22 // 00000000102C: BF880016 <_Z12saxpy_kernelfPKfPfj+0x88>
+ s_load_dwordx4 s[0:3], s[4:5], 0x8 // 000000001030: C00A0002 00000008
+ v_mov_b32_e32 v1, 0 // 000000001038: 7E020280
+ v_lshlrev_b64 v[0:1], 2, v[0:1] // 00000000103C: D28F0000 00020082
+ s_waitcnt lgkmcnt(0) // 000000001044: BF8C007F
+ v_mov_b32_e32 v3, s1 // 000000001048: 7E060201
+ v_add_u32_e32 v2, vcc, s0, v0 // 00000000104C: 32040000
+ v_addc_u32_e32 v3, vcc, v3, v1, vcc // 000000001050: 38060303
+ flat_load_dword v2, v[2:3] // 000000001054: DC500000 02000002
+ v_mov_b32_e32 v3, s3 // 00000000105C: 7E060203
+ v_add_u32_e32 v0, vcc, s2, v0 // 000000001060: 32000002
+ v_addc_u32_e32 v1, vcc, v3, v1, vcc // 000000001064: 38020303
+ flat_load_dword v3, v[0:1] // 000000001068: DC500000 03000000
+ s_load_dword s0, s[4:5], 0x0 // 000000001070: C0020002 00000000
+ s_waitcnt vmcnt(0) lgkmcnt(0) // 000000001078: BF8C0070
+ v_mac_f32_e32 v3, s0, v2 // 00000000107C: 2C060400
+ flat_store_dword v[0:1], v3 // 000000001080: DC700000 00000300
+ s_endpgm // 000000001088: BF810000
+
+ Alternatively, call the compiler with ``--save-temps`` to dump all device
+ binary to disk in separate files.
+
+ .. code-block:: bash
+
+ amdclang++ ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -lamdhip64 -L /opt/rocm/lib -O2 --save-temps
+
+ List all the temporaries created while compiling ``main.hip`` with:
+
+ .. code-block:: bash
+
+ ls main-hip-amdgcn-amd-amdhsa-*
+ main-hip-amdgcn-amd-amdhsa-gfx803.bc
+ main-hip-amdgcn-amd-amdhsa-gfx803.cui
+ main-hip-amdgcn-amd-amdhsa-gfx803.o
+ main-hip-amdgcn-amd-amdhsa-gfx803.out
+ main-hip-amdgcn-amd-amdhsa-gfx803.out.resolution.txt
+ main-hip-amdgcn-amd-amdhsa-gfx803.s
+
+ Files with the ``.s`` extension hold the disassembled contents of the binary.
+ The filename notes the graphics IPs used by the compiler. The contents of
+ this file are similar to what ``roc-obj`` printed to the console.
+
+ .. tab-item:: Linux and NVIDIA
+ :sync: linux-nvidia
+
+ Unlike HIP on AMD, when compiling using the NVIDIA support of HIP the resulting
+ binary will be a valid CUDA executable as far as the binary goes. Therefor
+ it'll incorporate PTX ISA (Parallel Thread eXecution Instruction Set
+ Architecture) instead of AMDGPU binary. As s result, tooling shipping with the
+ CUDA SDK can be used to inspect which device ISA got compiled into a specific
+ executable. The tool most useful to us currently is ``cuobjdump``.
+
+ .. code-block:: bash
+
+ cuobjdump --list-ptx ./saxpy
+
+ Which will print something like:
+
+ .. code-block::
+
+ PTX file 1: saxpy.1.sm_52.ptx
+
+ From this we can see that the saxpy kernel is stored as ``sm_52``, which shows
+ that a compute capability 5.2 ISA got embedded into the executable, so devices
+ which sport compute capability 5.2 or newer will be able to run this code.
+
+ .. tab-item:: Windows and AMD
+ :sync: windows-amd
+
+ The HIP SDK for Windows don't yet sport the ``roc-*`` set of utilities to work
+ with binary artifacts. To find out what binary formats are embedded into an
+ executable, one may use ``dumpbin`` tool from the Windows SDK to obtain the
+ raw data of the ``.hip_fat`` section of an executable. (This binary payload is
+ what gets parsed by the ``roc-*`` set of utilities on Linux.) Skipping over the
+ reported header, the rendered raw data as ASCII has ~3 lines per entries.
+ Depending on how many binaries are embedded, you may need to alter the number
+ of rendered lines. An invocation such as:
+
+ .. code-block:: powershell
+
+ dumpbin.exe /nologo /section:.hip_fat /rawdata:8 .\saxpy.exe | select -Skip 20 -First 12
+
+ The output may look like:
+
+ .. code-block::
+
+ 000000014004C000: 5F474E414C435F5F 5F44414F4C46464F __CLANG_OFFLOAD_
+ 000000014004C010: 5F5F454C444E5542 0000000000000002 BUNDLE__........
+ 000000014004C020: 0000000000001000 0000000000000000 ................
+ 000000014004C030: 0000000000000019 3638782D74736F68 ........host-x86
+ 000000014004C040: 6E6B6E752D34365F 756E696C2D6E776F _64-unknown-linu
+ 000000014004C050: 0000000000100078 00000000000D9800 x...............
+ 000000014004C060: 0000000000001F00 612D347670696800 .........hipv4-a
+ 000000014004C070: 6D612D6E6367646D 617368646D612D64 mdgcn-amd-amdhsa
+ 000000014004C080: 3630397866672D2D 0000000000000000 --gfx906........
+ 000000014004C090: 0000000000000000 0000000000000000 ................
+ 000000014004C0A0: 0000000000000000 0000000000000000 ................
+ 000000014004C0B0: 0000000000000000 0000000000000000 ................
+
+ We can see that the compiler embedded a version 4 code object (more on code
+ `object versions `_) and
+ used the LLVM target triple `amdgcn-amd-amdhsa--gfx906` (more on `target triples
+ `_). Don't be
+ alarmed about linux showing up as a binary format, AMDGPU binaries uploaded to
+ the GPU for execution are proper linux ELF binaries in their format.
+
+ Alternatively we can call the compiler with ``--save-temps`` to dump all device
+ binary to disk in separate files.
+
+ .. code-block:: powershell
+
+ clang++ .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I .\Common -lamdhip64 -L ${env:HIP_PATH}lib -O2 --save-temps
+
+ Now we can list all the temporaries created while compiling ``main.hip`` via
+
+ .. code-block:: powershell
+
+ Get-ChildItem -Filter main-hip-* | select -Property Name
+
+ Name
+ ----
+ main-hip-amdgcn-amd-amdhsa-gfx906.bc
+ main-hip-amdgcn-amd-amdhsa-gfx906.hipi
+ main-hip-amdgcn-amd-amdhsa-gfx906.o
+ main-hip-amdgcn-amd-amdhsa-gfx906.out
+ main-hip-amdgcn-amd-amdhsa-gfx906.out.resolution.txt
+ main-hip-amdgcn-amd-amdhsa-gfx906.s
+
+ Files with the ``.s`` extension hold the disassembled contents of the binary and
+ the filename directly informs us of the graphics IPs used by the compiler.
+
+ .. code-block:: powershell
+
+ Get-ChildItem main-hip-*.s | Get-Content
+ .text
+ .amdgcn_target "amdgcn-amd-amdhsa--gfx906"
+ .protected _Z12saxpy_kernelfPKfPfj ; -- Begin function _Z12saxpy_kernelfPKfPfj
+ .globl _Z12saxpy_kernelfPKfPfj
+ .p2align 8
+ .type _Z12saxpy_kernelfPKfPfj,@function
+ _Z12saxpy_kernelfPKfPfj: ; @_Z12saxpy_kernelfPKfPfj
+ ; %bb.0:
+ s_load_dword s0, s[4:5], 0x4
+ s_load_dword s1, s[6:7], 0x18
+ s_waitcnt lgkmcnt(0)
+ s_and_b32 s0, s0, 0xffff
+ s_mul_i32 s8, s8, s0
+ v_add_u32_e32 v0, s8, v0
+ v_cmp_gt_u32_e32 vcc, s1, v0
+ s_and_saveexec_b64 s[0:1], vcc
+ s_cbranch_execz .LBB0_2
+ ; %bb.1:
+ s_load_dwordx4 s[0:3], s[6:7], 0x8
+ v_mov_b32_e32 v1, 0
+ v_lshlrev_b64 v[0:1], 2, v[0:1]
+ s_waitcnt lgkmcnt(0)
+ v_mov_b32_e32 v3, s1
+ v_add_co_u32_e32 v2, vcc, s0, v0
+ v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
+ global_load_dword v2, v[2:3], off
+ v_mov_b32_e32 v3, s3
+ v_add_co_u32_e32 v0, vcc, s2, v0
+ v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+ global_load_dword v3, v[0:1], off
+ s_load_dword s0, s[6:7], 0x0
+ s_waitcnt vmcnt(0) lgkmcnt(0)
+ v_fmac_f32_e32 v3, s0, v2
+ global_store_dword v[0:1], v3, off
+ .LBB0_2:
+ s_endpgm
+ ...
+
+ .. tab-item:: Windows and NVIDIA
+ :sync: windows-nvidia
+
+ Unlike HIP on AMD, when compiling using the NVIDIA support for HIP, the resulting
+ binary will be a valid CUDA executable. Therefore, it'll incorporate PTX ISA
+ (Parallel Thread eXecution Instruction Set Architecture) instead of AMDGPU
+ binary. As a result, tooling included with the CUDA SDK can be used to
+ inspect which device ISA was compiled into a specific executable. The most
+ helpful to us currently is ``cuobjdump``.
+
+ .. code-block:: bash
+
+ cuobjdump.exe --list-ptx .\saxpy.exe
+
+ Which prints something like:
+
+ .. code-block::
+
+ PTX file 1: saxpy.1.sm_52.ptx
+
+ This example shows that the SAXPY kernel is stored as ``sm_52``. It also shows
+ that a compute capability 5.2 ISA was embedded into the executable, so devices
+ that support compute capability 5.2 or newer will be able to run this code.
+
+Now that you've found what binary got embedded into the executable, find which
+format our available devices use.
+
+.. tab-set::
+ .. tab-item:: Linux and AMD
+ :sync: linux-amd
+
+ On Linux a utility called ``rocminfo`` helps us list all the properties of the
+ devices available on the system, including which version of graphics IP
+ (``gfxXYZ``) they employ. You can filter the output to have only these lines:
+
+ .. code-block:: bash
+
+ /opt/rocm/bin/rocminfo | grep gfx
+ Name: gfx906
+ Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
+
+ Now that you know which graphics IPs our devices use, recompile your program with
+ the appropriate parameters.
+
+ .. code-block:: bash
+
+ amdclang++ ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -lamdhip64 -L /opt/rocm/lib -O2 --offload-arch=gfx906:sramecc+:xnack-
+
+ Now the sample will run.
+
+ .. code-block::
+
+ ./saxpy
+ Calculating y[i] = a * x[i] + y[i] over 1000000 elements.
+ First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ]
+
+ .. tab-item:: Linux and NVIDIA
+ :sync: linux-nvidia
+
+ On Linux HIP with the NVIDIA back-end, the ``deviceQuery`` CUDA SDK sample
+ can help us list all the properties of the devices available on the system,
+ including which version of compute capability a device sports.
+ ``.`` compute capability is passed to ``nvcc`` on the
+ command-line as ``sm_``, for eg. ``8.6`` is ``sm_86``.
+
+ Because it's not included as a binary, compile the matching
+ example from ROCm.
+
+ .. code-block:: bash
+
+ nvcc ./HIP-Basic/device_query/main.cpp -o device_query -I ./Common -I /opt/rocm/include -O2
+
+ Filter the output to have only the lines of interest, for example:
+
+ .. code-block:: bash
+
+ ./device_query | grep "major.minor"
+ major.minor: 8.6
+ major.minor: 7.0
+
+ .. note::
+
+ In addition to the ``nvcc`` executable is another tool called ``__nvcc_device_query``
+ which prints the SM Architecture numbers to standard out as a comma
+ separated list of numbers. The utility's name suggests it's not a user-facing
+ executable but is used by ``nvcc`` to determine what devices are in the
+ system at hand.
+
+ Now that you know which graphics IPs our devices use, recompile your program with
+ the appropriate parameters.
+
+ .. code-block:: bash
+
+ nvcc ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -I /opt/rocm/include -O2 -x cu -arch=sm_70,sm_86
+
+ .. note::
+
+ If you want to portably target the development machine which is compiling, you
+ may specify ``-arch=native`` instead.
+
+ Now the sample will run.
+
+ .. code-block::
+
+ ./saxpy
+ Calculating y[i] = a * x[i] + y[i] over 1000000 elements.
+ First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ]
+
+ .. tab-item:: Windows and AMD
+ :sync: windows-amd
+
+ On Windows, a utility called ``hipInfo.exe`` helps us list all the properties
+ of the devices available on the system, including which version of graphics IP
+ (``gfxXYZ``) they employ. Filter the output to have only these lines:
+
+ .. code-block:: powershell
+
+ & ${env:HIP_PATH}bin\hipInfo.exe | Select-String gfx
+
+ gcnArchName: gfx1032
+ gcnArchName: gfx1035
+
+ Now that you know which graphics IPs our devices use, recompile your program with
+ the appropriate parameters.
+
+ .. code-block:: powershell
+
+ clang++ .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I .\Common -lamdhip64 -L ${env:HIP_PATH}lib -O2 --offload-arch=gfx1032 --offload-arch=gfx1035
+
+ Now the sample will run.
+
+ .. code-block::
+
+ .\saxpy.exe
+ Calculating y[i] = a * x[i] + y[i] over 1000000 elements.
+ First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ]
+
+ .. tab-item:: Windows and NVIDIA
+ :sync: windows-nvidia
+
+ On Windows HIP with the NVIDIA back-end, the ``deviceQuery`` CUDA SDK sample
+ can help us list all the properties of the devices available on the system,
+ including which version of compute capability a device sports.
+ ``.`` compute capability is passed to ``nvcc`` on the
+ command-line as ``sm_``, for eg. ``8.6`` is ``sm_86``.
+
+ Because it's not included as a binary, compile the matching
+ example from ROCm.
+
+ .. code-block:: powershell
+
+ nvcc .\HIP-Basic\device_query\main.cpp -o device_query.exe -I .\Common -I ${env:HIP_PATH}include -O2
+
+ Filter the output to have only the lines of interest, for example:
+
+ .. code-block:: powershell
+
+ .\device_query.exe | Select-String "major.minor"
+
+ major.minor: 8.6
+ major.minor: 7.0
+
+ .. note::
+
+ Next to the ``nvcc`` executable is another tool called ``__nvcc_device_query.exe``
+ which simply prints the SM Architecture numbers to standard out as a comma
+ separated list of numbers. The naming of this utility suggests it's not a user
+ facing executable but is used by ``nvcc`` to determine what devices are in the
+ system at hand.
+
+ Now that you know which graphics IPs our devices use, recompile your program with
+ the appropriate parameters.
+
+ .. code-block:: powershell
+
+ nvcc .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I ${env:HIP_PATH}include -I .\Common -O2 -x cu -arch=sm_70,sm_86
+
+ .. note::
+
+ If you want to portably target the development machine which is compiling, you
+ may specify ``-arch=native`` instead.
+
+ Now the sample will run.
+
+ .. code-block::
+
+ .\saxpy.exe
+ Calculating y[i] = a * x[i] + y[i] over 1000000 elements.
+ First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ]
diff --git a/docs/understand/amd_clr.rst b/docs/understand/amd_clr.rst
index 24980468e7..3a643cb051 100644
--- a/docs/understand/amd_clr.rst
+++ b/docs/understand/amd_clr.rst
@@ -5,16 +5,16 @@
.. _AMD_Compute_Language_Runtimes:
*******************************************************************************
-AMD Common Language Runtimes (CLR)
+AMD common language runtimes (CLR)
*******************************************************************************
CLR contains source codes for AMD's compute languages runtimes: ``HIP`` and ``OpenCL™``.
CLR is the part of HIP runtime which is supported on the AMD ROCm platform, it provides a header and runtime library built on top of HIP-Clang compiler.
For developers and users, CLR implements HIP runtime APIs including streams, events, and memory APIs, which is a object library that is linked with the application.
-The source codes for all headers and the library implementation are available on GitHub in the `clr repository `_.
+The source codes for all headers and the library implementation are available on GitHub in the `CLR repository `_.
-Project Organisation
+Project organization
====================
CLR includes the following source code,
diff --git a/docs/understand/hardware_implementation.rst b/docs/understand/hardware_implementation.rst
index 8ee3e0e08c..9cf97b444a 100644
--- a/docs/understand/hardware_implementation.rst
+++ b/docs/understand/hardware_implementation.rst
@@ -5,13 +5,13 @@
.. _hardware_implementation:
*******************************************************************************
-Hardware Implementation
+Hardware implementation
*******************************************************************************
This chapter describes the typical hardware implementation of GPUs supported by
HIP, and how the :ref:`inherent_thread_model` maps to the hardware.
-Compute Units
+Compute units
=============
The basic building block of a GPU is a compute unit (CU), also known
@@ -79,7 +79,7 @@ instructions of the other branch have to be executed in the same way. The best
performance can therefore be achieved when thread divergence is kept to a warp
level, i.e. when all threads in a warp take the same execution path.
-Vector Cache
+Vector cache
------------
The usage of cache on a GPU differs from that on a CPU, as there is less cache
@@ -88,7 +88,7 @@ warps in order to reduce the amount of accesses to device memory, and make that
memory available for other warps that currently reside on the compute unit, that
also need to load those values.
-Local Data Share
+Local data share
----------------
The local data share is memory that is accessible to all threads within a block.
@@ -103,7 +103,7 @@ The scalar unit performs instructions that are uniform within a warp. It
thereby improves efficiency and reduces the pressure on the vector ALUs and the
vector register file.
-CDNA Architecture
+CDNA architecture
=================
The general structure of CUs stays mostly as it is in GCN
@@ -122,7 +122,7 @@ multiply-accumulate operations for
Block Diagram of a CDNA3 Compute Unit.
-RDNA Architecture
+RDNA architecture
=================
RDNA makes a fundamental change to CU design, by changing the
@@ -145,7 +145,7 @@ an L0 cache.
Block Diagram of an RDNA3 work group processor.
-Shader Engines
+Shader engines
==============
For hardware implementation's sake, multiple CUs are grouped
diff --git a/docs/understand/programming_model.rst b/docs/understand/programming_model.rst
index 88ba476a89..53299bd6e4 100644
--- a/docs/understand/programming_model.rst
+++ b/docs/understand/programming_model.rst
@@ -5,7 +5,7 @@
:keywords: AMD, ROCm, HIP, CUDA, API design
*******************************************************************************
-Understanding the HIP programming model
+HIP programming model
*******************************************************************************
The HIP programming model makes it easy to map data-parallel C/C++ algorithms to
@@ -14,7 +14,7 @@ such as GPUs. A basic understanding of the underlying device architecture helps
make efficient use of HIP and general purpose graphics processing unit (GPGPU)
programming in general.
-RDNA & CDNA Architecture Summary
+RDNA & CDNA architecture summary
================================
Most GPU architectures, like RDNA and CDNA, have a hierarchical structure.
@@ -68,7 +68,7 @@ memory subsystem resources.
.. _programming_model_simt:
-Single Instruction Multiple Threads
+Single instruction multiple threads
===================================
The single instruction, multiple threads (SIMT) programming model behind the
@@ -117,7 +117,7 @@ usually isn't exploited from the width of the built-in vector types, but via the
thread id constants ``threadIdx.x``, ``blockIdx.x``, etc. For more details,
refer to :ref:`inherent_thread_model`.
-Heterogeneous Programming
+Heterogeneous programming
=========================
The HIP programming model assumes two execution contexts. One is referred to as
diff --git a/docs/understand/programming_model_reference.rst b/docs/understand/programming_model_reference.rst
index 7c42569543..5c8d9c8a28 100644
--- a/docs/understand/programming_model_reference.rst
+++ b/docs/understand/programming_model_reference.rst
@@ -13,7 +13,7 @@ onto various architectures, primarily GPUs. While the model may be expressed
in most imperative languages, (for example Python via PyHIP) this document will focus on
the original C/C++ API of HIP.
-Threading Model
+Threading model
===============
The SIMT nature of HIP is captured by the ability to execute user-provided
@@ -26,7 +26,7 @@ The set of integers identifying a thread relate to the hierarchy in which thread
.. _inherent_thread_model:
-Inherent Thread Model
+Inherent thread Model
---------------------
The thread hierarchy inherent to how AMD GPUs operate is depicted in
@@ -56,6 +56,8 @@ Warp
signified by the set of communication primitives at their disposal, as
discussed in :ref:`warp-cross-lane`.
+.. _inherent_thread_hierarchy_block:
+
Block
The middle grouping is called a block or thread block. The defining feature
of a block is that all threads in a block will share an instance of memory
@@ -67,6 +69,8 @@ Block
within a block, assume the "fast index" being dimension ``x``, followed by
the ``y`` and ``z`` dimensions.
+.. _inherent_thread_hierarchy_grid:
+
Grid
The outermost grouping is called a grid. A grid manifests as a single
dispatch of kernels for execution. The unique ID of each block within a grid
@@ -83,47 +87,13 @@ of. It relaxes some restrictions of the :ref:`inherent_thread_model`
imposed by the strict 1:1 mapping of architectural details to the programming
model.
-The rich set of APIs introduced by Cooperative Groups allow the programmer to
-define their own set of thread groups which may fit their user-cases better than
-those defined by the hardware. The set of implicit groups by kernel launch
-parameters are still available.
-
-The thread hierarchy abstraction of Cooperative Groups manifest as depicted in
-:numref:`coop_thread_hierarchy`.
-
-.. _coop_thread_hierarchy:
-
-.. figure:: ../data/understand/programming_model_reference/thread_hierarchy_coop.svg
- :alt: Diagram depicting nested rectangles of varying color. The outermost one
- titled "Grid", inside sets of different sized rectangles layered on
- one another titled "Block". Each "Block" containing sets of uniform
- rectangles layered on one another titled "Warp". Each of the "Warp"
- titled rectangles filled with downward pointing arrows inside.
-
- Cooperative group thread hierarchy.
-
-Multi Grid
- An abstraction of potentially multiple simultaneous launches of
- the same kernel over multiple devices. Grids inside a multi device kernel
- launch need not be of uniform size, thus allowing taking into account
- different device capabilities and preferences.
-
- .. deprecated:: 5.0
-
-Grid
- Same as the :ref:`inherent_thread_model` Grid entity. The ability to
- synchronize over a grid requires the kernel to be launched using the
- Cooperative Groups API.
-
-Block
- Same as the :ref:`inherent_thread_model` Block entity.
+The Cooperative Groups API lets you define your own thread groups which may fit your use-case better than those defined by the default thread model.
.. note::
- Explicit warp-level thread handling is absent from the Cooperative Groups API.
- In order to exploit the known hardware SIMD width on which built-in
- functionality translates to simpler logic, one may use the group partitioning
- part of the API, such as ``tiled_partition``.
+ The default thread groups defined by kernel launch parameters are still available. See the :ref:inherent thread model for more information.
+
+For further information, check the :ref:`inherent thread model `.
Memory Model
============
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index dd17b062da..41f2cdaaec 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -1061,8 +1061,8 @@ typedef struct dim3 {
*/
typedef struct hipLaunchParams_t {
void* func; ///< Device function symbol
- dim3 gridDim; ///< Grid dimentions
- dim3 blockDim; ///< Block dimentions
+ dim3 gridDim; ///< Grid dimensions
+ dim3 blockDim; ///< Block dimensions
void **args; ///< Arguments
size_t sharedMem; ///< Shared memory
hipStream_t stream; ///< Stream identifier
@@ -1751,8 +1751,8 @@ hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
* @param [out] uuid UUID for the device
* @param [in] device device ordinal
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorNotInitialized,
* #hipErrorDeinitialized
@@ -1918,8 +1918,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceI
* @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
* hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDeviceGetDefaultMemPool(hipMemPool_t* mem_pool, int device);
/**
@@ -1940,8 +1940,8 @@ hipError_t hipDeviceGetDefaultMemPool(hipMemPool_t* mem_pool, int device);
* @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
* hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDeviceSetMemPool(int device, hipMemPool_t mem_pool);
/**
@@ -1960,8 +1960,8 @@ hipError_t hipDeviceSetMemPool(int device, hipMemPool_t mem_pool);
* @see hipDeviceGetDefaultMemPool, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
* hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDeviceGetMemPool(hipMemPool_t* mem_pool, int device);
/**
@@ -2653,8 +2653,8 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
* @note Support for hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and
* 'hipDeviceAttributeCanUseStreamWaitValue' flag.
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,
* hipStreamWriteValue32, hipDeviceGetAttribute
@@ -2687,8 +2687,8 @@ hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, uint32_t value, u
* @note Support for hipStreamWaitValue64 can be queried using 'hipDeviceGetAttribute()' and
* 'hipDeviceAttributeCanUseStreamWaitValue' flag.
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue32, hipStreamWriteValue64,
* hipStreamWriteValue32, hipDeviceGetAttribute
@@ -2708,8 +2708,8 @@ hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, u
* Enqueues a write command to the stream, write operation is performed after all earlier commands
* on this stream have completed the execution.
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
* hipStreamWaitValue64
@@ -2728,8 +2728,8 @@ hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value,
* Enqueues a write command to the stream, write operation is performed after all earlier commands
* on this stream have completed the execution.
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
* hipStreamWaitValue64
@@ -2807,7 +2807,7 @@ hipError_t hipEventCreate(hipEvent_t* event);
* If this function is called on an event that is currently being recorded, results are undefined
* - either outstanding recording may save state into the event, and the order is not guaranteed.
*
- * @note: If this function is not called before use hipEventQuery() or hipEventSynchronize(),
+ * @note If this function is not called before use hipEventQuery() or hipEventSynchronize(),
* #hipSuccess is returned, meaning no pending event in the stream.
*
* @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
@@ -2895,7 +2895,7 @@ hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop);
* commands in the appropriate stream (specified to hipEventRecord()) have completed. If any execution
* has not completed, then #hipErrorNotReady is returned.
*
- * @note: This API returns #hipSuccess, if hipEventRecord() is not called before this API.
+ * @note This API returns #hipSuccess, if hipEventRecord() is not called before this API.
*
* @see hipEventCreate, hipEventCreateWithFlags, hipEventRecord, hipEventDestroy,
* hipEventSynchronize, hipEventElapsedTime
@@ -2928,8 +2928,8 @@ hipError_t hipEventQuery(hipEvent_t event);
*
* @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipPointerSetAttribute(const void* value, hipPointer_attribute attribute,
@@ -2965,8 +2965,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void
*
* @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @see hipPointerGetAttributes
*/
@@ -2983,8 +2983,8 @@ hipError_t hipPointerGetAttribute(void* data, hipPointer_attribute attribute,
*
* @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
*
- * @warning This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
* @see hipPointerGetAttribute
*/
@@ -3190,7 +3190,8 @@ hipError_t hipMemAllocHost(void** ptr, size_t size);
* @param[out] ptr Pointer to the allocated host pinned memory
* @param[in] size Requested memory size in bytes
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- * @param[in] flags Type of host memory allocation
+ * @param[in] flags Type of host memory allocation. See the description of flags in
+ * hipSetDeviceFlags.
*
* If no input for flags, it will be the default pinned memory allocation on the host.
*
@@ -3244,7 +3245,7 @@ hipError_t hipMallocManaged(void** dev_ptr,
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPrefetchAsync(const void* dev_ptr,
size_t count,
@@ -3267,7 +3268,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr,
* be aligned to CPU page size, the same way as corresponding CUDA API behaves in CUDA version 8.0
* and afterwards.
*
- * @note This API is implemented on Linux and is under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemAdvise(const void* dev_ptr,
size_t count,
@@ -3285,7 +3286,7 @@ hipError_t hipMemAdvise(const void* dev_ptr,
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemRangeGetAttribute(void* data,
size_t data_size,
@@ -3306,7 +3307,7 @@ hipError_t hipMemRangeGetAttribute(void* data,
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemRangeGetAttributes(void** data,
size_t* data_sizes,
@@ -3326,7 +3327,7 @@ hipError_t hipMemRangeGetAttributes(void** data,
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipStreamAttachMemAsync(hipStream_t stream,
void* dev_ptr,
@@ -3387,10 +3388,10 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream,
* @see hipMallocFromPoolAsync, hipFreeAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
* hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream);
/**
@@ -3412,10 +3413,10 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream);
* @see hipMallocFromPoolAsync, hipMallocAsync, hipMemPoolTrimTo, hipMemPoolGetAttribute,
* hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream);
/**
@@ -3426,8 +3427,8 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream);
* The allocator cannot release OS allocations that back outstanding asynchronous allocations.
* The OS allocations may happen at different granularity from the user allocations.
*
- * @note: Allocations that have not been freed count as outstanding.
- * @note: Allocations that have been asynchronously freed but whose completion has
+ * @note Allocations that have not been freed count as outstanding.
+ * @note Allocations that have been asynchronously freed but whose completion has
* not been observed on the host (eg. by a synchronize) can count as outstanding.
*
* @param[in] mem_pool The memory pool to trim allocations
@@ -3440,10 +3441,10 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream);
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
* hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolTrimTo(hipMemPool_t mem_pool, size_t min_bytes_to_hold);
/**
@@ -3479,10 +3480,10 @@ hipError_t hipMemPoolTrimTo(hipMemPool_t mem_pool, size_t min_bytes_to_hold);
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolSetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, void* value);
/**
@@ -3518,10 +3519,10 @@ hipError_t hipMemPoolSetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, vo
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync,
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolGetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, void* value);
/**
@@ -3536,10 +3537,10 @@ hipError_t hipMemPoolGetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, vo
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolSetAccess(hipMemPool_t mem_pool, const hipMemAccessDesc* desc_list, size_t count);
/**
@@ -3556,10 +3557,10 @@ hipError_t hipMemPoolSetAccess(hipMemPool_t mem_pool, const hipMemAccessDesc* de
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute,
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolGetAccess(hipMemAccessFlags* flags, hipMemPool_t mem_pool, hipMemLocation* location);
/**
@@ -3580,10 +3581,10 @@ hipError_t hipMemPoolGetAccess(hipMemAccessFlags* flags, hipMemPool_t mem_pool,
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolDestroy,
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolCreate(hipMemPool_t* mem_pool, const hipMemPoolProps* pool_props);
/**
@@ -3607,10 +3608,10 @@ hipError_t hipMemPoolCreate(hipMemPool_t* mem_pool, const hipMemPoolProps* pool_
* @see hipMallocFromPoolAsync, hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolCreate
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool);
/**
@@ -3641,10 +3642,10 @@ hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool);
* @see hipMallocAsync, hipFreeAsync, hipMemPoolGetAttribute, hipMemPoolCreate
* hipMemPoolTrimTo, hipDeviceSetMemPool, hipMemPoolSetAttribute, hipMemPoolSetAccess, hipMemPoolGetAccess,
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_pool, hipStream_t stream);
/**
@@ -3656,7 +3657,7 @@ hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_
* The implementation of what the shareable handle is and how it can be transferred is defined by the requested
* handle type.
*
- * @note: To create an IPC capable mempool, create a mempool with a @p hipMemAllocationHandleType other
+ * @note To create an IPC capable mempool, create a mempool with a @p hipMemAllocationHandleType other
* than @p hipMemHandleTypeNone.
*
* @param [out] shared_handle Pointer to the location in which to store the requested handle
@@ -3668,10 +3669,10 @@ hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_
*
* @see hipMemPoolImportFromShareableHandle
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolExportToShareableHandle(
void* shared_handle,
@@ -3696,10 +3697,10 @@ hipError_t hipMemPoolExportToShareableHandle(
*
* @see hipMemPoolExportToShareableHandle
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolImportFromShareableHandle(
hipMemPool_t* mem_pool,
@@ -3720,10 +3721,10 @@ hipError_t hipMemPoolImportFromShareableHandle(
*
* @see hipMemPoolImportPointer
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* dev_ptr);
/**
@@ -3749,10 +3750,10 @@ hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* d
*
* @see hipMemPoolExportPointer
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemPoolImportPointer(
void** dev_ptr,
@@ -3814,7 +3815,7 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr);
*
* After registering the memory, use #hipHostGetDevicePointer to obtain the mapped device pointer.
* On many systems, the mapped device pointer will have a different value than the mapped host
- * pointer. Applications must use the device pointer in device code, and the host pointer in device
+ * pointer. Applications must use the device pointer in device code, and the host pointer in host
* code.
*
* On some systems, registered memory is pinned. On some systems, registered memory may not be
@@ -5581,7 +5582,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne
* @param [in] kernelParams A list of kernel arguments.
*
* Please note, HIP does not support kernel launch with total work items defined in dimension with
- * size gridDim x blockDim >= 2^32.
+ * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$.
*
* @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
* #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue,
@@ -5611,8 +5612,8 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
unsigned int numDevices,
unsigned int flags);
/**
- * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
- * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
+ * @brief Launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute.
*
* @param [in] f Kernel to launch.
* @param [in] gridDim Grid dimensions specified as multiple of blockDim.
@@ -5624,7 +5625,7 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
* default stream is used with associated synchronization rules.
*
* Please note, HIP does not support kernel launch with total work items defined in dimension with
- * size gridDim x blockDim >= 2^32.
+ * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$.
*
* @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue, #hipErrorCooperativeLaunchTooLarge
*/
@@ -5789,7 +5790,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
* @brief Start recording of profiling information
* When using this API, start the profiler with profiling disabled. (--startdisabled)
* @returns #hipErrorNotSupported
- * @warning : hipProfilerStart API is deprecated, use roctracer/rocTX instead.
+ * @warning hipProfilerStart API is deprecated, use roctracer/rocTX instead.
*/
DEPRECATED("use roctracer/rocTX instead")
hipError_t hipProfilerStart();
@@ -6145,7 +6146,7 @@ hipError_t hipTexObjectGetTextureDesc(
*
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
*
- * @note This API is implemented on Windows, under development on Linux.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*
*/
hipError_t hipMallocMipmappedArray(
@@ -6162,7 +6163,7 @@ hipError_t hipMallocMipmappedArray(
*
* @return #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Windows, under development on Linux.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*
*/
hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray);
@@ -6176,7 +6177,7 @@ hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray);
*
* @return #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Windows, under development on Linux.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*
*/
hipError_t hipGetMipmappedArrayLevel(
@@ -6193,7 +6194,7 @@ hipError_t hipGetMipmappedArrayLevel(
*
* @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue
*
- * @note This API is implemented on Windows, under development on Linux.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMipmappedArrayCreate(
hipMipmappedArray_t* pHandle,
@@ -6207,7 +6208,7 @@ hipError_t hipMipmappedArrayCreate(
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Windows, under development on Linux.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*
*/
hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray);
@@ -6221,7 +6222,7 @@ hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray);
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @note This API is implemented on Windows, under development on Linux.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*
*/
hipError_t hipMipmappedArrayGetLevel(
@@ -6843,8 +6844,8 @@ int hipGetStreamDeviceId(hipStream_t stream);
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode);
@@ -6863,7 +6864,7 @@ are not safe.
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
-* @warning : param "const hipGraphEdgeData* dependencyData" is currently not supported and has to
+* @warning param "const hipGraphEdgeData* dependencyData" is currently not supported and has to
passed as nullptr. This API is marked as beta, meaning, while this is feature complete, it is still
open to changes and may have outstanding issues.
*
@@ -6881,8 +6882,8 @@ hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph);
@@ -6896,8 +6897,8 @@ hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph);
*
* @returns #hipSuccess, #hipErrorStreamCaptureImplicit
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipStreamGetCaptureInfo(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
@@ -6915,8 +6916,8 @@ hipError_t hipStreamGetCaptureInfo(hipStream_t stream, hipStreamCaptureStatus* p
*
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipStreamGetCaptureInfo_v2(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
@@ -6933,8 +6934,8 @@ hipError_t hipStreamGetCaptureInfo_v2(hipStream_t stream, hipStreamCaptureStatus
*
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipStreamIsCapturing(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
@@ -6949,8 +6950,8 @@ hipError_t hipStreamIsCapturing(hipStream_t stream, hipStreamCaptureStatus* pCap
* #hipStreamUpdateCaptureDependenciesFlags
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorIllegalState
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t* dependencies,
@@ -6963,8 +6964,8 @@ hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t
* @param [in] mode - Pointer to mode value to swap with the current mode
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode* mode);
@@ -6977,8 +6978,8 @@ hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode* mode);
*
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
@@ -6990,8 +6991,8 @@ hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphDestroy(hipGraph_t graph);
@@ -7005,8 +7006,8 @@ hipError_t hipGraphDestroy(hipGraph_t graph);
* @param [in] numDependencies - the number of dependencies to add.
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from,
@@ -7021,8 +7022,8 @@ hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from,
* @param [in] numDependencies - the number of dependencies to remove.
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from,
@@ -7041,8 +7042,8 @@ hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* fr
* numEdges. Otherwise, numEdges entries will be filled in. If numEdges is higher than the actual
* number of edges, the remaining entries in from and to will be set to NULL, and the number of
* edges actually returned will be written to numEdges
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from, hipGraphNode_t* to,
@@ -7060,8 +7061,8 @@ hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from, hipGraphNode
* Otherwise, numNodes entries will be filled in. If numNodes is higher than the actual number of
* nodes, the remaining entries in nodes will be set to NULL, and the number of nodes actually
* obtained will be returned in numNodes.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphGetNodes(hipGraph_t graph, hipGraphNode_t* nodes, size_t* numNodes);
@@ -7078,8 +7079,8 @@ hipError_t hipGraphGetNodes(hipGraph_t graph, hipGraphNode_t* nodes, size_t* num
* pNumRootNodes. Otherwise, pNumRootNodes entries will be filled in. If pNumRootNodes is higher
* than the actual number of root nodes, the remaining entries in pRootNodes will be set to NULL,
* and the number of nodes actually obtained will be returned in pNumRootNodes.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes,
@@ -7097,8 +7098,8 @@ hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes,
* pNumDependencies. Otherwise, pNumDependencies entries will be filled in. If pNumDependencies is
* higher than the actual number of dependencies, the remaining entries in pDependencies will be set
* to NULL, and the number of nodes actually obtained will be returned in pNumDependencies.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node, hipGraphNode_t* pDependencies,
@@ -7117,8 +7118,8 @@ hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node, hipGraphNode_t* pDep
* pNumDependentNodes is higher than the actual number of dependent nodes, the remaining entries in
* pDependentNodes will be set to NULL, and the number of nodes actually obtained will be returned
* in pNumDependentNodes.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node, hipGraphNode_t* pDependentNodes,
@@ -7131,8 +7132,8 @@ hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node, hipGraphNode_t* pD
* @param [out] pType - pointer to the return the type
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType);
@@ -7143,8 +7144,8 @@ hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType);
* @param [in] node - graph node to remove
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphDestroyNode(hipGraphNode_t node);
@@ -7156,8 +7157,8 @@ hipError_t hipGraphDestroyNode(hipGraphNode_t node);
* @param [in] originalGraph - original graph to clone from.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph);
@@ -7170,8 +7171,8 @@ hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph);
* @param [in] clonedGraph - Cloned graph to query.
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t originalNode,
@@ -7189,8 +7190,8 @@ hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t origina
*
* @returns #hipSuccess, #hipErrorOutOfMemory
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
*/
hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
@@ -7204,9 +7205,9 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
* @param [in] flags - Flags to control instantiation.
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.It does not support
- * any of flag and is behaving as hipGraphInstantiate.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues. It does not support any of
+ * flag and is behaving as hipGraphInstantiate.
*/
hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
unsigned long long flags);
@@ -7219,8 +7220,8 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g
* @param [in] instantiateParams - Graph Instantiate Params
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
hipGraphInstantiateParams *instantiateParams);
@@ -7231,8 +7232,8 @@ hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t
* @param [in] stream - instance of stream in which to launch executable graph.
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
@@ -7243,8 +7244,8 @@ hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
* @param [in] stream - instance of stream in which to launch executable graph.
* @returns #hipSuccess, #hipErrorInvalidValue
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream);
@@ -7257,8 +7258,8 @@ hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream);
* @param [in] numDependencies - the number of the dependencies.
* @param [in] nodeParams - pointer to the parameters for the node.
* @returns #hipSuccess, #hipErrorInvalidValue.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
const hipGraphNode_t *pDependencies, size_t numDependencies,
@@ -7270,8 +7271,8 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
* @param [in] graphExec - Executable graph to get the flags.
* @param [out] flags - Flags used to instantiate this executable graph.
* @returns #hipSuccess, #hipErrorInvalidValue.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* flags);
@@ -7281,8 +7282,8 @@ hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* fl
* @param [in] node - instance of the node to set parameters to.
* @param [in] nodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction, #hipErrorNotSupported.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams *nodeParams);
@@ -7293,8 +7294,8 @@ hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams *nodePa
* @param [in] node - instance of the node to set parameters to.
* @param [in] nodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction, #hipErrorNotSupported.
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t node, hipGraphNodeParams* nodeParams);
@@ -7305,8 +7306,8 @@ hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t no
*
* @returns #hipSuccess.
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec);
@@ -7321,8 +7322,8 @@ hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec);
* @param [in] updateResult_out - Whether the graph update was permitted.
* @returns #hipSuccess, #hipErrorGraphExecUpdateFailure
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
hipGraphNode_t* hErrorNode_out,
@@ -7337,8 +7338,8 @@ hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
* @param [in] numDependencies - the number of the dependencies.
* @param [in] pNodeParams - pointer to the parameters to the kernel execution node on the GPU.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7350,8 +7351,8 @@ hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] node - instance of the node to get parameters from.
* @param [out] pNodeParams - pointer to the parameters
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphKernelNodeGetParams(hipGraphNode_t node, hipKernelNodeParams* pNodeParams);
@@ -7361,8 +7362,8 @@ hipError_t hipGraphKernelNodeGetParams(hipGraphNode_t node, hipKernelNodeParams*
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - const pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node, const hipKernelNodeParams* pNodeParams);
@@ -7373,8 +7374,8 @@ hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node, const hipKernelNodeP
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - const pointer to the kernel node parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
const hipKernelNodeParams* pNodeParams);
@@ -7389,8 +7390,8 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
* @param [in] copyParams - const pointer to the parameters for the memory copy.
* @param [in] ctx - cotext related to current device.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
const hipGraphNode_t* dependencies,
@@ -7405,8 +7406,8 @@ hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra
* @param [in] numDependencies - the number of the dependencies.
* @param [in] pCopyParams - const pointer to the parameters for the memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7417,8 +7418,8 @@ hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] node - instance of the node to get parameters from.
* @param [out] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms* pNodeParams);
@@ -7428,8 +7429,8 @@ hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms* pN
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - const pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemcpyNodeSetParams(hipGraphNode_t node, const hipMemcpy3DParms* pNodeParams);
@@ -7440,8 +7441,8 @@ hipError_t hipGraphMemcpyNodeSetParams(hipGraphNode_t node, const hipMemcpy3DPar
* @param [in] attr - the attribute node is set to.
* @param [in] value - const pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr,
const hipKernelNodeAttrValue* value);
@@ -7452,8 +7453,8 @@ hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAtt
* @param [in] attr - the attribute node is set to.
* @param [in] value - const pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr,
hipKernelNodeAttrValue* value);
@@ -7464,8 +7465,8 @@ hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAtt
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - const pointer to the kernel node parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
hipMemcpy3DParms* pNodeParams);
@@ -7482,8 +7483,8 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
* @param [in] count - the size of the memory to copy.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7498,8 +7499,8 @@ hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] count - the size of the memory to copy.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst, const void* src,
size_t count, hipMemcpyKind kind);
@@ -7515,8 +7516,8 @@ hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst, const v
* @param [in] count - the size of the memory to copy.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraphNode_t node,
void* dst, const void* src, size_t count,
@@ -7535,8 +7536,8 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph
* @param [in] offset - Offset from start of symbol in bytes.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies,
@@ -7553,8 +7554,8 @@ hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_
* @param [in] offset - Offset from start of symbol in bytes.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst, const void* symbol,
size_t count, size_t offset, hipMemcpyKind kind);
@@ -7571,8 +7572,8 @@ hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst,
* @param [in] offset - Offset from start of symbol in bytes.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node,
void* dst, const void* symbol, size_t count,
@@ -7591,8 +7592,8 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
* @param [in] offset - Offset from start of symbol in bytes.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies,
@@ -7610,8 +7611,8 @@ hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t
* @param [in] offset - Offset from start of symbol in bytes.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node, const void* symbol,
const void* src, size_t count, size_t offset,
@@ -7629,8 +7630,8 @@ hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node, const void*
* @param [in] offset - Offset from start of symbol in bytes.
* @param [in] kind - the type of memory copy.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node,
const void* symbol, const void* src,
@@ -7645,8 +7646,8 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi
* @param [in] numDependencies - the number of the dependencies.
* @param [in] pMemsetParams - const pointer to the parameters for the memory set.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7658,8 +7659,8 @@ hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] node - instane of the node to get parameters from.
* @param [out] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemsetNodeGetParams(hipGraphNode_t node, hipMemsetParams* pNodeParams);
@@ -7669,8 +7670,8 @@ hipError_t hipGraphMemsetNodeGetParams(hipGraphNode_t node, hipMemsetParams* pNo
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node, const hipMemsetParams* pNodeParams);
@@ -7681,8 +7682,8 @@ hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node, const hipMemsetParam
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
const hipMemsetParams* pNodeParams);
@@ -7696,8 +7697,8 @@ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
* @param [in] numDependencies - the number of the dependencies.
* @param [in] pNodeParams -pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7709,8 +7710,8 @@ hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] node - instane of the node to get parameters from.
* @param [out] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node, hipHostNodeParams* pNodeParams);
@@ -7720,8 +7721,8 @@ hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node, hipHostNodeParams* pNo
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node, const hipHostNodeParams* pNodeParams);
@@ -7732,8 +7733,8 @@ hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node, const hipHostNodeParam
* @param [in] node - instance of the node to set parameters to.
* @param [in] pNodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
const hipHostNodeParams* pNodeParams);
@@ -7747,8 +7748,8 @@ hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode
* @param [in] numDependencies - the number of the dependencies.
* @param [in] childGraph - the graph to clone into this node
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7760,8 +7761,8 @@ hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t grap
* @param [in] node - instane of the node to get child graph.
* @param [out] pGraph - pointer to get the graph.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph);
@@ -7772,8 +7773,8 @@ hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGrap
* @param [in] node - node from the graph which was used to instantiate graphExec.
* @param [in] childGraph - child graph with updated parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
hipGraph_t childGraph);
@@ -7786,8 +7787,8 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
* @param [in] pDependencies - const pointer to the node dependenties.
* @param [in] numDependencies - the number of dependencies.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies);
@@ -7802,8 +7803,8 @@ hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] numDependencies - the number of dependencies.
* @param [in] event - Event for the node.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7815,8 +7816,8 @@ hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t gra
* @param [in] node - instane of the node to get event from.
* @param [out] event_out - Pointer to return the event.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out);
@@ -7826,8 +7827,8 @@ hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node, hipEvent_t* even
* @param [in] node - instane of the node to set event to.
* @param [in] event - pointer to the event.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
@@ -7838,8 +7839,8 @@ hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event
* @param [in] hNode - node from the graph which was used to instantiate graphExec.
* @param [in] event - pointer to the event.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
hipEvent_t event);
@@ -7853,8 +7854,8 @@ hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec, hipGra
* @param [in] numDependencies - the number of dependencies.
* @param [in] event - Event for the node.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -7867,8 +7868,8 @@ hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph
* @param [in] node - instane of the node to get event from.
* @param [out] event_out - Pointer to return the event.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out);
@@ -7878,8 +7879,8 @@ hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_
* @param [in] node - instane of the node to set event to.
* @param [in] event - pointer to the event.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
@@ -7890,8 +7891,8 @@ hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
* @param [in] hNode - node from the graph which was used to instantiate graphExec.
* @param [in] event - pointer to the event.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
hipEvent_t event);
@@ -7905,8 +7906,8 @@ hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec, hipGraph
* @param [in] numDependencies - The number of dependencies
* @param [in] pNodeParams - Node parameters for memory allocation
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies, hipMemAllocNodeParams* pNodeParams);
@@ -7917,8 +7918,8 @@ hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] node - Memory allocation node for a query
* @param [out] pNodeParams - Parameters for the specified memory allocation node
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, hipMemAllocNodeParams* pNodeParams);
@@ -7931,8 +7932,8 @@ hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, hipMemAllocNodePar
* @param [in] numDependencies - The number of dependencies
* @param [in] dev_ptr - Pointer to the memory to be freed
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies, void* dev_ptr);
@@ -7943,8 +7944,8 @@ hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
* @param [in] node - Memory free node for a query
* @param [out] dev_ptr - Device pointer for the specified memory free node
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void* dev_ptr);
@@ -7955,8 +7956,8 @@ hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void* dev_ptr);
* @param [in] attr - attr to get.
* @param [out] value - value for specific attr.
* @returns #hipSuccess, #hipErrorInvalidDevice
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value);
@@ -7967,8 +7968,8 @@ hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType at
* @param [in] attr - attr to set.
* @param [in] value - value for specific attr.
* @returns #hipSuccess, #hipErrorInvalidDevice
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value);
@@ -7978,8 +7979,8 @@ hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType at
* @param [in] device - device the memory is used for graphs
* @returns #hipSuccess, #hipErrorInvalidDevice
*
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDeviceGraphMemTrim(int device);
@@ -7992,8 +7993,8 @@ hipError_t hipDeviceGraphMemTrim(int device);
* @param [in] initialRefcount - reference to resource.
* @param [in] flags - flags passed to API.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy,
unsigned int initialRefcount, unsigned int flags);
@@ -8004,8 +8005,8 @@ hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn
* @param [in] object - pointer to instace of userobj.
* @param [in] count - reference to resource to be retained.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count __dparm(1));
@@ -8015,8 +8016,8 @@ hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count __dpa
* @param [in] object - pointer to instace of userobj.
* @param [in] count - reference to resource to be retained.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipUserObjectRetain(hipUserObject_t object, unsigned int count __dparm(1));
@@ -8028,8 +8029,8 @@ hipError_t hipUserObjectRetain(hipUserObject_t object, unsigned int count __dpar
* @param [in] count - reference to resource to be retained.
* @param [in] flags - flags passed to API.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphRetainUserObject(hipGraph_t graph, hipUserObject_t object,
unsigned int count __dparm(1), unsigned int flags __dparm(0));
@@ -8041,8 +8042,8 @@ hipError_t hipGraphRetainUserObject(hipGraph_t graph, hipUserObject_t object,
* @param [in] object - pointer to instace of userobj.
* @param [in] count - reference to resource to be retained.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object,
unsigned int count __dparm(1));
@@ -8054,8 +8055,8 @@ hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object,
* @param [in] path - path to write the DOT file.
* @param [in] flags - Flags from hipGraphDebugDotFlags to get additional node information.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOperatingSystem
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags);
@@ -8070,8 +8071,8 @@ hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned in
* For list of attributes see ::hipKernelNodeAttrID.
*
* @returns #hipSuccess, #hipErrorInvalidContext
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t hDst);
@@ -8094,8 +8095,8 @@ hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t
* @param [in] isEnabled - Node is enabled if != 0, otherwise the node is disabled.
*
* @returns #hipSuccess, #hipErrorInvalidValue,
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
unsigned int isEnabled);
@@ -8116,8 +8117,8 @@ hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod
* @param [out] isEnabled - Location to return the enabled status of the node.
*
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
unsigned int* isEnabled);
@@ -8131,8 +8132,8 @@ hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod
* @param [in] numDependencies - the number of the dependencies.
* @param [in] nodeParams -pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddExternalSemaphoresWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -8147,8 +8148,8 @@ hipError_t hipGraphAddExternalSemaphoresWaitNode(hipGraphNode_t* pGraphNode, hip
* @param [in] numDependencies - the number of the dependencies.
* @param [in] nodeParams -pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphAddExternalSemaphoresSignalNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -8159,8 +8160,8 @@ hipError_t hipGraphAddExternalSemaphoresSignalNode(hipGraphNode_t* pGraphNode, h
* @param [in] hNode - Node from the graph from which graphExec was instantiated.
* @param [in] nodeParams - Pointer to the params to be set.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(hipGraphNode_t hNode,
const hipExternalSemaphoreSignalNodeParams* nodeParams);
@@ -8170,8 +8171,8 @@ hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(hipGraphNode_t hNode,
* @param [in] hNode - Node from the graph from which graphExec was instantiated.
* @param [in] nodeParams - Pointer to the params to be set.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(hipGraphNode_t hNode,
const hipExternalSemaphoreWaitNodeParams* nodeParams);
@@ -8181,8 +8182,8 @@ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(hipGraphNode_t hNode,
* @param [in] hNode - Node from the graph from which graphExec was instantiated.
* @param [out] params_out - Pointer to params.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(hipGraphNode_t hNode,
hipExternalSemaphoreSignalNodeParams* params_out);
@@ -8192,8 +8193,8 @@ hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(hipGraphNode_t hNode,
* @param [in] hNode - Node from the graph from which graphExec was instantiated.
* @param [out] params_out - Pointer to params.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(hipGraphNode_t hNode,
hipExternalSemaphoreWaitNodeParams* params_out);
@@ -8204,8 +8205,8 @@ hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(hipGraphNode_t hNode,
* @param [in] hNode - Node from the graph from which graphExec was instantiated.
* @param [in] nodeParams - Pointer to the params to be set.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
const hipExternalSemaphoreSignalNodeParams* nodeParams);
@@ -8216,8 +8217,8 @@ hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(hipGraphExec_t hGra
* @param [in] hNode - Node from the graph from which graphExec was instantiated.
* @param [in] nodeParams - Pointer to the params to be set.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
const hipExternalSemaphoreWaitNodeParams* nodeParams);
@@ -8228,8 +8229,8 @@ hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraph
* @param [in] hNode - instance of the node to get parameters from.
* @param [out] nodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode, HIP_MEMCPY3D* nodeParams);
@@ -8239,8 +8240,8 @@ hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode, HIP_MEMCPY3D* no
* @param [in] hNode - instance of the node to Set parameters for.
* @param [out] nodeParams - pointer to the parameters.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode, const HIP_MEMCPY3D* nodeParams);
@@ -8254,8 +8255,8 @@ hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode, const HIP_MEMCPY
* @param [in] memsetParams - const pointer to the parameters for the memory set.
* @param [in] ctx - cotext related to current device.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
const hipGraphNode_t* dependencies, size_t numDependencies,
@@ -8270,8 +8271,8 @@ hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra
* @param [in] numDependencies - The number of dependencies
* @param [in] dptr - Pointer to the memory to be freed
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
const hipGraphNode_t* dependencies, size_t numDependencies,
@@ -8285,8 +8286,8 @@ hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGr
* @param [in] copyParams - const pointer to the memcpy node params.
* @param [in] ctx - cotext related to current device.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
const HIP_MEMCPY3D* copyParams, hipCtx_t ctx);
@@ -8299,8 +8300,8 @@ hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
* @param [in] memsetParams - pointer to the parameters.
* @param [in] ctx - cotext related to current device.
* @returns #hipSuccess, #hipErrorInvalidValue
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*/
hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx);
@@ -8328,10 +8329,10 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
* @param [in] devPtr - starting address of the range.
* @param [in] size - size of the range.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemAddressFree(void* devPtr, size_t size);
@@ -8344,10 +8345,10 @@ hipError_t hipMemAddressFree(void* devPtr, size_t size);
* @param [in] addr - requested starting address of the range.
* @param [in] flags - currently unused, must be zero.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void* addr, unsigned long long flags);
@@ -8359,10 +8360,10 @@ hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void*
* @param [in] prop - properties of the allocation.
* @param [in] flags - currently unused, must be zero.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, const hipMemAllocationProp* prop, unsigned long long flags);
@@ -8374,10 +8375,10 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, co
* @param [in] handleType - type of the shareable handle.
* @param [in] flags - currently unused, must be zero.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemExportToShareableHandle(void* shareableHandle, hipMemGenericAllocationHandle_t handle, hipMemAllocationHandleType handleType, unsigned long long flags);
@@ -8388,10 +8389,10 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle, hipMemGenericAll
* @param [in] location - target location.
* @param [in] ptr - address to check the access flags.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr);
@@ -8402,10 +8403,10 @@ hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* loca
* @param [in] prop - location properties.
* @param [in] option - determines which granularity to return.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*
*/
hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAllocationProp* prop, hipMemAllocationGranularity_flags option);
@@ -8416,10 +8417,10 @@ hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAlloc
* @param [out] prop - properties of the given handle.
* @param [in] handle - handle to perform the query on.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, hipMemGenericAllocationHandle_t handle);
@@ -8430,10 +8431,10 @@ hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, h
* @param [in] osHandle - shareable handle representing the memory allocation.
* @param [in] shHandleType - handle type.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle, void* osHandle, hipMemAllocationHandleType shHandleType);
@@ -8446,10 +8447,10 @@ hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* hand
* @param [in] handle - memory allocation to be mapped.
* @param [in] flags - currently unused, must be zero.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocationHandle_t handle, unsigned long long flags);
@@ -8460,10 +8461,10 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat
* @param [in] count - number of hipArrayMapInfo in mapInfoList.
* @param [in] stream - stream identifier for the stream to use for map or unmap operations.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count, hipStream_t stream);
@@ -8472,10 +8473,10 @@ hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count
*
* @param [in] handle - handle of the memory allocation.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle);
@@ -8485,10 +8486,10 @@ hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle);
* @param [out] handle - handle representing addr.
* @param [in] addr - address to look up.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, void* addr);
@@ -8500,10 +8501,10 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle,
* @param [in] desc - array of hipMemAccessDesc.
* @param [in] count - number of hipMemAccessDesc in desc.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, size_t count);
@@ -8513,10 +8514,10 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc,
* @param [in] ptr - starting address of the range to unmap.
* @param [in] size - size of the virtual address range.
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported
- * @warning : This API is marked as beta, meaning, while this is feature complete,
- * it is still open to changes and may have outstanding issues.
+ * @warning This API is marked as Beta. While this feature is complete, it can
+ * change and might have outstanding issues.
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
hipError_t hipMemUnmap(void* ptr, size_t size);
@@ -8637,7 +8638,7 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
#ifdef __cplusplus
#if defined(__clang__) && defined(__HIP__)
template
-static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,template
T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast(f),dynSharedMemPerBlk,blockSizeLimit);
}
@@ -8932,12 +8933,16 @@ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kern
*
* @ingroup Execution
*
- * @param [in] f device function symbol
- * @param [in] gridDim grid dimentions
- * @param [in] blockDim block dimentions
- * @param [in] kernelParams kernel parameters
- * @param [in] sharedMemBytes shared memory in bytes
- * @param [in] stream stream on which kernel launched
+ * \tparam T The type of the kernel function.
+ *
+ * @param [in] f Kernel function to launch.
+ * @param [in] gridDim Grid dimensions specified as multiple of blockDim.
+ * @param [in] blockDim Block dimensions specified in work-items.
+ * @param [in] kernelParams A list of kernel arguments.
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for
+ * this kernel. The HIP-Clang compiler provides
+ * support for extern shared declarations.
+ * @param [in] stream Stream which on the kernel launched.
*
* @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue,
* #hipErrorInvalidResourceHandle
@@ -8950,14 +8955,14 @@ inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
blockDim, kernelParams, sharedMemBytes, stream);
}
/**
- * @brief Launches device function on multiple devices where thread blocks can cooperate and
- * synchronize on execution.
+ * @brief Launches kernel function on multiple devices, where thread blocks can
+ * cooperate and synchronize on execution.
*
* @ingroup Execution
*
- * @param [in] launchParamsList list of kernel launch parameters, one per device
- * @param [in] numDevices size of launchParamsList array
- * @param [in] flags flag to handle launch behavior
+ * @param [in] launchParamsList List of kernel launch parameters, one per device.
+ * @param [in] numDevices Size of launchParamsList array.
+ * @param [in] flags Flag to handle launch behavior.
*
* @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue,
* #hipErrorInvalidResourceHandle
@@ -9210,7 +9215,7 @@ static inline hipError_t hipUnbindTexture(
*
* @see hipMallocFromPoolAsync
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
static inline hipError_t hipMallocAsync(
void** dev_ptr,
@@ -9227,7 +9232,7 @@ static inline hipError_t hipMallocAsync(
*
* @see hipMallocFromPoolAsync
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
template
static inline hipError_t hipMallocAsync(
@@ -9245,7 +9250,7 @@ static inline hipError_t hipMallocAsync(
*
* @see hipMallocFromPoolAsync
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
template
static inline hipError_t hipMallocAsync(
@@ -9262,7 +9267,7 @@ static inline hipError_t hipMallocAsync(
*
* @see hipMallocFromPoolAsync
*
- * @note This API is implemented on Linux, under development on Windows.
+ * @note This API is implemented on Linux and is under development on Microsoft Windows.
*/
template
static inline hipError_t hipMallocFromPoolAsync(