diff --git a/.wordlist.txt b/.wordlist.txt index 2488764c1d..cb743c85a5 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -65,7 +65,9 @@ HIPRTC icc IILE iGPU +inlined inplace +interop Interoperation interoperate Intrinsics @@ -123,6 +125,7 @@ scalarizing sceneries shaders SIMT +SOMA SPMV structs SYCL diff --git a/docs/.gitignore b/docs/.gitignore index 53b7787fbd..f43f04af9f 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -5,4 +5,4 @@ /_templates /doxygen/html /doxygen/xml -/sphinx/_toc.yml +/sphinx/_toc.yml \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 82bcefee89..2db96905c9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -47,7 +47,6 @@ numfig = False - exclude_patterns = [ "doxygen/mainpage.md", "understand/glossary.md" diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.drawio similarity index 100% rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.drawio diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg similarity index 100% rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.drawio similarity index 100% rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.drawio diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg similarity index 100% rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg diff --git a/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.drawio b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.drawio new file mode 100644 index 0000000000..602c7e501d --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.drawio @@ -0,0 +1,106 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg new file mode 100644 index 0000000000..8ffb8aa965 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg @@ -0,0 +1 @@ +Pageable data transfer
Pinned memory
Pinned memory
Pinned data transfer
Pageable memory
Pageable memory
Device memory
Device memory
Pinned memory
Pinned memory
Device memory
Device memory
Host
Host
Device
Device
Device
Device
Host
Host
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/textures/border.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/border.png similarity index 100% rename from docs/data/understand/textures/border.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/border.png diff --git a/docs/data/understand/textures/clamp.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/clamp.png similarity index 100% rename from docs/data/understand/textures/clamp.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/clamp.png diff --git a/docs/data/understand/textures/linear.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/linear.png similarity index 100% rename from docs/data/understand/textures/linear.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/linear.png diff --git a/docs/data/understand/textures/mirror.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/mirror.png similarity index 100% rename from docs/data/understand/textures/mirror.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/mirror.png diff --git a/docs/data/understand/textures/nearest.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/nearest.png similarity index 100% rename from docs/data/understand/textures/nearest.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/nearest.png diff --git a/docs/data/understand/textures/original.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/original.png similarity index 100% rename from docs/data/understand/textures/original.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/original.png diff --git a/docs/data/understand/textures/wrap.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/wrap.png similarity index 100% rename from docs/data/understand/textures/wrap.png rename to docs/data/how-to/hip_runtime_api/memory_management/textures/wrap.png diff --git a/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.drawio b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.drawio new file mode 100644 index 0000000000..aa6c1bedba --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.drawio @@ -0,0 +1,1880 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg new file mode 100644 index 0000000000..3941cc635b --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg @@ -0,0 +1,9 @@ +Explicit Memory Management
CPU cores
CPU cores +
CPU
CPU +
GPU Memory
(HBM)
GPU Memory...
Unified Memory Management
Unified Memory
Unified Memory
CPU Memory (RAM)
CPU Memory (RAM)
GPU
GPU +
GPU cores
GPU cores +
GPU
GPU +
CPU cores
CPU cores +
CPU
CPU +
GPU cores
GPU cores +
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.drawio b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.drawio new file mode 100644 index 0000000000..0aeb6695a9 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.drawio @@ -0,0 +1,1875 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.svg b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.svg new file mode 100644 index 0000000000..2798637bcb --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.svg @@ -0,0 +1,5 @@ +Explicit Memory Management
CPU cores
CPU cores +
CPU
CPU +
GPU Memory
(HBM)
GPU Memory...
Unified Memory Management
Unified Memory
(HBM)
Unified Memory...
GPU cores
GPU cores
GPU cores
GPU cores +
CPU cores
CPU cores
GPU
GPU +
APU
APU
CPU Memory
(RAM)
CPU Memory...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/runtimes.drawio b/docs/data/how-to/hip_runtime_api/runtimes.drawio new file mode 100644 index 0000000000..ee1425b2ae --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/runtimes.drawio @@ -0,0 +1,130 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/runtimes.svg b/docs/data/how-to/hip_runtime_api/runtimes.svg new file mode 100644 index 0000000000..12edbdf831 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/runtimes.svg @@ -0,0 +1,2 @@ +Runtimes
HIP Runtime API
HIP Runtime API
CUDA Driver API
CUDA Driver API
CUDA runtime
CUDA runtime
ROCr runtime
ROCr runtime
PAL
PAL
CLR
CLR
AMD Platform
AMD Platform +
NVIDIA Platform
NVIDIA Platform
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/stream_management.drawio b/docs/data/how-to/hip_runtime_api/stream_management.drawio new file mode 100644 index 0000000000..2b443fe3f0 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/stream_management.drawio @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/stream_management.svg b/docs/data/how-to/hip_runtime_api/stream_management.svg new file mode 100644 index 0000000000..c7a05657f1 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/stream_management.svg @@ -0,0 +1 @@ +Stream 1
Kernel A
Kernel A
Stream 2
Memory Copy
Memory Copy
hipDeviceSynchronize
hipDeviceSynchronize
Kernel B
Kernel B
Kernel C
Kernel C
Memory Copy
Memory Copy
Memory Copy
Memory Copy
Memory Copy
Memory Copy
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/unified_memory/um.drawio b/docs/data/unified_memory/um.drawio deleted file mode 100644 index fac74f4b60..0000000000 --- a/docs/data/unified_memory/um.drawio +++ /dev/null @@ -1,1878 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/data/unified_memory/um.svg b/docs/data/unified_memory/um.svg deleted file mode 100644 index 748949b271..0000000000 --- a/docs/data/unified_memory/um.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -Explicit Memory Management
CPU cores
CPU cores
CPU
CPU
GPU Memory
(HBM)
GPU Memory...
Unified Memory Management
Unified Memory
(HBM)
Unified Memory...
GPU cores
GPU cores
CPU Memory (DRAM)
CPU Memory (DR...
GPU cores
GPU cores
CPU cores
CPU cores
GPU
GPU
APU
APU
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/how-to/hip_runtime_api.rst b/docs/how-to/hip_runtime_api.rst new file mode 100644 index 0000000000..cad6c38bdc --- /dev/null +++ b/docs/how-to/hip_runtime_api.rst @@ -0,0 +1,37 @@ +.. meta:: + :description: This chapter describes the HIP runtime API and shows + how to use it in AMD HIP. + :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API How to, + +.. _hip_runtime_api_how-to: + +******************************************************************************** +HIP Runtime API +******************************************************************************** + +The HIP runtime API provides C and C++ functionality to manage GPUs, like event, +stream and memory management. On AMD platforms the HIP runtime uses the +:doc:`Common Language Runtime (CLR) `, while on NVIDIA +platforms it is only a thin layer over the CUDA runtime or Driver API. + +- **CLR** contains source code for AMD's compute language runtimes: ``HIP`` and + ``OpenCLâ„¢``. CLR includes the implementation of the ``HIP`` language on the + AMD platform `hipamd `_ and + the Radeon Open Compute Common Language Runtime (rocclr). rocclr is a virtual + device interface, that enables the HIP runtime to interact with different + backends such as ROCr on Linux or PAL on Windows. CLR also include the + implementation of `OpenCL runtime `_. +- The **CUDA runtime** is built on top of the CUDA driver API, which is a C API + with lower-level access to NVIDIA GPUs. For further information about the CUDA + driver and runtime API and its relation to HIP check the :doc:`CUDA driver API porting guide`. + On non-AMD platform, HIP runtime determines, if CUDA is available and can be + used. + +The relation between the different runtimes and their backends is presented in +the following figure. + +.. figure:: ../data/how-to/hip_runtime_api/runtimes.svg + +.. note:: + + The CUDA specific headers can be found in the `hipother repository `_. diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/hip_runtime_api/cooperative_groups.rst similarity index 95% rename from docs/how-to/cooperative_groups.rst rename to docs/how-to/hip_runtime_api/cooperative_groups.rst index 370d6dc729..9acc5da149 100644 --- a/docs/how-to/cooperative_groups.rst +++ b/docs/how-to/hip_runtime_api/cooperative_groups.rst @@ -8,9 +8,16 @@ Cooperative groups ******************************************************************************* -Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. Cooperative groups let you define your own set of thread groups which may fit your user-cases better than those defined by the hardware. This lets you specify the level of granularity for thread communication which can lead to more efficient parallel decompositions. +Cooperative groups API is an extension to the HIP programming model, which +provides developers with a flexible, dynamic grouping mechanism for the +communicating threads. Cooperative groups let you define your own set of thread +groups which may fit your user-cases better than those defined by the hardware. +This lets you specify the level of granularity for thread communication which +can lead to more efficient parallel decompositions. -The API is accessible in the ``cooperative_groups`` namespace after the ``hip_cooperative_groups.h`` is included. The header contains the following elements: +The API is accessible in the ``cooperative_groups`` namespace after the +``hip_cooperative_groups.h`` is included. The header contains the following +elements: * Static functions to create groups and subgroups. * Hardware-accelerated operations over the whole group, like shuffles. @@ -19,13 +26,13 @@ The API is accessible in the ``cooperative_groups`` namespace after the ``hip_c * Get group properties member functions. Cooperative groups thread model -=============================== +================================================================================ The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarchy ` and :ref:`block hierarchy `. .. _coop_thread_top_hierarchy: -.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg +.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg :alt: Diagram depicting nested rectangles of varying color. The outermost one titled "Grid", inside sets of different sized rectangles layered on one another titled "Block". Each "Block" containing sets of uniform @@ -48,7 +55,7 @@ The **block** is the same as the :ref:`inherent_thread_model` block entity. .. _coop_thread_bottom_hierarchy: -.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg +.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg :alt: The new level between block thread and threads. Cooperative group thread hierarchy in blocks. diff --git a/docs/how-to/hip_runtime_api/memory_management.rst b/docs/how-to/hip_runtime_api/memory_management.rst new file mode 100644 index 0000000000..59d7ae1ed5 --- /dev/null +++ b/docs/how-to/hip_runtime_api/memory_management.rst @@ -0,0 +1,406 @@ +.. meta:: + :description: This chapter introduces memory management and shows how to use + it in AMD HIP. + :keywords: AMD, ROCm, HIP, CUDA, memory management + +******************************************************************************** +Memory management +******************************************************************************** + +Memory management is an important part of the HIP runtime API, when creating +high-performance applications. Both allocating and copying memory can result in +bottlenecks, which can significantly impact performance. + +The programming model is based on a system with a host and a device, each having +its own distinct memory. Kernels execute mainly on device memory, the runtime +offers functions for allocating, deallocating, and copying device memory, along +with transferring data between host and device memory. + +Device memory +================================================================================ + +Device memory exists on the device, e.g. on GPUs in the video random access +memory (VRAM), and is accessible by the kernels operating on the device. Recent +architectures use graphics double data rate (GDDR) synchronous dynamic +random-access memory (SDRAM) such as GDDR6, or high-bandwidth memory (HBM) such +as HBM2e. Device memory can be allocated as global memory, constant, texture or +surface memory. + +Global memory +-------------------------------------------------------------------------------- + +Read-write storage visible to all threads on a given device. There are +specialized versions of global memory with different usage semantics which are +typically backed by the same hardware, but can use different caching paths. + +Constant memory +-------------------------------------------------------------------------------- + +Read-only storage visible to all threads on a given device. It is a limited +segment backed by device memory with queryable size. It needs to be set by the +host before kernel execution. Constant memory provides the best performance +benefit when all threads within a warp access the same address. + +Texture memory +-------------------------------------------------------------------------------- + +Read-only storage visible to all threads on a given device and accessible +through additional APIs. Its origins come from graphics APIs, and provides +performance benefits when accessing memory in a pattern where the +addresses are close to each other in a 2D representation of the memory. + +The texture management module of HIP runtime API contains the functions of +texture memory. + +Surface memory +-------------------------------------------------------------------------------- + +A read-write version of texture memory, which can be useful for applications +that require direct manipulation of 1D, 2D, or 3D hipArray_t. + +The surface objects module of HIP runtime API contains the functions for surface +memory create, destroy, read and write. + +Host Memory +================================================================================ + +Host memory where on the host (e.g. CPU) of the machine in random access memory +(RAM). The host memory has three different main types in HIP: + +* Pageable memory +* Pinned memory +* Unified Memory + +These different types of memory should be used at different use cases. The +pageable and pinned memory using explicit memory management, where the +developers have direct control over memory operations, while at the unified +memory case the developer gets a simplified memory model with less control over +low level memory operations. + +The data transfer differences between the pageable or pinned memory usage +represented in the next figure. + +.. figure:: ../../data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg + +The unified memory management and explicit memory management main difference +highlighted at the following figure. + +.. figure:: ../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg + +The unified memory management described at the :doc:`/how-to/hip_runtime_api/memory_management/unified_memory`. + +Pageable memory +-------------------------------------------------------------------------------- + +Pageable memory is exists on "pages" (blocks of memory), which can be +migrated to other memory storage. For example, migrating memory between CPU +sockets on a motherboard, or a system that runs out of space in RAM and starts +dumping pages of RAM into the swap partition of your hard drive. + +Pageable memory is usually gotten when calling ``malloc`` or ``new`` in a C++ +application. The following example shows the pageable host memory usage in HIP. + +.. code-block:: cpp + + #define ELEMENT_NUMBER 100 + + int *host_input, *host_output; + // Host allocation + host_input = new int[ELEMENT_NUMBER]; + host_output = new int[ELEMENT_NUMBER]; + + // Host data preparation + for (int i = 0; i < ELEMENT_NUMBER; i++) { + host_input[i] = i; + } + memset(host_output, 0, ELEMENT_NUMBER * sizeof(int)); + + int *device_input, *device_output; + + // Device allocation + hipMalloc((int **)&device_input, ELEMENT_NUMBER * sizeof(int)); + hipMalloc((int **)&device_output, ELEMENT_NUMBER * sizeof(int)); + + // Device data preparation + hipMemcpy(device_input, host_input, ELEMENT_NUMBER * sizeof(int), cudaMemcpyHostToDevice); + hipMemset() + + // Run the kernel + ... + + hipMemcpy(device_input, host_input, ELEMENT_NUMBER * sizeof(int), cudaMemcpyDeviceToHost); + + // Free host memory + delete[] host_input; + delete[] host_output; + + // Free device memory + hipFree() + hipFree() + +.. note:: + + :cpp:func:`hipMalloc` and :cpp:func:`hipFree` are blocking calls, however, HIP + recently added non-blocking versions :cpp:func:`hipMallocAsync` and + :cpp:func:`hipFreeAsync` which take in a stream as an additional argument. + +Pinned memory +-------------------------------------------------------------------------------- + +Pinned memory (or page-locked memory, or non-pageable memory) is host memory +that is mapped into the address space of all GPUs, meaning that the pointer can +be used on both host and device. Accessing host-resident pinned memory in device +kernels is generally not recommended for performance, as it can force the data +to traverse the host-device interconnect (e.g. PCIe), which is much slower than +the on-device bandwidth (>40x on MI200). + +Much like how a process can be locked to a CPU core by setting affinity, a +pinned memory allocator does this with the memory storage system. On multi-socket +systems it is important to ensure that pinned memory is located on the same +socket as the owning process, or else each cache line will be moved through the +CPU-CPU interconnect, thereby increasing latency and potentially decreasing +bandwidth. + +Advantage of pinned memory is the improved transfer times between host and +device. For transfer operations, such as :cpp:func:`hipMemcpy` or :cpp:func:`hipMemcpyAsync`, +using pinned memory instead of pageable memory on host can lead to a ~3x +improvement in bandwidth. + +Disadvantage of pinned memory that it reduces the available RAM for other +operations, such as paging, which can negatively impact the overall performance +of the host. + +The example code how to use pinned memory in HIP showed at the following example. + +.. code-block:: cpp + + #define ELEMENT_NUMBER 100 + + int *host_input, *host_output; + // Host allocation + hipHostMalloc((int **)&host_input, ELEMENT_NUMBER * sizeof(int)); + hipHostMalloc((int **)&host_output, ELEMENT_NUMBER * sizeof(int)); + + // Host data preparation + for (int i = 0; i < ELEMENT_NUMBER; i++) { + host_input[i] = i; + } + memset(host_output, 0, ELEMENT_NUMBER * sizeof(int)); + + int *device_input, *device_output; + + // Device allocation + hipMalloc((int **)&device_input, ELEMENT_NUMBER * sizeof(int)); + hipMalloc((int **)&device_output, ELEMENT_NUMBER * sizeof(int)); + + // Device data preparation + hipMemcpy(device_input, host_input, ELEMENT_NUMBER * sizeof(int), cudaMemcpyHostToDevice); + hipMemset() + + // Run the kernel + ... + + hipMemcpy(device_input, host_input, ELEMENT_NUMBER * sizeof(int), cudaMemcpyDeviceToHost); + + // Free host memory + delete[] host_input; + delete[] host_output; + + // Free device memory + hipFree() + hipFree() + +The pinned memory allocation is effected with different flags, which details +described at :ref:`memory_allocation_flags`. + +.. _memory_allocation_flags: + +Memory allocation flags of pinned memory +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``hipHostMalloc`` flags specify different memory allocation types for pinned +host memory: + +* ``hipHostMallocPortable``: The memory is considered allocated by all contexts, + not just the one on which the allocation is made. +* ``hipHostMallocMapped``: Map the allocation into the address space for + the current device, and the device pointer can be obtained with + :cpp:func:`hipHostGetDevicePointer`. +* ``hipHostMallocNumaUser``: The flag to allow host memory allocation to + follow Numa policy by user. Target of Numa policy is to select a CPU that is + closest to each GPU. Numa distance is the measurement of how far between GPU + and CPU devices. +* ``hipHostMallocWriteCombined``: Allocates the memory as write-combined. On + some system configurations, write-combined allocation may be transferred + faster across the PCI Express bus, however, could have low read efficiency by + most CPUs. It's a good option for data transfer from host to device via mapped + pinned memory. +* ``hipHostMallocCoherent``: Allocate fine-grained memory. Overrides + ``HIP_HOST_COHERENT`` environment variable for specific allocation. For + further details, check :ref:`coherency_controls`. +* ``hipHostMallocNonCoherent``: Allocate coarse-grained memory. Overrides + ``HIP_HOST_COHERENT`` environment variable for specific allocation. For + further details, check :ref:`coherency_controls`. + +All allocation flags are independent and can be used in most of the combination +without restriction, for instance, :cpp:func:`hipHostMalloc` can be called with both +``hipHostMallocPortable`` and ``hipHostMallocMapped`` flags set. Both usage +models described above use the same allocation flags, and the difference is in +how the surrounding code uses the host memory. + +.. note:: + + By default, each GPU selects a Numa CPU node that has the least Numa distance + between them, that is, host memory will be automatically allocated closest on + the memory pool of Numa node of the current GPU device. Using + :cpp:func:`hipSetDevice` API to a different GPU will still be able to access + the host allocation, but can have longer Numa distance. + + Numa policy is implemented on Linux and is under development on Microsoft + Windows. + +.. _coherency_controls: + +Coherency controls +================================================================================ + +AMD GPUs can have two different types of memory coherence: + +* **Coarse-grained coherence** means that memory is only considered up to date at + kernel boundaries, which can be enforced through hipDeviceSynchronize, + hipStreamSynchronize, or any blocking operation that acts on the null + stream (e.g. hipMemcpy). For example, cacheable memory is a type of + coarse-grained memory where an up-to-date copy of the data can be stored + elsewhere (e.g. in an L2 cache). +* **Fine-grained coherence** means the coherence is supported while a CPU/GPU + kernel is running. This can be useful if both host and device are operating on + the same dataspace using system-scope atomic operations (e.g. updating an + error code or flag to a buffer). Fine-grained memory implies that up-to-date + data may be made visible to others regardless of kernel boundaries as + discussed above. + +.. note:: + + In order to achieve this fine-grained coherence, many AMD GPUs use a limited + cache policy, such as leaving these allocations uncached by the GPU, or making + them read-only. + +.. TODO: Is this still valid? What about Mi300? +Developers should use coarse-grained coherence where they can to reduce +host-device interconnect communication and also Mi200 accelerators hardware +based floating point instructions are working on coarse grained memory regions. + +The availability of fine- and coarse-grained memory pools can be checked with +``rocminfo``. + +.. list-table:: Memory coherence control + :widths: 25, 35, 20, 20 + :header-rows: 1 + :align: center + + * - API + - Flag + - :cpp:func:`hipMemAdvise` call with argument + - Coherence + * - ``hipHostMalloc`` + - ``hipHostMallocDefault`` + - + - Fine-grained + * - ``hipHostMalloc`` + - ``hipHostMallocNonCoherent`` :sup:`1` + - + - Coarse-grained + * - ``hipExtMallocWithFlags`` + - ``hipDeviceMallocDefault`` + - + - Coarse-grained + * - ``hipExtMallocWithFlags`` + - ``hipDeviceMallocFinegrained`` + - + - Fine-grained + * - ``hipMallocManaged`` + - + - + - Fine-grained + * - ``hipMallocManaged`` + - + - ``hipMemAdviseSetCoarseGrain`` + - Coarse-grained + * - ``malloc`` + - + - + - Fine-grained + * - ``malloc`` + - + - ``hipMemAdviseSetCoarseGrain`` + - Coarse-grained + +:sup:`1` The :cpp:func:`hipHostMalloc` memory allocation coherence mode can be +affected by the ``HIP_HOST_COHERENT`` environment variable, if the +``hipHostMallocCoherent=0``, ``hipHostMallocNonCoherent=0``, +``hipHostMallocMapped=0`` and one of the other flag is set to 1. At this case, +if the ``HIP_HOST_COHERENT`` is not defined, or defined as 0, the host memory +allocation is coarse-grained. + +.. note:: + + * At ``hipHostMallocMapped=1`` case the allocated host memory is + fine-grained and the ``hipHostMallocNonCoherent`` flag is ignored. + * The ``hipHostMallocCoherent=1`` and ``hipHostMallocNonCoherent=1`` state is + illegal. + +Visibility of synchronization functions +-------------------------------------------------------------------------------- + +The fine-grained coherence memory is visible at synchronization points, however +at coarse-grained coherence, it depends on the used synchronization function. +The synchronization functions effect and visibility on different coherence +memory types collected in the following table. + +.. list-table:: HIP API + + * - HIP API + - ``hipStreamSynchronize`` + - ``hipDeviceSynchronize`` + - ``hipEventSynchronize`` + - ``hipStreamWaitEvent`` + * - Synchronization Effect + - host waits for all commands in the specified stream to complete + - host waits for all commands in all streams on the specified device to complete + - host waits for the specified event to complete + - stream waits for the specified event to complete + * - Fence + - system-scope release + - system-scope release + - system-scope release + - none + * - Fine-grained host memory visibility + - yes + - yes + - yes + - yes + * - Coarse-grained host memory visibility + - yes + - yes + - depends on the used event. + - no + +Developers can control the release scope for :cpp:func:`hipEvents`: + +* By default, the GPU performs a device-scope acquire and release operation + with each recorded event. This will make host and device memory visible to + other commands executing on the same device. + +A stronger system-level fence can be specified when the event is created with +:cpp:func:`hipEventCreateWithFlags`: + +* :cpp:func:`hipEventReleaseToSystem`: Perform a system-scope release operation + when the event is recorded. This will make **both fine-grained and + coarse-grained host memory visible to other agents in the system**, but may + involve heavyweight operations such as cache flushing. Fine-grained memory + will typically use lighter-weight in-kernel synchronization mechanisms such as + an atomic operation and thus does not need to use. + :cpp:func:`hipEventReleaseToSystem`. +* :cpp:func:`hipEventDisableTiming`: Events created with this flag will not + record profiling data and provide the best performance if used for + synchronization. diff --git a/docs/understand/texture_fetching.rst b/docs/how-to/hip_runtime_api/memory_management/texture_fetching.rst similarity index 91% rename from docs/understand/texture_fetching.rst rename to docs/how-to/hip_runtime_api/memory_management/texture_fetching.rst index 498e5723f3..4b93748e39 100644 --- a/docs/understand/texture_fetching.rst +++ b/docs/how-to/hip_runtime_api/memory_management/texture_fetching.rst @@ -7,7 +7,7 @@ Texture fetching ******************************************************************************* -`Textures <../doxygen/html/group___texture.html>`_ are more than just a buffer +`Textures <../../../doxygen/html/group___texture.html>`_ are more than just a buffer interpreted as a 1D, 2D, or 3D array. As textures are associated with graphics, they are indexed using floating-point @@ -32,7 +32,7 @@ sections. Here is the sample texture used in this document for demonstration purposes. It is 2x2 texels and indexed in the [0 to 1] range. -.. figure:: ../data/understand/textures/original.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/original.png :width: 150 :alt: Sample texture :align: center @@ -66,7 +66,7 @@ The following image shows a texture stretched to a 4x4 pixel quad but still indexed in the [0 to 1] range. The in-between values are the same as the values of the nearest texel. -.. figure:: ../data/understand/textures/nearest.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/nearest.png :width: 300 :alt: Texture upscaled with nearest point sampling :align: center @@ -97,7 +97,7 @@ This following image shows a texture stretched out to a 4x4 pixel quad, but still indexed in the [0 to 1] range. The in-between values are interpolated between the neighboring texels. -.. figure:: ../data/understand/textures/linear.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/linear.png :width: 300 :alt: Texture upscaled with linear filtering :align: center @@ -124,7 +124,7 @@ bounds. The border value must be set before texture fetching. The following image shows the texture on a 4x4 pixel quad, indexed in the [0 to 3] range. The out-of-bounds values are the border color, which is yellow. -.. figure:: ../data/understand/textures/border.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/border.png :width: 300 :alt: Texture with yellow border color :align: center @@ -147,7 +147,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in the [0 to 3] range. The out-of-bounds values are repeating the values at the edge of the texture. -.. figure:: ../data/understand/textures/clamp.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/clamp.png :width: 300 :alt: Texture with clamp addressing :align: center @@ -172,7 +172,7 @@ This creates a repeating image effect. The following image shows the texture on a 4x4 pixel quad, indexed in the [0 to 3] range. The out-of-bounds values are repeating the original texture. -.. figure:: ../data/understand/textures/wrap.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/wrap.png :width: 300 :alt: Texture with wrap addressing :align: center @@ -201,7 +201,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in The [0 to 3] range. The out-of-bounds values are repeating the original texture, but mirrored. -.. figure:: ../data/understand/textures/mirror.png +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/textures/mirror.png :width: 300 :alt: Texture with mirror addressing :align: center diff --git a/docs/how-to/unified_memory.rst b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst similarity index 99% rename from docs/how-to/unified_memory.rst rename to docs/how-to/hip_runtime_api/memory_management/unified_memory.rst index f64189454c..3eeba0e171 100644 --- a/docs/how-to/unified_memory.rst +++ b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst @@ -17,12 +17,13 @@ and promise increased efficiency and innovation. Unified memory ============== + Unified Memory is a single memory address space accessible from any processor within a system. This setup simplifies memory management processes and enables applications to allocate data that can be read or written by code running on either CPUs or GPUs. The Unified memory model is shown in the following figure. -.. figure:: ../data/unified_memory/um.svg +.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um_old.svg AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory Architecture. On a single die, a central processing unit (CPU) is combined @@ -35,6 +36,7 @@ throughput (data processed by unit time). System requirements =================== + Unified memory is supported on Linux by all modern AMD GPUs from the Vega series onward. Unified memory management can be achieved with managed memory allocation and, for the latest GPUs, with a system allocator. @@ -108,6 +110,7 @@ system requirements` and :ref:`checking unified memory management support`. Checking unified memory management support ------------------------------------------ + Some device attributes can offer information about which :ref:`unified memory programming models` are supported. The attribute value is 1 if the functionality is supported, and 0 if it is not supported. diff --git a/docs/how-to/virtual_memory.rst b/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst similarity index 89% rename from docs/how-to/virtual_memory.rst rename to docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst index 3e56bfb4fe..7f6880a639 100644 --- a/docs/how-to/virtual_memory.rst +++ b/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst @@ -9,11 +9,21 @@ Virtual memory management ***************************** -Memory management is important when creating high-performance applications in the HIP ecosystem. Both allocating and copying memory can result in bottlenecks, which can significantly impact performance. - -Global memory allocation in HIP uses the C language style allocation function. This works fine for simple cases but can cause problems if your memory needs change. If you need to increase the size of your memory, you must allocate a second larger buffer and copy the data to it before you can free the original buffer. This increases overall memory usage and causes unnecessary ``memcpy`` calls. Another solution is to allocate a larger buffer than you initially need. However, this isn't an efficient way to handle resources and doesn't solve the issue of reallocation when the extra buffer runs out. - -Virtual memory management solves these memory management problems. It helps to reduce memory usage and unnecessary ``memcpy`` calls. +Memory management is important when creating high-performance applications in +the HIP ecosystem. Both allocating and copying memory can result in bottlenecks, +which can significantly impact performance. + +Global memory allocation in HIP uses the C language style allocation function. +This works fine for simple cases but can cause problems if your memory needs +change. If you need to increase the size of your memory, you must allocate a +second larger buffer and copy the data to it before you can free the original +buffer. This increases overall memory usage and causes unnecessary ``memcpy`` +calls. Another solution is to allocate a larger buffer than you initially need. +However, this isn't an efficient way to handle resources and doesn't solve the +issue of reallocation when the extra buffer runs out. + +Virtual memory management solves these memory management problems. It helps to +reduce memory usage and unnecessary ``memcpy`` calls. .. _memory_allocation_virtual_memory: diff --git a/docs/how-to/programming_manual.md b/docs/how-to/programming_manual.md deleted file mode 100644 index 33ab58de93..0000000000 --- a/docs/how-to/programming_manual.md +++ /dev/null @@ -1,212 +0,0 @@ -# HIP programming manual - -## Host Memory - -### Introduction - -`hipHostMalloc` allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`. -There are two use cases for this host memory: - -* Faster `HostToDevice` and `DeviceToHost` Data Transfers: -The runtime tracks the `hipHostMalloc` allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with `--unpinned` and `--pinned` switches for the `hipBusBandwidth` tool. -* Zero-Copy GPU Access: -GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data. This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory. Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once). Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired. - -### Memory allocation flags - -There are flags parameter which can specify options how to allocate the memory, for example, -`hipHostMallocPortable`, the memory is considered allocated by all contexts, not just the one on which the allocation is made. -`hipHostMallocMapped`, will map the allocation into the address space for the current device, and the device pointer can be obtained with the API `hipHostGetDevicePointer()`. -`hipHostMallocNumaUser` is the flag to allow host memory allocation to follow Numa policy by user. Please note this flag is currently only applicable on Linux, under development on Windows. - -All allocation flags are independent, and can be used in any combination without restriction, for instance, `hipHostMalloc` can be called with both `hipHostMallocPortable` and `hipHostMallocMapped` flags set. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory. - -### Numa-aware host memory allocation - -Numa policy determines how memory is allocated. -Target of Numa policy is to select a CPU that is closest to each GPU. -Numa distance is the measurement of how far between GPU and CPU devices. - -By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using `hipSetDevice` API to a different GPU will still be able to access the host allocation, but can have longer Numa distance. -Note, Numa policy is so far implemented on Linux, and under development on Windows. - -### Coherency Controls - -ROCm defines two coherency options for host memory: - -* Coherent memory : Supports fine-grain synchronization while the kernel is running. For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs. Synchronization instructions include `threadfence_system` and C++11-style atomic operations. -In order to achieve this fine-grained coherence, many AMD GPUs use a limited cache policy, such as leaving these allocations uncached by the GPU, or making them read-only. - -* Non-coherent memory : Can be cached by GPU, but cannot support synchronization while the kernel is running. Non-coherent memory can be optionally synchronized only at command (end-of-kernel or copy command) boundaries. This memory is appropriate for high-performance access when fine-grain synchronization is not required. - -HIP provides the developer with controls to select which type of memory is used via allocation flags passed to `hipHostMalloc` and the `HIP_HOST_COHERENT` environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP. -The control logic in the current version of HIP is as follows: - -* No flags are passed in: the host memory allocation is coherent, the HIP_HOST_COHERENT environment variable is ignored. -* `hipHostMallocCoherent=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored. -* `hipHostMallocMapped=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored. -* `hipHostMallocNonCoherent=1`, `hipHostMallocCoherent=0`, and `hipHostMallocMapped=0`: The host memory will be non-coherent, the HIP_HOST_COHERENT environment variable is ignored. -* `hipHostMallocCoherent=0`, `hipHostMallocNonCoherent=0`, `hipHostMallocMapped=0`, but one of the other `HostMalloc` flags is set: - * If `HIP_HOST_COHERENT` is defined as 1, the host memory allocation is coherent. - * If `HIP_HOST_COHERENT` is not defined, or defined as 0, the host memory allocation is non-coherent. -* `hipHostMallocCoherent=1`, `hipHostMallocNonCoherent=1`: Illegal. - -### Visibility of Zero-Copy Host Memory - -Coherent host memory is automatically visible at synchronization points. -Non-coherent - -| HIP API | Synchronization Effect | Fence | Coherent Host Memory Visibility | Non-Coherent Host Memory Visibility| -| --- | --- | --- | --- | --- | -| `hipStreamSynchronize` | host waits for all commands in the specified stream to complete | system-scope release | yes | yes | -| `hipDeviceSynchronize` | host waits for all commands in all streams on the specified device to complete | system-scope release | yes | yes | -| `hipEventSynchronize` | host waits for the specified event to complete | device-scope release | yes | depends - see below| -| `hipStreamWaitEvent` | stream waits for the specified event to complete | none | yes | no | - -### `hipEventSynchronize` - -Developers can control the release scope for `hipEvents`: - -* By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device. - -A stronger system-level fence can be specified when the event is created with `hipEventCreateWithFlags`: - -* `hipEventReleaseToSystem`: Perform a system-scope release operation when the event is recorded. This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing. Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use `hipEventReleaseToSystem`. -* `hipEventDisableTiming`: Events created with this flag will not record profiling data and provide the best performance if used for synchronization. - -### Summary and Recommendations - -* Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as `threadfence_system` to work transparently. -* HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization. - -### Managed memory allocation - -Managed memory, including the `__managed__` keyword, is supported in HIP combined host/device compilation, on Linux, not on Windows (under development). - -Managed memory, via unified memory allocation, allows data be shared and accessible to both the CPU and GPU using a single pointer. -The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism, the user can call managed memory API `hipMallocManaged` to allocate a large chunk of HMM memory, execute kernels on device and fetch data between the host and device as needed. - -In HIP application, it is recommended to do the capability check before calling the managed memory APIs. For example: - -```cpp -int managed_memory = 0; -HIPCHECK(hipDeviceGetAttribute(&managed_memory, - hipDeviceAttributeManagedMemory,p_gpuDevice)); - -if (!managed_memory ) { - printf ("info: managed memory access not supported on the device %d\n Skipped\n", p_gpuDevice); -} -else { - HIPCHECK(hipSetDevice(p_gpuDevice)); - HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T))); -. . . -} -``` - -Please note, the managed memory capability check may not be necessary, but if HMM is not supported, then managed malloc will fall back to using system memory and other managed memory API calls will have undefined behavior. - -Note, managed memory management is implemented on Linux, not supported on Windows yet. - -### HIP Stream Memory Operations - -HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added, - `hipStreamWaitValue32` - `hipStreamWaitValue64` - `hipStreamWriteValue32` - `hipStreamWriteValue64` - -Note, CPU access to the semaphore's memory requires volatile keyword to disable CPU compiler's optimizations on memory access. -For more details, please check the documentation `HIP-API.pdf`. - -Please note, HIP stream does not guarantee concurrency on AMD hardware for the case of multiple (at least 6) long-running streams executing concurrently, using `hipStreamSynchronize(nullptr)` for synchronization. - -## Direct Dispatch - -HIP runtime has Direct Dispatch enabled by default in ROCM 4.4 on Linux. -With this feature we move away from our conventional producer-consumer model where the runtime creates a worker thread(consumer) for each HIP Stream, and the host thread(producer) enqueues commands to a command queue(per stream). - -For Direct Dispatch, HIP runtime would directly enqueue a packet to the AQL queue (user mode queue on GPU) on the Dispatch API call from the application. That has shown to reduce the latency to launch the first wave on the idle GPU and total time of tiny dispatches synchronized with the host. - -In addition, eliminating the threads in runtime has reduced the variance in the dispatch numbers as the thread scheduling delays and atomics/locks synchronization latencies are reduced. - -This feature can be disabled via setting the following environment variable, -AMD_DIRECT_DISPATCH=0 - -Note, Direct Dispatch is implemented on Linux. It is currently not supported on Windows. - -## HIP Runtime Compilation - -HIP now supports runtime compilation (HIP RTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation. - -HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes. - -For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](../doxygen/html/index). - -For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available. - -## HIP Graph - -HIP graph is supported. For more details, refer to the HIP API Guide. - -## Device-Side Malloc - -HIP-Clang now supports device-side malloc and free. -This implementation does not require the use of `hipDeviceSetLimit(hipLimitMallocHeapSize,value)` nor respects any setting. The heap is fully dynamic and can grow until the available free memory on the device is consumed. - -## Use of Per-thread default stream - -The per-thread default stream is supported in HIP. It is an implicit stream local to both the thread and the current device. This means that the command issued to the per-thread default stream by the thread does not implicitly synchronize with other streams (like explicitly created streams), or default per-thread stream on other threads. -The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program. -The per-thread default stream can be enabled via adding a compilation option, -`-fgpu-default-stream=per-thread`. - -And users can explicitly use `hipStreamPerThread` as per-thread default stream handle as input in API commands. There are test codes as examples in the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/streamperthread). - -## Use of Long Double Type - -In HIP-Clang, long double type is 80-bit extended precision format for x86_64, which is not supported by AMDGPU. HIP-Clang treats long double type as IEEE double type for AMDGPU. Using long double type in HIP source code will not cause issue as long as data of long double type is not transferred between host and device. However, long double type should not be used as kernel argument type. - -## Use of `_Float16` Type - -If a host function is to be used between clang (or hipcc) and gcc for x86_64, i.e. its definition is compiled by one compiler but the caller is compiled by a different compiler, `_Float16` or aggregates containing `_Float16` should not be used as function argument or return type. This is due to lack of stable ABI for `_Float16` on x86_64. Passing `_Float16` or aggregates containing `_Float16` between clang and gcc could cause undefined behavior. - -## FMA and contractions - -By default HIP-Clang assumes `-ffp-contract=fast-honor-pragmas`. -Users can use `#pragma clang fp contract(on|off|fast)` to control `fp` contraction of a block of code. -For x86_64, FMA is off by default since the generic x86_64 target does not -support FMA by default. To turn on FMA on x86_64, either use `-mfma` or `-march=native` -on CPU's supporting FMA. - -When contractions are enabled and the CPU has not enabled FMA instructions, the -GPU can produce different numerical results than the CPU for expressions that -can be contracted. Tolerance should be used for floating point comparisons. - -## Math functions with special rounding modes - -Note: Currently, HIP only supports basic math functions with rounding modern (round to nearest). HIP does not support basic math functions with rounding modes `ru` (round up), `rd` (round down), and `rz` (round towards zero). - -## Creating Static Libraries - -HIP-Clang supports generating two types of static libraries. The first type of static library does not export device functions, and only exports and launches host functions within the same library. The advantage of this type is the ability to link with a non-hipcc compiler such as gcc. The second type exports device functions to be linked by other code objects. However, this requires using hipcc as the linker. - -In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using `ar`. - -Here is an example to create and use static libraries: - -* Type 1 using `--emit-static-lib`: - - ```cpp - hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a - gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out - ``` - -* Type 2 using system `ar`: - - ```cpp - hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o - ar rcsD libHipDevice.a hipDevice.o - hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out - ``` - -For more information, please see [HIP samples host functions](https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/host_functions) and [device_functions](https://github.com/ROCm/hip-tests/tree/rocm-5.5.x/samples/2_Cookbook/15_static_library/device_functions). diff --git a/docs/index.md b/docs/index.md index a659b9b83a..9c154588f1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,22 +32,21 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support * {doc}`./understand/programming_model` * {doc}`./understand/hardware_implementation` * {doc}`./understand/amd_clr` -* {doc}`./understand/texture_fetching` +* {doc}`./understand/compilers` ::: :::{grid-item-card} How to -* [Programming manual](./how-to/programming_manual) +* {doc}`./how-to/hip_runtime_api` + * {doc}`./how-to/hip_runtime_api/memory_management` + * {doc}`./how-to/hip_runtime_api/cooperative_groups` * [HIP porting guide](./how-to/hip_porting_guide) * [HIP porting: driver API guide](./how-to/hip_porting_driver_api) * {doc}`./how-to/hip_rtc` * {doc}`./how-to/performance_guidelines` * [Debugging with HIP](./how-to/debugging) * {doc}`./how-to/logging` -* [Unified memory](./how-to/unified_memory) -* [Virtual memory](./how-to/virtual_memory) -* [Cooperative groups](./how-to/cooperative_groups) * {doc}`./how-to/faq` ::: diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 6a70b9e2ad..013a36cc1a 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -18,23 +18,26 @@ subtrees: - file: understand/programming_model - file: understand/hardware_implementation - file: understand/amd_clr - - file: understand/texture_fetching - title: Texture fetching + - file: understand/compilers - caption: How to entries: - - file: how-to/programming_manual + - file: how-to/hip_runtime_api + subtrees: + - entries: + - file: how-to/hip_runtime_api/memory_management + subtrees: + - entries: + - file: how-to/hip_runtime_api/memory_management/unified_memory + - file: how-to/hip_runtime_api/memory_management/virtual_memory + - file: how-to/hip_runtime_api/memory_management/texture_fetching + - file: how-to/hip_runtime_api/cooperative_groups - file: how-to/hip_porting_guide - file: how-to/hip_porting_driver_api - file: how-to/hip_rtc - file: how-to/performance_guidelines - file: how-to/debugging - file: how-to/logging - - file: how-to/cooperative_groups - - file: how-to/unified_memory - title: Unified memory - - file: how-to/virtual_memory - title: Virtual memory - file: how-to/faq - caption: Reference diff --git a/docs/tutorial/saxpy.rst b/docs/tutorial/saxpy.rst index 91ecc10be7..c3dc766102 100644 --- a/docs/tutorial/saxpy.rst +++ b/docs/tutorial/saxpy.rst @@ -143,10 +143,12 @@ Retrieval of the result from the device is done much like input data copy. In th HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost)); +.. _compiling_on_the_command_line: + Compiling on the command line ============================= -.. _setting_up_the_command-line: +.. _setting_up_the_command_line: Setting up the command line --------------------------- diff --git a/docs/understand/compilers.rst b/docs/understand/compilers.rst new file mode 100644 index 0000000000..e51ce63d5d --- /dev/null +++ b/docs/understand/compilers.rst @@ -0,0 +1,86 @@ +.. meta:: + :description: This chapter describes the compilation workflow of the HIP + compilers. + :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API + +.. _hip_compilers: + +******************************************************************************** +HIP compilers +******************************************************************************** + +ROCm provides the compiler driver ``hipcc``, that can be used on AMD and NVIDIA +platforms. ``hipcc`` takes care of setting the default library and include paths +for HIP, as well as some environment variables, and takes care of invoking the +appropriate compiler - ``amdclang++`` on AMD platforms and ``nvcc`` on NVIDIA +platforms. ``amdclang++`` is based on the ``clang++`` compiler. For further +details, check :doc:`the llvm project`. + +HIP compilation workflow +================================================================================ + +Offline compilation +-------------------------------------------------------------------------------- + +The compilation of HIP code is separated into a host- and a device-code +compilation stage. + +The compiled device code is embedded into the host object file. Depending on the +platform, the device code can be compiled into assembly or binary. ``nvcc`` and +``amdclang++`` target different architectures and use different code object +formats: ``nvcc`` uses the binary ``cubin`` or the assembly ``PTX`` files, while +the ``amdclang++`` path is the binary ``hsaco`` format. On NVIDIA platforms the +driver takes care of compiling the PTX files to executable code during runtime. + +On the host side ``nvcc`` only replaces the ``<<<...>>>`` kernel launch syntax +with the appropriate CUDA runtime function call and the modified host code is +passed to the default host compiler. ``hipcc`` or ``amdclang++`` can compile the +host code in one step without other C++ compilers. + +An example for how to compile HIP from the command line can be found in the +:ref:`SAXPY tutorial` . + +Runtime compilation +-------------------------------------------------------------------------------- + +HIP lets you compile kernels at runtime with the ``hiprtc*`` API. Kernels are +stored as a text string that are then passed to HIPRTC alongside options to +guide the compilation. + +For further details, check the +:doc:`how-to section for the HIP runtime compilation<../how-to/hip_rtc>`. + +Static Libraries +================================================================================ + +``hipcc`` supports generating two types of static libraries. The first type of +static library does not export device functions, and only exports and launches +host functions within the same library. The advantage of this type is the +ability to link with a non-hipcc compiler such as gcc. The second type exports +device functions to be linked by other code objects. However, this requires +using ``hipcc`` as the linker. + +In addition, the first type of library contains host objects with device code +embedded as fat binaries. It is generated using the flag ``--emit-static-lib``. +The second type of library contains relocatable device objects and is generated +using ``ar``. + +Here is an example to create and use static libraries: + +* Type 1 using `--emit-static-lib`: + + .. code-block:: cpp + + hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a + gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out + +* Type 2 using system `ar`: + + .. code-block:: cpp + + hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o + ar rcsD libHipDevice.a hipDevice.o + hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out + +For more information, please see `HIP samples host functions `_ +and `device_functions `_. diff --git a/docs/understand/programming_model.rst b/docs/understand/programming_model.rst index 410448434d..8f80f83f34 100644 --- a/docs/understand/programming_model.rst +++ b/docs/understand/programming_model.rst @@ -225,7 +225,7 @@ better than the defaults defined by the hardware. The implicit groups defined by kernel launch parameters are still available when working with cooperative groups. -For further information, see :doc:`Cooperative groups `. +For further information, see :doc:`Cooperative groups `. Memory model ============ diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index 798ba63bf0..bce1264a9e 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -717,7 +717,7 @@ enum hipLimit_t { /** Allocates the memory as write-combined. On some system configurations, write-combined allocation * may be transferred faster across the PCI Express bus, however, could have low read efficiency by - * most CPUs. It's a good option for data tranfer from host to device via mapped pinned memory.*/ + * most CPUs. It's a good option for data transfer from host to device via mapped pinned memory.*/ #define hipHostMallocWriteCombined 0x4 #define hipHostAllocWriteCombined 0x4 @@ -728,11 +728,11 @@ enum hipLimit_t { #define hipHostMallocNumaUser 0x20000000 #define hipExtHostAllocNumaUser 0x20000000 -/** Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/ +/** Allocate coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/ #define hipHostMallocCoherent 0x40000000 #define hipExtHostAllocCoherent 0x40000000 -/** Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/ +/** Allocate non-coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/ #define hipHostMallocNonCoherent 0x80000000 #define hipExtHostAllocNonCoherent 0x80000000