From 206d266288316aa7e66b8a5fa921c61b22ee87a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= <matyas@streamhpc.com>
Date: Thu, 30 May 2024 13:50:34 +0200
Subject: [PATCH] Update based on reviews

---
 docs/reference/unified_memory.rst | 199 ++++++++++++++++++++++++++++--
 1 file changed, 188 insertions(+), 11 deletions(-)

diff --git a/docs/reference/unified_memory.rst b/docs/reference/unified_memory.rst
index a4fc1e45ad..0aa210054b 100644
--- a/docs/reference/unified_memory.rst
+++ b/docs/reference/unified_memory.rst
@@ -27,25 +27,113 @@ either CPUs or GPUs. The Unified memory model is shown in the figure below.
 AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory
 Architecture. On a single die, a central processing unit (CPU) is combined with
 an integrated graphics processing unit (iGPU) and both have the access for a
-high bandwidth memory module, named as Unified Memory. The CPU enables
+high bandwidth memory (HBM) module, named as Unified Memory. The CPU enables
 high-performance low latency operations, while the GPU is optimized for
 high-throughput (data processed by unit time).
 
-List of managed memory functions
-================================
+How-to use?
+===========
 
-.. doxygengroup:: MemoryM
-   :content-only:
+Unified Memory Management (UMM) is a feature that can simplify the complexities
+of memory management in GPU computing. It is particularly useful in
+heterogeneous computing environments with heavy memory usage with both a CPU
+and a GPU which would require large memory transfers. Here are some areas where
+UMM can be beneficial:
+
+- **Simplification of Memory Management**:
+UMM can help to simplify the complexities of memory management. This can make
+it easier for developers to write code without having to worry about the
+details of memory allocation and deallocation.
+
+- **Data Migration**:
+UMM allows for efficient data migration between the host (CPU) and the device
+(GPU). This can be particularly useful for applications that need to move data
+back and forth between the device and host.
+
+- **Improved Programming Productivity**:
+As a positive side effect, the use of UMM can reduce the lines of code,
+thereby improving programming productivity.
+
+In HIP, pinned memory allocations are coherent by default. Pinned memory is
+host memory that is mapped into the address space of all GPUs, meaning that the
+pointer can be used on both host and device. Using pinned memory instead of
+pageable memory on the host can lead an improvement in bandwidth.
+
+While UMM can provide numerous benefits, it is also important
+to be aware of the potential performance overhead associated with UMM.
+Therefore, it is recommended to thoroughly test and profile your code to
+ensure it is indeed the most suitable choice for your specific use case.
+
+System Requirements
+===================
+Unified memory is supported on Linux by all modern AMD GPUs from the Vega
+series onwards. Unified memory management can be achieved with managed memory
+allocation and, for the latest GPUs, with a system allocator.
+
+The table below lists the supported allocators. The allocators are described in
+the next chapter.
+
+.. csv-table::
+    :widths: 50, 10, 10, 10
+    :header: "GPU", "hipMallocManaged", "managed", "malloc"
+
+        "MI200, MI 300 Series", "✅" , "✅" , "✅:sup:`1`"
+        "MI100", "✅" , "✅" , "❌"
+        "RDNA (Navi) Series", "✅" , "✅" , "❌"
+        "GCN5 (Vega) Series", "✅" , "✅" , "❌"
+
+✅: **Supported**
+
+❌: **Unsupported**
+
+:sup:`1` Works only with ``XNACK=1``. First GPU access causes recoverable page-fault.
+
+Unified Memory Programming Models
+=================================
+
+- **HIP Managed Memory Allocation API**:
+The ``hipMallocManaged()`` is a dynamic memory allocator available at all GPUs
+with unified memory support.
+
+- **HIP Managed Variables**:
+The ``__managed__`` declaration specifier, which serves as its counterpart, is
+supported across all modern AMD cards and can be utilized for static
+allocation.
+
+- **System Allocation API**:
+Starting with the MI300 series, it is also possible to reserve unified memory
+via the ``malloc()`` system allocator.
+
+If it is wondered whether the GPU and the environment are capable of supporting
+unified memory management, the ``hipDeviceAttributeConcurrentManagedAccess``
+device attribute can answer it:
+
+.. code:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    int main() {
+        int d;
+        hipGetDevice(&d);
+
+        int is_cma = 0;
+        hipDeviceGetAttribute(&is_cma, hipDeviceAttributeConcurrentManagedAccess, d);
+        std::cout << "HIP Managed Memory: " << (is_cma == 1 ? "is" : "NOT") << " supported" << std::endl;
+        return 0;
+    }
 
 Example for Unified Memory Management
-=====================================
+-------------------------------------
 
-The following HIP program with unified memory management shows the addition of
-two integers. In the other tab we can compare it to explicit memory management.
+The following example shows how to use unified memory management with
+``hipMallocManaged()``, function, with ``__managed__`` attribute for static
+allocation and standard  ``malloc()`` allocation. The Explicit Memory
+Management is presented for comparison.
 
 .. tab-set::
 
-    .. tab-item:: Unified Memory Management
+    .. tab-item:: hipMallocManaged()
 
         .. code:: cpp
 
@@ -60,7 +148,7 @@ two integers. In the other tab we can compare it to explicit memory management.
             int main() {
                 int *a, *b, *c;
 
-                // Allocate device copies of a, b and c.
+                // Allocate memory for a, b and c that is accessible to both device and host codes.
                 hipMallocManaged(&a, sizeof(*a));
                 hipMallocManaged(&b, sizeof(*b));
                 hipMallocManaged(&c, sizeof(*c));
@@ -87,6 +175,81 @@ two integers. In the other tab we can compare it to explicit memory management.
             }
 
 
+    .. tab-item:: __managed__
+
+        .. code:: cpp
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            // Addition of two values.
+            __global__ void add(int *a, int *b, int *c) {
+                *c = *a + *b;
+            }
+
+            // Declare a, b and c as static variables.
+            __managed__ int a, b, c;
+
+            int main() {
+                // Setup input values.
+                a = 1;
+                b = 2;
+
+                // Launch add() kernel on GPU.
+                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
+
+                // Wait for GPU to finish before accessing on host.
+                hipDeviceSynchronize();
+
+                // Prints the result.
+                std::cout << a << " + " << b << " = " << c << std::endl;
+
+                return 0;
+            }
+
+
+    .. tab-item:: malloc()
+
+        .. code:: cpp
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            // Addition of two values.
+            __global__ void add(int* a, int* b, int* c) {
+                *c = *a + *b;
+            }
+
+            int main() {
+                int* a, * b, * c;
+
+                // Allocate memory for a, b, and c.
+                a = (int*)malloc(sizeof(*a));
+                b = (int*)malloc(sizeof(*b));
+                c = (int*)malloc(sizeof(*c));
+
+                // Setup input values.
+                *a = 1;
+                *b = 2;
+
+                // Launch add() kernel on GPU.
+                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+                // Wait for GPU to finish before accessing on host.
+                hipDeviceSynchronize();
+
+                // Prints the result.
+                std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+                // Cleanup allocated memory.
+                free(a);
+                free(b);
+                free(c);
+
+                return 0;
+            }
+
+
     .. tab-item:: Explicit Memory Management
 
         .. code:: cpp
@@ -107,7 +270,7 @@ two integers. In the other tab we can compare it to explicit memory management.
                 a = 1;
                 b = 2;
 
-                // Allocate device copies of a, b and c
+                // Allocate device copies of a, b and c.
                 hipMalloc(&d_a, sizeof(*d_a));
                 hipMalloc(&d_b, sizeof(*d_b));
                 hipMalloc(&d_c, sizeof(*d_c));
@@ -133,3 +296,17 @@ two integers. In the other tab we can compare it to explicit memory management.
                 return 0;
             }
 
+
+Missing features
+================
+
+
+List of HIP Managed Memory Allocation API
+=========================================
+
+.. doxygenfunction:: hipMallocManaged
+
+.. doxygengroup:: MemoryM
+   :content-only:
+
+.. doxygenfunction:: hipPointerSetAttribute