Optimized Jacobi second-step vertex.

The second step of the Jacobi algorithm is an usual "sparse" access update pattern, which can not be simply optimized with a `rpt` loop. The loop is intrinsically limited by the number of load/store operations. Nevertheless, by using `aop` outer product intrinsic + unrolling of 2 steps, the bundling of operations in the loop can be massively improved, leading to 40% decrease of cycle count. Additionally, this PR is adding a thin AMP C++ abstraction, allowing a simple implementation between IPU hardware and IPU model.
graphcore-research · Oct 17, 2023 · 1335dbb · 1335dbb
1 parent 5063716
commit 1335dbb
Show file tree

Hide file tree

Showing 8 changed files with 339 additions and 55 deletions.
diff --git a/tessellate_ipu/core/tile_array.py b/tessellate_ipu/core/tile_array.py
@@ -1,8 +1,10 @@
 # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
+import itertools
 from dataclasses import dataclass
 from typing import Any, Sequence, Tuple, Union
 
 import chex
+import jax.lax
 import numpy as np
 from jax.core import ShapedArray
 from jax.interpreters.xla import DeviceArray
@@ -185,6 +187,14 @@ def __getitem__(self, key: Union[SliceType, MultiSliceType]) -> "TileShardedArra
         check_tile_array_multi_slice(key, self.array.shape)
         return TileShardedArray(array=self.array[key], tiles=self.tiles[key[0]])  # type:ignore
 
+    @classmethod
+    def concatenate(cls, arrays: Sequence["TileShardedArray"]) -> "TileShardedArray":
+        """Concatenate tile sharded arrays along the first axis."""
+        assert all([isinstance(v, TileShardedArray) for v in arrays])
+        outarray = jax.lax.concatenate([v.array for v in arrays], dimension=0)
+        outtiles = tuple(itertools.chain(*[v.tiles for v in arrays]))
+        return TileShardedArray(array=outarray, tiles=outtiles)
+
 
 def tile_put_sharded(array: DeviceArray, tiles: Sequence[int]) -> TileShardedArray:
     """Shard a JAX array over tiles on the first axis.

diff --git a/tessellate_ipu/core/tile_interpreter_vertex_utils.py b/tessellate_ipu/core/tile_interpreter_vertex_utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 import math
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 from numpy.typing import DTypeLike, NDArray
@@ -26,36 +26,53 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]:
 
 
 def make_ipu_vector1d_worker_offsets(
-    size: int, vector_size: int = 2, num_workers: int = 6, wdtype: DTypeLike = np.uint16
+    size: int,
+    vector_size: int = 2,
+    num_workers: int = 6,
+    wdtype: DTypeLike = np.uint16,
+    allow_overlap: bool = False,
+    grain_size: Optional[int] = None,
 ) -> NDArray[np.int_]:
-    """Make the QR householder row update worker sizes, i.e. how many
+    """Make worker sizes/offsets for a 1D array workload, i.e. how many
     data vectors per worker thread?
 
     Args:
         size: Size of the vector to divide.
         vector_size: Vector size (2: float, 4: half).
         num_workers: Number of workers.
         wdtype: Worklists dtype.
+        allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term.
+        grain_size: Optional grain size. vector_size by default.
     Returns:
         (6,) number of data vectors per thread.
     """
+    grain_size = grain_size or vector_size
+    grain_scale = grain_size // vector_size
 
     def make_offsets_fn(sizes):
         sizes = [0] + sizes
-        offsets = np.cumsum(np.array(sizes, wdtype), dtype=wdtype)
+        offsets = np.cumsum(np.array(sizes, wdtype) * grain_scale, dtype=wdtype)
         return offsets
 
-    assert size % vector_size == 0
+    # TODO: support properly odd size.
+    assert size % 2 == 0, "Not supporting odd sizing at the moment."
+    # Base checks!
+    assert grain_size % vector_size == 0
+    assert size >= grain_size, f"Requires at least a size of {grain_size}."
+    assert (
+        size % grain_size == 0 or allow_overlap
+    ), f"Requires the size, {size}, divisible by the grain size {grain_size}, (or allowing overlap)."
+
     # Base worksize on the first few workers.
-    base_worksize: int = math.ceil(size / (vector_size * num_workers))
-    num_base_workers = size // (vector_size * base_worksize)
+    base_worksize: int = math.ceil(size / (grain_size * num_workers))
+    num_base_workers = size // (grain_size * base_worksize)
     worker_sizes: List[int] = [base_worksize] * num_base_workers
     if num_base_workers == num_workers:
         return make_offsets_fn(worker_sizes)
 
     # Remainer term, for the next thread.
-    rem_worksize = size - base_worksize * vector_size * num_base_workers
-    rem_worksize = rem_worksize // vector_size
+    rem_worksize = size - base_worksize * grain_size * num_base_workers
+    rem_worksize = rem_worksize // grain_size
     worker_sizes += [rem_worksize]
     # Fill the rest with zeros.
     unused_workers = num_workers - num_base_workers - 1

diff --git a/tessellate_ipu/core/vertex/intrinsics_utils.hpp b/tessellate_ipu/core/vertex/intrinsics_utils.hpp
@@ -64,6 +64,7 @@ ALWAYS_INLINE T ipu_div_by_6(T n) noexcept {
  */
 ALWAYS_INLINE void __builtin_ipu_put_tas(float v) noexcept {
   // TAS register, used for __builtin_ipu_f32v2axpy.
+  // TODO: use `__builtin_ipu_uput`?
   asm volatile(
       R"l( uput $TAS, %[sv]
         )l"
@@ -72,6 +73,20 @@ ALWAYS_INLINE void __builtin_ipu_put_tas(float v) noexcept {
       :);
 }
 
+/**
+ * @brief Zero AACC registers.
+ */
+ALWAYS_INLINE void __builtin_ipu_aacc_zero() {
+  asm (R"(
+    setzi $a0, 0x8
+    uput $FP_CLR, $a0
+  )"
+      :
+      :
+      : "$a0");
+}
+
+
 /**
  * @brief IPU cmac f32 instruction.
  */

diff --git a/tessellate_ipu/core/vertex/ipu_amp.hpp b/tessellate_ipu/core/vertex/ipu_amp.hpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#pragma once
+#include <type_traits>
+
+#include "intrinsics_utils.hpp"
+#include "ipu_model_types.hpp"
+
+namespace ipu {
+
+/**
+ * @brief Thin abstraction of the IPU AMP unit(s) and registers, allowing
+ * to write generic code compiling on IPU model and IPU hardware.
+ *
+ * NOTE: zero-cost abstraction on IPU hardware.
+ *
+ * The AMP class is modelling AACC registers as well as AMP unit instructions
+ * on the IPU model, reproducing the expected behaviour of the hardware.
+ */
+template <typename T>
+class AMP {
+ public:
+  // TODO: support half as well.
+  static_assert(std::is_same_v<T, float>);
+  using FPType = T;
+  /** Number of AACC register available in hw. */
+  // TODO: use TFPU_AMP_UNITS_PER_SET and TFPU_AACC_PER_AMP_UNIT;
+  static constexpr unsigned NumAACC = 16;
+
+  // TODO: random initialization on IPU model of registers.
+  AMP() noexcept = default;
+  // No copy + no move allowed!
+  AMP(const AMP&) = delete;
+  AMP(AMP&&) = delete;
+
+  /**
+   * @brief Set the value of the TAS register, used in
+   * `axpy` operation.
+   */
+  ALWAYS_INLINE void tas(FPType val) noexcept {
+#ifdef __IPU__
+    __builtin_ipu_put_tas(val);
+#else
+    m_tas = val;
+#endif
+  }
+  /**
+   * @brief Zero AACC registers.
+   */
+  ALWAYS_INLINE void aaccZero() noexcept {
+#ifdef __IPU__
+    __builtin_ipu_aacc_zero();
+#else
+    for (unsigned idx = 0; idx < NumAACC; ++idx) {
+      m_aacc[idx] = 0;
+    }
+#endif
+  }
+
+  /**
+   * @brief Scaled-add `axpy` intrinsic. Only supported on FP32.
+   * NOTE: act as 1 stage pipeline, storing result in AACC[0...2]
+   */
+  ALWAYS_INLINE float2 axpy(float2 x, float2 y) noexcept {
+    using T2 = float2;
+#ifdef __IPU__
+    // Weird ordering here? Bug in the intrinsic definition?
+    return __builtin_ipu_f32v2axpy(y, x);
+#else
+    // Simulating pipeline with storing in AACC[0] and AACC[2].
+    const auto res = T2{m_aacc[0], m_aacc[2]};
+    // FIXME/TODO: understand ordering!?
+    m_aacc[0] = m_tas * x[0] + y[0];
+    m_aacc[2] = m_tas * x[1] + y[1];
+    return res;
+#endif
+  }
+
+  /**
+   * @brief Outer-product `aop` intrinsic. Only supported on FP32.
+   * Storing results in AACC[0...6]
+   */
+  void aop(float2 x, float2 y) noexcept {
+#ifdef __IPU__
+    // Note: third argument not used by hw.
+    __builtin_ipu_f32v2aop(x, y, 0);
+#else
+    // Multiply + accumulate.
+    m_aacc[0] += x[0] * y[0];
+    m_aacc[2] += x[1] * y[0];
+    m_aacc[4] += x[0] * y[1];
+    m_aacc[6] += x[1] * y[1];
+#endif
+  }
+
+  /**
+   * @brief `gina` instruction: get AACC register + propagate.
+   * FIXME: support non-zero flag/index.
+   */
+  template <unsigned int FLAG>
+  float2 gina(float2 val) noexcept {
+    using T2 = float2;
+#ifdef __IPU__
+    return __builtin_ipu_f32v2gina(val, 0);
+#else
+    // TODO: implement GINA_IMMFLAGS__SET__GET
+    const auto res = T2{m_aacc[0], m_aacc[2]};
+    // Propagate accumulator states.
+    for (unsigned idx = 4; idx < NumAACC; idx += 4) {
+      m_aacc[idx - 4] = m_aacc[idx];
+      m_aacc[idx - 2] = m_aacc[idx + 2];
+    }
+    m_aacc[NumAACC - 4] = val[0];
+    m_aacc[NumAACC - 2] = val[1];
+    return res;
+#endif
+  }
+
+ private:
+#ifndef __IPU__
+  // Simulating AACC registers on IPU model.
+  FPType m_aacc[NumAACC];
+  // Simulating TAS register on IPU model.
+  FPType m_tas;
+#endif
+};
+
+}  // namespace ipu