From b4f3cc6c9ad39dbef33f6e6ca29298f7cc002a88 Mon Sep 17 00:00:00 2001
From: Paul Balanca <paulb@graphcore.ai>
Date: Fri, 20 Oct 2023 09:38:13 +0000
Subject: [PATCH] Remove TAS register + axpy instrinsic wrapper class

Replacing the use with `ipu::AMP<T>`, which is much more general
and should be able to properly model any IPU AMP unit instruction.
---
 .../core/vertex/intrinsics_utils.hpp          | 53 ++-----------------
 .../core/vertex/tile_hessenberg_vertex.cpp    | 20 ++++---
 .../core/vertex/tile_jacobi_vertex.cpp        |  2 +-
 tessellate_ipu/core/vertex/tile_qr_vertex.cpp | 24 ++++-----
 tessellate_ipu/core/vertex/tile_small_dot.hpp |  7 +--
 5 files changed, 28 insertions(+), 78 deletions(-)
diff --git a/tessellate_ipu/core/vertex/intrinsics_utils.hpp b/tessellate_ipu/core/vertex/intrinsics_utils.hpp
index 7860ac2..f089cdd 100644
--- a/tessellate_ipu/core/vertex/intrinsics_utils.hpp
+++ b/tessellate_ipu/core/vertex/intrinsics_utils.hpp
@@ -112,13 +112,6 @@ ALWAYS_INLINE float ld32(const T* address, unsigned offset) {
   return result;
 }
 
-struct __ipu_and_ipumodel_tas {
-  void put(float v) { __builtin_ipu_put_tas(v); }
-  float2 f32v2axpy(float2 const& x, float2 const& y) {
-    return __builtin_ipu_f32v2axpy(x, y);
-  }
-};
-
 #else
 
 #include <limits>
@@ -152,47 +145,7 @@ IpuVector<T, N> fma(IpuVector<T, N> const& x, IpuVector<T, N> const& y,
 
 }  // namespace ipu
 
-// Reflect IPU's AXPY semantics in a way that is IPUModel compatible
-// IPU-only usage:
-//   __builtin_ipu_put_tas(v);
-//   z_prev = __builtin_ipu_f32v2axpy(x, y)
-//
-// IPUModel-compatible usage:
-//   __ipu_and_ipumodel_tas tas;
-//   tas.put(v);
-//   z_prev = tas.f32v2axpy(x, y)
-//
-// https://docs.graphcore.ai/projects/poplar-api/en/latest/ipu_intrinsics/ipu_builtins.html#_CPPv423__builtin_ipu_f32v2axpy6float26float2
-struct __ipu_and_ipumodel_tas {
-  float tas;
-  float2 prev;
-
-  __ipu_and_ipumodel_tas() : tas{0}, prev{0, 0} {}
-
-  void put(float v) { tas = v; }
-
-  float2 f32v2axpy(float2 const& x, float2 const& y) {
-    const auto res = prev;
-    prev = float2{
-        // TODO: understand ordering!?
-        // tas * x[0] + y[0],
-        // tas * x[1] + y[1],
-        tas * y[0] + x[0],
-        tas * y[1] + x[1],
-    };
-    return res;
-  }
-};
-
-// And give useful error messages when people port from IPU to IPUModel, e.g.
-/* clang-format off */ // need these error messages on one line
-/*
-/workspaces/tessellate-ipu/tessellate/tile/vertex/intrinsics_utils.hpp:166:3: error: static_assert failed due to requirement '__ipu_false<IpuVector<float, 2>>()': *** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.
-  static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
-  ^             ~~~~~~~~~~~~~~~~
-/workspaces/tessellate-ipu/tessellate/tile/vertex/tile_qr_vertex.cpp:231:12: note: in instantiation of function template specialization '__builtin_ipu_f32v2axpy<IpuVector<float, 2>>' requested here
-    rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
-*/
+// And give useful error messages when people port from IPU to IPUModel.
 template <typename T>
 constexpr bool __ipu_false() {
   return !std::is_same<T, T>::value;
@@ -200,12 +153,12 @@ constexpr bool __ipu_false() {
 
 template <typename T>
 void __builtin_ipu_put_tas(T v) {
-  static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_put_tas with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
+  static_assert(__ipu_false<T>(), "*** Please use `ipu::AMP` class for TAS handling on IPUModel.");
 }
 
 template <typename T>
 T __builtin_ipu_f32v2axpy(T const& x, T const& y) {
-  static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
+  static_assert(__ipu_false<T>(), "*** Please use `ipu::AMP::axpy` for `f32v2axpy` intrinsic on IPUModel.");
   return T{};
 }
 // clang-format on
diff --git a/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp b/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp
index 08249a0..253fc53 100644
--- a/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp
+++ b/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp
@@ -3,6 +3,7 @@
 #include <poplar/Vertex.hpp>
 
 #include "intrinsics_utils.hpp"
+#include "ipu_amp.hpp"
 
 using namespace poplar;
 
@@ -162,9 +163,10 @@ class [[poplar::constraint(
 
     // Set the $TAS register with the proper scale.
     const T s = -scale1[0] * scale2[0];
-    // __builtin_ipu_put_tas(s);
-    __ipu_and_ipumodel_tas tas;
-    tas.put(s);
+    // Basic AMP usage with TAS + axpy instruction.
+    // AMP code using this abstraction is compatible with IPU hw & model.
+    ipu::AMP<T> amp;
+    amp.tas(s);
 
     // Nothing to do in this worker thread.
     if (wstart == wend) {
@@ -183,20 +185,16 @@ class [[poplar::constraint(
     vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step);
     // TODO: use ld2x64pace + tapack instructions.
     for (IndexType idx = 1; idx != wsize; ++idx) {
-      rtmp = tas.f32v2axpy(xin, vin);
-      // rtmp = __builtin_ipu_f32v2axpy(xin, vin);
+      rtmp = amp.axpy(vin, xin);
       // Grouping here seems to help the compiler optimising loads?
       xin = ipu::load_postinc(&ptr_inxdata_f2, ptr_step);
       vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step);
-      rout = tas.f32v2axpy(rtmp, rtmp);
-      // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
+      rout = amp.axpy(rtmp, rtmp);
       ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step);
     }
     // Finish the loop, getting the last computation.
-    // rtmp = __builtin_ipu_f32v2axpy(xin, vin);
-    // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
-    rtmp = tas.f32v2axpy(xin, vin);
-    rout = tas.f32v2axpy(rtmp, rtmp);
+    rtmp = amp.axpy(vin, xin);
+    rout = amp.axpy(rtmp, rtmp);
     ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step);
 
     return true;
diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
index b7aced0..47f9800 100644
--- a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
+++ b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
@@ -391,7 +391,7 @@ void jacob_update_eigenvectors(const T* vpcol, const T* vqcol, T* vpcol_updated,
  * See:  Gene H. Golub, Charles F. Van Loan, MATRIX COMPUTATIONS, 3rd edition,
  * Johns Hopkins Chapter 8.
  */
-class  [[poplar::constraint(
+class [[poplar::constraint(
     "elem(*vpcol) != elem(*vpcol_out)",
     "elem(*vqcol) != elem(*vqcol_out)")]] JacobiUpdateEigenvectors
     : public MultiVertex {
diff --git a/tessellate_ipu/core/vertex/tile_qr_vertex.cpp b/tessellate_ipu/core/vertex/tile_qr_vertex.cpp
index f6aa71a..51a1962 100644
--- a/tessellate_ipu/core/vertex/tile_qr_vertex.cpp
+++ b/tessellate_ipu/core/vertex/tile_qr_vertex.cpp
@@ -3,6 +3,7 @@
 #include <poplar/Vertex.hpp>
 
 #include "intrinsics_utils.hpp"
+#include "ipu_amp.hpp"
 
 using namespace poplar;
 
@@ -165,8 +166,8 @@ float QRCorrectionVectorVertex::shared_partial_sqnorms[6] = {-1};
  * NOTE: poplar::constraint here to make sure x and v are not part of the same
  * memory bank, allowing simultaneous loads (see `ld2x64pace` instruction).
  */
-class [[poplar::constraint(
-    "elem(*x) != elem(*v)")]] QRHouseholderRowUpdateVertex
+class [
+    [poplar::constraint("elem(*x) != elem(*v)")]] QRHouseholderRowUpdateVertex
     : public MultiVertex {
  public:
   using T = float;
@@ -199,9 +200,10 @@ class [[poplar::constraint(
 
     // Set the $TAS register with the proper scale.
     const T s = -scale1[0] * scale2[0];
-    // __builtin_ipu_put_tas(s);
-    __ipu_and_ipumodel_tas tas;
-    tas.put(s);
+    // Basic AMP usage with TAS + axpy instruction.
+    // AMP code using this abstraction is compatible with IPU hw & model.
+    ipu::AMP<T> amp;
+    amp.tas(s);
 
     // Nothing to do in this worker thread.
     if (wstart == wend) {
@@ -220,20 +222,16 @@ class [[poplar::constraint(
     vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step);
     // TODO: use ld2x64pace + tapack instructions.
     for (IndexType idx = 1; idx != wsize; ++idx) {
-      rtmp = tas.f32v2axpy(xin, vin);
-      // rtmp = __builtin_ipu_f32v2axpy(xin, vin);
+      rtmp = amp.axpy(vin, xin);
       // Grouping here seems to help the compiler optimising loads?
       xin = ipu::load_postinc(&ptr_inxdata_f2, ptr_step);
       vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step);
-      rout = tas.f32v2axpy(rtmp, rtmp);
-      // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
+      rout = amp.axpy(rtmp, rtmp);
       ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step);
     }
     // Finish the loop, getting the last computation.
-    // rtmp = __builtin_ipu_f32v2axpy(xin, vin);
-    // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
-    rtmp = tas.f32v2axpy(xin, vin);
-    rout = tas.f32v2axpy(rtmp, rtmp);
+    rtmp = amp.axpy(vin, xin);
+    rout = amp.axpy(rtmp, rtmp);
     ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step);
 
     return true;
diff --git a/tessellate_ipu/core/vertex/tile_small_dot.hpp b/tessellate_ipu/core/vertex/tile_small_dot.hpp
index 0b380fd..7aa6fbf 100644
--- a/tessellate_ipu/core/vertex/tile_small_dot.hpp
+++ b/tessellate_ipu/core/vertex/tile_small_dot.hpp
@@ -72,9 +72,10 @@ inline void axplusby_f32(float a, float b, const float2 *x, const float2 *y,
   // Necessary if using unsigned `nblocks`.
   // __builtin_assume(nblocks < 4096);
   using T2 = float2;
-  // Using TAS register for the scalar `b`.
-  __ipu_and_ipumodel_tas tas;
-  tas.put(b);
+  // Basic AMP usage with TAS + axpy instruction.
+  ipu::AMP<float> amp;
+  amp.tas(b);
+
 
   T2 av = {a, a};
   // Explicit variables passed to inline assembly.