From b4f3cc6c9ad39dbef33f6e6ca29298f7cc002a88 Mon Sep 17 00:00:00 2001 From: Paul Balanca Date: Fri, 20 Oct 2023 09:38:13 +0000 Subject: [PATCH] Remove TAS register + axpy instrinsic wrapper class Replacing the use with `ipu::AMP`, which is much more general and should be able to properly model any IPU AMP unit instruction. --- .../core/vertex/intrinsics_utils.hpp | 53 ++----------------- .../core/vertex/tile_hessenberg_vertex.cpp | 20 ++++--- .../core/vertex/tile_jacobi_vertex.cpp | 2 +- tessellate_ipu/core/vertex/tile_qr_vertex.cpp | 24 ++++----- tessellate_ipu/core/vertex/tile_small_dot.hpp | 7 +-- 5 files changed, 28 insertions(+), 78 deletions(-) diff --git a/tessellate_ipu/core/vertex/intrinsics_utils.hpp b/tessellate_ipu/core/vertex/intrinsics_utils.hpp index 7860ac2..f089cdd 100644 --- a/tessellate_ipu/core/vertex/intrinsics_utils.hpp +++ b/tessellate_ipu/core/vertex/intrinsics_utils.hpp @@ -112,13 +112,6 @@ ALWAYS_INLINE float ld32(const T* address, unsigned offset) { return result; } -struct __ipu_and_ipumodel_tas { - void put(float v) { __builtin_ipu_put_tas(v); } - float2 f32v2axpy(float2 const& x, float2 const& y) { - return __builtin_ipu_f32v2axpy(x, y); - } -}; - #else #include @@ -152,47 +145,7 @@ IpuVector fma(IpuVector const& x, IpuVector const& y, } // namespace ipu -// Reflect IPU's AXPY semantics in a way that is IPUModel compatible -// IPU-only usage: -// __builtin_ipu_put_tas(v); -// z_prev = __builtin_ipu_f32v2axpy(x, y) -// -// IPUModel-compatible usage: -// __ipu_and_ipumodel_tas tas; -// tas.put(v); -// z_prev = tas.f32v2axpy(x, y) -// -// https://docs.graphcore.ai/projects/poplar-api/en/latest/ipu_intrinsics/ipu_builtins.html#_CPPv423__builtin_ipu_f32v2axpy6float26float2 -struct __ipu_and_ipumodel_tas { - float tas; - float2 prev; - - __ipu_and_ipumodel_tas() : tas{0}, prev{0, 0} {} - - void put(float v) { tas = v; } - - float2 f32v2axpy(float2 const& x, float2 const& y) { - const auto res = prev; - prev = float2{ - // TODO: understand ordering!? - // tas * x[0] + y[0], - // tas * x[1] + y[1], - tas * y[0] + x[0], - tas * y[1] + x[1], - }; - return res; - } -}; - -// And give useful error messages when people port from IPU to IPUModel, e.g. -/* clang-format off */ // need these error messages on one line -/* -/workspaces/tessellate-ipu/tessellate/tile/vertex/intrinsics_utils.hpp:166:3: error: static_assert failed due to requirement '__ipu_false>()': *** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel. - static_assert(__ipu_false(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel."); - ^ ~~~~~~~~~~~~~~~~ -/workspaces/tessellate-ipu/tessellate/tile/vertex/tile_qr_vertex.cpp:231:12: note: in instantiation of function template specialization '__builtin_ipu_f32v2axpy>' requested here - rout = __builtin_ipu_f32v2axpy(rtmp, rtmp); -*/ +// And give useful error messages when people port from IPU to IPUModel. template constexpr bool __ipu_false() { return !std::is_same::value; @@ -200,12 +153,12 @@ constexpr bool __ipu_false() { template void __builtin_ipu_put_tas(T v) { - static_assert(__ipu_false(), "*** Replace __builtin_ipu_put_tas with __ipu_and_ipumodel_tas for TAS handling on IPUModel."); + static_assert(__ipu_false(), "*** Please use `ipu::AMP` class for TAS handling on IPUModel."); } template T __builtin_ipu_f32v2axpy(T const& x, T const& y) { - static_assert(__ipu_false(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel."); + static_assert(__ipu_false(), "*** Please use `ipu::AMP::axpy` for `f32v2axpy` intrinsic on IPUModel."); return T{}; } // clang-format on diff --git a/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp b/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp index 08249a0..253fc53 100644 --- a/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp +++ b/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp @@ -3,6 +3,7 @@ #include #include "intrinsics_utils.hpp" +#include "ipu_amp.hpp" using namespace poplar; @@ -162,9 +163,10 @@ class [[poplar::constraint( // Set the $TAS register with the proper scale. const T s = -scale1[0] * scale2[0]; - // __builtin_ipu_put_tas(s); - __ipu_and_ipumodel_tas tas; - tas.put(s); + // Basic AMP usage with TAS + axpy instruction. + // AMP code using this abstraction is compatible with IPU hw & model. + ipu::AMP amp; + amp.tas(s); // Nothing to do in this worker thread. if (wstart == wend) { @@ -183,20 +185,16 @@ class [[poplar::constraint( vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step); // TODO: use ld2x64pace + tapack instructions. for (IndexType idx = 1; idx != wsize; ++idx) { - rtmp = tas.f32v2axpy(xin, vin); - // rtmp = __builtin_ipu_f32v2axpy(xin, vin); + rtmp = amp.axpy(vin, xin); // Grouping here seems to help the compiler optimising loads? xin = ipu::load_postinc(&ptr_inxdata_f2, ptr_step); vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step); - rout = tas.f32v2axpy(rtmp, rtmp); - // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp); + rout = amp.axpy(rtmp, rtmp); ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step); } // Finish the loop, getting the last computation. - // rtmp = __builtin_ipu_f32v2axpy(xin, vin); - // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp); - rtmp = tas.f32v2axpy(xin, vin); - rout = tas.f32v2axpy(rtmp, rtmp); + rtmp = amp.axpy(vin, xin); + rout = amp.axpy(rtmp, rtmp); ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step); return true; diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp index b7aced0..47f9800 100644 --- a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp +++ b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp @@ -391,7 +391,7 @@ void jacob_update_eigenvectors(const T* vpcol, const T* vqcol, T* vpcol_updated, * See: Gene H. Golub, Charles F. Van Loan, MATRIX COMPUTATIONS, 3rd edition, * Johns Hopkins Chapter 8. */ -class [[poplar::constraint( +class [[poplar::constraint( "elem(*vpcol) != elem(*vpcol_out)", "elem(*vqcol) != elem(*vqcol_out)")]] JacobiUpdateEigenvectors : public MultiVertex { diff --git a/tessellate_ipu/core/vertex/tile_qr_vertex.cpp b/tessellate_ipu/core/vertex/tile_qr_vertex.cpp index f6aa71a..51a1962 100644 --- a/tessellate_ipu/core/vertex/tile_qr_vertex.cpp +++ b/tessellate_ipu/core/vertex/tile_qr_vertex.cpp @@ -3,6 +3,7 @@ #include #include "intrinsics_utils.hpp" +#include "ipu_amp.hpp" using namespace poplar; @@ -165,8 +166,8 @@ float QRCorrectionVectorVertex::shared_partial_sqnorms[6] = {-1}; * NOTE: poplar::constraint here to make sure x and v are not part of the same * memory bank, allowing simultaneous loads (see `ld2x64pace` instruction). */ -class [[poplar::constraint( - "elem(*x) != elem(*v)")]] QRHouseholderRowUpdateVertex +class [ + [poplar::constraint("elem(*x) != elem(*v)")]] QRHouseholderRowUpdateVertex : public MultiVertex { public: using T = float; @@ -199,9 +200,10 @@ class [[poplar::constraint( // Set the $TAS register with the proper scale. const T s = -scale1[0] * scale2[0]; - // __builtin_ipu_put_tas(s); - __ipu_and_ipumodel_tas tas; - tas.put(s); + // Basic AMP usage with TAS + axpy instruction. + // AMP code using this abstraction is compatible with IPU hw & model. + ipu::AMP amp; + amp.tas(s); // Nothing to do in this worker thread. if (wstart == wend) { @@ -220,20 +222,16 @@ class [[poplar::constraint( vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step); // TODO: use ld2x64pace + tapack instructions. for (IndexType idx = 1; idx != wsize; ++idx) { - rtmp = tas.f32v2axpy(xin, vin); - // rtmp = __builtin_ipu_f32v2axpy(xin, vin); + rtmp = amp.axpy(vin, xin); // Grouping here seems to help the compiler optimising loads? xin = ipu::load_postinc(&ptr_inxdata_f2, ptr_step); vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step); - rout = tas.f32v2axpy(rtmp, rtmp); - // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp); + rout = amp.axpy(rtmp, rtmp); ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step); } // Finish the loop, getting the last computation. - // rtmp = __builtin_ipu_f32v2axpy(xin, vin); - // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp); - rtmp = tas.f32v2axpy(xin, vin); - rout = tas.f32v2axpy(rtmp, rtmp); + rtmp = amp.axpy(vin, xin); + rout = amp.axpy(rtmp, rtmp); ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step); return true; diff --git a/tessellate_ipu/core/vertex/tile_small_dot.hpp b/tessellate_ipu/core/vertex/tile_small_dot.hpp index 0b380fd..7aa6fbf 100644 --- a/tessellate_ipu/core/vertex/tile_small_dot.hpp +++ b/tessellate_ipu/core/vertex/tile_small_dot.hpp @@ -72,9 +72,10 @@ inline void axplusby_f32(float a, float b, const float2 *x, const float2 *y, // Necessary if using unsigned `nblocks`. // __builtin_assume(nblocks < 4096); using T2 = float2; - // Using TAS register for the scalar `b`. - __ipu_and_ipumodel_tas tas; - tas.put(b); + // Basic AMP usage with TAS + axpy instruction. + ipu::AMP amp; + amp.tas(b); + T2 av = {a, a}; // Explicit variables passed to inline assembly.