graphcore-research · balancap · Oct 19, 2023 · Oct 18, 2023
diff --git a/tessellate_ipu/core/__init__.py b/tessellate_ipu/core/__init__.py
@@ -45,6 +45,7 @@
     primitive_clone,
     primitive_num_inout_alias_args,
 )
+from .tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets, make_ipu_vector1d_worker_offsets_and_sizes
 
 
 def tessellate_ipu_cleanup():

diff --git a/tessellate_ipu/core/tile_interpreter_vertex_utils.py b/tessellate_ipu/core/tile_interpreter_vertex_utils.py
@@ -25,26 +25,79 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]:
     return num_elements
 
 
-def make_ipu_vector1d_worker_offsets(
+def make_ipu_vector1d_worker_offsets_and_sizes(
     size: int,
     vector_size: int = 2,
     num_workers: int = 6,
     wdtype: DTypeLike = np.uint16,
     allow_overlap: bool = False,
     grain_size: Optional[int] = None,
 ) -> NDArray[np.int_]:
-    """Make worker sizes/offsets for a 1D array workload, i.e. how many
-    data vectors per worker thread?
+    """Make worker sizes + offsets for a 1D array workload, i.e. how many
+    data vectors per worker thread (with starting offset)?
 
     Args:
         size: Size of the vector to divide.
         vector_size: Vector size (2: float, 4: half).
         num_workers: Number of workers.
         wdtype: Worklists dtype.
         allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term.
+        grain_size: Optional grain size. vector_size by default. Minimal size per thread.
+    Returns:
+        (NUM_WORKERS, 2) data offset + size per worker thread.
+
+            NOTE: offsets and sizes expressed in vector size unit!
+    """
+    grain_size = grain_size or vector_size
+    grain_scale = grain_size // vector_size
+    # TODO: support properly odd size.
+    assert size % 2 == 0, "Not supporting odd sizing at the moment."
+    # Base checks!
+    assert grain_size % vector_size == 0
+    assert size >= grain_size, f"Requires at least a size of {grain_size}."
+    assert (
+        size % grain_size == 0 or allow_overlap
+    ), f"Requires the size, {size}, divisible by the grain size {grain_size} (or overlap allowed)."
+
+    # Offset+size array to build.
+    offset_size_arr = np.zeros((num_workers, 2), dtype=np.int32)
+
+    # Base worksize on the first few workers.
+    base_worksize: int = math.ceil(size / (grain_size * num_workers))
+    num_base_workers = size // (grain_size * base_worksize)
+    # Offsets + size
+    offset_size_arr[:num_base_workers, 0] = np.arange(num_base_workers) * base_worksize * grain_scale
+    offset_size_arr[:num_base_workers, 1] = base_worksize * grain_scale
+    if num_base_workers == num_workers:
+        return offset_size_arr.astype(wdtype)
+
+    # Remainer term, for the next thread => all which is left, with potential overlap.
+    rem_worksize = size - base_worksize * grain_size * num_base_workers
+    rem_worksize = math.ceil(rem_worksize / grain_size)
+    offset_size_arr[num_base_workers, 0] = size / vector_size - rem_worksize * grain_scale
+    offset_size_arr[num_base_workers, 1] = rem_worksize * grain_scale
+    # Rest already filled with zeros...
+    return offset_size_arr.astype(wdtype)
+
+
+def make_ipu_vector1d_worker_offsets(
+    size: int,
+    vector_size: int = 2,
+    num_workers: int = 6,
+    wdtype: DTypeLike = np.uint16,
+    grain_size: Optional[int] = None,
+) -> NDArray[np.int_]:
+    """Make worker offsets (with additional padding) i.e. how many
+    data vectors per worker thread?
+
+    Args:
+        size: Size of the vector to divide.
+        vector_size: Vector size (2: float, 4: half).
+        num_workers: Number of workers.
+        wdtype: Worklists dtype.
         grain_size: Optional grain size. vector_size by default.
     Returns:
-        (6,) number of data vectors per thread.
+        (NUM_WORKERS + 1,) data offset per worker thread.
     """
     grain_size = grain_size or vector_size
     grain_scale = grain_size // vector_size
@@ -59,9 +112,7 @@ def make_offsets_fn(sizes):
     # Base checks!
     assert grain_size % vector_size == 0
     assert size >= grain_size, f"Requires at least a size of {grain_size}."
-    assert (
-        size % grain_size == 0 or allow_overlap
-    ), f"Requires the size, {size}, divisible by the grain size {grain_size}, (or allowing overlap)."
+    assert size % grain_size == 0, f"Requires the size, {size}, divisible by the grain size {grain_size}."
 
     # Base worksize on the first few workers.
     base_worksize: int = math.ceil(size / (grain_size * num_workers))

diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
@@ -194,9 +194,8 @@ template <typename T>
 inline void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
                                       const T* cs_arr, const T* pcol,
                                       const T* qcol, T* pcol_updated,
-                                      T* qcol_updated, unsigned wstart,
-                                      unsigned wend) noexcept {
-  const unsigned wsize = (wend - wstart) / 2;
+                                      T* qcol_updated, const unsigned wstart,
+                                      const unsigned wsize) noexcept {
   // Necessary for generating `rpt` loop.
   __builtin_assume(wsize < 4096);
   using T2 = float2;
@@ -324,7 +323,7 @@ class JacobiUpdateSecondStep : public MultiVertex {
       rotset_idx_ignored;  // (1,) index in rotset to ignore.
 
   Input<Vector<IndexType, poplar::VectorLayout::ONE_PTR>>
-      worker_offsets;  // (7,) threads work size + 1.
+      worker_offsets_sizes;  // (2, 6) worker offset + size
 
   Input<Vector<T, poplar::VectorLayout::ONE_PTR, 8>> pcol;  // (N+2,) p column
   Input<Vector<T, poplar::VectorLayout::ONE_PTR, 8>> qcol;  // (N+2,) q column
@@ -339,9 +338,9 @@ class JacobiUpdateSecondStep : public MultiVertex {
   bool compute(unsigned wid) {
     // Size of the index prefix in pcol and qcol.
     constexpr unsigned INDEX_PREFIX = 2;
-    // Worker load: start + end vectorized indexes.
-    const unsigned wstart = worker_offsets[wid];
-    const unsigned wend = worker_offsets[wid + 1];
+    // Worker load: start + size vectorized indexes.
+    const unsigned wstart = worker_offsets_sizes[2 * wid];
+    const unsigned wsize = worker_offsets_sizes[2 * wid + 1];
 
     // Forward pq indices.
     pcol_updated[0] = pcol[0];
@@ -359,7 +358,7 @@ class JacobiUpdateSecondStep : public MultiVertex {
 
     jacobi_update_second_step(rotset_sorted_arr.data(), cs_arr.data(), pcol_ptr,
                               qcol_ptr, pcol_updated_ptr, qcol_updated_ptr,
-                              wstart, wend);
+                              wstart, wsize);
     return true;
   }
 };

diff --git a/tessellate_ipu/lax/tile_lax_small_dot.py b/tessellate_ipu/lax/tile_lax_small_dot.py
@@ -5,8 +5,7 @@
 import numpy as np
 from jax.core import ShapedArray
 
-from tessellate_ipu.core import declare_ipu_tile_primitive
-from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
+from tessellate_ipu.core import declare_ipu_tile_primitive, make_ipu_vector1d_worker_offsets
 
 
 def get_small_dot_vertex_gp_filename() -> str:

diff --git a/tessellate_ipu/linalg/tile_linalg_hessenberg.py b/tessellate_ipu/linalg/tile_linalg_hessenberg.py
@@ -15,7 +15,7 @@
     tile_put_replicated,
     tile_put_sharded,
 )
-from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
+from tessellate_ipu.core import make_ipu_vector1d_worker_offsets
 
 from .tile_linalg_qr import dot_product1d_p
 

diff --git a/tessellate_ipu/linalg/tile_linalg_jacobi.py b/tessellate_ipu/linalg/tile_linalg_jacobi.py
@@ -18,7 +18,7 @@
     tile_put_replicated,
     tile_put_sharded,
 )
-from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
+from tessellate_ipu.core import make_ipu_vector1d_worker_offsets, make_ipu_vector1d_worker_offsets_and_sizes
 from tessellate_ipu.lax import tile_fill
 from tessellate_ipu.utils import NDArray
 
@@ -71,10 +71,11 @@ def get_jacobi_vertex_gp_filename() -> str:
     outputs={"cs_arr": 0, "pcol_updated": 3, "qcol_updated": 4},
     constants={
         # NOTE: using grain_size=4 because of partial loop unrolling
-        # TODO: support overlap properly.
-        "worker_offsets": lambda inavals, *_: make_ipu_vector1d_worker_offsets(
-            inavals[3].size - INDEX_PREFIX, vector_size=2, wdtype=np.uint16, allow_overlap=False, grain_size=4
+        # Rescale the size to be directly in grain size unit.
+        "worker_offsets_sizes": lambda inavals, *_: make_ipu_vector1d_worker_offsets_and_sizes(
+            inavals[3].size - INDEX_PREFIX, vector_size=2, grain_size=4, wdtype=np.uint16, allow_overlap=True
         )
+        // np.array([[1, 2]], dtype=np.uint16)
     },
     gp_filename=get_jacobi_vertex_gp_filename(),
     perf_estimate=200,

diff --git a/tessellate_ipu/linalg/tile_linalg_qr.py b/tessellate_ipu/linalg/tile_linalg_qr.py
@@ -7,7 +7,7 @@
 from jax.core import ShapedArray
 
 from tessellate_ipu import TileShardedArray, create_ipu_tile_primitive, tile_map, tile_put_replicated, tile_put_sharded
-from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
+from tessellate_ipu.core import make_ipu_vector1d_worker_offsets
 
 Array = Any
 

diff --git a/tests/core/test_tile_interpreter_vertex_utils.py b/tests/core/test_tile_interpreter_vertex_utils.py
@@ -7,6 +7,7 @@
 
 from tessellate_ipu.core.tile_interpreter_vertex_utils import (
     make_ipu_vector1d_worker_offsets,
+    make_ipu_vector1d_worker_offsets_and_sizes,
     make_num_elements_per_worker,
 )
 
@@ -45,3 +46,22 @@ def test__tile_vertex_utils__make_num_elements_per_worker(self, N, expected_num_
         num_elements = make_num_elements_per_worker(N, num_workers)
         assert np.sum(num_elements) == N
         npt.assert_array_equal(num_elements, expected_num_elements)
+
+    @parameterized.parameters(
+        {"N": 4, "expected_offsets": [0, 2, 0, 0, 0, 0], "expected_sizes": [2, 0, 0, 0, 0, 0]},
+        {"N": 6, "expected_offsets": [0, 1, 0, 0, 0, 0], "expected_sizes": [2, 2, 0, 0, 0, 0]},
+        {"N": 24, "expected_offsets": [0, 2, 4, 6, 8, 10], "expected_sizes": [2, 2, 2, 2, 2, 2]},
+        {"N": 30, "expected_offsets": [0, 4, 8, 11, 0, 0], "expected_sizes": [4, 4, 4, 4, 0, 0]},
+        {"N": 128, "expected_offsets": [0, 12, 24, 36, 48, 60], "expected_sizes": [12, 12, 12, 12, 12, 4]},
+    )
+    def test__tile_vertex_utils__make_ipu_vector1d_worker_offsets_and_sizes(self, N, expected_offsets, expected_sizes):
+        vector_size = 2
+        grain_size = 4
+        num_workers = 6
+        woffsets_sizes = make_ipu_vector1d_worker_offsets_and_sizes(
+            N, vector_size, num_workers=num_workers, wdtype=np.int16, grain_size=grain_size, allow_overlap=True
+        )
+        assert woffsets_sizes.shape == (num_workers, 2)
+        assert woffsets_sizes.dtype == np.int16
+        npt.assert_array_equal(woffsets_sizes[:, 0], expected_offsets)
+        npt.assert_array_equal(woffsets_sizes[:, 1], expected_sizes)
diff --git a/tests/linalg/test_tile_linalg_jacobi.py b/tests/linalg/test_tile_linalg_jacobi.py
@@ -177,8 +177,11 @@ def test__jacobi_eigh__single_iteration(self):
         npt.assert_array_almost_equal(np.linalg.eigh(A)[0], np.linalg.eigh(x)[0], decimal=5)
 
     @unittest.skipUnless(ipu_num_tiles >= 16, "Requires IPU with 16 tiles")
-    def test__jacobi_eigh_raw__proper_eigh_result(self):
-        N = 12
+    @parameterized.parameters(
+        {"N": 10},  # testing Jacobi 2nd update where grain size=4
+        {"N": 12},
+    )
+    def test__jacobi_eigh_raw__proper_eigh_result(self, N):
         x = np.random.randn(N, N).astype(np.float32)
         x = (x + x.T) / 2.0