Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix IPU Jacobi eigh algorithm when size % 4 == 2 #50

Merged
merged 1 commit into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tessellate_ipu/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
primitive_clone,
primitive_num_inout_alias_args,
)
from .tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets, make_ipu_vector1d_worker_offsets_and_sizes


def tessellate_ipu_cleanup():
Expand Down
65 changes: 58 additions & 7 deletions tessellate_ipu/core/tile_interpreter_vertex_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,79 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]:
return num_elements


def make_ipu_vector1d_worker_offsets(
def make_ipu_vector1d_worker_offsets_and_sizes(
size: int,
vector_size: int = 2,
num_workers: int = 6,
wdtype: DTypeLike = np.uint16,
allow_overlap: bool = False,
grain_size: Optional[int] = None,
) -> NDArray[np.int_]:
"""Make worker sizes/offsets for a 1D array workload, i.e. how many
data vectors per worker thread?
"""Make worker sizes + offsets for a 1D array workload, i.e. how many
data vectors per worker thread (with starting offset)?

Args:
size: Size of the vector to divide.
vector_size: Vector size (2: float, 4: half).
num_workers: Number of workers.
wdtype: Worklists dtype.
allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term.
grain_size: Optional grain size. vector_size by default. Minimal size per thread.
Returns:
(NUM_WORKERS, 2) data offset + size per worker thread.

NOTE: offsets and sizes expressed in vector size unit!
"""
grain_size = grain_size or vector_size
grain_scale = grain_size // vector_size
# TODO: support properly odd size.
assert size % 2 == 0, "Not supporting odd sizing at the moment."
# Base checks!
assert grain_size % vector_size == 0
assert size >= grain_size, f"Requires at least a size of {grain_size}."
assert (
size % grain_size == 0 or allow_overlap
), f"Requires the size, {size}, divisible by the grain size {grain_size} (or overlap allowed)."

# Offset+size array to build.
offset_size_arr = np.zeros((num_workers, 2), dtype=np.int32)

# Base worksize on the first few workers.
base_worksize: int = math.ceil(size / (grain_size * num_workers))
num_base_workers = size // (grain_size * base_worksize)
# Offsets + size
offset_size_arr[:num_base_workers, 0] = np.arange(num_base_workers) * base_worksize * grain_scale
offset_size_arr[:num_base_workers, 1] = base_worksize * grain_scale
if num_base_workers == num_workers:
return offset_size_arr.astype(wdtype)

# Remainer term, for the next thread => all which is left, with potential overlap.
rem_worksize = size - base_worksize * grain_size * num_base_workers
rem_worksize = math.ceil(rem_worksize / grain_size)
offset_size_arr[num_base_workers, 0] = size / vector_size - rem_worksize * grain_scale
offset_size_arr[num_base_workers, 1] = rem_worksize * grain_scale
# Rest already filled with zeros...
return offset_size_arr.astype(wdtype)


def make_ipu_vector1d_worker_offsets(
size: int,
vector_size: int = 2,
num_workers: int = 6,
wdtype: DTypeLike = np.uint16,
grain_size: Optional[int] = None,
) -> NDArray[np.int_]:
"""Make worker offsets (with additional padding) i.e. how many
data vectors per worker thread?

Args:
size: Size of the vector to divide.
vector_size: Vector size (2: float, 4: half).
num_workers: Number of workers.
wdtype: Worklists dtype.
grain_size: Optional grain size. vector_size by default.
Returns:
(6,) number of data vectors per thread.
(NUM_WORKERS + 1,) data offset per worker thread.
"""
grain_size = grain_size or vector_size
grain_scale = grain_size // vector_size
Expand All @@ -59,9 +112,7 @@ def make_offsets_fn(sizes):
# Base checks!
assert grain_size % vector_size == 0
assert size >= grain_size, f"Requires at least a size of {grain_size}."
assert (
size % grain_size == 0 or allow_overlap
), f"Requires the size, {size}, divisible by the grain size {grain_size}, (or allowing overlap)."
assert size % grain_size == 0, f"Requires the size, {size}, divisible by the grain size {grain_size}."

# Base worksize on the first few workers.
base_worksize: int = math.ceil(size / (grain_size * num_workers))
Expand Down
15 changes: 7 additions & 8 deletions tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,8 @@ template <typename T>
inline void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
const T* cs_arr, const T* pcol,
const T* qcol, T* pcol_updated,
T* qcol_updated, unsigned wstart,
unsigned wend) noexcept {
const unsigned wsize = (wend - wstart) / 2;
T* qcol_updated, const unsigned wstart,
const unsigned wsize) noexcept {
// Necessary for generating `rpt` loop.
__builtin_assume(wsize < 4096);
using T2 = float2;
Expand Down Expand Up @@ -324,7 +323,7 @@ class JacobiUpdateSecondStep : public MultiVertex {
rotset_idx_ignored; // (1,) index in rotset to ignore.

Input<Vector<IndexType, poplar::VectorLayout::ONE_PTR>>
worker_offsets; // (7,) threads work size + 1.
worker_offsets_sizes; // (2, 6) worker offset + size

Input<Vector<T, poplar::VectorLayout::ONE_PTR, 8>> pcol; // (N+2,) p column
Input<Vector<T, poplar::VectorLayout::ONE_PTR, 8>> qcol; // (N+2,) q column
Expand All @@ -339,9 +338,9 @@ class JacobiUpdateSecondStep : public MultiVertex {
bool compute(unsigned wid) {
// Size of the index prefix in pcol and qcol.
constexpr unsigned INDEX_PREFIX = 2;
// Worker load: start + end vectorized indexes.
const unsigned wstart = worker_offsets[wid];
const unsigned wend = worker_offsets[wid + 1];
// Worker load: start + size vectorized indexes.
const unsigned wstart = worker_offsets_sizes[2 * wid];
const unsigned wsize = worker_offsets_sizes[2 * wid + 1];

// Forward pq indices.
pcol_updated[0] = pcol[0];
Expand All @@ -359,7 +358,7 @@ class JacobiUpdateSecondStep : public MultiVertex {

jacobi_update_second_step(rotset_sorted_arr.data(), cs_arr.data(), pcol_ptr,
qcol_ptr, pcol_updated_ptr, qcol_updated_ptr,
wstart, wend);
wstart, wsize);
return true;
}
};
Expand Down
3 changes: 1 addition & 2 deletions tessellate_ipu/lax/tile_lax_small_dot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import numpy as np
from jax.core import ShapedArray

from tessellate_ipu.core import declare_ipu_tile_primitive
from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
from tessellate_ipu.core import declare_ipu_tile_primitive, make_ipu_vector1d_worker_offsets


def get_small_dot_vertex_gp_filename() -> str:
Expand Down
2 changes: 1 addition & 1 deletion tessellate_ipu/linalg/tile_linalg_hessenberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
tile_put_replicated,
tile_put_sharded,
)
from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
from tessellate_ipu.core import make_ipu_vector1d_worker_offsets

from .tile_linalg_qr import dot_product1d_p

Expand Down
9 changes: 5 additions & 4 deletions tessellate_ipu/linalg/tile_linalg_jacobi.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
tile_put_replicated,
tile_put_sharded,
)
from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
from tessellate_ipu.core import make_ipu_vector1d_worker_offsets, make_ipu_vector1d_worker_offsets_and_sizes
from tessellate_ipu.lax import tile_fill
from tessellate_ipu.utils import NDArray

Expand Down Expand Up @@ -71,10 +71,11 @@ def get_jacobi_vertex_gp_filename() -> str:
outputs={"cs_arr": 0, "pcol_updated": 3, "qcol_updated": 4},
constants={
# NOTE: using grain_size=4 because of partial loop unrolling
# TODO: support overlap properly.
"worker_offsets": lambda inavals, *_: make_ipu_vector1d_worker_offsets(
inavals[3].size - INDEX_PREFIX, vector_size=2, wdtype=np.uint16, allow_overlap=False, grain_size=4
# Rescale the size to be directly in grain size unit.
"worker_offsets_sizes": lambda inavals, *_: make_ipu_vector1d_worker_offsets_and_sizes(
inavals[3].size - INDEX_PREFIX, vector_size=2, grain_size=4, wdtype=np.uint16, allow_overlap=True
)
// np.array([[1, 2]], dtype=np.uint16)
},
gp_filename=get_jacobi_vertex_gp_filename(),
perf_estimate=200,
Expand Down
2 changes: 1 addition & 1 deletion tessellate_ipu/linalg/tile_linalg_qr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from jax.core import ShapedArray

from tessellate_ipu import TileShardedArray, create_ipu_tile_primitive, tile_map, tile_put_replicated, tile_put_sharded
from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
from tessellate_ipu.core import make_ipu_vector1d_worker_offsets

Array = Any

Expand Down
20 changes: 20 additions & 0 deletions tests/core/test_tile_interpreter_vertex_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from tessellate_ipu.core.tile_interpreter_vertex_utils import (
make_ipu_vector1d_worker_offsets,
make_ipu_vector1d_worker_offsets_and_sizes,
make_num_elements_per_worker,
)

Expand Down Expand Up @@ -45,3 +46,22 @@ def test__tile_vertex_utils__make_num_elements_per_worker(self, N, expected_num_
num_elements = make_num_elements_per_worker(N, num_workers)
assert np.sum(num_elements) == N
npt.assert_array_equal(num_elements, expected_num_elements)

@parameterized.parameters(
{"N": 4, "expected_offsets": [0, 2, 0, 0, 0, 0], "expected_sizes": [2, 0, 0, 0, 0, 0]},
{"N": 6, "expected_offsets": [0, 1, 0, 0, 0, 0], "expected_sizes": [2, 2, 0, 0, 0, 0]},
{"N": 24, "expected_offsets": [0, 2, 4, 6, 8, 10], "expected_sizes": [2, 2, 2, 2, 2, 2]},
{"N": 30, "expected_offsets": [0, 4, 8, 11, 0, 0], "expected_sizes": [4, 4, 4, 4, 0, 0]},
{"N": 128, "expected_offsets": [0, 12, 24, 36, 48, 60], "expected_sizes": [12, 12, 12, 12, 12, 4]},
)
def test__tile_vertex_utils__make_ipu_vector1d_worker_offsets_and_sizes(self, N, expected_offsets, expected_sizes):
vector_size = 2
grain_size = 4
num_workers = 6
woffsets_sizes = make_ipu_vector1d_worker_offsets_and_sizes(
N, vector_size, num_workers=num_workers, wdtype=np.int16, grain_size=grain_size, allow_overlap=True
)
assert woffsets_sizes.shape == (num_workers, 2)
assert woffsets_sizes.dtype == np.int16
npt.assert_array_equal(woffsets_sizes[:, 0], expected_offsets)
npt.assert_array_equal(woffsets_sizes[:, 1], expected_sizes)
7 changes: 5 additions & 2 deletions tests/linalg/test_tile_linalg_jacobi.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,11 @@ def test__jacobi_eigh__single_iteration(self):
npt.assert_array_almost_equal(np.linalg.eigh(A)[0], np.linalg.eigh(x)[0], decimal=5)

@unittest.skipUnless(ipu_num_tiles >= 16, "Requires IPU with 16 tiles")
def test__jacobi_eigh_raw__proper_eigh_result(self):
N = 12
@parameterized.parameters(
{"N": 10}, # testing Jacobi 2nd update where grain size=4
{"N": 12},
)
def test__jacobi_eigh_raw__proper_eigh_result(self, N):
x = np.random.randn(N, N).astype(np.float32)
x = (x + x.T) / 2.0

Expand Down