Skip to content

Commit

Permalink
Completing the API documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
JOOpdenhoevel committed Sep 2, 2021
1 parent 18726b5 commit 9c6a86e
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 83 deletions.
34 changes: 32 additions & 2 deletions StencilStream/MonotileExecutor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,50 @@ template <typename T, uindex_t stencil_radius, typename TransFunc, uindex_t pipe
/**
* \brief An executor that follows \ref monotile.
*
* The feature that distincts this executor from \ref StencilExecutor is that it works with exactly
* one tile. This means the grid range may not exceed the set tile range, but it uses less resources
* and time per kernel execution.
*
* \tparam T The cell type.
* \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
* \tparam TransFunc The type of the transition function.
* \tparam pipeline_length The number of hardware execution stages per kernel. Must be at least 1.
* Defaults to 1.
* \tparam tile_width The number of columns in a tile and maximum number of columns in a grid.
* Defaults to 1024.
* \tparam tile_height The number of rows in a tile and maximum number of rows in a grid. Defaults
* to 1024.
*/
class MonotileExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc> {
public:
static constexpr uindex_t burst_length = std::min<uindex_t>(1, burst_size / sizeof(T));
static constexpr uindex_t halo_radius = stencil_radius * pipeline_length;
/**
* \brief Shorthand for the parent class.
*/
using Parent = SingleQueueExecutor<T, stencil_radius, TransFunc>;

/**
* \brief Create a new executor.
*
* \param halo_value The value of cells in the grid halo.
* \param trans_func An instance of the transition function type.
*/
MonotileExecutor(T halo_value, TransFunc trans_func)
: Parent(halo_value, trans_func), tile_buffer(cl::sycl::range<2>(tile_width, tile_height)) {
auto ac = tile_buffer.template get_access<cl::sycl::access::mode::discard_write>();
ac[0][0] = halo_value;
}

/**
* \brief Set the internal state of the grid.
*
* This will copy the contents of the buffer to an internal representation. The buffer may be
* used for other purposes later. It must not reset the generation index. The range of the input
* buffer will be used as the new grid range.
*
* \throws std::range_error Thrown if the number of width or height of the buffer exceeds the
* set width and height of the tile. \param input_buffer The source buffer of the new grid
* state.
*/
void set_input(cl::sycl::buffer<T, 2> input_buffer) override {
if (input_buffer.get_range()[0] > tile_width && input_buffer.get_range()[1] > tile_height) {
throw std::range_error("The grid is bigger than the tile. The monotile architecture "
Expand Down
37 changes: 27 additions & 10 deletions StencilStream/RuntimeSample.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@

namespace stencil {
/**
* \brief Collection of SYCL events from a \ref StencilExecutor.run invocation.
*
* It's interface might be extended in the future to support more in-depth analysis. Now, it can
* analyse the runtime of a single execution kernel invocation as well as the total runtime of all
* passes.
* \brief Collection of runtime performance information for a \ref SingleQueueExecutor.
*/
class RuntimeSample {
public:
Expand All @@ -37,34 +33,55 @@ class RuntimeSample {
*/
RuntimeSample() : makespan(0.0), n_passes(0.0) {}

/**
* \brief Retrieve the starting timestamp of the event, in seconds.
*
* This will fail with an exception if the event's queue has not been configured to collect
* runtime information.
*/
static double start_of_event(cl::sycl::event event) {
return double(event.get_profiling_info<cl::sycl::info::event_profiling::command_start>()) /
timesteps_per_second;
}

/**
* \brief Retrieve the ending timestamp of the event, in seconds.
*
* This will fail with an exception if the event's queue has not been configured to collect
* runtime information.
*/
static double end_of_event(cl::sycl::event event) {
return double(event.get_profiling_info<cl::sycl::info::event_profiling::command_end>()) /
timesteps_per_second;
}

/**
* \brief Retrieve the runtime/makespan of the event, in seconds.
*
* This will fail with an exception if the event's queue has not been configured to collect
* runtime information.
*/
static double runtime_of_event(cl::sycl::event event) {
return end_of_event(event) - start_of_event(event);
}

/**
* \brief Add the runtime/makespan of a single pass over the grid to the database.
*/
void add_pass(double pass_runtime) {
makespan += pass_runtime;
n_passes += 1.0;
}

/**
* \brief Add the event of a single pass over the grid to the database.
*/
void add_pass(cl::sycl::event event) { add_pass(runtime_of_event(event)); }

/**
* \brief Calculate the total runtime of all passes.
*
* This iterates through all known events, finds the earliest start and latest end and returns
* the time difference between them.
* \brief Get the makespan of all grid passes.
*
* \return The total wall time in seconds it took to execute all passes.
* \return The makespan in seconds it took to execute all passes.
*/
double get_total_runtime() { return makespan; }

Expand Down
82 changes: 27 additions & 55 deletions StencilStream/StencilExecutor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,54 +24,44 @@

namespace stencil {
/**
* \brief Execution coordinator.
* \brief The default stencil executor.
*
* The `StencilExecutor` binds the different parts of StencilStream together and provides a unified
* interface for applications. Users create a stencil executor, configure it and then use it to run
* the payload computations. It has multiple logical attributes that can be configured:
* Unlike \ref MonotileExecutor, this executors supports any grid range by tiling the grid as
* described in \ref tiling, at the cost of complexer IO and a computational overhead for every
* tile.
*
* ### Grid
* This executor is called `StencilExecutor` since was the first one in StencilStream 2.x. A better
* name for it would be `TilingExecutor`.
*
* The grid is the logical array of cells, set with \ref StencilExecutor.set_input A stencil
* executor does not work in place and a buffer used to initialize the grid can be used for other
* tasks afterwards. The \ref StencilExecutor.run method alters the state of the grid and the grid
* can be copied back to a given buffer using \ref StencilExecutor.copy_output.
*
* ### SYCL queue
*
* The queue provides the OpenCL platform, device, context and queue to execute the kernels. A
* stencil executor does not have a queue when constructed and \ref StencilExecutor.run tries to
* select the FPGA emulation device if no queue has been configured yet. \ref
* StencilExecutor.set_queue is used to directly set the queue, but \ref
* StencilExecutor.select_emulator and \ref StencilExecutor.select_fpga can be used to automatically
* configure the FPGA emulator or the FPGA, respectively.
*
* ### Transition Function
*
* A stencil executor stores an instance of the transition function since it may require some
* configuration and runtime-dynamic parameters too. An instance is required for the initialization,
* but it may be replaced at any time with \ref StencilExecutor.set_trans_func.
*
* ### Generation Index
*
* This is the generation index of the current state of the grid. \ref StencilExecutor.run updates
* and therefore, it can be ignored in most instances. However, it can be reset if a transition
* function needs it.
*
* \tparam T Cell value type.
* \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the
* central cell. Must be at least 1. \tparam TransFunc An invocable type that maps a \ref Stencil to
* the next generation of the stencil's central cell. \tparam pipeline_length The number of hardware
* execution stages. Must be at least 1. Defaults to 1. \tparam tile_width The number of columns in
* a tile. Defaults to 1024. \tparam tile_height The number of rows in a tile. Defaults to 1024.
* \tparam T The cell type.
* \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
* \tparam TransFunc The type of the transition function.
* \tparam pipeline_length The number of hardware execution stages per kernel. Must be at least 1.
* Defaults to 1.
* \tparam tile_width The number of columns in a tile and maximum number of columns in a grid.
* Defaults to 1024.
* \tparam tile_height The number of rows in a tile and maximum number of rows in a grid. Defaults
* to 1024.
* \tparam burst_size The number of bytes to load/store in one burst. Defaults to 1024.
*/
template <typename T, uindex_t stencil_radius, typename TransFunc, uindex_t pipeline_length = 1,
uindex_t tile_width = 1024, uindex_t tile_height = 1024, uindex_t burst_size = 1024>
class StencilExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc> {
public:
/**
* \brief The number of cells that can be transfered in a single burst.
*/
static constexpr uindex_t burst_length = std::min<uindex_t>(1, burst_size / sizeof(T));

/**
* \brief The number of cells that have be added to the tile in every direction to form the
* complete input.
*/
static constexpr uindex_t halo_radius = stencil_radius * pipeline_length;

/**
* \brief Shorthand for the parent class.
*/
using Parent = SingleQueueExecutor<T, stencil_radius, TransFunc>;

/**
Expand All @@ -84,32 +74,14 @@ class StencilExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc>
: Parent(halo_value, trans_func),
input_grid(cl::sycl::buffer<T, 2>(cl::sycl::range<2>(0, 0))) {}

/**
* \brief Set the state of the grid.
*
* \param input_buffer A buffer containing the new state of the grid.
*/
void set_input(cl::sycl::buffer<T, 2> input_buffer) override {
this->input_grid = GridImpl(input_buffer);
}

/**
* \brief Copy the current state of the grid to the buffer.
*
* The `output_buffer` has to have the exact range as returned by \ref
* StencilExecutor.get_grid_range.
*
* \param output_buffer Copy the state of the grid to this buffer.
*/
void copy_output(cl::sycl::buffer<T, 2> output_buffer) override {
input_grid.copy_to(output_buffer);
}

/**
* \brief Return the range of the grid.
*
* \return The range of the grid.
*/
UID get_grid_range() const override { return input_grid.get_grid_range(); }

void run(uindex_t n_generations) override {
Expand Down
25 changes: 15 additions & 10 deletions StencilStream/tiling/ExecutionKernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ namespace tiling {
* \tparam TransFunc The type of transition function to use.
* \tparam T Cell value type.
* \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the
* central cell \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll
* factor for a loop. \tparam output_tile_width The number of columns in a grid tile. \tparam
* output_tile_height The number of rows in a grid tile. \tparam in_pipe The pipe to read from.
* central cell
* \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll
* factor for a loop.
* \tparam output_tile_width The number of columns in a grid tile.
* \tparam output_tile_height The number of rows in a grid tile.
* \tparam in_pipe The pipe to read from.
* \tparam out_pipe The pipe to write to.
*/
template <typename TransFunc, typename T, uindex_t stencil_radius, uindex_t pipeline_length,
Expand Down Expand Up @@ -80,13 +83,15 @@ class ExecutionKernel {
* \param trans_func The instance of the transition function to use.
* \param i_generation The generation index of the input cells.
* \param target_i_generation The number of generations to compute. If this number is bigger
* than `pipeline_length`, only `pipeline_length` generations will be computed. \param
* grid_c_offset The column offset of the processed tile relative to the grid's origin, not
* including the halo. For example, for the most north-western tile the offset will always be
* (0,0), not
* (-halo_radius,-halo_radius) \param grid_r_offset The row offset of the processed tile
* relative to the grid's origin. See `grid_c_offset` for details. \param grid_width The number
* of cell columns in the grid. \param grid_height The number of cell rows in the grid.
* than `pipeline_length`, only `pipeline_length` generations will be computed.
* \param grid_c_offset The column offset of the processed tile relative to the grid's origin,
* not including the halo. For example, for the most north-western tile the offset will always
* be (0,0), not (-halo_radius,-halo_radius)
* \param grid_r_offset The row offset of the processed tile relative to the grid's origin. See
* `grid_c_offset` for details.
* \param grid_width The number of cell columns in the grid.
* \param grid_height The number of cell rows in the grid.
* \param halo_value The value of cells in the grid halo.
*/
ExecutionKernel(TransFunc trans_func, uindex_t i_generation, uindex_t target_i_generation,
uindex_t grid_c_offset, uindex_t grid_r_offset, uindex_t grid_width,
Expand Down
3 changes: 2 additions & 1 deletion StencilStream/tiling/Grid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ class Grid {
/**
* \brief Copy the contents of the grid to a given buffer.
*
* This buffer has to exactly have the size of the grid, otherwise a range error is thrown.
* This buffer has to exactly have the size of the grid, otherwise a `std::range_error` is
* thrown.
*
* \param out_buffer The buffer to copy the cells to.
* \throws std::range_error The buffer's size is not the same as the grid's size.
Expand Down
15 changes: 10 additions & 5 deletions StencilStream/tiling/IOKernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,22 @@ namespace tiling {
*
* \tparam T Cell value type.
* \tparam halo_height The radius (aka width and height) of the tile halo (and therefore both
* dimensions of a corner buffer). \tparam core_height The height of the core buffer. \tparam
* burst_length The number of elements that can be read or written in a burst. \tparam pipe The pipe
* to read or write to. \tparam n_halo_height_buffers The number of buffers to accept, in addition
* to \tparam access_mode The access mode to expect for the buffer accessor. \tparam access_target
* The access target to expect for the buffer accessor.
* dimensions of a corner buffer).
* \tparam core_height The height of the core buffer.
* \tparam burst_length The number of elements that can be read or written in a burst.
* \tparam pipe The pipe to read or write to.
* \tparam n_halo_height_buffers The number of buffers to accept, in addition to the core buffer.
* \tparam access_mode The access mode to expect for the buffer accessor.
* \tparam access_target The access target to expect for the buffer accessor.
*/
template <typename T, uindex_t halo_height, uindex_t core_height, uindex_t burst_length,
typename pipe, uindex_t n_halo_height_buffers, cl::sycl::access::mode access_mode,
cl::sycl::access::target access_target = cl::sycl::access::target::global_buffer>
class IOKernel {
public:
/**
* \brief The exact accessor type required by the IO kernel.
*/
using Accessor = cl::sycl::accessor<T, 2, access_mode, access_target>;

/**
Expand Down

0 comments on commit 9c6a86e

Please sign in to comment.