From 9c6a86ed7784a999881346eba713f532b22d59b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Oliver=20Opdenh=C3=B6vel?= Date: Thu, 2 Sep 2021 15:04:42 +0200 Subject: [PATCH] Completing the API documentation --- StencilStream/MonotileExecutor.hpp | 34 +++++++++- StencilStream/RuntimeSample.hpp | 37 ++++++++--- StencilStream/StencilExecutor.hpp | 82 ++++++++---------------- StencilStream/tiling/ExecutionKernel.hpp | 25 +++++--- StencilStream/tiling/Grid.hpp | 3 +- StencilStream/tiling/IOKernel.hpp | 15 +++-- 6 files changed, 113 insertions(+), 83 deletions(-) diff --git a/StencilStream/MonotileExecutor.hpp b/StencilStream/MonotileExecutor.hpp index 7b5cbbf..0e79833 100644 --- a/StencilStream/MonotileExecutor.hpp +++ b/StencilStream/MonotileExecutor.hpp @@ -30,20 +30,50 @@ template { public: - static constexpr uindex_t burst_length = std::min(1, burst_size / sizeof(T)); - static constexpr uindex_t halo_radius = stencil_radius * pipeline_length; + /** + * \brief Shorthand for the parent class. + */ using Parent = SingleQueueExecutor; + /** + * \brief Create a new executor. + * + * \param halo_value The value of cells in the grid halo. + * \param trans_func An instance of the transition function type. + */ MonotileExecutor(T halo_value, TransFunc trans_func) : Parent(halo_value, trans_func), tile_buffer(cl::sycl::range<2>(tile_width, tile_height)) { auto ac = tile_buffer.template get_access(); ac[0][0] = halo_value; } + /** + * \brief Set the internal state of the grid. + * + * This will copy the contents of the buffer to an internal representation. The buffer may be + * used for other purposes later. It must not reset the generation index. The range of the input + * buffer will be used as the new grid range. + * + * \throws std::range_error Thrown if the number of width or height of the buffer exceeds the + * set width and height of the tile. \param input_buffer The source buffer of the new grid + * state. + */ void set_input(cl::sycl::buffer input_buffer) override { if (input_buffer.get_range()[0] > tile_width && input_buffer.get_range()[1] > tile_height) { throw std::range_error("The grid is bigger than the tile. The monotile architecture " diff --git a/StencilStream/RuntimeSample.hpp b/StencilStream/RuntimeSample.hpp index 32250b1..b806269 100644 --- a/StencilStream/RuntimeSample.hpp +++ b/StencilStream/RuntimeSample.hpp @@ -24,11 +24,7 @@ namespace stencil { /** - * \brief Collection of SYCL events from a \ref StencilExecutor.run invocation. - * - * It's interface might be extended in the future to support more in-depth analysis. Now, it can - * analyse the runtime of a single execution kernel invocation as well as the total runtime of all - * passes. + * \brief Collection of runtime performance information for a \ref SingleQueueExecutor. */ class RuntimeSample { public: @@ -37,34 +33,55 @@ class RuntimeSample { */ RuntimeSample() : makespan(0.0), n_passes(0.0) {} + /** + * \brief Retrieve the starting timestamp of the event, in seconds. + * + * This will fail with an exception if the event's queue has not been configured to collect + * runtime information. + */ static double start_of_event(cl::sycl::event event) { return double(event.get_profiling_info()) / timesteps_per_second; } + /** + * \brief Retrieve the ending timestamp of the event, in seconds. + * + * This will fail with an exception if the event's queue has not been configured to collect + * runtime information. + */ static double end_of_event(cl::sycl::event event) { return double(event.get_profiling_info()) / timesteps_per_second; } + /** + * \brief Retrieve the runtime/makespan of the event, in seconds. + * + * This will fail with an exception if the event's queue has not been configured to collect + * runtime information. + */ static double runtime_of_event(cl::sycl::event event) { return end_of_event(event) - start_of_event(event); } + /** + * \brief Add the runtime/makespan of a single pass over the grid to the database. + */ void add_pass(double pass_runtime) { makespan += pass_runtime; n_passes += 1.0; } + /** + * \brief Add the event of a single pass over the grid to the database. + */ void add_pass(cl::sycl::event event) { add_pass(runtime_of_event(event)); } /** - * \brief Calculate the total runtime of all passes. - * - * This iterates through all known events, finds the earliest start and latest end and returns - * the time difference between them. + * \brief Get the makespan of all grid passes. * - * \return The total wall time in seconds it took to execute all passes. + * \return The makespan in seconds it took to execute all passes. */ double get_total_runtime() { return makespan; } diff --git a/StencilStream/StencilExecutor.hpp b/StencilStream/StencilExecutor.hpp index a26ec6f..f950492 100644 --- a/StencilStream/StencilExecutor.hpp +++ b/StencilStream/StencilExecutor.hpp @@ -24,54 +24,44 @@ namespace stencil { /** - * \brief Execution coordinator. + * \brief The default stencil executor. * - * The `StencilExecutor` binds the different parts of StencilStream together and provides a unified - * interface for applications. Users create a stencil executor, configure it and then use it to run - * the payload computations. It has multiple logical attributes that can be configured: + * Unlike \ref MonotileExecutor, this executors supports any grid range by tiling the grid as + * described in \ref tiling, at the cost of complexer IO and a computational overhead for every + * tile. * - * ### Grid + * This executor is called `StencilExecutor` since was the first one in StencilStream 2.x. A better + * name for it would be `TilingExecutor`. * - * The grid is the logical array of cells, set with \ref StencilExecutor.set_input A stencil - * executor does not work in place and a buffer used to initialize the grid can be used for other - * tasks afterwards. The \ref StencilExecutor.run method alters the state of the grid and the grid - * can be copied back to a given buffer using \ref StencilExecutor.copy_output. - * - * ### SYCL queue - * - * The queue provides the OpenCL platform, device, context and queue to execute the kernels. A - * stencil executor does not have a queue when constructed and \ref StencilExecutor.run tries to - * select the FPGA emulation device if no queue has been configured yet. \ref - * StencilExecutor.set_queue is used to directly set the queue, but \ref - * StencilExecutor.select_emulator and \ref StencilExecutor.select_fpga can be used to automatically - * configure the FPGA emulator or the FPGA, respectively. - * - * ### Transition Function - * - * A stencil executor stores an instance of the transition function since it may require some - * configuration and runtime-dynamic parameters too. An instance is required for the initialization, - * but it may be replaced at any time with \ref StencilExecutor.set_trans_func. - * - * ### Generation Index - * - * This is the generation index of the current state of the grid. \ref StencilExecutor.run updates - * and therefore, it can be ignored in most instances. However, it can be reset if a transition - * function needs it. - * - * \tparam T Cell value type. - * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the - * central cell. Must be at least 1. \tparam TransFunc An invocable type that maps a \ref Stencil to - * the next generation of the stencil's central cell. \tparam pipeline_length The number of hardware - * execution stages. Must be at least 1. Defaults to 1. \tparam tile_width The number of columns in - * a tile. Defaults to 1024. \tparam tile_height The number of rows in a tile. Defaults to 1024. + * \tparam T The cell type. + * \tparam stencil_radius The radius of the stencil buffer supplied to the transition function. + * \tparam TransFunc The type of the transition function. + * \tparam pipeline_length The number of hardware execution stages per kernel. Must be at least 1. + * Defaults to 1. + * \tparam tile_width The number of columns in a tile and maximum number of columns in a grid. + * Defaults to 1024. + * \tparam tile_height The number of rows in a tile and maximum number of rows in a grid. Defaults + * to 1024. * \tparam burst_size The number of bytes to load/store in one burst. Defaults to 1024. */ template class StencilExecutor : public SingleQueueExecutor { public: + /** + * \brief The number of cells that can be transfered in a single burst. + */ static constexpr uindex_t burst_length = std::min(1, burst_size / sizeof(T)); + + /** + * \brief The number of cells that have be added to the tile in every direction to form the + * complete input. + */ static constexpr uindex_t halo_radius = stencil_radius * pipeline_length; + + /** + * \brief Shorthand for the parent class. + */ using Parent = SingleQueueExecutor; /** @@ -84,32 +74,14 @@ class StencilExecutor : public SingleQueueExecutor : Parent(halo_value, trans_func), input_grid(cl::sycl::buffer(cl::sycl::range<2>(0, 0))) {} - /** - * \brief Set the state of the grid. - * - * \param input_buffer A buffer containing the new state of the grid. - */ void set_input(cl::sycl::buffer input_buffer) override { this->input_grid = GridImpl(input_buffer); } - /** - * \brief Copy the current state of the grid to the buffer. - * - * The `output_buffer` has to have the exact range as returned by \ref - * StencilExecutor.get_grid_range. - * - * \param output_buffer Copy the state of the grid to this buffer. - */ void copy_output(cl::sycl::buffer output_buffer) override { input_grid.copy_to(output_buffer); } - /** - * \brief Return the range of the grid. - * - * \return The range of the grid. - */ UID get_grid_range() const override { return input_grid.get_grid_range(); } void run(uindex_t n_generations) override { diff --git a/StencilStream/tiling/ExecutionKernel.hpp b/StencilStream/tiling/ExecutionKernel.hpp index e05fb01..403592e 100644 --- a/StencilStream/tiling/ExecutionKernel.hpp +++ b/StencilStream/tiling/ExecutionKernel.hpp @@ -36,9 +36,12 @@ namespace tiling { * \tparam TransFunc The type of transition function to use. * \tparam T Cell value type. * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the - * central cell \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll - * factor for a loop. \tparam output_tile_width The number of columns in a grid tile. \tparam - * output_tile_height The number of rows in a grid tile. \tparam in_pipe The pipe to read from. + * central cell + * \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll + * factor for a loop. + * \tparam output_tile_width The number of columns in a grid tile. + * \tparam output_tile_height The number of rows in a grid tile. + * \tparam in_pipe The pipe to read from. * \tparam out_pipe The pipe to write to. */ template class IOKernel { public: + /** + * \brief The exact accessor type required by the IO kernel. + */ using Accessor = cl::sycl::accessor; /**