diff --git a/.vscode/settings.json b/.vscode/settings.json
index 553995a..1e273b7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -76,15 +76,6 @@
         "codecvt": "cpp",
         "shared_mutex": "cpp"
     },
-    "C_Cpp.clang_format_style": "{ BasedOnStyle: LLVM, UseTab: Never, IndentWidth: 4, TabWidth: 4, BreakBeforeBraces: Allman, AllowShortIfStatementsOnASingleLine: false, IndentCaseLabels: false, ColumnLimit: 0, AccessModifierOffset: -4 }",
+    "C_Cpp.clang_format_style": "{ BasedOnStyle: LLVM, IndentWidth: 4, TabWidth: 4, ColumnLimit: 100, IndentPPDirectives: BeforeHash }",
     "python.pythonPath": "/usr/bin/python3",
-    "spellright.language": [
-        "English (American)"
-    ],
-    "spellright.documentTypes": [
-        "markdown",
-        "latex",
-        "plaintext",
-        "cpp"
-    ]
 }
\ No newline at end of file
diff --git a/Doxyfile b/Doxyfile
index d7fdba1..5452467 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "StencilStream"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = v2.0.0
+PROJECT_NUMBER         = v2.1.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -829,7 +829,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = StencilStream README.md docs
+INPUT                  = StencilStream StencilStream/monotile StencilStream/tiling README.md docs
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/README.md b/README.md
index e4227c7..4ce38c5 100644
--- a/README.md
+++ b/README.md
@@ -45,26 +45,22 @@ Next are some important definitions: The cell type, the value of cells in the gr
 This is everything we need to define the transition function, so let's do it now:
 
 ``` C++
-auto conway = [](stencil::Stencil<Cell, stencil_radius> const &stencil, stencil::StencilInfo const &info)
-{
+auto conway = [](stencil::Stencil<Cell, stencil_radius> const &stencil) {
 ``` 
 
 As you can see, a transition function is just an invocable object. In this case, we have chosen a lambda expression, but more complicated applications may define their transition function as a class with an `operator()` method.
 
-The first argument is the stencil buffer itself and the second argument is a struct with useful information about the current invocation, like the coordinates of the central cell of the stencil buffer. This is the cell we are going to replace.
+The only argument is the stencil buffer itself. It also contains useful information about the current invocation, like the coordinates of the central cell of the stencil buffer. This is the cell we are going to replace.
 
 ``` C++
-    stencil::ID idx = info.center_cell_id;
+    stencil::ID idx = stencil.id;
 
     uint8_t alive_neighbours = 0;
 #pragma unroll
-    for (stencil::index_t c = -stencil_radius; c <= stencil::index_t(stencil_radius); c++)
-    {
+    for (stencil::index_t c = -stencil_radius; c <= stencil::index_t(stencil_radius); c++) {
 #pragma unroll
-        for (stencil::index_t r = -stencil_radius; r <= stencil::index_t(stencil_radius); r++)
-        {
-            if (stencil[stencil::ID(c, r)] && !(c == 0 && r == 0))
-            {
+        for (stencil::index_t r = -stencil_radius; r <= stencil::index_t(stencil_radius); r++) {
+            if (stencil[stencil::ID(c, r)] && !(c == 0 && r == 0)) {
                 alive_neighbours += 1;
             }
         }
@@ -74,12 +70,9 @@ The first argument is the stencil buffer itself and the second argument is a str
 First, we count the living neighbors since their numbers decides the fate of our cell. The `for`-loops for that are completely unrolled, which means that these evaluations will be carried out in parallel.
 
 ``` C++
-    if (stencil[stencil::ID(0, 0)])
-    {
+    if (stencil[stencil::ID(0, 0)]) {
         return alive_neighbours == 2 || alive_neighbours == 3;
-    }
-    else
-    {
+    } else {
         return alive_neighbours == 3;
     }
 };
@@ -88,15 +81,12 @@ First, we count the living neighbors since their numbers decides the fate of our
 Now we know how many of our neighbors are alive and can therefore return the new cell value according to [the rules of the game](https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life#Rules).
 
 ``` C++
-cl::sycl::buffer<Cell, 2> read(stencil::uindex_t width, stencil::uindex_t height)
-{
+cl::sycl::buffer<Cell, 2> read(stencil::uindex_t width, stencil::uindex_t height) {
     cl::sycl::buffer<Cell, 2> input_buffer(cl::sycl::range<2>(width, height));
     auto buffer_ac = input_buffer.get_access<cl::sycl::access::mode::write>();
 
-    for (stencil::uindex_t r = 0; r < height; r++)
-    {
-        for (stencil::uindex_t c = 0; c < width; c++)
-        {
+    for (stencil::uindex_t r = 0; r < height; r++) {
+        for (stencil::uindex_t c = 0; c < width; c++) {
             char Cell;
             std::cin >> Cell;
             assert(Cell == 'X' || Cell == '.');
@@ -107,23 +97,17 @@ cl::sycl::buffer<Cell, 2> read(stencil::uindex_t width, stencil::uindex_t height
     return input_buffer;
 }
 
-void write(cl::sycl::buffer<Cell, 2> output_buffer)
-{
+void write(cl::sycl::buffer<Cell, 2> output_buffer) {
     auto buffer_ac = output_buffer.get_access<cl::sycl::access::mode::read>();
 
     stencil::uindex_t width = output_buffer.get_range()[0];
     stencil::uindex_t height = output_buffer.get_range()[1];
 
-    for (stencil::uindex_t r = 0; r < height; r++)
-    {
-        for (stencil::uindex_t c = 0; c < width; c++)
-        {
-            if (buffer_ac[c][r])
-            {
+    for (stencil::uindex_t r = 0; r < height; r++) {
+        for (stencil::uindex_t c = 0; c < width; c++) {
+            if (buffer_ac[c][r]) {
                 std::cout << "X";
-            }
-            else
-            {
+            } else {
                 std::cout << ".";
             }
         }
@@ -132,15 +116,13 @@ void write(cl::sycl::buffer<Cell, 2> output_buffer)
 }
 ```
 
-The next part is some boilerplate code to read the input from stdin and write the output to stdout. Nothing to spectacular.
+The next part is some boilerplate code to read the input from stdin and write the output to stdout. Nothing too spectacular.
 
 The only thing left is to run the calculations. We do this like this:
 
 ``` C++
-int main(int argc, char **argv)
-{
-    if (argc != 4)
-    {
+int main(int argc, char **argv) {
+    if (argc != 4) {
         std::cerr << "Usage: " << argv[0] << " <width> <height> <n_generations>" << std::endl;
         return 1;
     }
@@ -156,7 +138,7 @@ int main(int argc, char **argv)
     executor.set_input(grid_buffer);
 ```
 
-After checking and parsing the arguments, we read the input data and initialize the executor. This is the central API facade to control the calculations. In it's simplest form, it only requires cell type, the radius of the stencil and the type of the transition function as template arguments. It has more template arguments, but these are performance parameters. We are looking into them later. The actual constructor arguments are only the initial data, the halo value and an instance of the transition function.
+After checking and parsing the arguments, we read the input data. Then, we pick and initialize an executor. Executors are the user-facing facades of StencilStream and the library offers different executors that are optimized for different scenarios. In this case, we pick the `StencilExecutor`, which is the most universal executor. The static operation and performance parameters are defined as template parameters to an executor. In it's simplest form, it only requires our cell type, the radius of the stencil and the type of the transition function. 
 
 ``` C++
 #ifdef HARDWARE
@@ -199,15 +181,17 @@ ifdef AOCL_BOARD_PACKAGE_ROOT
 endif
 
 EMU_ARGS = $(ARGS)
-HW_ARGS = $(ARGS) -DHARDWARE -Xshardware 
+HW_ARGS = $(ARGS) -DHARDWARE -Xshardware
+
+RESOURCES = conway.cpp $(wildcard StencilStream/*) Makefile
 
-conway_emu: conway.cpp Makefile
+conway_emu: $(RESOURCES)
 	$(CC) $(EMU_ARGS) conway.cpp -o conway_emu
 
-conway_hw: conway.cpp Makefile
+conway_hw: $(RESOURCES)
 	$(CC) $(HW_ARGS) conway.cpp -o conway_hw
 
-conway_hw.report.tar.gz: conway.cpp Makefile
+conway_hw.report.tar.gz: $(RESOURCES)
 	rm -f conway_hw
 	$(CC) $(HW_ARGS) -fsycl-link conway.cpp -o conway_hw
 	tar -caf conway_hw.report.tar.gz conway_hw.prj/reports
diff --git a/StencilStream/AbstractExecutor.hpp b/StencilStream/AbstractExecutor.hpp
new file mode 100644
index 0000000..d1b5d87
--- /dev/null
+++ b/StencilStream/AbstractExecutor.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "GenericID.hpp"
+#include "Index.hpp"
+#include <CL/sycl.hpp>
+
+namespace stencil {
+/**
+ * \brief Base class for all execution managers.
+ *
+ * Executors are the user-facing facades of StencilStream that orchestrate the computations.
+ * Different executors may use different architectures and strategies to apply transition functions
+ * to cells. Application code that may work with any executor can use this base class to access
+ * them. It has multiple logical attributes that can be configured:
+ *
+ * ### Grid
+ *
+ * The grid is the logical array of cells, set with \ref AbstractExecutor.set_input. A stencil
+ * executor does not work in place and a buffer used to initialize the grid can be used for other
+ * tasks afterwards. The \ref AbstractExecutor.run method alters the state of the grid and the grid
+ * can be copied back to a given buffer using \ref AbstractExecutor.copy_output.
+ *
+ * ### Transition Function
+ *
+ * A stencil executor stores an instance of the transition function since it may require some
+ * configuration and runtime-dynamic parameters too. An instance is required for the initialization,
+ * but it may be replaced at any time with \ref AbstractExecutor.set_trans_func.
+ *
+ * ### Generation Index
+ *
+ * This is the generation index of the current state of the grid. \ref AbstractExecutor.run updates
+ * and therefore, it can be ignored in most instances. However, it can be reset if a transition
+ * function needs it.
+ *
+ * \tparam T The cell type.
+ * \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
+ * \tparam TransFunc The type of the transition function.
+ */
+template <typename T, uindex_t stencil_radius, typename TransFunc> class AbstractExecutor {
+  public:
+    /**
+     * \brief Create a new abstract executor.
+     * \param halo_value The value of cells that are outside the grid.
+     * \param trans_func The instance of the transition function that should be used to calculate
+     * new generations.
+     */
+    AbstractExecutor(T halo_value, TransFunc trans_func)
+        : halo_value(halo_value), trans_func(trans_func), i_generation(0) {}
+
+    /**
+     * \brief Compute the next generations of the grid and store it internally.
+     *
+     * This will use the transition function to compute the next `n_generations` generations of the
+     * grid and store the new state of the grid internally. The resulting grid state can be
+     * retrieved with \ref AbstractExecutor.copy_output.
+     *
+     * \param n_generations The number of generations to calculate.
+     */
+    virtual void run(uindex_t n_generations) = 0;
+
+    /**
+     * \brief Set the internal state of the grid.
+     *
+     * This will copy the contents of the buffer to an internal representation. The buffer may be
+     * used for other purposes later. It must not reset the generation index. The range of the input
+     * buffer will be used as the new grid range.
+     *
+     * \param input_buffer The source buffer of the new grid state.
+     */
+    virtual void set_input(cl::sycl::buffer<T, 2> input_buffer) = 0;
+
+    /**
+     * \brief Copy the state of the grid to a buffer.
+     *
+     * This will copy the cells of the internal grid representation to the buffer. The range of the
+     * output buffer must be equal to the grid range (retrievable with \ref
+     * AbstractExecutor.get_grid_range).
+     *
+     * \param output_buffer The target buffer.
+     */
+    virtual void copy_output(cl::sycl::buffer<T, 2> output_buffer) = 0;
+
+    /**
+     * \brief Get the range of the internal grid.
+     */
+    virtual UID get_grid_range() const = 0;
+
+    /**
+     * \brief Get the value of cells outside of the grid.
+     */
+    T const get_halo_value() const { return halo_value; }
+
+    /**
+     * \brief Set the value of cells outside of the grid.
+     */
+    void set_halo_value(T halo_value) { this->halo_value = halo_value; }
+
+    /**
+     * \brief Get the configured transition function instance.
+     */
+    TransFunc get_trans_func() const { return trans_func; }
+
+    /**
+     * \brief Set the transition function instance.
+     */
+    void set_trans_func(TransFunc trans_func) { this->trans_func = trans_func; }
+
+    /**
+     * \brief Get the generation index of the grid.
+     */
+    uindex_t get_i_generation() const { return i_generation; }
+
+    /**
+     * \brief Set the generation index of the grid.
+     */
+    void set_i_generation(uindex_t i_generation) { this->i_generation = i_generation; }
+
+    /**
+     * \brief Increase the generation index of the grid by a certain delta.
+     */
+    void inc_i_generation(index_t delta) { this->i_generation += delta; }
+
+  private:
+    T halo_value;
+    TransFunc trans_func;
+    uindex_t i_generation;
+};
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/CounterID.hpp b/StencilStream/CounterID.hpp
index 7118c0c..8900069 100644
--- a/StencilStream/CounterID.hpp
+++ b/StencilStream/CounterID.hpp
@@ -1,67 +1,71 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include "GenericID.hpp"
 #include "Index.hpp"
 
-namespace stencil
-{
+namespace stencil {
 /**
  * \brief A subclass of the `GenericID<T>` with two-dimensional index counting.
- * 
- * It additionally requires bounds for the column and row counters and provides increment and decrement
- * operators. The row counter is always the fastest: For example, the increment operators first increase
- * the row counter and when it reaches the row bound, it is reset to zero and the column counter is
- * increased. When the column counter reaches it's bound, it is also reset to zero. One can therefore
- * use this counter to safely iterate over a two-dimensional grid even with multiple passes.
+ *
+ * It additionally requires bounds for the column and row counters and provides increment and
+ * decrement operators. The row counter is always the fastest: For example, the increment operators
+ * first increase the row counter and when it reaches the row bound, it is reset to zero and the
+ * column counter is increased. When the column counter reaches it's bound, it is also reset to
+ * zero. One can therefore use this counter to safely iterate over a two-dimensional grid even with
+ * multiple passes.
  */
-template <typename T>
-class CounterID : public GenericID<T>
-{
-public:
+template <typename T> class CounterID : public GenericID<T> {
+  public:
     /**
      * \brief Create a new counter with uninitialized column and row counters.
-     * 
+     *
      * The column and row bounds may not be uninitialized and are therefore required.
      */
-    CounterID(T column_bound, T row_bound) : GenericID<T>(), c_bound(column_bound), r_bound(row_bound) {}
+    CounterID(T column_bound, T row_bound)
+        : GenericID<T>(), c_bound(column_bound), r_bound(row_bound) {}
 
     /**
      * \brief Create a new counter with the given column/row counters and bounds.
      */
-    CounterID(T column, T row, T column_bound, T row_bound) : GenericID<T>(column, row), c_bound(column_bound), r_bound(row_bound) {}
+    CounterID(T column, T row, T column_bound, T row_bound)
+        : GenericID<T>(column, row), c_bound(column_bound), r_bound(row_bound) {}
 
     /**
      * \brief Create a new counter from the given SYCL id and bounds.
      */
-    CounterID(cl::sycl::id<2> sycl_id, T column_bound, T row_bound) : GenericID<T>(sycl_id), c_bound(column_bound), r_bound(row_bound) {}
+    CounterID(cl::sycl::id<2> sycl_id, T column_bound, T row_bound)
+        : GenericID<T>(sycl_id), c_bound(column_bound), r_bound(row_bound) {}
 
     /**
      * \brief Increase the counters.
      */
-    CounterID &operator++()
-    {
-        if (this->r == r_bound - 1)
-        {
+    CounterID &operator++() {
+        if (this->r == r_bound - 1) {
             this->r = 0;
-            if (this->c == c_bound - 1)
-            {
+            if (this->c == c_bound - 1) {
                 this->c = 0;
-            }
-            else
-            {
+            } else {
                 this->c++;
             }
-        }
-        else
-        {
+        } else {
             this->r++;
         }
         return *this;
@@ -70,22 +74,15 @@ class CounterID : public GenericID<T>
     /**
      * \brief Decrease the counters.
      */
-    CounterID &operator--()
-    {
-        if (this->r == 0)
-        {
+    CounterID &operator--() {
+        if (this->r == 0) {
             this->r = r_bound - 1;
-            if (this->c == 0)
-            {
+            if (this->c == 0) {
                 this->c = c_bound - 1;
-            }
-            else
-            {
+            } else {
                 this->c--;
             }
-        }
-        else
-        {
+        } else {
             this->r--;
         }
         return *this;
@@ -94,8 +91,7 @@ class CounterID : public GenericID<T>
     /**
      * \brief Increase the counters.
      */
-    CounterID operator++(int)
-    {
+    CounterID operator++(int) {
         CounterID copy = *this;
         this->operator++();
         return copy;
@@ -104,14 +100,13 @@ class CounterID : public GenericID<T>
     /**
      * \brief Decrease the counters.
      */
-    CounterID operator--(int)
-    {
+    CounterID operator--(int) {
         CounterID copy = *this;
         this->operator--();
         return copy;
     }
 
-private:
+  private:
     T c_bound;
     T r_bound;
 };
diff --git a/StencilStream/ExecutionKernel.hpp b/StencilStream/ExecutionKernel.hpp
deleted file mode 100644
index e46ec6e..0000000
--- a/StencilStream/ExecutionKernel.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#pragma once
-#include "GenericID.hpp"
-#include "Index.hpp"
-#include "Stencil.hpp"
-#include <optional>
-
-namespace stencil
-{
-
-/**
- * \brief A kernel that executes a stencil transition function on a tile.
- * 
- * It receives the contents of a tile and it's halo from the `in_pipe`, applies the transition function when applicable and writes the result to the `out_pipe`.
- * 
- * \tparam TransFunc The type of transition function to use.
- * \tparam T Cell value type.
- * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the central cell
- * \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll factor for a loop.
- * \tparam output_tile_width The number of columns in a grid tile.
- * \tparam output_tile_height The number of rows in a grid tile.
- * \tparam in_pipe The pipe to read from.
- * \tparam out_pipe The pipe to write to.
- */
-template <typename TransFunc, typename T, uindex_t stencil_radius, uindex_t pipeline_length, uindex_t output_tile_width, uindex_t output_tile_height, typename in_pipe, typename out_pipe>
-class ExecutionKernel
-{
-public:
-    static_assert(
-        std::is_invocable_r<T, TransFunc const, Stencil<T, stencil_radius> const &>::
-            value);
-    static_assert(stencil_radius >= 1);
-
-    /**
-     * \brief The width and height of the stencil buffer.
-     */
-    const static uindex_t stencil_diameter = Stencil<T, stencil_radius>::diameter;
-
-    /**
-     * \brief The total number of cells to read from the `in_pipe`.
-     */
-    const static uindex_t n_input_cells = (2 * stencil_radius * pipeline_length + output_tile_width) * (2 * stencil_radius * pipeline_length + output_tile_height);
-
-    /**
-     * \brief The width of the processed tile with the tile halo attached.
-     */
-    const static uindex_t input_tile_width = 2 * stencil_radius * pipeline_length + output_tile_width;
-
-    /**
-     * \brief The height of the processed tile with the tile halo attached.
-     */
-    const static uindex_t input_tile_height = 2 * stencil_radius * pipeline_length + output_tile_height;
-
-    /**
-     * \brief Create and configure the execution kernel.
-     * 
-     * \param trans_func The instance of the transition function to use.
-     * \param i_generation The generation index of the input cells.
-     * \param n_generations The number of generations to compute. If this number is bigger than `pipeline_length`, only `pipeline_length` generations will be computed.
-     * \param grid_c_offset The column offset of the processed tile relative to the grid's origin, not including the halo. For example, for the most north-western tile the offset will always be (0,0), not (-halo_radius,-halo_radius)
-     * \param grid_r_offset The row offset of the processed tile relative to the grid's origin. See `grid_c_offset` for details.
-     * \param grid_width The number of cell columns in the grid.
-     * \param grid_height The number of cell rows in the grid.
-     */
-    ExecutionKernel(
-        TransFunc trans_func,
-        uindex_t i_generation,
-        uindex_t n_generations,
-        uindex_t grid_c_offset,
-        uindex_t grid_r_offset,
-        uindex_t grid_width,
-        uindex_t grid_height,
-        T halo_value) : trans_func(trans_func),
-                        i_generation(i_generation),
-                        n_generations(n_generations),
-                        grid_c_offset(grid_c_offset),
-                        grid_r_offset(grid_r_offset),
-                        grid_width(grid_width),
-                        grid_height(grid_height),
-                        halo_value(halo_value)
-    {
-    }
-
-    /**
-     * \brief Execute the configured operations.
-     */
-    void operator()() const
-    {
-        uindex_t input_tile_c = 0;
-        uindex_t input_tile_r = 0;
-
-        [[intel::fpga_memory, intel::numbanks(2 * pipeline_length)]] T cache[2][input_tile_height][pipeline_length][stencil_diameter - 1];
-        uindex_t active_cache = 0;
-        [[intel::fpga_register]] T stencil_buffer[pipeline_length][stencil_diameter][stencil_diameter];
-
-        for (uindex_t i = 0; i < n_input_cells; i++)
-        {
-            T value = in_pipe::read();
-
-#pragma unroll
-            for (uindex_t stage = 0; stage < pipeline_length; stage++)
-            {
-                /*
-                 * Shift up every value in the stencil_buffer.
-                 * This operation does not touch the values in the bottom row, which will be filled
-                 * from the cache and the new input value later.
-                 */
-#pragma unroll
-                for (uindex_t r = 0; r < stencil_diameter - 1; r++)
-                {
-#pragma unroll
-                    for (uindex_t c = 0; c < stencil_diameter; c++)
-                    {
-                        stencil_buffer[stage][c][r] = stencil_buffer[stage][c][r + 1];
-                    }
-                }
-
-                index_t input_grid_c = grid_c_offset + index_t(input_tile_c) - (stencil_diameter - 1) - (pipeline_length + stage - 2) * stencil_radius;
-                index_t input_grid_r = grid_r_offset + index_t(input_tile_r) - (stencil_diameter - 1) - (pipeline_length + stage - 2) * stencil_radius;
-
-                // Update the stencil buffer and cache with previous cache contents and the new input cell.
-#pragma unroll
-                for (uindex_t cache_c = 0; cache_c < stencil_diameter; cache_c++)
-                {
-                    T new_value;
-                    if (cache_c == stencil_diameter - 1)
-                    {
-                        if (input_grid_c < 0 || input_grid_r < 0 || input_grid_c >= grid_width || input_grid_r >= grid_height)
-                        {
-                            new_value = halo_value;
-                        }
-                        else
-                        {
-                            new_value = value;
-                        }
-                    }
-                    else
-                    {
-                        new_value = cache[active_cache][input_tile_r][stage][cache_c];
-                    }
-
-                    stencil_buffer[stage][cache_c][stencil_diameter - 1] = new_value;
-                    if (cache_c > 0)
-                    {
-                        cache[active_cache == 0 ? 1 : 0][input_tile_r][stage][cache_c - 1] = new_value;
-                    }
-                }
-
-                index_t output_grid_c = input_grid_c - stencil_radius;
-                index_t output_grid_r = input_grid_r - stencil_radius;
-                Stencil<T, stencil_radius> stencil(
-                    ID(output_grid_c, output_grid_r),
-                    i_generation + stage,
-                    stage,
-                    stencil_buffer[stage],
-                    UID(grid_width, grid_height));
-
-                if (i_generation + stage < n_generations)
-                {
-                    value = trans_func(stencil);
-                }
-                else
-                {
-                    value = stencil_buffer[stage][stencil_radius][stencil_radius];
-                }
-            }
-
-            bool is_valid_output = input_tile_c >= (stencil_diameter - 1) * pipeline_length;
-            is_valid_output &= input_tile_r >= (stencil_diameter - 1) * pipeline_length;
-
-            if (is_valid_output)
-            {
-                out_pipe::write(value);
-            }
-
-            if (input_tile_r == input_tile_height - 1)
-            {
-                active_cache = active_cache == 0 ? 1 : 0;
-                input_tile_r = 0;
-                if (input_tile_c == input_tile_width - 1)
-                {
-                    input_tile_c = 0;
-                }
-                else
-                {
-                    input_tile_c++;
-                }
-            }
-            else
-            {
-                input_tile_r++;
-            }
-        }
-    }
-
-private:
-    TransFunc trans_func;
-    uindex_t i_generation;
-    uindex_t n_generations;
-    uindex_t grid_c_offset;
-    uindex_t grid_r_offset;
-    uindex_t grid_width;
-    uindex_t grid_height;
-    T halo_value;
-};
-
-} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/GenericID.hpp b/StencilStream/GenericID.hpp
index dd7cc23..ab038bf 100644
--- a/StencilStream/GenericID.hpp
+++ b/StencilStream/GenericID.hpp
@@ -1,28 +1,36 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include "Index.hpp"
 #include <CL/sycl/id.hpp>
 
-namespace stencil
-{
+namespace stencil {
 
 /**
  * \brief A generic, two-dimensional index.
- * 
- * \tparam The index type. It can be anything as long as it can be constructed from a dimension of `cl::sycl::id` and tested for equality.
+ *
+ * \tparam The index type. It can be anything as long as it can be constructed from a dimension of
+ * `cl::sycl::id` and tested for equality.
  */
-template <typename T>
-class GenericID
-{
-public:
+template <typename T> class GenericID {
+  public:
     /**
      * \brief Create a new index with undefined contents.
      */
@@ -46,12 +54,11 @@ class GenericID
     /**
      * \brief Test if the other generic ID has equivalent coordinates to this ID.
      */
-    bool operator==(GenericID const &other) const
-    {
+    bool operator==(GenericID const &other) const {
         return this->c == other.c && this->r == other.r;
     }
 
-    /** 
+    /**
      * \brief The column index.
      */
     T c;
diff --git a/StencilStream/Helpers.hpp b/StencilStream/Helpers.hpp
index ceeaf52..01036ef 100644
--- a/StencilStream/Helpers.hpp
+++ b/StencilStream/Helpers.hpp
@@ -1,41 +1,55 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include "Index.hpp"
 #include <CL/sycl/access/access.hpp>
 #include <CL/sycl/id.hpp>
 
-namespace stencil
-{
-inline cl::sycl::range<2> burst_partitioned_range(uindex_t width, uindex_t height, uindex_t burst_length)
-{
+namespace stencil {
+inline cl::sycl::range<2> burst_partitioned_range(uindex_t width, uindex_t height,
+                                                  uindex_t burst_length) {
     uindex_t nCells = width * height;
     cl::sycl::range<2> range(nCells / burst_length, burst_length);
-    if (nCells % burst_length != 0)
-    {
+    if (nCells % burst_length != 0) {
         range[0] += 1;
     }
     return range;
 }
 
-inline constexpr bool is_mode_readable(cl::sycl::access::mode access_mode)
-{
+inline constexpr bool is_mode_readable(cl::sycl::access::mode access_mode) {
     return access_mode == cl::sycl::access::mode::read ||
            access_mode == cl::sycl::access::mode::read_write;
 }
 
-inline constexpr bool is_mode_writable(cl::sycl::access::mode access_mode)
-{
+inline constexpr bool is_mode_writable(cl::sycl::access::mode access_mode) {
     return access_mode == cl::sycl::access::mode::write ||
            access_mode == cl::sycl::access::mode::read_write ||
            access_mode == cl::sycl::access::mode::discard_write ||
            access_mode == cl::sycl::access::mode::discard_read_write;
 }
+
+inline constexpr uindex_t next_power_of_two(uindex_t value) {
+    uindex_t next_power_of_two = 1;
+    while (next_power_of_two < value) {
+        next_power_of_two = next_power_of_two << 1;
+    }
+    return next_power_of_two;
+}
 } // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/IOKernel.hpp b/StencilStream/IOKernel.hpp
deleted file mode 100644
index ebc63bb..0000000
--- a/StencilStream/IOKernel.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#pragma once
-#include "CounterID.hpp"
-#include "Index.hpp"
-#include <CL/sycl.hpp>
-#include <CL/sycl/accessor.hpp>
-#include <array>
-
-namespace stencil
-{
-
-/**
- * \brief Generic Input/Output kernel for use with the \ref ExecutionKernel and \ref Grid.
- * 
- * This kernel provides IO services to the execution kernel by writing the contents of a input tile with halo to the input pipe and writing the output of the execution kernel to an output tile. The input and output code only differs by one line. Therefore, both services are provided by the same class. Unlike \ref ExecutionKernel, this kernel is supposed to be constructed by a lambda expression that then either executes \ref IOKernel.read or \ref IOKernel.write.
- * 
- * Logically, an instance of the IO kernel receives access to a horizontal slice of the input or output and processes this slice in the \ref indexingorder. Due to the \ref tiling, this slice is partioned vertically into 2*n + 1 buffers, where n is the `n_halo_height_buffers` template parameter. For the input buffer, n should be 2, and for the output buffer, n should be 1. All buffers are expected to have a static height and dynamic width. The upper n and lower n buffers are all expected be `halo_height` cells high, while the buffer in the middle is expected to have be `core_height` cells high.
- * 
- * \tparam T Cell value type.
- * \tparam halo_height The radius (aka width and height) of the tile halo (and therefore both dimensions of a corner buffer).
- * \tparam core_height The height of the core buffer.
- * \tparam burst_length The number of elements that can be read or written in a burst.
- * \tparam pipe The pipe to read or write to.
- * \tparam n_halo_height_buffers The number of buffers to accept, in addition to 
- * \tparam access_mode The access mode to expect for the buffer accessor.
- * \tparam access_target The access target to expect for the buffer accessor.
- */
-template <typename T, uindex_t halo_height, uindex_t core_height, uindex_t burst_length, typename pipe, uindex_t n_halo_height_buffers, cl::sycl::access::mode access_mode, cl::sycl::access::target access_target = cl::sycl::access::target::global_buffer>
-class IOKernel
-{
-public:
-    using Accessor = cl::sycl::accessor<T, 2, access_mode, access_target>;
-
-    /**
-     * \brief The total number of buffers in a slice/column.
-     */
-    static constexpr uindex_t n_buffers = 2 * n_halo_height_buffers + 1;
-    /**
-     * \brief The total number of cell rows in the (logical) slice/column.
-     */
-    static constexpr uindex_t n_rows = 2 * n_halo_height_buffers * halo_height + core_height;
-
-    /**
-     * \brief Get the height of a buffer.
-     * 
-     * As described in the description, the first and last n buffers are `halo_height` cells high,
-     * while the middle buffer is `core_height` cells high.
-     */
-    static constexpr uindex_t get_buffer_height(uindex_t index)
-    {
-        if (index == n_halo_height_buffers)
-        {
-            return core_height;
-        }
-        else
-        {
-            return halo_height;
-        }
-    }
-
-    /**
-     * \brief Create a new IOKernel instance.
-     * 
-     * The created instance is not invocable. You need to construct it inside a lambda function (or equivalent) and then call either \ref IOKernel.read or \ref IOKernel.write.
-     * 
-     * \param accessor The slice/column of buffer accessors to process.
-     * \param n_columns The width of every buffer passed in `accessor`.
-     */
-    IOKernel(std::array<Accessor, n_buffers> accessor, uindex_t n_columns) : accessor(accessor), n_columns(n_columns)
-    {
-#ifndef __SYCL_DEVICE_ONLY__
-        for (uindex_t i = 0; i < n_buffers; i++)
-        {
-            assert(accessor[i].get_range()[1] == burst_length);
-            assert(get_buffer_height(i) * n_columns <= accessor[i].get_range()[0] * accessor[i].get_range()[1]);
-        }
-#endif
-    }
-
-    /**
-     * \brief Read the cells from the buffers and write them to the pipe.
-     */
-    void read()
-    {
-        static_assert(access_mode == cl::sycl::access::mode::read || access_mode == cl::sycl::access::mode::read_write);
-        run([](Accessor &accessor, uindex_t burst_i, uindex_t cell_i)
-            { pipe::write(accessor[burst_i][cell_i]); });
-    }
-
-    /**
-     * \brief Read the cells from the pipe and write them to the buffers.
-     */
-    void write()
-    {
-        static_assert(access_mode == cl::sycl::access::mode::write || access_mode == cl::sycl::access::mode::discard_write || access_mode == cl::sycl::access::mode::read_write || access_mode == cl::sycl::access::mode::discard_read_write);
-        run([](Accessor &accessor, uindex_t burst_i, uindex_t cell_i)
-            { accessor[burst_i][cell_i] = pipe::read(); });
-    }
-
-private:
-    template <typename Action>
-    void run(Action action)
-    {
-        static_assert(std::is_invocable<Action, Accessor &, uindex_t, uindex_t>::value);
-
-        uindex_t burst_i[n_buffers] = {0};
-        uindex_t cell_i[n_buffers] = {0};
-
-        for (uindex_t c = 0; c < n_columns; c++)
-        {
-            uindex_t buffer_i = 0;
-            uindex_t next_bound = get_buffer_height(0);
-            for (uindex_t r = 0; r < n_rows; r++)
-            {
-                if (r == next_bound)
-                {
-                    buffer_i++;
-                    next_bound += get_buffer_height(buffer_i);
-                }
-
-                action(accessor[buffer_i], burst_i[buffer_i], cell_i[buffer_i]);
-                if (cell_i[buffer_i] == burst_length - 1)
-                {
-                    cell_i[buffer_i] = 0;
-                    burst_i[buffer_i]++;
-                }
-                else
-                {
-                    cell_i[buffer_i]++;
-                }
-            }
-        }
-    }
-
-    std::array<Accessor, n_buffers> accessor;
-    uindex_t n_columns;
-};
-
-} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/Index.hpp b/StencilStream/Index.hpp
index b4a472c..89d4753 100644
--- a/StencilStream/Index.hpp
+++ b/StencilStream/Index.hpp
@@ -1,32 +1,41 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include <boost/preprocessor/cat.hpp>
 #include <cstdint>
 
-namespace stencil
-{
+namespace stencil {
 #ifndef STENCIL_INDEX_WIDTH
-#define STENCIL_INDEX_WIDTH 64
+    #define STENCIL_INDEX_WIDTH 64
 #endif
 
 /**
  * Integer types for indexing.
- * 
+ *
  * There is always a signed version, `index_t`, and an unsigned version, `uindex_t`. Their width is
  * defined by the `STENCIL_INDEX_WIDTH` macro. The default is 64 and can be increased to allow
  * bigger buffers or decreased to reduce the complexity and resource requirements.
- * 
+ *
  * Static asserts throughout the library ensure that the index type is wide enough. Therefore, you
  * can decrease the until you get compilation errors.
  */
 typedef BOOST_PP_CAT(BOOST_PP_CAT(uint, STENCIL_INDEX_WIDTH), _t) uindex_t;
 typedef BOOST_PP_CAT(BOOST_PP_CAT(int, STENCIL_INDEX_WIDTH), _t) index_t;
-}
\ No newline at end of file
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/MonotileExecutor.hpp b/StencilStream/MonotileExecutor.hpp
new file mode 100644
index 0000000..0e79833
--- /dev/null
+++ b/StencilStream/MonotileExecutor.hpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel
+ * Computing, Paderborn University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the “Software”), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include "SingleQueueExecutor.hpp"
+#include "monotile/ExecutionKernel.hpp"
+
+namespace stencil {
+template <typename T, uindex_t stencil_radius, typename TransFunc, uindex_t pipeline_length = 1,
+          uindex_t tile_width = 1024, uindex_t tile_height = 1024, uindex_t burst_size = 1024>
+/**
+ * \brief An executor that follows \ref monotile.
+ *
+ * The feature that distincts this executor from \ref StencilExecutor is that it works with exactly
+ * one tile. This means the grid range may not exceed the set tile range, but it uses less resources
+ * and time per kernel execution.
+ *
+ * \tparam T The cell type.
+ * \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
+ * \tparam TransFunc The type of the transition function.
+ * \tparam pipeline_length The number of hardware execution stages per kernel. Must be at least 1.
+ * Defaults to 1.
+ * \tparam tile_width The number of columns in a tile and maximum number of columns in a grid.
+ * Defaults to 1024.
+ * \tparam tile_height The number of rows in a tile and maximum number of rows in a grid. Defaults
+ * to 1024.
+ */
+class MonotileExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc> {
+  public:
+    /**
+     * \brief Shorthand for the parent class.
+     */
+    using Parent = SingleQueueExecutor<T, stencil_radius, TransFunc>;
+
+    /**
+     * \brief Create a new executor.
+     *
+     * \param halo_value The value of cells in the grid halo.
+     * \param trans_func An instance of the transition function type.
+     */
+    MonotileExecutor(T halo_value, TransFunc trans_func)
+        : Parent(halo_value, trans_func), tile_buffer(cl::sycl::range<2>(tile_width, tile_height)) {
+        auto ac = tile_buffer.template get_access<cl::sycl::access::mode::discard_write>();
+        ac[0][0] = halo_value;
+    }
+
+    /**
+     * \brief Set the internal state of the grid.
+     *
+     * This will copy the contents of the buffer to an internal representation. The buffer may be
+     * used for other purposes later. It must not reset the generation index. The range of the input
+     * buffer will be used as the new grid range.
+     *
+     * \throws std::range_error Thrown if the number of width or height of the buffer exceeds the
+     * set width and height of the tile. \param input_buffer The source buffer of the new grid
+     * state.
+     */
+    void set_input(cl::sycl::buffer<T, 2> input_buffer) override {
+        if (input_buffer.get_range()[0] > tile_width && input_buffer.get_range()[1] > tile_height) {
+            throw std::range_error("The grid is bigger than the tile. The monotile architecture "
+                                   "requires that grid ranges are smaller or equal to the tile "
+                                   "range");
+        }
+        auto in_ac = input_buffer.template get_access<cl::sycl::access::mode::read>();
+        tile_buffer = cl::sycl::buffer<T, 2>(input_buffer.get_range());
+        auto tile_ac = tile_buffer.template get_access<cl::sycl::access::mode::discard_write>();
+        for (uindex_t c = 0; c < input_buffer.get_range()[0]; c++) {
+            for (uindex_t r = 0; r < input_buffer.get_range()[1]; r++) {
+                tile_ac[c][r] = in_ac[c][r];
+            }
+        }
+    }
+
+    void copy_output(cl::sycl::buffer<T, 2> output_buffer) override {
+        if (output_buffer.get_range() != tile_buffer.get_range()) {
+            throw std::range_error("The output buffer is not the same size as the grid");
+        }
+        auto in_ac = tile_buffer.template get_access<cl::sycl::access::mode::read>();
+        auto out_ac = output_buffer.template get_access<cl::sycl::access::mode::discard_write>();
+        for (uindex_t c = 0; c < tile_buffer.get_range()[0]; c++) {
+            for (uindex_t r = 0; r < tile_buffer.get_range()[1]; r++) {
+                out_ac[c][r] = in_ac[c][r];
+            }
+        }
+    }
+
+    UID get_grid_range() const override {
+        return UID(tile_buffer.get_range()[0], tile_buffer.get_range()[1]);
+    }
+
+    void run(uindex_t n_generations) override {
+        using in_pipe = cl::sycl::pipe<class monotile_in_pipe, T>;
+        using out_pipe = cl::sycl::pipe<class monotile_out_pipe, T>;
+        using ExecutionKernelImpl =
+            monotile::ExecutionKernel<TransFunc, T, stencil_radius, pipeline_length, tile_width,
+                                      tile_height, in_pipe, out_pipe>;
+
+        cl::sycl::queue &queue = this->get_queue();
+
+        uindex_t target_i_generation = this->get_i_generation() + n_generations;
+        uindex_t grid_width = tile_buffer.get_range()[0];
+        uindex_t grid_height = tile_buffer.get_range()[1];
+
+        while (this->get_i_generation() < target_i_generation) {
+            cl::sycl::buffer<T, 2> out_buffer(tile_buffer.get_range());
+
+            queue.submit([&](cl::sycl::handler &cgh) {
+                auto ac = tile_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+                T halo_value = this->get_halo_value();
+
+                cgh.single_task<class MonotileInputKernel>([=]() {
+                    [[intel::loop_coalesce(2)]] for (uindex_t c = 0; c < tile_width; c++) {
+                        for (uindex_t r = 0; r < tile_height; r++) {
+                            T value;
+                            if (c < grid_width && r < grid_height) {
+                                value = ac[c][r];
+                            } else {
+                                value = halo_value;
+                            }
+
+                            in_pipe::write(value);
+                        }
+                    }
+                });
+            });
+
+            cl::sycl::event computation_event = queue.submit([&](cl::sycl::handler &cgh) {
+                cgh.single_task(ExecutionKernelImpl(
+                    this->get_trans_func(), this->get_i_generation(), target_i_generation,
+                    grid_width, grid_height, this->get_halo_value()));
+            });
+
+            queue.submit([&](cl::sycl::handler &cgh) {
+                auto ac =
+                    out_buffer.template get_access<cl::sycl::access::mode::discard_write>(cgh);
+                T halo_value = this->get_halo_value();
+
+                cgh.single_task<class MonotileInputKernel>([=]() {
+                    [[intel::loop_coalesce(2)]] for (uindex_t c = 0; c < tile_width; c++) {
+                        for (uindex_t r = 0; r < tile_height; r++) {
+                            T value = out_pipe::read();
+                            if (c < grid_width && r < grid_height) {
+                                ac[c][r] = value;
+                            }
+                        }
+                    }
+                });
+            });
+
+            tile_buffer = out_buffer;
+
+            if (this->is_runtime_analysis_enabled()) {
+                this->get_runtime_sample().add_pass(computation_event);
+            }
+
+            this->inc_i_generation(
+                std::min(target_i_generation - this->get_i_generation(), pipeline_length));
+        }
+    }
+
+  private:
+    cl::sycl::buffer<T, 2> tile_buffer;
+    UID grid_range;
+};
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/RuntimeSample.hpp b/StencilStream/RuntimeSample.hpp
index 52545f2..b806269 100644
--- a/StencilStream/RuntimeSample.hpp
+++ b/StencilStream/RuntimeSample.hpp
@@ -1,111 +1,100 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include "Index.hpp"
-#include "boost/multi_array.hpp"
 #include <CL/sycl/buffer.hpp>
 #include <CL/sycl/event.hpp>
 
-namespace stencil
-{
+namespace stencil {
 /**
- * \brief Collection of SYCL events from a \ref StencilExecutor.run invocation.
- * 
- * It's interface might be extended in the future to support more in-depth analysis. Now, it can analyse the runtime of a single execution kernel invocation as well as the total runtime of all passes.
+ * \brief Collection of runtime performance information for a \ref SingleQueueExecutor.
  */
-class RuntimeSample
-{
-public:
+class RuntimeSample {
+  public:
     /**
      * \brief Prepare the collection of SYCL events.
-     * 
-     * \param n_passes The number times the execution kernel is invoced per tile.
-     * \param n_tile_columns The number of tile columns.
-     * \param n_tile_rows The number of tile rows.
      */
-    RuntimeSample(uindex_t n_passes, uindex_t n_tile_columns, uindex_t n_tile_rows) : events(boost::extents[n_passes][n_tile_columns][n_tile_rows]) {}
+    RuntimeSample() : makespan(0.0), n_passes(0.0) {}
 
     /**
-     * \brief Add an event to the collection.
-     * 
-     * \param event The event to add.
-     * \param i_pass The index of the grid pass.
-     * \param i_column The index of the tile column.
-     * \param i_row The index of the tile row.
+     * \brief Retrieve the starting timestamp of the event, in seconds.
+     *
+     * This will fail with an exception if the event's queue has not been configured to collect
+     * runtime information.
      */
-    void add_event(cl::sycl::event event, uindex_t i_pass, uindex_t i_column, uindex_t i_row)
-    {
-        events[i_pass][i_column][i_row] = event;
+    static double start_of_event(cl::sycl::event event) {
+        return double(event.get_profiling_info<cl::sycl::info::event_profiling::command_start>()) /
+               timesteps_per_second;
     }
 
     /**
-     * \brief Calculate the runtime of the specified event.
-     * 
-     * If the event has not been added with \ref RuntimeSample.add_event before, the resulting value is undefined.
-     * 
-     * \param i_pass The index of the grid pass.
-     * \param i_column The index of the tile column.
-     * \param i_row The index of the tile row.
-     * \return The runtime of the specified event, in seconds.
+     * \brief Retrieve the ending timestamp of the event, in seconds.
+     *
+     * This will fail with an exception if the event's queue has not been configured to collect
+     * runtime information.
      */
-    double get_runtime(uindex_t i_pass, uindex_t i_column, uindex_t i_row)
-    {
-        unsigned long event_start = events[i_pass][i_column][i_row].get_profiling_info<cl::sycl::info::event_profiling::command_start>();
-        unsigned long event_end = events[i_pass][i_column][i_row].get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-        return (event_end - event_start) / timesteps_per_second;
+    static double end_of_event(cl::sycl::event event) {
+        return double(event.get_profiling_info<cl::sycl::info::event_profiling::command_end>()) /
+               timesteps_per_second;
     }
 
     /**
-     * \brief Calculate the total runtime of all passes.
-     * 
-     * This iterates through all known events, finds the earliest start and latest end and returns the time difference between them.
-     * 
-     * \return The total wall time in seconds it took to execute all passes.
+     * \brief Retrieve the runtime/makespan of the event, in seconds.
+     *
+     * This will fail with an exception if the event's queue has not been configured to collect
+     * runtime information.
      */
-    double get_total_runtime()
-    {
-        unsigned long earliest_start = std::numeric_limits<unsigned long>::max();
-        unsigned long latest_end = std::numeric_limits<unsigned long>::min();
-
-        for (uindex_t i_pass = 0; i_pass < events.shape()[0]; i_pass++)
-        {
-            for (uindex_t i_column = 0; i_column < events.shape()[1]; i_column++)
-            {
-                for (uindex_t i_row = 0; i_row < events.shape()[2]; i_row++)
-                {
-                    unsigned long event_start = events[i_pass][i_column][i_row].get_profiling_info<cl::sycl::info::event_profiling::command_start>();
-                    unsigned long event_end = events[i_pass][i_column][i_row].get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-                    if (event_start < earliest_start)
-                        earliest_start = event_start;
-                    if (event_end > latest_end)
-                        latest_end = event_end;
-                }
-            }
-        }
+    static double runtime_of_event(cl::sycl::event event) {
+        return end_of_event(event) - start_of_event(event);
+    }
 
-        return (latest_end - earliest_start) / timesteps_per_second;
+    /**
+     * \brief Add the runtime/makespan of a single pass over the grid to the database.
+     */
+    void add_pass(double pass_runtime) {
+        makespan += pass_runtime;
+        n_passes += 1.0;
     }
 
+    /**
+     * \brief Add the event of a single pass over the grid to the database.
+     */
+    void add_pass(cl::sycl::event event) { add_pass(runtime_of_event(event)); }
+
+    /**
+     * \brief Get the makespan of all grid passes.
+     *
+     * \return The makespan in seconds it took to execute all passes.
+     */
+    double get_total_runtime() { return makespan; }
+
     /**
      * \brief Calculate the mean execution speed in passes per second.
-     * 
+     *
      * \return The mean execution speed in passes per second.
      */
-    double get_mean_speed()
-    {
-        return get_total_runtime() / double(events.shape()[0]);
-    }
+    double get_mean_speed() { return makespan / n_passes; }
 
-private:
+  private:
     static constexpr double timesteps_per_second = 1000000000.0;
 
-    boost::multi_array<cl::sycl::event, 3> events;
+    double makespan, n_passes;
 };
 } // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/SingleQueueExecutor.hpp b/StencilStream/SingleQueueExecutor.hpp
new file mode 100644
index 0000000..1322140
--- /dev/null
+++ b/StencilStream/SingleQueueExecutor.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "AbstractExecutor.hpp"
+#include "RuntimeSample.hpp"
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+#include <optional>
+
+namespace stencil {
+/**
+ * \brief An abstract executor with common code for executors that work with a single SYCL queue.
+ *
+ * This class contains common code for executors that work with execution kernels running on a
+ * single queue. This includes queue selection and management as well as event-based runtime
+ * analysis.
+ *
+ * User code that may work with any kind of executor should use pointers to the general \ref
+ * AbstractExecutor.
+ *
+ * \tparam T The cell type.
+ * \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
+ * \tparam TransFunc The type of the transition function.
+ */
+template <typename T, uindex_t stencil_radius, typename TransFunc>
+class SingleQueueExecutor : public AbstractExecutor<T, stencil_radius, TransFunc> {
+  public:
+    /**
+     * \brief Create a new executor.
+     * \param halo_value The value of cells that are outside the grid.
+     * \param trans_func The instance of the transition function that should be used to calculate
+     * new generations.
+     */
+    SingleQueueExecutor(T halo_value, TransFunc trans_func)
+        : AbstractExecutor<T, stencil_radius, TransFunc>(halo_value, trans_func),
+          queue(std::nullopt), runtime_sample() {}
+
+    /**
+     * \brief Return the configured queue.
+     *
+     * If no queue has been configured yet, this method will configure and return a queue targeting
+     * the FPGA emulator, without runtime analysis.
+     */
+    cl::sycl::queue &get_queue() {
+        if (!this->queue.has_value()) {
+            select_emulator(false);
+        }
+        return *queue;
+    }
+
+    /**
+     * \brief Manually set the SYCL queue to use for execution.
+     *
+     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for
+     * the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will
+     * lead to exceptions.
+     *
+     * In order to use runtime analysis features, the queue has to be configured with the
+     * `cl::sycl::property::queue::enable_profiling` property. A `std::runtime_error` is thrown if
+     * `runtime_analysis` is true and the passed queue does not have this property.
+     *
+     * \deprecated This method is deprecated since the `runtime_analysis` flag is redundant by now.
+     * Use the other variant without the `runtime_analysis` flag instead.
+     *
+     * \param queue The new SYCL queue to use for execution.
+     * \param runtime_analysis Enable event-level runtime analysis.
+     */
+    [[deprecated("Use set_queue(cl::sycl::queue) instead")]] void set_queue(cl::sycl::queue queue,
+                                                                            bool runtime_analysis) {
+        if (runtime_analysis &&
+            !queue.has_property<cl::sycl::property::queue::enable_profiling>()) {
+            throw std::runtime_error(
+                "Runtime analysis is enabled, but the queue does not support it.");
+        }
+        this->queue = queue;
+    }
+
+    /**
+     * \brief Manually set the SYCL queue to use for execution.
+     *
+     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for
+     * the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will
+     * lead to exceptions.
+     *
+     * Runtime analysis is enabled by configuring the queue with the
+     * `cl::sycl::property::queue::enable_profiling` property.
+     *
+     * \param queue The new SYCL queue to use for execution.
+     */
+    void set_queue(cl::sycl::queue queue) { this->queue = queue; }
+
+    /**
+     * \brief Set up a SYCL queue with the FPGA emulator device, without runtime analysis.
+     *
+     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for
+     * the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will
+     * lead to exceptions.
+     */
+    void select_emulator() { select_emulator(false); }
+
+    /**
+     * \brief Set up a SYCL queue with an FPGA device, without runtime analysis.
+     *
+     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for
+     * the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will
+     * lead to exceptions.
+     */
+    void select_fpga() { select_fpga(false); }
+
+    /**
+     * \brief Set up a SYCL queue with the FPGA emulator device and optional runtime analysis.
+     *
+     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for
+     * the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will
+     * lead to exceptions.
+     *
+     * \param runtime_analysis Enable event-level runtime analysis.
+     */
+    void select_emulator(bool runtime_analysis) {
+        this->queue = cl::sycl::queue(cl::sycl::INTEL::fpga_emulator_selector(),
+                                      get_queue_properties(runtime_analysis));
+    }
+
+    /**
+     * \brief Set up a SYCL queue with an FPGA device and optional runtime analysis.
+     *
+     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for
+     * the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will
+     * lead to exceptions.
+     *
+     * \param runtime_analysis Enable event-level runtime analysis.
+     */
+    void select_fpga(bool runtime_analysis) {
+        this->queue = cl::sycl::queue(cl::sycl::INTEL::fpga_selector(),
+                                      get_queue_properties(runtime_analysis));
+    }
+
+    /**
+     * \brief Check if the configured queue supports runtime analysis.
+     *
+     * False if no queue has been configured yet.
+     */
+    bool is_runtime_analysis_enabled() const {
+        if (queue.has_value()) {
+            return queue->has_property<cl::sycl::property::queue::enable_profiling>();
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * \brief Return a reference to the runtime information struct.
+     *
+     * \return The collected runtime information.
+     */
+    RuntimeSample &get_runtime_sample() { return runtime_sample; }
+
+  private:
+    static cl::sycl::property_list get_queue_properties(bool runtime_analysis) {
+        cl::sycl::property_list properties;
+        if (runtime_analysis) {
+            properties = {cl::sycl::property::queue::enable_profiling{}};
+        } else {
+            properties = {};
+        }
+        return properties;
+    }
+
+    std::optional<cl::sycl::queue> queue;
+    RuntimeSample runtime_sample;
+};
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/Stencil.hpp b/StencilStream/Stencil.hpp
index 02a0590..84f7a58 100644
--- a/StencilStream/Stencil.hpp
+++ b/StencilStream/Stencil.hpp
@@ -1,32 +1,41 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include "GenericID.hpp"
 #include "Index.hpp"
 
-namespace stencil
-{
+namespace stencil {
 
 /**
  * \brief The stencil buffer.
- * 
- * The stencil buffer contains the extended Moore neighborhood of a central cell and is used by the transition function to
- * calculate the next generation of the central cell.
- * 
- * This implementation provides two ways to index the stencil: With an `ID` and a `UID`. Since `ID` is signed, the column and row axes are within the range of [-radius : radius].
- * Therefore, (0,0) points to the central cell. `UID` is unsigned and the column and row axes are within the range of [0 : 2*radius + 1). Therefore, (0,0) points to the north-western corner of the stencil.
+ *
+ * The stencil buffer contains the extended Moore neighborhood of a central cell and is used by the
+ * transition function to calculate the next generation of the central cell.
+ *
+ * This implementation provides two ways to index the stencil: With an `ID` and a `UID`. Since `ID`
+ * is signed, the column and row axes are within the range of [-radius : radius]. Therefore, (0,0)
+ * points to the central cell. `UID` is unsigned and the column and row axes are within the range of
+ * [0 : 2*radius + 1). Therefore, (0,0) points to the north-western corner of the stencil.
  */
-template <typename T, uindex_t radius>
-class Stencil
-{
-public:
+template <typename T, uindex_t radius> class Stencil {
+  public:
     /**
      * \brief The diameter (aka width and height) of the stencil buffer.
      */
@@ -37,31 +46,30 @@ class Stencil
 
     /**
      * \brief Create a new stencil with an uninitialized buffer.
-     * 
+     *
      * \param id The position of the central cell in the global grid.
      * \param generation The present generation index of the central cell.
      * \param stage The index of the pipeline stage that calls the transition function.
      * \param grid_range The range of the stencil's grid.
      */
-    Stencil(ID id, uindex_t generation, uindex_t stage, UID grid_range) : id(id), generation(generation), stage(stage), internal(), grid_range(grid_range) {}
+    Stencil(ID id, uindex_t generation, uindex_t stage, UID grid_range)
+        : id(id), generation(generation), stage(stage), internal(), grid_range(grid_range) {}
 
     /**
      * \brief Create a new stencil from the raw buffer.
-     * 
+     *
      * \param id The position of the central cell in the global grid.
      * \param generation The present generation index of the central cell.
      * \param stage The index of the pipeline stage that calls the transition function.
      * \param raw A raw array containing cells.
      * \param grid_range The range of the stencil's grid.
      */
-    Stencil(ID id, uindex_t generation, uindex_t stage, T raw[diameter][diameter], UID grid_range) : id(id), generation(generation), stage(stage), internal(), grid_range(grid_range)
-    {
+    Stencil(ID id, uindex_t generation, uindex_t stage, T raw[diameter][diameter], UID grid_range)
+        : id(id), generation(generation), stage(stage), internal(), grid_range(grid_range) {
 #pragma unroll
-        for (uindex_t c = 0; c < diameter; c++)
-        {
+        for (uindex_t c = 0; c < diameter; c++) {
 #pragma unroll
-            for (uindex_t r = 0; r < diameter; r++)
-            {
+            for (uindex_t r = 0; r < diameter; r++) {
                 internal[c][r] = raw[c][r];
             }
         }
@@ -69,29 +77,31 @@ class Stencil
 
     /**
      * \brief Access a cell in the stencil.
-     * 
+     *
      * Since the indices in `id` are signed, the origin of this index operator is the central cell.
      */
     T const &operator[](ID id) const { return internal[id.c + radius][id.r + radius]; }
 
     /**
      * \brief Access a cell in the stencil.
-     * 
+     *
      * Since the indices in `id` are signed, the origin of this index operator is the central cell.
      */
     T &operator[](ID id) { return internal[id.c + radius][id.r + radius]; }
 
     /**
      * \brief Access a cell in the stencil.
-     * 
-     * Since the indices in `id` are unsigned, the origin of this index operator is the north-western corner.
+     *
+     * Since the indices in `id` are unsigned, the origin of this index operator is the
+     * north-western corner.
      */
     T const &operator[](UID id) const { return internal[id.c][id.r]; }
 
     /**
      * \brief Access a cell in the stencil.
-     * 
-     * Since the indices in `id` are unsigned, the origin of this index operator is the north-western corner.
+     *
+     * Since the indices in `id` are unsigned, the origin of this index operator is the
+     * north-western corner.
      */
     T &operator[](UID id) { return internal[id.c][id.r]; }
 
@@ -107,7 +117,7 @@ class Stencil
 
     /**
      * \brief The index of the pipeline stage that calls the transition function.
-     * 
+     *
      * The stage index is added to the generation index of the input tile to get the generation
      * index of this stencil. Under the assumption that the pipeline was always fully executed, it
      * equals to `generation % pipeline_length`. Since it is hard coded in the final design, it can
@@ -122,8 +132,8 @@ class Stencil
      *   return bar(stencil);
      * }
      * ```
-     * `foo` will only be synthesized for even pipeline stages, and `bar` will only be synthesized for
-     * odd pipeline stages.
+     * `foo` will only be synthesized for even pipeline stages, and `bar` will only be synthesized
+     * for odd pipeline stages.
      */
     const uindex_t stage;
 
@@ -132,7 +142,7 @@ class Stencil
      */
     const UID grid_range;
 
-private:
+  private:
     T internal[diameter][diameter];
 };
 
diff --git a/StencilStream/StencilExecutor.hpp b/StencilStream/StencilExecutor.hpp
index 824d662..f950492 100644
--- a/StencilStream/StencilExecutor.hpp
+++ b/StencilStream/StencilExecutor.hpp
@@ -1,338 +1,144 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
-#include "ExecutionKernel.hpp"
-#include "Grid.hpp"
-#include "RuntimeSample.hpp"
-#include <CL/sycl.hpp>
-#include <CL/sycl/INTEL/fpga_extensions.hpp>
+#include "SingleQueueExecutor.hpp"
+#include "tiling/ExecutionKernel.hpp"
+#include "tiling/Grid.hpp"
 
-namespace stencil
-{
+namespace stencil {
 /**
- * \brief Execution coordinator.
- * 
- * The `StencilExecutor` binds the different parts of StencilStream together and provides a unified
- * interface for applications. Users create a stencil executor, configure it and then use it to run
- * the payload computations. It has multiple logical attributes that can be configured:
- * 
- * ### Grid 
- * 
- * The grid is the logical array of cells, set with \ref StencilExecutor.set_input A stencil
- * executor does not work in place and a buffer used to initialize the grid can be used for other
- * tasks afterwards. The \ref StencilExecutor.run method alters the state of the grid and the grid
- * can be copied back to a given buffer using \ref StencilExecutor.copy_output.
- * 
- * ### SYCL queue
- * 
- * The queue provides the OpenCL platform, device, context and queue to execute the kernels. A stencil
- * executor does not have a queue when constructed and \ref StencilExecutor.run tries to select the
- * FPGA emulation device if no queue has been configured yet. \ref StencilExecutor.set_queue is used
- * to directly set the queue, but \ref StencilExecutor.select_emulator and
- * \ref StencilExecutor.select_fpga can be used to automatically configure the FPGA emulator or the
- * FPGA, respectively.
- * 
- * ### Transition Function
- * 
- * A stencil executor stores an instance of the transition function since it may require some
- * configuration and runtime-dynamic parameters too. An instance is required for the initialization,
- * but it may be replaced at any time with \ref StencilExecutor.set_trans_func.
- * 
- * ### Generation Index
- * 
- * This is the generation index of the current state of the grid. \ref StencilExecutor.run updates and therefore, it can be ignored in most instances. However, it can be reset if a transition function needs it.
- * 
- * \tparam T Cell value type.
- * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the central cell. Must be at least 1.
- * \tparam TransFunc An invocable type that maps a \ref Stencil to the next generation of the stencil's central cell.
- * \tparam pipeline_length The number of hardware execution stages. Must be at least 1. Defaults to 1.
- * \tparam tile_width The number of columns in a tile. Defaults to 1024.
- * \tparam tile_height The number of rows in a tile. Defaults to 1024.
+ * \brief The default stencil executor.
+ *
+ * Unlike \ref MonotileExecutor, this executors supports any grid range by tiling the grid as
+ * described in \ref tiling, at the cost of complexer IO and a computational overhead for every
+ * tile.
+ *
+ * This executor is called `StencilExecutor` since was the first one in StencilStream 2.x. A better
+ * name for it would be `TilingExecutor`.
+ *
+ * \tparam T The cell type.
+ * \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
+ * \tparam TransFunc The type of the transition function.
+ * \tparam pipeline_length The number of hardware execution stages per kernel. Must be at least 1.
+ * Defaults to 1.
+ * \tparam tile_width The number of columns in a tile and maximum number of columns in a grid.
+ * Defaults to 1024.
+ * \tparam tile_height The number of rows in a tile and maximum number of rows in a grid. Defaults
+ * to 1024.
  * \tparam burst_size The number of bytes to load/store in one burst. Defaults to 1024.
  */
-template <typename T, uindex_t stencil_radius, typename TransFunc, uindex_t pipeline_length = 1, uindex_t tile_width = 1024, uindex_t tile_height = 1024, uindex_t burst_size = 1024>
-class StencilExecutor
-{
-public:
-    static constexpr uindex_t burst_length = std::min<uindex_t>(1, burst_size / sizeof(T));
-    static constexpr uindex_t halo_radius = stencil_radius * pipeline_length;
-
+template <typename T, uindex_t stencil_radius, typename TransFunc, uindex_t pipeline_length = 1,
+          uindex_t tile_width = 1024, uindex_t tile_height = 1024, uindex_t burst_size = 1024>
+class StencilExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc> {
+  public:
     /**
-     * \brief Create a new stencil executor.
-     * 
-     * \param halo_value The value of cells in the grid halo.
-     * \param trans_func An instance of the transition function type.
+     * \brief The number of cells that can be transfered in a single burst.
      */
-    StencilExecutor(T halo_value, TransFunc trans_func) : input_grid(cl::sycl::buffer<T, 2>(cl::sycl::range<2>(0, 0))), queue(), trans_func(trans_func), i_generation(0), halo_value(halo_value), runtime_analysis_enabled(false)
-    {
-    }
+    static constexpr uindex_t burst_length = std::min<uindex_t>(1, burst_size / sizeof(T));
 
     /**
-     * \brief Compute the next generations.
-     * 
-     * It uses the transition function to advance the state of the grid.
-     * 
-     * \param n_generations The number of generations to advance the state of the grid by.
+     * \brief The number of cells that have be added to the tile in every direction to form the
+     * complete input.
      */
-    void run(uindex_t n_generations)
-    {
-        using in_pipe = cl::sycl::pipe<class in_pipe_id, T>;
-        using out_pipe = cl::sycl::pipe<class out_pipe_id, T>;
-        using ExecutionKernelImpl = ExecutionKernel<TransFunc, T, stencil_radius, pipeline_length, tile_width, tile_height, in_pipe, out_pipe>;
-
-        if (!this->queue.has_value())
-        {
-            select_emulator(false);
-        }
-
-        cl::sycl::queue queue = *(this->queue);
-
-        uindex_t n_passes = n_generations / pipeline_length;
-        if (n_generations % pipeline_length != 0)
-        {
-            n_passes += 1;
-        }
-
-        if (runtime_analysis_enabled)
-        {
-            runtime_sample = std::nullopt;
-            runtime_sample = RuntimeSample(n_passes, input_grid.get_tile_range().c, input_grid.get_tile_range().r);
-        }
-
-        uindex_t grid_width = input_grid.get_grid_range().c;
-        uindex_t grid_height = input_grid.get_grid_range().r;
-
-        for (uindex_t i = 0; i < n_passes; i++)
-        {
-            Grid output_grid = input_grid.make_output_grid();
-
-            for (uindex_t c = 0; c < input_grid.get_tile_range().c; c++)
-            {
-                for (uindex_t r = 0; r < input_grid.get_tile_range().r; r++)
-                {
-                    input_grid.template submit_tile_input<in_pipe>(queue, UID(c, r));
-
-                    cl::sycl::event computation_event = queue.submit([&](cl::sycl::handler &cgh)
-                                                                     { cgh.single_task(ExecutionKernelImpl(
-                                                                           trans_func,
-                                                                           i_generation,
-                                                                           n_generations,
-                                                                           c * tile_width,
-                                                                           r * tile_height,
-                                                                           grid_width,
-                                                                           grid_height,
-                                                                           halo_value)); });
-
-                    if (runtime_analysis_enabled)
-                    {
-                        runtime_sample->add_event(computation_event, i, c, r);
-                    }
-
-                    output_grid.template submit_tile_output<out_pipe>(queue, UID(c, r));
-                }
-            }
-            input_grid = output_grid;
-            i_generation += std::min(n_generations - i_generation, pipeline_length);
-        }
-    }
+    static constexpr uindex_t halo_radius = stencil_radius * pipeline_length;
 
     /**
-     * \brief Set the state of the grid.
-     * 
-     * \param input_buffer A buffer containing the new state of the grid.
+     * \brief Shorthand for the parent class.
      */
-    void set_input(cl::sycl::buffer<T, 2> input_buffer)
-    {
-        this->input_grid = GridImpl(input_buffer);
-    }
+    using Parent = SingleQueueExecutor<T, stencil_radius, TransFunc>;
 
     /**
-     * \brief Copy the current state of the grid to the buffer.
-     * 
-     * The \ref output_buffer has to have the exact range as returned by \ref StencilExecutor.get_grid_range.
-     * 
-     * \param output_buffer Copy the state of the grid to this buffer.
+     * \brief Create a new stencil executor.
+     *
+     * \param halo_value The value of cells in the grid halo.
+     * \param trans_func An instance of the transition function type.
      */
-    void copy_output(cl::sycl::buffer<T, 2> output_buffer)
-    {
-        input_grid.copy_to(output_buffer);
-    }
+    StencilExecutor(T halo_value, TransFunc trans_func)
+        : Parent(halo_value, trans_func),
+          input_grid(cl::sycl::buffer<T, 2>(cl::sycl::range<2>(0, 0))) {}
 
-    /**
-     * \brief Return the range of the grid.
-     * 
-     * \return The range of the grid.
-     */
-    UID get_grid_range() const
-    {
-        return input_grid.get_grid_range();
+    void set_input(cl::sycl::buffer<T, 2> input_buffer) override {
+        this->input_grid = GridImpl(input_buffer);
     }
 
-    /**
-     * \brief Return the value of cells in the grid halo.
-     * 
-     * \return The value of cells in the grid halo.
-     */
-    T get_halo_value() const
-    {
-        return halo_value;
+    void copy_output(cl::sycl::buffer<T, 2> output_buffer) override {
+        input_grid.copy_to(output_buffer);
     }
 
-    /**
-     * \brief Set the value of cells in the grid halo.
-     * 
-     * \param halo_value The new value of cells in the grid halo.
-     */
-    void set_halo_value(T halo_value)
-    {
-        this->halo_value = halo_value;
-    }
+    UID get_grid_range() const override { return input_grid.get_grid_range(); }
 
-    /**
-     * \brief Manually set the SYCL queue to use for execution.
-     * 
-     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will lead to exceptions.
-     * 
-     * In order to use runtime analysis features, the queue has to be configured with the `cl::sycl::property::queue::enable_profiling` property.
-     * 
-     * \param queue The new SYCL queue to use for execution.
-     * \param runtime_analysis Enable event-level runtime analysis.
-     */
-    void set_queue(cl::sycl::queue queue, bool runtime_analysis)
-    {
-        runtime_analysis_enabled = runtime_analysis;
-        this->queue = queue;
-        verify_queue_properties();
-    }
+    void run(uindex_t n_generations) override {
+        using in_pipe = cl::sycl::pipe<class tiling_in_pipe, T>;
+        using out_pipe = cl::sycl::pipe<class tiling_out_pipe, T>;
+        using ExecutionKernelImpl =
+            tiling::ExecutionKernel<TransFunc, T, stencil_radius, pipeline_length, tile_width,
+                                    tile_height, in_pipe, out_pipe>;
 
-    /**
-     * \brief Set up a SYCL queue with the FPGA emulator device.
-     * 
-     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will lead to exceptions.
-     */
-    void select_emulator()
-    {
-        select_emulator(false);
-    }
+        cl::sycl::queue &queue = this->get_queue();
 
-    /**
-     * \brief Set up a SYCL queue with an FPGA device.
-     * 
-     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will lead to exceptions.
-     */
-    void select_fpga()
-    {
-        select_fpga(false);
-    }
+        uindex_t target_i_generation = this->get_i_generation() + n_generations;
+        uindex_t grid_width = input_grid.get_grid_range().c;
+        uindex_t grid_height = input_grid.get_grid_range().r;
 
-    /**
-     * \brief Set up a SYCL queue with the FPGA emulator device and optional runtime analysis.
-     * 
-     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will lead to exceptions.
-     * 
-     * \param runtime_analysis Enable event-level runtime analysis.
-     */
-    void select_emulator(bool runtime_analysis)
-    {
-        runtime_analysis_enabled = runtime_analysis;
-        this->queue = cl::sycl::queue(cl::sycl::INTEL::fpga_emulator_selector(), get_queue_properties());
-    }
+        while (this->get_i_generation() < target_i_generation) {
+            GridImpl output_grid = input_grid.make_output_grid();
 
-    /**
-     * \brief Set up a SYCL queue with an FPGA device and optional runtime analysis.
-     * 
-     * Note that as of OneAPI Version 2021.1.1, device code is usually built either for CPU/GPU, for the FPGA emulator or for a specific FPGA. Using the wrong queue with the wrong device will lead to exceptions.
-     * 
-     * \param runtime_analysis Enable event-level runtime analysis.
-     */
-    void select_fpga(bool runtime_analysis)
-    {
-        runtime_analysis_enabled = runtime_analysis;
-        this->queue = cl::sycl::queue(cl::sycl::INTEL::fpga_selector(), get_queue_properties());
-    }
+            std::vector<cl::sycl::event> events;
+            events.reserve(input_grid.get_tile_range().c * input_grid.get_tile_range().r);
 
-    /**
-     * \brief Update the transition function instance.
-     * 
-     * \param trans_func The new transition function instance.
-     */
-    void set_trans_func(TransFunc trans_func)
-    {
-        this->trans_func = trans_func;
-    }
+            for (uindex_t c = 0; c < input_grid.get_tile_range().c; c++) {
+                for (uindex_t r = 0; r < input_grid.get_tile_range().r; r++) {
+                    input_grid.template submit_tile_input<in_pipe>(queue, UID(c, r));
 
-    /**
-     * \brief Get the current generation index of the grid.
-     * 
-     * \return The current generation index of the grid.
-     */
-    uindex_t get_i_generation() const
-    {
-        return i_generation;
-    }
+                    cl::sycl::event computation_event = queue.submit([&](cl::sycl::handler &cgh) {
+                        cgh.single_task(ExecutionKernelImpl(
+                            this->get_trans_func(), this->get_i_generation(), target_i_generation,
+                            c * tile_width, r * tile_height, grid_width, grid_height,
+                            this->get_halo_value()));
+                    });
+                    events.push_back(computation_event);
 
-    /**
-     * \brief Set the generation index of the grid.
-     * 
-     * \param i_generation The new generation index of the grid.
-     */
-    void set_i_generation(uindex_t i_generation)
-    {
-        this->i_generation = i_generation;
-    }
+                    output_grid.template submit_tile_output<out_pipe>(queue, UID(c, r));
+                }
+            }
 
-    /**
-     * \brief Return the runtime information collected from the last \ref StencilExecutor.run call.
-     * 
-     * \return The collected runtime information. May be `nullopt` if no runtime analysis was configured.
-     */
-    std::optional<RuntimeSample> get_runtime_sample()
-    {
-        return runtime_sample;
-    }
+            input_grid = output_grid;
 
-private:
-    cl::sycl::property_list get_queue_properties()
-    {
-        cl::sycl::property_list properties;
-        if (runtime_analysis_enabled)
-        {
-            properties = {cl::sycl::property::queue::enable_profiling{}};
-        }
-        else
-        {
-            properties = {};
-        }
-        return properties;
-    }
+            if (this->is_runtime_analysis_enabled()) {
+                double earliest_start = std::numeric_limits<double>::max();
+                double latest_end = std::numeric_limits<double>::min();
 
-    void verify_queue_properties()
-    {
-        if (!queue.has_value())
-            return;
+                for (cl::sycl::event event : events) {
+                    earliest_start = std::min(earliest_start, RuntimeSample::start_of_event(event));
+                    latest_end = std::max(latest_end, RuntimeSample::end_of_event(event));
+                }
+                this->get_runtime_sample().add_pass(latest_end - earliest_start);
+            }
 
-        if (runtime_analysis_enabled && !queue->has_property<cl::sycl::property::queue::enable_profiling>())
-        {
-            throw std::runtime_error("Runtime analysis is enabled, but the queue does not support it.");
+            this->inc_i_generation(
+                std::min(target_i_generation - this->get_i_generation(), pipeline_length));
         }
     }
 
-    using GridImpl = Grid<T, tile_width, tile_height, halo_radius, burst_length>;
-
+  private:
+    using GridImpl = tiling::Grid<T, tile_width, tile_height, halo_radius, burst_length>;
     GridImpl input_grid;
-    std::optional<cl::sycl::queue> queue;
-    TransFunc trans_func;
-    uindex_t i_generation;
-    T halo_value;
-
-    bool runtime_analysis_enabled;
-    std::optional<RuntimeSample> runtime_sample;
 };
 } // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/monotile/ExecutionKernel.hpp b/StencilStream/monotile/ExecutionKernel.hpp
new file mode 100644
index 0000000..9fc95c1
--- /dev/null
+++ b/StencilStream/monotile/ExecutionKernel.hpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "../GenericID.hpp"
+#include "../Helpers.hpp"
+#include "../Index.hpp"
+#include "../Stencil.hpp"
+#include <optional>
+
+namespace stencil {
+namespace monotile {
+
+/**
+ * \brief A kernel that executes a stencil transition function using the monotile approach.
+ *
+ * It receives the contents of a tile and it's halo from the `in_pipe`, applies the transition
+ * function when applicable and writes the result to the `out_pipe`.
+ *
+ * With the monotile approach, the whole grid fits in one tile. This eliminates the need to
+ * calculate the cells of the tile halo, reducing the cache size and number of loop iterations. More
+ * is described in \ref monotile.
+ *
+ * \tparam TransFunc The type of transition function to use.
+ * \tparam T Cell value type.
+ * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the
+ * central cell \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll
+ * factor for a loop. \tparam output_tile_width The number of columns in a grid tile. \tparam
+ * output_tile_height The number of rows in a grid tile. \tparam in_pipe The pipe to read from.
+ * \tparam out_pipe The pipe to write to.
+ */
+template <typename TransFunc, typename T, uindex_t stencil_radius, uindex_t pipeline_length,
+          uindex_t tile_width, uindex_t tile_height, typename in_pipe, typename out_pipe>
+class ExecutionKernel {
+  public:
+    static_assert(
+        std::is_invocable_r<T, TransFunc const, Stencil<T, stencil_radius> const &>::value);
+    static_assert(stencil_radius >= 1);
+
+    /**
+     * \brief The width and height of the stencil buffer.
+     */
+    const static uindex_t stencil_diameter = Stencil<T, stencil_radius>::diameter;
+
+    /**
+     * \brief The number of cells in the tile.
+     */
+    const static uindex_t n_cells = tile_width * tile_height;
+
+    /**
+     * \brief The number of cells that need to be fed into a stage before it produces correct
+     * values.
+     */
+    const static uindex_t stage_latency = stencil_radius * (tile_height + 1);
+
+    /**
+     * \brief The number of cells that need to be fed into the pipeline before it produces correct
+     * values.
+     */
+    const static uindex_t pipeline_latency = pipeline_length * stage_latency;
+
+    /**
+     * \brief The total number of loop iterations.
+     */
+    const static uindex_t n_iterations = pipeline_latency + n_cells;
+
+    /**
+     * \brief Create and configure the execution kernel.
+     *
+     * \param trans_func The instance of the transition function to use.
+     * \param i_generation The generation index of the input cells.
+     * \param n_generations The number of generations to compute. If this number is bigger than
+     * `pipeline_length`, only `pipeline_length` generations will be computed.
+     * \param grid_width The number of cell columns in the grid.
+     * \param grid_height The number of cell rows in the grid.
+     * \param halo_value The value of cells outside the grid.
+     */
+    ExecutionKernel(TransFunc trans_func, uindex_t i_generation, uindex_t n_generations,
+                    uindex_t grid_width, uindex_t grid_height, T halo_value)
+        : trans_func(trans_func), i_generation(i_generation), n_generations(n_generations),
+          grid_width(grid_width), grid_height(grid_height), halo_value(halo_value) {}
+
+    /**
+     * \brief Execute the kernel.
+     */
+    void operator()() const {
+        [[intel::fpga_register]] index_t c[pipeline_length];
+        [[intel::fpga_register]] index_t r[pipeline_length];
+
+        // Initializing (output) column and row counters.
+        index_t prev_c = 0;
+        index_t prev_r = 0;
+#pragma unroll
+        for (uindex_t i = 0; i < pipeline_length; i++) {
+            c[i] = prev_c - stencil_radius;
+            r[i] = prev_r - stencil_radius;
+            if (r[i] < index_t(0)) {
+                r[i] += tile_height;
+                c[i] -= 1;
+            }
+            prev_c = c[i];
+            prev_r = r[i];
+        }
+
+        /*
+         * The intel::numbanks attribute requires a power of two as it's argument and if the
+         * pipeline length isn't a power of two, it would produce an error. Therefore, we calculate
+         * the next power of two and use it to allocate the cache. The compiler is smart enough to
+         * see that these additional banks in the cache aren't used and therefore optimizes them
+         * away.
+         */
+        [[intel::fpga_memory, intel::numbanks(2 * next_power_of_two(pipeline_length))]] T
+            cache[2][tile_height][next_power_of_two(pipeline_length)][stencil_diameter - 1];
+        [[intel::fpga_register]] T stencil_buffer[pipeline_length][stencil_diameter]
+                                                 [stencil_diameter];
+
+        for (uindex_t i = 0; i < n_iterations; i++) {
+            T value;
+            if (i < n_cells) {
+                value = in_pipe::read();
+            } else {
+                value = halo_value;
+            }
+
+#pragma unroll
+            for (uindex_t stage = 0; stage < pipeline_length; stage++) {
+#pragma unroll
+                for (uindex_t r = 0; r < stencil_diameter - 1; r++) {
+#pragma unroll
+                    for (uindex_t c = 0; c < stencil_diameter; c++) {
+                        stencil_buffer[stage][c][r] = stencil_buffer[stage][c][r + 1];
+                    }
+                }
+
+                // Update the stencil buffer and cache with previous cache contents and the new
+                // input cell.
+#pragma unroll
+                for (uindex_t cache_c = 0; cache_c < stencil_diameter; cache_c++) {
+                    T new_value;
+                    if (cache_c == stencil_diameter - 1) {
+                        new_value = value;
+                    } else {
+                        new_value = cache[c[stage] & 0b1][r[stage]][stage][cache_c];
+                    }
+
+                    stencil_buffer[stage][cache_c][stencil_diameter - 1] = new_value;
+                    if (cache_c > 0) {
+                        cache[(~c[stage]) & 0b1][r[stage]][stage][cache_c - 1] = new_value;
+                    }
+                }
+
+                if (i_generation + stage < n_generations) {
+                    if (id_in_grid(c[stage], r[stage])) {
+                        Stencil<T, stencil_radius> stencil(ID(c[stage], r[stage]),
+                                                           i_generation + stage, stage,
+                                                           UID(grid_width, grid_height));
+
+#pragma unroll
+                        for (index_t cell_c = -stencil_radius; cell_c <= index_t(stencil_radius);
+                             cell_c++) {
+#pragma unroll
+                            for (index_t cell_r = -stencil_radius;
+                                 cell_r <= index_t(stencil_radius); cell_r++) {
+                                if (id_in_grid(cell_c + c[stage], cell_r + r[stage])) {
+                                    stencil[ID(cell_c, cell_r)] =
+                                        stencil_buffer[stage][cell_c + stencil_radius]
+                                                      [cell_r + stencil_radius];
+                                } else {
+                                    stencil[ID(cell_c, cell_r)] = halo_value;
+                                }
+                            }
+                        }
+
+                        value = trans_func(stencil);
+                    } else {
+                        value = halo_value;
+                    }
+                } else {
+                    value = stencil_buffer[stage][stencil_radius][stencil_radius];
+                }
+
+                r[stage] += 1;
+                if (r[stage] == tile_height) {
+                    r[stage] = 0;
+                    c[stage] += 1;
+                }
+            }
+
+            if (i >= pipeline_latency) {
+                out_pipe::write(value);
+            }
+        }
+    }
+
+  private:
+    bool id_in_grid(index_t c, index_t r) const {
+        return c >= index_t(0) && r >= index_t(0) && c < index_t(grid_width) &&
+               r < index_t(grid_height);
+    }
+
+    TransFunc trans_func;
+    uindex_t i_generation;
+    uindex_t n_generations;
+    uindex_t grid_width;
+    uindex_t grid_height;
+    T halo_value;
+};
+
+} // namespace monotile
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/tiling/ExecutionKernel.hpp b/StencilStream/tiling/ExecutionKernel.hpp
new file mode 100644
index 0000000..403592e
--- /dev/null
+++ b/StencilStream/tiling/ExecutionKernel.hpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "../GenericID.hpp"
+#include "../Helpers.hpp"
+#include "../Index.hpp"
+#include "../Stencil.hpp"
+#include <optional>
+
+namespace stencil {
+namespace tiling {
+
+/**
+ * \brief A kernel that executes a stencil transition function on a tile.
+ *
+ * It receives the contents of a tile and it's halo from the `in_pipe`, applies the transition
+ * function when applicable and writes the result to the `out_pipe`.
+ *
+ * \tparam TransFunc The type of transition function to use.
+ * \tparam T Cell value type.
+ * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the
+ * central cell
+ * \tparam pipeline_length The number of pipeline stages to use. Similar to an unroll
+ * factor for a loop.
+ * \tparam output_tile_width The number of columns in a grid tile.
+ * \tparam output_tile_height The number of rows in a grid tile.
+ * \tparam in_pipe The pipe to read from.
+ * \tparam out_pipe The pipe to write to.
+ */
+template <typename TransFunc, typename T, uindex_t stencil_radius, uindex_t pipeline_length,
+          uindex_t output_tile_width, uindex_t output_tile_height, typename in_pipe,
+          typename out_pipe>
+class ExecutionKernel {
+  public:
+    static_assert(
+        std::is_invocable_r<T, TransFunc const, Stencil<T, stencil_radius> const &>::value);
+    static_assert(stencil_radius >= 1);
+
+    /**
+     * \brief The width and height of the stencil buffer.
+     */
+    const static uindex_t stencil_diameter = Stencil<T, stencil_radius>::diameter;
+
+    /**
+     * \brief The total number of cells to read from the `in_pipe`.
+     */
+    const static uindex_t n_input_cells =
+        (2 * stencil_radius * pipeline_length + output_tile_width) *
+        (2 * stencil_radius * pipeline_length + output_tile_height);
+
+    /**
+     * \brief The width of the processed tile with the tile halo attached.
+     */
+    const static uindex_t input_tile_width =
+        2 * stencil_radius * pipeline_length + output_tile_width;
+
+    /**
+     * \brief The height of the processed tile with the tile halo attached.
+     */
+    const static uindex_t input_tile_height =
+        2 * stencil_radius * pipeline_length + output_tile_height;
+
+    /**
+     * \brief Create and configure the execution kernel.
+     *
+     * \param trans_func The instance of the transition function to use.
+     * \param i_generation The generation index of the input cells.
+     * \param target_i_generation The number of generations to compute. If this number is bigger
+     * than `pipeline_length`, only `pipeline_length` generations will be computed.
+     * \param grid_c_offset The column offset of the processed tile relative to the grid's origin,
+     * not including the halo. For example, for the most north-western tile the offset will always
+     * be (0,0), not (-halo_radius,-halo_radius)
+     * \param grid_r_offset The row offset of the processed tile relative to the grid's origin. See
+     * `grid_c_offset` for details.
+     * \param grid_width The number of cell columns in the grid.
+     * \param grid_height The number of cell rows in the grid.
+     * \param halo_value The value of cells in the grid halo.
+     */
+    ExecutionKernel(TransFunc trans_func, uindex_t i_generation, uindex_t target_i_generation,
+                    uindex_t grid_c_offset, uindex_t grid_r_offset, uindex_t grid_width,
+                    uindex_t grid_height, T halo_value)
+        : trans_func(trans_func), i_generation(i_generation),
+          target_i_generation(target_i_generation), grid_c_offset(grid_c_offset),
+          grid_r_offset(grid_r_offset), grid_width(grid_width), grid_height(grid_height),
+          halo_value(halo_value) {}
+
+    /**
+     * \brief Execute the configured operations.
+     */
+    void operator()() const {
+        uindex_t input_tile_c = 0;
+        uindex_t input_tile_r = 0;
+
+        /*
+         * The intel::numbanks attribute requires a power of two as it's argument and if the
+         * pipeline length isn't a power of two, it would produce an error. Therefore, we calculate
+         * the next power of two and use it to allocate the cache. The compiler is smart enough to
+         * see that these additional banks in the cache aren't used and therefore optimizes them
+         * away.
+         */
+        [[intel::fpga_memory, intel::numbanks(2 * next_power_of_two(pipeline_length))]] T
+            cache[2][input_tile_height][next_power_of_two(pipeline_length)][stencil_diameter - 1];
+        [[intel::fpga_register]] T stencil_buffer[pipeline_length][stencil_diameter]
+                                                 [stencil_diameter];
+
+        for (uindex_t i = 0; i < n_input_cells; i++) {
+            T value = in_pipe::read();
+
+#pragma unroll
+            for (uindex_t stage = 0; stage < pipeline_length; stage++) {
+                /*
+                 * Shift up every value in the stencil_buffer.
+                 * This operation does not touch the values in the bottom row, which will be filled
+                 * from the cache and the new input value later.
+                 */
+#pragma unroll
+                for (uindex_t r = 0; r < stencil_diameter - 1; r++) {
+#pragma unroll
+                    for (uindex_t c = 0; c < stencil_diameter; c++) {
+                        stencil_buffer[stage][c][r] = stencil_buffer[stage][c][r + 1];
+                    }
+                }
+
+                index_t input_grid_c = grid_c_offset + index_t(input_tile_c) -
+                                       (stencil_diameter - 1) -
+                                       (pipeline_length + stage - 2) * stencil_radius;
+                index_t input_grid_r = grid_r_offset + index_t(input_tile_r) -
+                                       (stencil_diameter - 1) -
+                                       (pipeline_length + stage - 2) * stencil_radius;
+
+                // Update the stencil buffer and cache with previous cache contents and the new
+                // input cell.
+#pragma unroll
+                for (uindex_t cache_c = 0; cache_c < stencil_diameter; cache_c++) {
+                    T new_value;
+                    if (cache_c == stencil_diameter - 1) {
+                        if (input_grid_c < 0 || input_grid_r < 0 || input_grid_c >= grid_width ||
+                            input_grid_r >= grid_height) {
+                            new_value = halo_value;
+                        } else {
+                            new_value = value;
+                        }
+                    } else {
+                        new_value = cache[input_tile_c & 0b1][input_tile_r][stage][cache_c];
+                    }
+
+                    stencil_buffer[stage][cache_c][stencil_diameter - 1] = new_value;
+                    if (cache_c > 0) {
+                        cache[(~input_tile_c) & 0b1][input_tile_r][stage][cache_c - 1] = new_value;
+                    }
+                }
+
+                index_t output_grid_c = input_grid_c - stencil_radius;
+                index_t output_grid_r = input_grid_r - stencil_radius;
+                Stencil<T, stencil_radius> stencil(
+                    ID(output_grid_c, output_grid_r), i_generation + stage, stage,
+                    stencil_buffer[stage], UID(grid_width, grid_height));
+
+                if (i_generation + stage < target_i_generation) {
+                    value = trans_func(stencil);
+                } else {
+                    value = stencil_buffer[stage][stencil_radius][stencil_radius];
+                }
+            }
+
+            bool is_valid_output = input_tile_c >= (stencil_diameter - 1) * pipeline_length;
+            is_valid_output &= input_tile_r >= (stencil_diameter - 1) * pipeline_length;
+
+            if (is_valid_output) {
+                out_pipe::write(value);
+            }
+
+            if (input_tile_r == input_tile_height - 1) {
+                input_tile_r = 0;
+                input_tile_c++;
+            } else {
+                input_tile_r++;
+            }
+        }
+    }
+
+  private:
+    TransFunc trans_func;
+    uindex_t i_generation;
+    uindex_t target_i_generation;
+    uindex_t grid_c_offset;
+    uindex_t grid_r_offset;
+    uindex_t grid_width;
+    uindex_t grid_height;
+    T halo_value;
+};
+
+} // namespace tiling
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/Grid.hpp b/StencilStream/tiling/Grid.hpp
similarity index 55%
rename from StencilStream/Grid.hpp
rename to StencilStream/tiling/Grid.hpp
index 038c86e..cfdbc29 100644
--- a/StencilStream/Grid.hpp
+++ b/StencilStream/tiling/Grid.hpp
@@ -1,14 +1,24 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
-#include "GenericID.hpp"
+#include "../GenericID.hpp"
 #include "IOKernel.hpp"
 #include "Tile.hpp"
 #include <CL/sycl/accessor.hpp>
@@ -17,73 +27,75 @@
 #include <memory>
 #include <vector>
 
-namespace stencil
-{
+namespace stencil {
+namespace tiling {
 
 /**
- * \brief A rectangular container of cells with a dynamic, arbitrary size.
- * 
- * It logically contains the grid the transition function is applied to. As described in \ref tiling, it partitions the grid into tiles of static size, which are the units the \ref ExecutionKernel works on.
- * 
- * Apart from providing copy operations to and from monolithic grid buffers, it also handles the input and output kernel submission for a given tile.
- * 
+ * \brief A rectangular container of cells with a dynamic, arbitrary size, used by the \ref
+ * StencilExecutor.
+ *
+ * This class is part of the \ref tiling architecture. It logically contains the grid the transition
+ * function is applied to and it partitions the grid into tiles of static size. These are the units
+ * the \ref ExecutionKernel works on.
+ *
+ * Apart from providing copy operations to and from monolithic grid buffers, it also handles the
+ * input and output kernel submission for a given tile.
+ *
  * \tparam T Cell value type.
  * \tparam tile_width The number of columns of a tile.
  * \tparam tile_height The number of rows of a tile.
  * \tparam halo_radius The radius (aka width and height) of the tile halo.
  * \tparam burst_length The number of elements that can be read or written in a burst.
  */
-template <typename T, uindex_t tile_width, uindex_t tile_height, uindex_t halo_radius, uindex_t burst_length>
-class Grid
-{
-public:
+template <typename T, uindex_t tile_width, uindex_t tile_height, uindex_t halo_radius,
+          uindex_t burst_length>
+class Grid {
+  private:
     using Tile = Tile<T, tile_width, tile_height, halo_radius, burst_length>;
 
+  public:
     /**
      * \brief Create a grid with undefined contents.
-     * 
-     * This constructor is used to create the output grid of a \ref ExecutionKernel invocation. It's contents do not need to be initialized or copied from another buffer since it will override cell values from the execution kernel anyway.
-     * 
+     *
+     * This constructor is used to create the output grid of a \ref ExecutionKernel
+     * invocation. It's contents do not need to be initialized or copied from another buffer since
+     * it will override cell values from the execution kernel anyway.
+     *
      * \param width The number of columns of the grid.
      * \param height The number of rows of the grid.
      */
-    Grid(uindex_t width, uindex_t height) : tiles(), grid_range(width, height)
-    {
-        allocate_tiles();
-    }
+    Grid(uindex_t width, uindex_t height) : tiles(), grid_range(width, height) { allocate_tiles(); }
 
     /**
      * \brief Create a grid that contains the cells of a buffer.
-     * 
-     * This will allocate enough resources to store the content of the buffer and than copy those cells into the data layout of the grid.
-     * 
+     *
+     * This will allocate enough resources to store the content of the buffer and than copy those
+     * cells into the data layout of the grid.
+     *
      * \param in_buffer The buffer to copy the cells from.
      */
-    Grid(cl::sycl::buffer<T, 2> in_buffer) : tiles(), grid_range(in_buffer.get_range())
-    {
+    Grid(cl::sycl::buffer<T, 2> in_buffer) : tiles(), grid_range(in_buffer.get_range()) {
         copy_from(in_buffer);
     }
 
     /**
      * \brief Copy the contents of the grid to a given buffer.
-     * 
-     * This buffer has to exactly have the size of the grid, otherwise a range error is thrown.
-     * 
+     *
+     * This buffer has to exactly have the size of the grid, otherwise a `std::range_error` is
+     * thrown.
+     *
      * \param out_buffer The buffer to copy the cells to.
      * \throws std::range_error The buffer's size is not the same as the grid's size.
      */
-    void copy_to(cl::sycl::buffer<T, 2> &out_buffer)
-    {
-        if (out_buffer.get_range() != grid_range)
-        {
+    void copy_to(cl::sycl::buffer<T, 2> &out_buffer) {
+        if (out_buffer.get_range() != grid_range) {
             throw std::range_error("The target buffer has not the same size as the grid");
         }
 
-        for (uindex_t tile_column = 1; tile_column < tiles.size() - 1; tile_column++)
-        {
-            for (uindex_t tile_row = 1; tile_row < tiles[tile_column].size() - 1; tile_row++)
-            {
-                cl::sycl::id<2> offset((tile_column - 1) * tile_width, (tile_row - 1) * tile_height);
+        for (uindex_t tile_column = 1; tile_column < tiles.size() - 1; tile_column++) {
+            for (uindex_t tile_row = 1; tile_row < tiles[tile_column].size() - 1; tile_row++) {
+                cl::sycl::id<2> offset((tile_column - 1) * tile_width,
+                                       (tile_row - 1) * tile_height);
                 tiles[tile_column][tile_row].copy_to(out_buffer, offset);
             }
         }
@@ -91,13 +103,12 @@ class Grid
 
     /**
      * \brief Create a new grid that can be used as an output target.
-     * 
+     *
      * This grid will have the same range as the original grid and will use new buffers.
-     * 
+     *
      * \return The new grid.
      */
-    Grid make_output_grid() const
-    {
+    Grid make_output_grid() const {
         Grid output_grid(grid_range[0], grid_range[1]);
 
         return output_grid;
@@ -105,39 +116,35 @@ class Grid
 
     /**
      * \brief Return the range of (central) tiles of the grid.
-     * 
-     * This is not the range of a single tile nor is the range of the grid. It is the range of valid arguments for \ref Grid.get_tile. For example, if the grid is 60 by 60 cells in size and a tile is 32 by 32 cells in size, the tile range would be 2 by 2 tiles.
-     * 
+     *
+     * This is not the range of a single tile nor is the range of the grid. It is the range of valid
+     * arguments for \ref Grid.get_tile. For example, if the grid is 60 by 60 cells in size and a
+     * tile is 32 by 32 cells in size, the tile range would be 2 by 2 tiles.
+     *
      * \return The range of tiles of the grid.
      */
-    UID get_tile_range() const
-    {
-        return UID(tiles.size() - 2, tiles[0].size() - 2);
-    }
+    UID get_tile_range() const { return UID(tiles.size() - 2, tiles[0].size() - 2); }
 
     /**
      * \brief Return the range of the grid in cells.
-     * 
-     * If the grid was constructed from a buffer, this will be the range of this buffer. If the grid was constructed with width and height arguments, this will be this exact range.
-     * 
+     *
+     * If the grid was constructed from a buffer, this will be the range of this buffer. If the grid
+     * was constructed with width and height arguments, this will be this exact range.
+     *
      * \return The range of cells of the grid.
      */
-    UID get_grid_range() const
-    {
-        return grid_range;
-    }
+    UID get_grid_range() const { return grid_range; }
 
     /**
      * \brief Get the tile at the given index.
-     * 
+     *
      * \param tile_id The id of the tile to return.
      * \return The tile.
-     * \throws std::out_of_range Thrown if the tile id is outside the range of tiles, as returned by \ref Grid.get_tile_range.
+     * \throws std::out_of_range Thrown if the tile id is outside the range of tiles, as returned by
+     * \ref Grid.get_tile_range.
      */
-    Tile &get_tile(UID tile_id)
-    {
-        if (tile_id.c > get_tile_range().c || tile_id.r > get_tile_range().r)
-        {
+    Tile &get_tile(UID tile_id) {
+        if (tile_id.c > get_tile_range().c || tile_id.r > get_tile_range().r) {
             throw std::out_of_range("Tile index out of range");
         }
 
@@ -149,34 +156,34 @@ class Grid
 
     /**
      * \brief Submit the input kernels required for one execution of the \ref ExecutionKernel.
-     * 
-     * This will submit five \ref IOKernel invocations in total, which are executed in order. Those kernels write the contents of a tile and it's halo to the `in_pipe`. 
-     * 
+     *
+     * This will submit five \ref IOKernel invocations in total, which are executed in order. Those
+     * kernels write the contents of a tile and it's halo to the `in_pipe`.
+     *
      * \tparam in_pipe The pipe to write the cells to.
      * \param fpga_queue The configured SYCL queue for submissions.
      * \param tile_id The id of the tile to read.
-     * \throws std::out_of_range Thrown if the tile id is outside the range of tiles, as returned by \ref Grid.get_tile_range.
+     * \throws std::out_of_range Thrown if the tile id is outside the range of tiles, as returned by
+     * \ref Grid.get_tile_range.
      */
-    template <typename in_pipe>
-    void submit_tile_input(cl::sycl::queue fpga_queue, UID tile_id)
-    {
-        if (tile_id.c > get_tile_range().c || tile_id.r > get_tile_range().r)
-        {
+    template <typename in_pipe> void submit_tile_input(cl::sycl::queue fpga_queue, UID tile_id) {
+        if (tile_id.c > get_tile_range().c || tile_id.r > get_tile_range().r) {
             throw std::out_of_range("Tile index out of range");
         }
 
         uindex_t tile_c = tile_id.c + 1;
         uindex_t tile_r = tile_id.r + 1;
 
-        submit_input_kernel<in_pipe>(fpga_queue,
-                                     std::array<cl::sycl::buffer<T, 2>, 5>{
-                                         tiles[tile_c - 1][tile_r - 1][Tile::Part::SOUTH_EAST_CORNER],
-                                         tiles[tile_c - 1][tile_r][Tile::Part::NORTH_EAST_CORNER],
-                                         tiles[tile_c - 1][tile_r][Tile::Part::EAST_BORDER],
-                                         tiles[tile_c - 1][tile_r][Tile::Part::SOUTH_EAST_CORNER],
-                                         tiles[tile_c - 1][tile_r + 1][Tile::Part::NORTH_EAST_CORNER],
-                                     },
-                                     halo_radius);
+        submit_input_kernel<in_pipe>(
+            fpga_queue,
+            std::array<cl::sycl::buffer<T, 2>, 5>{
+                tiles[tile_c - 1][tile_r - 1][Tile::Part::SOUTH_EAST_CORNER],
+                tiles[tile_c - 1][tile_r][Tile::Part::NORTH_EAST_CORNER],
+                tiles[tile_c - 1][tile_r][Tile::Part::EAST_BORDER],
+                tiles[tile_c - 1][tile_r][Tile::Part::SOUTH_EAST_CORNER],
+                tiles[tile_c - 1][tile_r + 1][Tile::Part::NORTH_EAST_CORNER],
+            },
+            halo_radius);
 
         submit_input_kernel<in_pipe>(fpga_queue,
                                      std::array<cl::sycl::buffer<T, 2>, 5>{
@@ -208,32 +215,33 @@ class Grid
                                      },
                                      halo_radius);
 
-        submit_input_kernel<in_pipe>(fpga_queue,
-                                     std::array<cl::sycl::buffer<T, 2>, 5>{
-                                         tiles[tile_c + 1][tile_r - 1][Tile::Part::SOUTH_WEST_CORNER],
-                                         tiles[tile_c + 1][tile_r][Tile::Part::NORTH_WEST_CORNER],
-                                         tiles[tile_c + 1][tile_r][Tile::Part::WEST_BORDER],
-                                         tiles[tile_c + 1][tile_r][Tile::Part::SOUTH_WEST_CORNER],
-                                         tiles[tile_c + 1][tile_r + 1][Tile::Part::NORTH_WEST_CORNER],
-                                     },
-                                     halo_radius);
+        submit_input_kernel<in_pipe>(
+            fpga_queue,
+            std::array<cl::sycl::buffer<T, 2>, 5>{
+                tiles[tile_c + 1][tile_r - 1][Tile::Part::SOUTH_WEST_CORNER],
+                tiles[tile_c + 1][tile_r][Tile::Part::NORTH_WEST_CORNER],
+                tiles[tile_c + 1][tile_r][Tile::Part::WEST_BORDER],
+                tiles[tile_c + 1][tile_r][Tile::Part::SOUTH_WEST_CORNER],
+                tiles[tile_c + 1][tile_r + 1][Tile::Part::NORTH_WEST_CORNER],
+            },
+            halo_radius);
     }
 
     /**
-     * \brief Submit the output kernels required for one execution of the \ref ExecutionKernel.
-     * 
-     * This will submit three \ref IOKernel invocations in total, which are executed in order. Those kernels will write cells from the `out_pipe` to one of the tiles.
-     * 
+     * \brief Submit the output kernels required for one execution of the \ref
+     * ExecutionKernel.
+     *
+     * This will submit three \ref IOKernel invocations in total, which are executed in order. Those
+     * kernels will write cells from the `out_pipe` to one of the tiles.
+     *
      * \tparam out_pipe The pipe to read the cells from.
      * \param fpga_queue The configured SYCL queue for submissions.
      * \param tile_id The id of the tile to write to.
-     * \throws std::out_of_range Thrown if the tile id is outside the range of tiles, as returned by \ref Grid.get_tile_range.
+     * \throws std::out_of_range Thrown if the tile id is outside the range of tiles, as returned by
+     * \ref Grid.get_tile_range.
      */
-    template <typename out_pipe>
-    void submit_tile_output(cl::sycl::queue fpga_queue, UID tile_id)
-    {
-        if (tile_id.c > get_tile_range().c || tile_id.r > get_tile_range().r)
-        {
+    template <typename out_pipe> void submit_tile_output(cl::sycl::queue fpga_queue, UID tile_id) {
+        if (tile_id.c > get_tile_range().c || tile_id.r > get_tile_range().r) {
             throw std::out_of_range("Tile index out of range");
         }
 
@@ -265,102 +273,81 @@ class Grid
                                        halo_radius);
     }
 
-private:
+  private:
     static constexpr uindex_t core_height = tile_height - 2 * halo_radius;
     static constexpr uindex_t core_width = tile_width - 2 * halo_radius;
 
     template <typename pipe>
-    void submit_input_kernel(cl::sycl::queue fpga_queue, std::array<cl::sycl::buffer<T, 2>, 5> buffer, uindex_t buffer_width)
-    {
-        using InputKernel = IOKernel<
-            T,
-            halo_radius,
-            core_height,
-            burst_length,
-            pipe,
-            2,
-            cl::sycl::access::mode::read>;
-
-        fpga_queue.submit([&](cl::sycl::handler &cgh)
-                          {
-                              std::array<typename InputKernel::Accessor, 5> accessor{
-                                  buffer[0].template get_access<cl::sycl::access::mode::read>(cgh),
-                                  buffer[1].template get_access<cl::sycl::access::mode::read>(cgh),
-                                  buffer[2].template get_access<cl::sycl::access::mode::read>(cgh),
-                                  buffer[3].template get_access<cl::sycl::access::mode::read>(cgh),
-                                  buffer[4].template get_access<cl::sycl::access::mode::read>(cgh),
-                              };
-
-                              cgh.single_task<class InputKernelLambda>([=]()
-                                                                       { InputKernel(accessor, buffer_width).read(); });
-                          });
+    void submit_input_kernel(cl::sycl::queue fpga_queue,
+                             std::array<cl::sycl::buffer<T, 2>, 5> buffer, uindex_t buffer_width) {
+        using InputKernel = IOKernel<T, halo_radius, core_height, burst_length, pipe, 2,
+                                     cl::sycl::access::mode::read>;
+
+        fpga_queue.submit([&](cl::sycl::handler &cgh) {
+            std::array<typename InputKernel::Accessor, 5> accessor{
+                buffer[0].template get_access<cl::sycl::access::mode::read>(cgh),
+                buffer[1].template get_access<cl::sycl::access::mode::read>(cgh),
+                buffer[2].template get_access<cl::sycl::access::mode::read>(cgh),
+                buffer[3].template get_access<cl::sycl::access::mode::read>(cgh),
+                buffer[4].template get_access<cl::sycl::access::mode::read>(cgh),
+            };
+
+            cgh.single_task<class InputKernelLambda>(
+                [=]() { InputKernel(accessor, buffer_width).read(); });
+        });
     }
 
     template <typename pipe>
-    void submit_output_kernel(cl::sycl::queue fpga_queue, std::array<cl::sycl::buffer<T, 2>, 3> buffer, uindex_t buffer_width)
-    {
-        using OutputKernel = IOKernel<T,
-                                      halo_radius,
-                                      core_height,
-                                      burst_length,
-                                      pipe,
-                                      1,
+    void submit_output_kernel(cl::sycl::queue fpga_queue,
+                              std::array<cl::sycl::buffer<T, 2>, 3> buffer, uindex_t buffer_width) {
+        using OutputKernel = IOKernel<T, halo_radius, core_height, burst_length, pipe, 1,
                                       cl::sycl::access::mode::discard_write>;
 
-        fpga_queue.submit([&](cl::sycl::handler &cgh)
-                          {
-                              std::array<typename OutputKernel::Accessor, 3> accessor{
-                                  buffer[0].template get_access<cl::sycl::access::mode::discard_write>(cgh),
-                                  buffer[1].template get_access<cl::sycl::access::mode::discard_write>(cgh),
-                                  buffer[2].template get_access<cl::sycl::access::mode::discard_write>(cgh),
-                              };
-
-                              cgh.single_task<class OutputKernelLambda>([=]()
-                                                                        { OutputKernel(accessor, buffer_width).write(); });
-                          });
+        fpga_queue.submit([&](cl::sycl::handler &cgh) {
+            std::array<typename OutputKernel::Accessor, 3> accessor{
+                buffer[0].template get_access<cl::sycl::access::mode::discard_write>(cgh),
+                buffer[1].template get_access<cl::sycl::access::mode::discard_write>(cgh),
+                buffer[2].template get_access<cl::sycl::access::mode::discard_write>(cgh),
+            };
+
+            cgh.single_task<class OutputKernelLambda>(
+                [=]() { OutputKernel(accessor, buffer_width).write(); });
+        });
     }
 
-    void copy_from(cl::sycl::buffer<T, 2> in_buffer)
-    {
-        if (in_buffer.get_range() != grid_range)
-        {
+    void copy_from(cl::sycl::buffer<T, 2> in_buffer) {
+        if (in_buffer.get_range() != grid_range) {
             throw std::range_error("The target buffer has not the same size as the grid");
         }
 
         allocate_tiles();
 
-        for (uindex_t tile_column = 1; tile_column < tiles.size() - 1; tile_column++)
-        {
-            for (uindex_t tile_row = 1; tile_row < tiles[tile_column].size() - 1; tile_row++)
-            {
-                cl::sycl::id<2> offset((tile_column - 1) * tile_width, (tile_row - 1) * tile_height);
+        for (uindex_t tile_column = 1; tile_column < tiles.size() - 1; tile_column++) {
+            for (uindex_t tile_row = 1; tile_row < tiles[tile_column].size() - 1; tile_row++) {
+                cl::sycl::id<2> offset((tile_column - 1) * tile_width,
+                                       (tile_row - 1) * tile_height);
                 tiles[tile_column][tile_row].copy_from(in_buffer, offset);
             }
         }
     }
 
-    void allocate_tiles()
-    {
+    void allocate_tiles() {
         tiles.clear();
 
         uindex_t n_tile_columns = grid_range[0] / tile_width;
-        if (grid_range[0] % tile_width != 0)
-        {
+        if (grid_range[0] % tile_width != 0) {
             n_tile_columns++;
         }
         uindex_t n_tile_rows = grid_range[1] / tile_height;
-        if (grid_range[1] % tile_height != 0)
-        {
+        if (grid_range[1] % tile_height != 0) {
             n_tile_rows++;
         }
 
         tiles.reserve(n_tile_columns + 2);
-        for (uindex_t i_column = 0; i_column < n_tile_columns + 2; i_column++)
-        {
+        for (uindex_t i_column = 0; i_column < n_tile_columns + 2; i_column++) {
             std::vector<Tile> column;
             column.reserve(n_tile_rows + 2);
-            for (uindex_t i_row = 0; i_row < n_tile_rows + 2; i_row++)
-            {
+            for (uindex_t i_row = 0; i_row < n_tile_rows + 2; i_row++) {
                 column.push_back(Tile());
             }
             tiles.push_back(column);
@@ -371,4 +358,5 @@ class Grid
     cl::sycl::range<2> grid_range;
 };
 
+} // namespace tiling
 } // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/tiling/IOKernel.hpp b/StencilStream/tiling/IOKernel.hpp
new file mode 100644
index 0000000..316a84e
--- /dev/null
+++ b/StencilStream/tiling/IOKernel.hpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "../CounterID.hpp"
+#include "../Index.hpp"
+#include <CL/sycl.hpp>
+#include <CL/sycl/accessor.hpp>
+#include <array>
+
+namespace stencil {
+namespace tiling {
+
+/**
+ * \brief Generic Input/Output kernel for use with the \ref ExecutionKernel and \ref Grid.
+ *
+ * This kernel provides IO services to the execution kernel by writing the contents of a input tile
+ * with halo to the input pipe and writing the output of the execution kernel to an output tile. The
+ * input and output code only differs by one line. Therefore, both services are provided by the same
+ * class. Unlike \ref ExecutionKernel, this kernel is supposed to be constructed by a lambda
+ * expression that then either executes \ref IOKernel.read or \ref IOKernel.write.
+ *
+ * Logically, an instance of the IO kernel receives access to a horizontal slice of the input or
+ * output and processes this slice in the \ref indexingorder. Due to the \ref tiling, this slice is
+ * partioned vertically into 2*n + 1 buffers, where n is the `n_halo_height_buffers` template
+ * parameter. For the input buffer, n should be 2, and for the output buffer, n should be 1. All
+ * buffers are expected to have a static height and dynamic width. The upper n and lower n buffers
+ * are all expected be `halo_height` cells high, while the buffer in the middle is expected to have
+ * be `core_height` cells high.
+ *
+ * \tparam T Cell value type.
+ * \tparam halo_height The radius (aka width and height) of the tile halo (and therefore both
+ * dimensions of a corner buffer).
+ * \tparam core_height The height of the core buffer.
+ * \tparam burst_length The number of elements that can be read or written in a burst.
+ * \tparam pipe The pipe to read or write to.
+ * \tparam n_halo_height_buffers The number of buffers to accept, in addition to the core buffer.
+ * \tparam access_mode The access mode to expect for the buffer accessor.
+ * \tparam access_target The access target to expect for the buffer accessor.
+ */
+template <typename T, uindex_t halo_height, uindex_t core_height, uindex_t burst_length,
+          typename pipe, uindex_t n_halo_height_buffers, cl::sycl::access::mode access_mode,
+          cl::sycl::access::target access_target = cl::sycl::access::target::global_buffer>
+class IOKernel {
+  public:
+    /**
+     * \brief The exact accessor type required by the IO kernel.
+     */
+    using Accessor = cl::sycl::accessor<T, 2, access_mode, access_target>;
+
+    /**
+     * \brief The total number of buffers in a slice/column.
+     */
+    static constexpr uindex_t n_buffers = 2 * n_halo_height_buffers + 1;
+    /**
+     * \brief The total number of cell rows in the (logical) slice/column.
+     */
+    static constexpr uindex_t n_rows = 2 * n_halo_height_buffers * halo_height + core_height;
+
+    /**
+     * \brief Get the height of a buffer.
+     *
+     * As described in the description, the first and last n buffers are `halo_height` cells high,
+     * while the middle buffer is `core_height` cells high.
+     */
+    static constexpr uindex_t get_buffer_height(uindex_t index) {
+        if (index == n_halo_height_buffers) {
+            return core_height;
+        } else {
+            return halo_height;
+        }
+    }
+
+    /**
+     * \brief Create a new IOKernel instance.
+     *
+     * The created instance is not invocable. You need to construct it inside a lambda function (or
+     * equivalent) and then call either \ref IOKernel.read or \ref IOKernel.write.
+     *
+     * \param accessor The slice/column of buffer accessors to process.
+     * \param n_columns The width of every buffer passed in `accessor`.
+     */
+    IOKernel(std::array<Accessor, n_buffers> accessor, uindex_t n_columns)
+        : accessor(accessor), n_columns(n_columns) {
+#ifndef __SYCL_DEVICE_ONLY__
+        for (uindex_t i = 0; i < n_buffers; i++) {
+            assert(accessor[i].get_range()[1] == burst_length);
+            assert(get_buffer_height(i) * n_columns <=
+                   accessor[i].get_range()[0] * accessor[i].get_range()[1]);
+        }
+#endif
+    }
+
+    /**
+     * \brief Read the cells from the buffers and write them to the pipe.
+     */
+    void read() {
+        static_assert(access_mode == cl::sycl::access::mode::read ||
+                      access_mode == cl::sycl::access::mode::read_write);
+        run([](Accessor &accessor, uindex_t burst_i, uindex_t cell_i) {
+            pipe::write(accessor[burst_i][cell_i]);
+        });
+    }
+
+    /**
+     * \brief Read the cells from the pipe and write them to the buffers.
+     */
+    void write() {
+        static_assert(access_mode == cl::sycl::access::mode::write ||
+                      access_mode == cl::sycl::access::mode::discard_write ||
+                      access_mode == cl::sycl::access::mode::read_write ||
+                      access_mode == cl::sycl::access::mode::discard_read_write);
+        run([](Accessor &accessor, uindex_t burst_i, uindex_t cell_i) {
+            accessor[burst_i][cell_i] = pipe::read();
+        });
+    }
+
+  private:
+    template <typename Action> void run(Action action) {
+        static_assert(std::is_invocable<Action, Accessor &, uindex_t, uindex_t>::value);
+
+        uindex_t burst_i[n_buffers] = {0};
+        uindex_t cell_i[n_buffers] = {0};
+
+        for (uindex_t c = 0; c < n_columns; c++) {
+            uindex_t buffer_i = 0;
+            uindex_t next_bound = get_buffer_height(0);
+            for (uindex_t r = 0; r < n_rows; r++) {
+                if (r == next_bound) {
+                    buffer_i++;
+                    next_bound += get_buffer_height(buffer_i);
+                }
+
+                action(accessor[buffer_i], burst_i[buffer_i], cell_i[buffer_i]);
+                if (cell_i[buffer_i] == burst_length - 1) {
+                    cell_i[buffer_i] = 0;
+                    burst_i[buffer_i]++;
+                } else {
+                    cell_i[buffer_i]++;
+                }
+            }
+        }
+    }
+
+    std::array<Accessor, n_buffers> accessor;
+    uindex_t n_columns;
+};
+
+} // namespace tiling
+} // namespace stencil
\ No newline at end of file
diff --git a/StencilStream/Tile.hpp b/StencilStream/tiling/Tile.hpp
similarity index 67%
rename from StencilStream/Tile.hpp
rename to StencilStream/tiling/Tile.hpp
index 5e04ea9..041db2b 100644
--- a/StencilStream/Tile.hpp
+++ b/StencilStream/tiling/Tile.hpp
@@ -1,29 +1,44 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
-#include "Helpers.hpp"
-#include "Index.hpp"
+#include "../Helpers.hpp"
+#include "../Index.hpp"
 #include <CL/sycl.hpp>
 #include <optional>
 #include <stdexcept>
 
-namespace stencil
-{
+namespace stencil {
+namespace tiling {
 
 /**
  * \brief A rectangular container of cells with a static size
- * 
- * StencilStream tiles the grid and the \ref ExecutionKernel works with those tiles: it receives the content of a tile (together with it's halo) and emits the contents of a tile. A tile is partitioned in four corner buffers, four border buffers and one core buffer. This is done to provide easy access to the halo of a tile. Have a look at the \ref Architecture for more details of the data layout.
- * 
- * This tile manager supports copy operations to and from monolithic, user-supplied buffers, as well as an index operation to access the individual parts.
- * 
+ *
+ * StencilStream tiles the grid and the \ref ExecutionKernel works with those tiles: it
+ * receives the content of a tile (together with it's halo) and emits the contents of a tile. A tile
+ * is partitioned in four corner buffers, four border buffers and one core buffer. This is done to
+ * provide easy access to the halo of a tile. Have a look at the \ref Architecture for more details
+ * of the data layout.
+ *
+ * This tile manager supports copy operations to and from monolithic, user-supplied buffers, as well
+ * as an index operation to access the individual parts.
+ *
  * \tparam T Cell value type.
  * \tparam width The number of columns of the tile.
  * \tparam height The number of rows of the tile.
@@ -31,24 +46,23 @@ namespace stencil
  * \tparam burst_length The number of elements that can be read or written in a burst.
  */
 template <typename T, uindex_t width, uindex_t height, uindex_t halo_radius, uindex_t burst_length>
-class Tile
-{
+class Tile {
     static_assert(width > 2 * halo_radius);
     static_assert(height > 2 * halo_radius);
 
-public:
+  public:
     /**
      * \brief Create a new tile.
-     * 
-     * Logically, the contents of the newly created tile are undefined since no memory resources are allocated during construction and no initialization is done by the indexing operation.
+     *
+     * Logically, the contents of the newly created tile are undefined since no memory resources are
+     * allocated during construction and no initialization is done by the indexing operation.
      */
     Tile() : part{std::nullopt} {}
 
     /**
      * \brief Enumeration to address individual parts of the tile.
      */
-    enum class Part
-    {
+    enum class Part {
         NORTH_WEST_CORNER,
         NORTH_BORDER,
         NORTH_EAST_CORNER,
@@ -64,27 +78,19 @@ class Tile
      * \brief Array with all \ref Part variants to allow iteration over them.
      */
     static constexpr Part all_parts[] = {
-        Part::NORTH_WEST_CORNER,
-        Part::NORTH_BORDER,
-        Part::NORTH_EAST_CORNER,
-        Part::EAST_BORDER,
-        Part::SOUTH_EAST_CORNER,
-        Part::SOUTH_BORDER,
-        Part::SOUTH_WEST_CORNER,
-        Part::WEST_BORDER,
-        Part::CORE,
+        Part::NORTH_WEST_CORNER, Part::NORTH_BORDER,      Part::NORTH_EAST_CORNER,
+        Part::EAST_BORDER,       Part::SOUTH_EAST_CORNER, Part::SOUTH_BORDER,
+        Part::SOUTH_WEST_CORNER, Part::WEST_BORDER,       Part::CORE,
     };
 
     /**
      * \brief Calculate the range of a given Part.
-     * 
+     *
      * \param part The part to calculate the range for.
      * \return The range of the part, used for example to allocate it.
      */
-    static cl::sycl::range<2> get_part_range(Part part)
-    {
-        switch (part)
-        {
+    static cl::sycl::range<2> get_part_range(Part part) {
+        switch (part) {
         case Part::NORTH_WEST_CORNER:
         case Part::SOUTH_WEST_CORNER:
         case Part::SOUTH_EAST_CORNER:
@@ -104,15 +110,14 @@ class Tile
     }
 
     /**
-     * \brief Calculate the index offset of a given part relative to the north-western corner of the tile.
-     * 
+     * \brief Calculate the index offset of a given part relative to the north-western corner of the
+     * tile.
+     *
      * \param part The part to calculate the offset for.
      * \return The offset of the part.
      */
-    static cl::sycl::id<2> get_part_offset(Part part)
-    {
-        switch (part)
-        {
+    static cl::sycl::id<2> get_part_offset(Part part) {
+        switch (part) {
         case Part::NORTH_WEST_CORNER:
             return cl::sycl::id<2>(0, 0);
         case Part::NORTH_BORDER:
@@ -138,17 +143,19 @@ class Tile
 
     /**
      * \brief Return the buffer with the contents of the given part.
-     * 
-     * If the part has not been accessed before, it will allocate the part's buffer. Note however that this method does not initialize the buffer. Please also note that the buffer is burst-aligned: The height of the returned buffer (the second value of the range) is always `burst_length` and the width is big enough to store all required cells of the part. For more information, read about \ref burstalignment.
-     * 
+     *
+     * If the part has not been accessed before, it will allocate the part's buffer. Note however
+     * that this method does not initialize the buffer. Please also note that the buffer is
+     * burst-aligned: The height of the returned buffer (the second value of the range) is always
+     * `burst_length` and the width is big enough to store all required cells of the part. For more
+     * information, read about \ref burstalignment.
+     *
      * \param tile_part The part to access.
      * \return The buffer of the part.
      */
-    cl::sycl::buffer<T, 2> operator[](Part tile_part)
-    {
+    cl::sycl::buffer<T, 2> operator[](Part tile_part) {
         uindex_t part_column, part_row;
-        switch (tile_part)
-        {
+        switch (tile_part) {
         case Part::NORTH_WEST_CORNER:
             part_column = 0;
             part_row = 0;
@@ -193,8 +200,7 @@ class Tile
         uindex_t part_width = part_range[0];
         uindex_t part_height = part_range[1];
 
-        if (!part[part_column][part_row].has_value())
-        {
+        if (!part[part_column][part_row].has_value()) {
             auto part_range = burst_partitioned_range(part_width, part_height, burst_length);
             cl::sycl::buffer<T, 2> new_part(part_range);
             part[part_column][part_row] = new_part;
@@ -204,14 +210,16 @@ class Tile
 
     /**
      * \brief Copy the contents of a buffer into the tile.
-     * 
-     * This will take the buffer section `[offset[0] : min(grid_width, offset[0] + tile_width)) x [offset[1] : min(grid_height, offset[1] + tile_height))` and copy it to the tile. This means that if the grid does not contain enough cells to fill the whole tile, those missing cells are left as-is.
-     * 
+     *
+     * This will take the buffer section `[offset[0] : min(grid_width, offset[0] + tile_width)) x
+     * [offset[1] : min(grid_height, offset[1] + tile_height))` and copy it to the tile. This means
+     * that if the grid does not contain enough cells to fill the whole tile, those missing cells
+     * are left as-is.
+     *
      * \param buffer The buffer to copy the data from.
      * \param offset The offset of the buffer section relative to the origin of the buffer.
      */
-    void copy_from(cl::sycl::buffer<T, 2> buffer, cl::sycl::id<2> offset)
-    {
+    void copy_from(cl::sycl::buffer<T, 2> buffer, cl::sycl::id<2> offset) {
         auto accessor = buffer.template get_access<cl::sycl::access::mode::read_write>();
         copy_part(accessor, Part::NORTH_WEST_CORNER, offset, true);
         copy_part(accessor, Part::NORTH_BORDER, offset, true);
@@ -226,14 +234,17 @@ class Tile
 
     /**
      * \brief Copy the contents of the tile to a buffer.
-     * 
-     * This will take the tile section `[0 : min(tile_width, grid_width - offset[0])) x [0 : min(tile_height, grid_height - offset[1]))` and copy it to the buffer section `[offset[0] : min(grid_width, offset[0] + tile_width)) x [offset[1] : min(grid_height, offset[1] + tile_height))`. This means that if the tile plus the offset is wider or higher than the grid, those superfluous cells are ignored.
-     * 
+     *
+     * This will take the tile section `[0 : min(tile_width, grid_width - offset[0])) x [0 :
+     * min(tile_height, grid_height - offset[1]))` and copy it to the buffer section `[offset[0] :
+     * min(grid_width, offset[0] + tile_width)) x [offset[1] : min(grid_height, offset[1] +
+     * tile_height))`. This means that if the tile plus the offset is wider or higher than the grid,
+     * those superfluous cells are ignored.
+     *
      * \param buffer The buffer to copy the data to.
      * \param offset The offset of the buffer section relative to the origin of the buffer.
      */
-    void copy_to(cl::sycl::buffer<T, 2> buffer, cl::sycl::id<2> offset)
-    {
+    void copy_to(cl::sycl::buffer<T, 2> buffer, cl::sycl::id<2> offset) {
         auto accessor = buffer.template get_access<cl::sycl::access::mode::read_write>();
         copy_part(accessor, Part::NORTH_WEST_CORNER, offset, false);
         copy_part(accessor, Part::NORTH_BORDER, offset, false);
@@ -246,19 +257,16 @@ class Tile
         copy_part(accessor, Part::CORE, offset, false);
     }
 
-private:
+  private:
     /**
      * \brief Helper function to copy a part to or from a buffer.
      */
-    void copy_part(
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::host_buffer> accessor,
-        Part part,
-        cl::sycl::id<2> global_offset,
-        bool buffer_to_part)
-    {
+    void copy_part(cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write,
+                                      cl::sycl::access::target::host_buffer>
+                       accessor,
+                   Part part, cl::sycl::id<2> global_offset, bool buffer_to_part) {
         cl::sycl::id<2> offset = global_offset + get_part_offset(part);
-        if (offset[0] >= accessor.get_range()[0] || offset[1] >= accessor.get_range()[1])
-        {
+        if (offset[0] >= accessor.get_range()[0] || offset[1] >= accessor.get_range()[1]) {
             // Nothing to do here. There is no data in the buffer for this part.
             return;
         }
@@ -268,29 +276,21 @@ class Tile
 
         uindex_t i_burst = 0;
         uindex_t i_cell = 0;
-        for (uindex_t c = 0; c < part_range[0]; c++)
-        {
-            for (uindex_t r = 0; r < part_range[1]; r++)
-            {
-                if (c + offset[0] < accessor.get_range()[0] && r + offset[1] < accessor.get_range()[1])
-                {
-                    if (buffer_to_part)
-                    {
+        for (uindex_t c = 0; c < part_range[0]; c++) {
+            for (uindex_t r = 0; r < part_range[1]; r++) {
+                if (c + offset[0] < accessor.get_range()[0] &&
+                    r + offset[1] < accessor.get_range()[1]) {
+                    if (buffer_to_part) {
                         part_ac[i_burst][i_cell] = accessor[c + offset[0]][r + offset[1]];
-                    }
-                    else
-                    {
+                    } else {
                         accessor[c + offset[0]][r + offset[1]] = part_ac[i_burst][i_cell];
                     }
                 }
 
-                if (i_cell == burst_length - 1)
-                {
+                if (i_cell == burst_length - 1) {
                     i_cell = 0;
                     i_burst++;
-                }
-                else
-                {
+                } else {
                     i_cell++;
                 }
             }
@@ -299,4 +299,6 @@ class Tile
 
     std::optional<cl::sycl::buffer<T, 2>> part[3][3];
 };
+
+} // namespace tiling
 } // namespace stencil
\ No newline at end of file
diff --git a/docs/architecture.md b/docs/architecture.md
index 39b3f89..d01b9c8 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -52,13 +52,11 @@ for (uindex_t c = 0; c < grid_width; c++)
 }
 ```
 
-### General Approach {#approach}
+### Tile-wise computation {#tilecomputation}
 
 The general goal of a stencil execution engine like StencilStream is to calculate a certain generation of a grid of cells where a single generation step of a single cell is computed from cell's neighborhood (contained in the stencil). This is repeated iteratively over all cells of the grid and for all generations up to the desired generation.
 
-However, the naive approach of storing all cells in a buffer and reading the neihborhood directly from this buffer doesn't work well for FPGAs. StencilStream therefore uses an approach introduced by [Hamid Reza Zohouri, Artur Podobas and Satoshi Matsuoka](https://dl.acm.org/doi/pdf/10.1145/3174243.3174248) that uses a spatially tiled buffer and temporal caching to perform the computations.
-
-#### Tile-wise computation {#tilecomputation}
+However, the naive approach of storing all cells in a buffer and reading the neihborhood directly from this buffer doesn't work well for FPGAs. StencilStream therefore uses two different architectures inspired by [Hamid Reza Zohouri, Artur Podobas and Satoshi Matsuoka](https://dl.acm.org/doi/pdf/10.1145/3174243.3174248) that usetemporal caching and in one case spatially tiled buffers to perform the computations.
 
 Let's look at the simplest case first: There is a tile and we want to calculate the it's next generation. StencilStream splits up this task into an input kernel, an execution kernel and an output kernel. They communicate via on-chip FIFO pipe and the input kernel receives access to the current generation of the tile while the output kernel receives access to buffers for the next generation of the tile. The input kernel reads the cells from the buffer column-wise (as discussed in ["Indexing and Iteration order"](#index)) and sends them to the execution kernel.
 
@@ -70,15 +68,15 @@ After these shifts, the stencil buffer contains the correct neighborhood of a ce
 
 Of course, the input and output of an execution stage do not necessarily have to come from the input or output kernel. StencilStream arranges multiple execution stages into a pipeline. This means that for a given pipeline length of `p`, the grid is only written to global memory every `p` generations and since the main loop of the execution kernel is pipelined itself, all of these `p` generations are calculated in parallel, utilizing the full potential of the FPGA. This long stream of values is also where the name "StencilStream" comes from.
 
-#### Grid Tiling {#tiling}
+### Grid Tiling {#tiling}
 
-In order to calculate the next generation of a cell, you need it's neighborhood. This neighborhood is included in the tile for almost all cells, but not for those on the edge of the tile. This means that in order to calculate the next generation of a tile, cells from neighboring tiles are needed too. These cells are known as the halo of a tile. Since more cells are needed in the halo for every generation that is computed, the input tile has to contain `stencil_radius` additional cells in every cardinal direction. This leads us to an input tile with `tile_width + 2 * stencil_radius * pipeline_lenth` columns and `tile_height + 2 * stencil_radius * pipeline_length` rows.
+Per definition, the range of tiles is static and the range of grids is dynamic. This means that the range of a grid may exceed the range of a tile. StencilStream therefore partitions the grid into tiles and computes the next generations tile by tile. In order to calculate the next generation of a cell, you need it's neighborhood. For most cells in a tile, this neighborhood is included in the tile, but not for those on the edge of the tile. This means that in order to calculate the next generation of a tile, cells from neighboring tiles are needed too. These cells are known as the halo of a tile. Since more cells are needed in the halo for every generation that is computed, the input tile has to contain `stencil_radius` additional cells in every cardinal direction. This leads us to an input tile with `tile_width + 2 * stencil_radius * pipeline_lenth` columns and `tile_height + 2 * stencil_radius * pipeline_length` rows.
 
-The execution kernel described above works on tiles, which have a static size, but the user provides a grid, which has a dynamic size. Therefore, StencilStream needs to partition the grid into tiles. Since every tile also needs parts of neighboring tiles for their halo, these tiles are partitioned into buffers too. Every tile has four corner buffers, four edge buffers and a core buffer. The following figure illustrates this partition, where the buffer borders are marked in black, tile borders are marked in red and the grid border is marked in yellow. Note that the shapes of tiles and their buffers is static, but the number of tiles is dynamic and adapted to contain the whole grid.
+In order to allow easy access to a tile's halo, tiles are partitioned into buffers too. Every tile has four corner buffers, four edge buffers and a core buffer. The following figure illustrates the final partition of a grid, where the buffer borders are marked in black, tile borders are marked in red and the grid border is marked in yellow. Note that the shapes of the tiles and their buffers is static, but the number of tiles is dynamic and adapted to contain the whole grid.
 
 ![Partition](partition.svg)
 
-The resulting input for a submission of the execution kernel is therefore partitioned into 5x5 buffers and the output is partitioned into 3x3 buffers. The IO code in use by StencilStream groups these buffers into buffer columns (five buffer columns for the input and three for the output) and submits the input/output kernel for every single column. Therefore, every invocation has access to a number of full cell columns from the input or output which it can process. Also note that the height of the buffers is equal for every buffer column, which is therefore used as a constant parameter of the design.
+The resulting input for a submission of the execution kernel is therefore partitioned into 5x5 buffers and the output is partitioned into 3x3 buffers. The IO code used by StencilStream groups these buffers into buffer columns (five buffer columns for the input and three for the output) and submits the input/output kernel for every single column. Therefore, every invocation has access to a number of full cell columns from the input or output which it can process. Also note that the height of the buffers is equal for every buffer column, which is therefore used as a constant parameter of the design.
 
 #### Halo/Edge handling {#halo}
 
@@ -88,4 +86,10 @@ StencilStream currently only supports transition functions with a default value,
 
 #### Burst-aligned buffers {#burstalignment}
 
-One last concept of note is the layout of the buffers themselves: The global memory interface of most FPGAs support burst accesses where a specific number of bytes can be read or written in one transaction. Therefore, those interfaces are most efficient when all memory accesses are organized in such bursts. StencilStream ensures this by using two-dimensional buffers with the "height" of one memory burst.
\ No newline at end of file
+One last concept of note is the layout of the buffers themselves: The global memory interface of most FPGAs support burst accesses where a specific number of bytes can be read or written in one transaction. Therefore, those interfaces are most efficient when all memory accesses are organized in such bursts. StencilStream ensures this by using two-dimensional buffers with the "height" of one memory burst.
+
+### The Monotile Architecture {#monotile}
+
+The architecture and buffer layout described above introduces complex grid partitioning in order to work on grids with arbitrary ranges. However, there are applications where the possible grid ranges are known at compilation time and where the biggest grid may fit on the FPGA as a single tile. Grid tiling is unnecessary in this case and StencilStream offers an executor without it: The \ref stencil::MonotileExecutor. As the name indicates, the monotile executor stores the grid in a single buffer and computes the next generations of the whole grid in one kernel invocation.
+
+This approach uses less FPGA resources than the tiling architecture for the same tile range and pipeline length since the IO kernels are simpler and the caches are smaller. The monotile execution kernel also has a lower latency and runtime than the tiled execution kernel since less main loop iterations are required. However, the runtime does not scale well for varying grid ranges. Both of StencilStreams's execution kernels use the same amount time for every invocation, regardless whether most of the tile cells are within the grid or not. Therefore, the runtime of the tiled architecture with many small tiles actually scales with the grid range, while the monotile architecture with a single big tile does not.
\ No newline at end of file
diff --git a/examples/build.sh b/examples/build.sh
new file mode 100755
index 0000000..79bb1d7
--- /dev/null
+++ b/examples/build.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+source /cm/shared/opt/intel_oneapi/2021.3/setvars.sh
+ml compiler/GCC nalla_pcie/20.4.0
+
+make all -j$(nprocs)
\ No newline at end of file
diff --git a/examples/conway/Makefile b/examples/conway/Makefile
index 20ec444..0e693fe 100644
--- a/examples/conway/Makefile
+++ b/examples/conway/Makefile
@@ -19,15 +19,17 @@ ifdef AOCL_BOARD_PACKAGE_ROOT
 endif
 
 EMU_ARGS = $(ARGS)
-HW_ARGS = $(ARGS) -DHARDWARE -Xshardware 
+HW_ARGS = $(ARGS) -DHARDWARE -Xshardware
 
-conway_emu: conway.cpp Makefile
+RESOURCES = conway.cpp $(wildcard StencilStream/*) Makefile
+
+conway_emu: $(RESOURCES)
 	$(CC) $(EMU_ARGS) conway.cpp -o conway_emu
 
-conway_hw: conway.cpp Makefile
+conway_hw: $(RESOURCES)
 	$(CC) $(HW_ARGS) conway.cpp -o conway_hw
 
-conway_hw.report.tar.gz: conway.cpp Makefile
+conway_hw.report.tar.gz: $(RESOURCES)
 	rm -f conway_hw
 	$(CC) $(HW_ARGS) -fsycl-link conway.cpp -o conway_hw
 	tar -caf conway_hw.report.tar.gz conway_hw.prj/reports
diff --git a/examples/conway/conway.cpp b/examples/conway/conway.cpp
index 95078f4..ba91749 100644
--- a/examples/conway/conway.cpp
+++ b/examples/conway/conway.cpp
@@ -1,11 +1,21 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include <CL/sycl/INTEL/fpga_extensions.hpp>
 #include <StencilStream/StencilExecutor.hpp>
@@ -14,43 +24,33 @@ using Cell = bool;
 const Cell halo_value = false;
 const stencil::uindex_t stencil_radius = 1;
 
-auto conway = [](stencil::Stencil<Cell, stencil_radius> const &stencil)
-{
+auto conway = [](stencil::Stencil<Cell, stencil_radius> const &stencil) {
     stencil::ID idx = stencil.id;
 
     uint8_t alive_neighbours = 0;
 #pragma unroll
-    for (stencil::index_t c = -stencil_radius; c <= stencil::index_t(stencil_radius); c++)
-    {
+    for (stencil::index_t c = -stencil_radius; c <= stencil::index_t(stencil_radius); c++) {
 #pragma unroll
-        for (stencil::index_t r = -stencil_radius; r <= stencil::index_t(stencil_radius); r++)
-        {
-            if (stencil[stencil::ID(c, r)] && !(c == 0 && r == 0))
-            {
+        for (stencil::index_t r = -stencil_radius; r <= stencil::index_t(stencil_radius); r++) {
+            if (stencil[stencil::ID(c, r)] && !(c == 0 && r == 0)) {
                 alive_neighbours += 1;
             }
         }
     }
 
-    if (stencil[stencil::ID(0, 0)])
-    {
+    if (stencil[stencil::ID(0, 0)]) {
         return alive_neighbours == 2 || alive_neighbours == 3;
-    }
-    else
-    {
+    } else {
         return alive_neighbours == 3;
     }
 };
 
-cl::sycl::buffer<Cell, 2> read(stencil::uindex_t width, stencil::uindex_t height)
-{
+cl::sycl::buffer<Cell, 2> read(stencil::uindex_t width, stencil::uindex_t height) {
     cl::sycl::buffer<Cell, 2> input_buffer(cl::sycl::range<2>(width, height));
     auto buffer_ac = input_buffer.get_access<cl::sycl::access::mode::write>();
 
-    for (stencil::uindex_t r = 0; r < height; r++)
-    {
-        for (stencil::uindex_t c = 0; c < width; c++)
-        {
+    for (stencil::uindex_t r = 0; r < height; r++) {
+        for (stencil::uindex_t c = 0; c < width; c++) {
             char Cell;
             std::cin >> Cell;
             assert(Cell == 'X' || Cell == '.');
@@ -61,23 +61,17 @@ cl::sycl::buffer<Cell, 2> read(stencil::uindex_t width, stencil::uindex_t height
     return input_buffer;
 }
 
-void write(cl::sycl::buffer<Cell, 2> output_buffer)
-{
+void write(cl::sycl::buffer<Cell, 2> output_buffer) {
     auto buffer_ac = output_buffer.get_access<cl::sycl::access::mode::read>();
 
     stencil::uindex_t width = output_buffer.get_range()[0];
     stencil::uindex_t height = output_buffer.get_range()[1];
 
-    for (stencil::uindex_t r = 0; r < height; r++)
-    {
-        for (stencil::uindex_t c = 0; c < width; c++)
-        {
-            if (buffer_ac[c][r])
-            {
+    for (stencil::uindex_t r = 0; r < height; r++) {
+        for (stencil::uindex_t c = 0; c < width; c++) {
+            if (buffer_ac[c][r]) {
                 std::cout << "X";
-            }
-            else
-            {
+            } else {
                 std::cout << ".";
             }
         }
@@ -85,10 +79,8 @@ void write(cl::sycl::buffer<Cell, 2> output_buffer)
     }
 }
 
-int main(int argc, char **argv)
-{
-    if (argc != 4)
-    {
+int main(int argc, char **argv) {
+    if (argc != 4) {
         std::cerr << "Usage: " << argv[0] << " <width> <height> <n_generations>" << std::endl;
         return 1;
     }
diff --git a/examples/emulation_env.sh b/examples/emulation_env.sh
new file mode 100644
index 0000000..083667e
--- /dev/null
+++ b/examples/emulation_env.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+export CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=8MB
+export CL_CONFIG_CPU_FORCE_LOCAL_MEM_SIZE=8MB
\ No newline at end of file
diff --git a/examples/fdtd/.gitignore b/examples/fdtd/.gitignore
index 267b5f1..72ab1cf 100644
--- a/examples/fdtd/.gitignore
+++ b/examples/fdtd/.gitignore
@@ -1,6 +1,5 @@
 # executables
-fdtd_emu
-fdtd_hw
+fdtd_*
 
 # Output
 *.csv
diff --git a/examples/fdtd/Makefile b/examples/fdtd/Makefile
index 3a1a135..419872a 100644
--- a/examples/fdtd/Makefile
+++ b/examples/fdtd/Makefile
@@ -8,7 +8,7 @@
 CC = dpcpp
 STENCIL_STREAM_PATH = ./
 
-ARGS = -fintelfpga -Xsv -Xsprofile -std=c++17 -DSTENCIL_INDEX_WIDTH=64 -DFDTD_BURST_SIZE=1024 -I$(STENCIL_STREAM_PATH) -O3 -lpthread
+ARGS = -fintelfpga -Xsv -Xsprofile -std=c++17 -DSTENCIL_INDEX_WIDTH=64 -DFDTD_BURST_SIZE=1024 -I$(STENCIL_STREAM_PATH) -O3
 
 ifdef EBROOTGCC
 	ARGS += --gcc-toolchain=$(EBROOTGCC)
@@ -18,22 +18,37 @@ ifdef AOCL_BOARD_PACKAGE_ROOT
 	ARGS += -Xsboard=$(FPGA_BOARD_NAME) -Xsboard-package=$(AOCL_BOARD_PACKAGE_ROOT)
 endif
 
-EMU_ARGS = $(ARGS)
-HW_ARGS = $(ARGS) -DHARDWARE -Xshardware 
+EMU =
+HW = -DHARDWARE -Xshardware 
+MONOTILE = -DMONOTILE
+TILING =
 
 SOURCES = $(wildcard src/*.cpp)
 RESOURCES = $(wildcard src/*.hpp) $(wildcard StencilStream/*) Makefile
 
-fdtd_emu: $(SOURCES) $(RESOURCES)
-	$(CC) $(EMU_ARGS) $(SOURCES) -o fdtd_emu
+all: fdtd_mono_emu fdtd_tiling_emu fdtd_mono_hw fdtd_tiling_hw
 
-fdtd_hw: $(SOURCES) $(RESOURCES)
-	$(CC) $(HW_ARGS) $(SOURCES) -o fdtd_hw
+fdtd_mono_emu: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(MONOTILE) $(EMU) $(SOURCES) -o fdtd_mono_emu
 
-fdtd_hw.report.tar.gz: src/*.cpp src/*.hpp Makefile
-	rm -f fdtd_hw
-	$(CC) $(HW_ARGS) -fsycl-link $(SOURCES) -o fdtd_hw
-	tar -caf fdtd_hw.report.tar.gz fdtd_hw.prj/reports
+fdtd_tiling_emu: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(TILING) $(EMU) $(SOURCES) -o fdtd_tiling_emu
+
+fdtd_mono_hw: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(MONOTILE) $(HW) $(SOURCES) -o fdtd_mono_hw
+
+fdtd_mono_hw.report.tar.gz: $(SOURCES) $(RESOURCES)
+	rm -f fdtd_mono_hw
+	$(CC) $(ARGS) $(MONOTILE) $(HW) -fsycl-link $(SOURCES) -o fdtd_mono_hw
+	tar -caf fdtd_mono_hw.report.tar.gz fdtd_mono_hw.prj/reports
+
+fdtd_tiling_hw: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(TILING) $(HW) $(SOURCES) -o fdtd_tiling_hw
+
+fdtd_tiling_hw.report.tar.gz: $(SOURCES) $(RESOURCES)
+	rm -f fdtd_tiling_hw
+	$(CC) $(ARGS) $(TILING) $(HW) -fsycl-link $(SOURCES) -o fdtd_tiling_hw
+	tar -caf fdtd_tiling_hw.report.tar.gz fdtd_tiling_hw.prj/reports
 
 clean:
 	git clean -dXf
diff --git a/examples/fdtd/plot_frames.py b/examples/fdtd/plot_frames.py
index abd3216..1498c70 100755
--- a/examples/fdtd/plot_frames.py
+++ b/examples/fdtd/plot_frames.py
@@ -16,8 +16,11 @@
 
 def plot_frame(path):
     with open(path) as frame:
-        array = np.asarray([[float(cell) for cell in row] for row in csv.reader(frame)], dtype=np.float32)
+        array = np.asarray([[abs(float(cell)) for cell in row] for row in csv.reader(frame)], dtype=np.float32)
     pyplot.pcolormesh(array)
+    ax = pyplot.gca()
+    ax.set_ylim(ax.get_ylim()[::-1])
+    ax.xaxis.tick_top()
     path = path.with_suffix(".png")
     pyplot.savefig(path, format="png")
 
diff --git a/examples/fdtd/simulate.sh b/examples/fdtd/simulate.sh
deleted file mode 100755
index c4588c7..0000000
--- a/examples/fdtd/simulate.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-#SBATCH -J fdtd-simulate
-#SBATCH -p fpga
-#SBATCH --time=00:20:00
-#SBATCH -o simulate.log
-#SBATCH --constraint=19.4.0_max
-#SBATCH --mail-type=all
-
-source /cm/shared/opt/intel_oneapi/2021.1.1/setvars.sh
-ml compiler/GCC nalla_pcie
-
-mkdir -p output
-../fdtd_hw -i 0.1 -o output
\ No newline at end of file
diff --git a/examples/fdtd/src/defines.hpp b/examples/fdtd/src/defines.hpp
index 6f4e7d7..b1e0fd3 100644
--- a/examples/fdtd/src/defines.hpp
+++ b/examples/fdtd/src/defines.hpp
@@ -1,11 +1,21 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include <CL/sycl/INTEL/fpga_extensions.hpp>
@@ -46,12 +56,7 @@ constexpr float sqrt_2 = 1.4142135623730951;
 constexpr uindex_t tile_height = 512;
 constexpr uindex_t tile_width = 512;
 constexpr uindex_t stencil_radius = 1;
-
-#ifdef HARDWARE
-constexpr uindex_t pipeline_length = 128;
-#else
 constexpr uindex_t pipeline_length = 16;
-#endif
 
 std::string description = "\
 This application simulates a nano-photonic disk cavity.\n\
@@ -83,98 +88,79 @@ Operation Parameters:\n\
 -i <float>: Write a snapshot of the magnetic field to the output directory every x multiples of tau (default: disabled).\n\
 ";
 
-struct FDTDCell
-{
+struct FDTDCell {
     float ex, ey, hz, hz_sum, distance, pad0, pad1, pad2;
 };
 
 // The coefficients that describe the properties of a material.
-struct Material
-{
+struct Material {
     float ca;
     float cb;
     float da;
     float db;
 };
 
-struct Parameters
-{
-    Parameters(int argc, char **argv) : t_cutoff_factor(7.0),
-    t_detect_factor(14.0),
-    t_max_factor(15.0),
-    frequency(120e12),
-    t_0_factor(3.0),
-    disk_radius(800e-9),
-    dx(10e-9),
-    tau(100e-15),
-    out_dir("."),
-    interval_factor(std::nullopt)
-    {
+struct Parameters {
+    Parameters(int argc, char **argv)
+        : t_cutoff_factor(7.0), t_detect_factor(14.0), t_max_factor(15.0), frequency(120e12),
+          t_0_factor(3.0), disk_radius(800e-9), dx(10e-9), tau(100e-15), out_dir("."),
+          interval_factor(std::nullopt) {
         int c;
-        while ((c = getopt(argc, argv, "hc:d:e:f:p:r:s:t:o:i:")) != -1)
-        {
-            switch (c)
-            {
+        while ((c = getopt(argc, argv, "hc:d:e:f:p:r:s:t:o:i:")) != -1) {
+            switch (c) {
             case 'c':
                 t_cutoff_factor = stof(optarg);
-                if (t_cutoff_factor < 0.0)
-                {
+                if (t_cutoff_factor < 0.0) {
                     cerr << "Error: t_cutoff may not be negative!" << std::endl;
                     exit(1);
                 }
                 break;
             case 'd':
                 t_detect_factor = stof(optarg);
-                if (t_detect_factor < 0.0)
-                {
+                if (t_detect_factor < 0.0) {
                     cerr << "Error: t_detect may not be negative!" << std::endl;
                     exit(1);
                 }
                 break;
             case 'e':
                 t_max_factor = stof(optarg);
-                if (t_max_factor < 0.0)
-                {
+                if (t_max_factor < 0.0) {
                     cerr << "Error: t_max may not not be negative!" << std::endl;
                     exit(1);
                 }
                 break;
             case 'f':
                 frequency = stof(optarg);
-                if (frequency < 0.0)
-                {
-                    cerr << "Error: The frequency of the source wave may not be negative" << std::endl;
+                if (frequency < 0.0) {
+                    cerr << "Error: The frequency of the source wave may not be negative"
+                         << std::endl;
                     exit(1);
                 }
                 break;
             case 'p':
                 t_0_factor = stof(optarg);
-                if (t_0_factor < 0.0)
-                {
+                if (t_0_factor < 0.0) {
                     cerr << "Error: The phase of the source wave may not be negative" << std::endl;
                     exit(1);
                 }
                 break;
             case 'r':
                 disk_radius = stof(optarg);
-                if (disk_radius < 0.0)
-                {
+                if (disk_radius < 0.0) {
                     cerr << "Error: The disk radius wave may not be negative" << std::endl;
                     exit(1);
                 }
                 break;
             case 's':
                 dx = stof(optarg);
-                if (dx < 0.0)
-                {
+                if (dx < 0.0) {
                     cerr << "Error: The spatial resolution may not be negative" << std::endl;
                     exit(1);
                 }
                 break;
             case 't':
                 tau = stof(optarg);
-                if (tau < 0.0)
-                {
+                if (tau < 0.0) {
                     cerr << "Error: Tau may not be negative" << std::endl;
                     exit(1);
                 }
@@ -214,81 +200,57 @@ struct Parameters
 
     std::optional<float> interval_factor;
 
-    float t_cutoff() const
-    {
-        return t_cutoff_factor * tau;
-    }
+    float t_cutoff() const { return t_cutoff_factor * tau; }
 
-    float t_detect() const
-    {
-        return t_detect_factor * tau;
-    }
+    float t_detect() const { return t_detect_factor * tau; }
 
-    float t_max() const
-    {
-        return t_max_factor * tau;
-    }
+    float t_max() const { return t_max_factor * tau; }
 
-    float t_0() const
-    {
-        return t_0_factor * tau;
-    }
+    float t_0() const { return t_0_factor * tau; }
 
-    float dt() const
-    {
-        return (dx / float(c0 * sqrt_2)) * 0.99;
-    }
+    float dt() const { return (dx / float(c0 * sqrt_2)) * 0.99; }
 
-    uindex_t n_timesteps() const
-    {
-        return 2 * uindex_t(std::ceil(t_max() / dt()));
-    }
+    uindex_t n_timesteps() const { return 2 * uindex_t(std::ceil(t_max() / dt())); }
 
     // Omega (?) in Hz.
-    float omega() const
-    {
-        return 2.0 * M_PI * frequency;
-    }
+    float omega() const { return 2.0 * M_PI * frequency; }
 
-    cl::sycl::range<2> grid_range() const
-    {
+    cl::sycl::range<2> grid_range() const {
         uindex_t width = uindex_t(std::ceil((2 * disk_radius / dx) + 2));
         uindex_t height = width;
         return cl::sycl::range<2>(width, height);
     }
 
-    Material vacuum() const
-    {
+    Material vacuum() const {
         const Material vacuum{
-            (1 - (sigma * dt()) / (2 * eps_0 * eps_r)) / (1 + (sigma * dt()) / (2 * eps_0 * eps_r)), // ca
-            (dt() / (eps_0 * eps_r * dx)) / (1 + (sigma * dt()) / (2 * eps_0 * eps_r)),              // cb
-            (1 - (sigma * dt()) / (2 * mu_0)) / (1 + (sigma * dt()) / (2 * mu_0)),                   // da
-            (dt() / (mu_0 * dx)) / (1 + (sigma * dt()) / (2 * mu_0)),                                // db
+            (1 - (sigma * dt()) / (2 * eps_0 * eps_r)) /
+                (1 + (sigma * dt()) / (2 * eps_0 * eps_r)),                             // ca
+            (dt() / (eps_0 * eps_r * dx)) / (1 + (sigma * dt()) / (2 * eps_0 * eps_r)), // cb
+            (1 - (sigma * dt()) / (2 * mu_0)) / (1 + (sigma * dt()) / (2 * mu_0)),      // da
+            (dt() / (mu_0 * dx)) / (1 + (sigma * dt()) / (2 * mu_0)),                   // db
         };
         return vacuum;
     }
 
-    std::optional<uindex_t> interval() const
-    {
-        if (interval_factor.has_value())
-        {
+    std::optional<uindex_t> interval() const {
+        if (interval_factor.has_value()) {
             return uindex_t(std::ceil((*interval_factor * tau) / dt()));
-        }
-        else
-        {
+        } else {
             return std::nullopt;
         }
     }
 
-    void print_configuration() const
-    {
+    void print_configuration() const {
         std::cout << "Simulation Configuration:" << std::endl;
         std::cout << std::endl;
         std::cout << "# Timing" << std::endl;
         std::cout << "tau           = " << tau << " s" << std::endl;
-        std::cout << "t_cutoff      = " << t_cutoff_factor << " tau = " << t_cutoff() << " s" << std::endl;
-        std::cout << "t_detect      = " << t_detect_factor << " tau = " << t_detect() << " s" << std::endl;
-        std::cout << "t_max         = " << t_max_factor << " tau = " << t_max() << " s" << std::endl;
+        std::cout << "t_cutoff      = " << t_cutoff_factor << " tau = " << t_cutoff() << " s"
+                  << std::endl;
+        std::cout << "t_detect      = " << t_detect_factor << " tau = " << t_detect() << " s"
+                  << std::endl;
+        std::cout << "t_max         = " << t_max_factor << " tau = " << t_max() << " s"
+                  << std::endl;
         std::cout << std::endl;
         std::cout << "# Source Wave" << std::endl;
         std::cout << "phase         = " << t_0_factor << " tau = " << t_0() << " s" << std::endl;
diff --git a/examples/fdtd/src/fdtd.cpp b/examples/fdtd/src/fdtd.cpp
index 285ff74..cd11a02 100644
--- a/examples/fdtd/src/fdtd.cpp
+++ b/examples/fdtd/src/fdtd.cpp
@@ -1,93 +1,141 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include "simulation.hpp"
+#include <StencilStream/MonotileExecutor.hpp>
 #include <StencilStream/StencilExecutor.hpp>
 #include <deque>
 
-auto exception_handler = [](cl::sycl::exception_list exceptions)
-{
-    for (std::exception_ptr const &e : exceptions)
-    {
-        try
-        {
+#ifdef MONOTILE
+using Executor = MonotileExecutor<FDTDCell, stencil_radius, FDTDKernel, pipeline_length, tile_width,
+                                  tile_height>;
+#else
+using Executor = StencilExecutor<FDTDCell, stencil_radius, FDTDKernel, pipeline_length, tile_width,
+                                 tile_height, FDTD_BURST_SIZE>;
+#endif
+
+#ifdef HARDWARE
+using selector = INTEL::fpga_selector;
+#else
+using selector = INTEL::fpga_emulator_selector;
+#endif
+
+auto exception_handler = [](cl::sycl::exception_list exceptions) {
+    for (std::exception_ptr const &e : exceptions) {
+        try {
             std::rethrow_exception(e);
-        }
-        catch (cl::sycl::exception const &e)
-        {
-            std::cout << "Caught asynchronous SYCL exception:\n"
-                      << e.what() << "\n";
+        } catch (cl::sycl::exception const &e) {
+            std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << "\n";
             std::terminate();
         }
     }
 };
 
-void save_frame(cl::sycl::buffer<FDTDCell, 2> frame_buffer, uindex_t generation_index, bool snapshot, Parameters const &parameters)
-{
+enum class CellField {
+    EX,
+    EY,
+    HZ,
+    HZ_SUM,
+    DISTANCE,
+};
+
+void save_frame(cl::sycl::buffer<FDTDCell, 2> frame_buffer, uindex_t generation_index,
+                CellField field, Parameters const &parameters) {
     auto frame = frame_buffer.get_access<access::mode::read>();
 
     ostringstream frame_path;
-    frame_path << parameters.out_dir;
-    if (snapshot)
-    {
-        frame_path << "/hz." << generation_index << ".csv";
-    }
-    else
-    {
-        frame_path << "/hz_sum.csv";
+    frame_path << parameters.out_dir << "/";
+    switch (field) {
+    case CellField::EX:
+        frame_path << "ex";
+        break;
+    case CellField::EY:
+        frame_path << "ey";
+        break;
+    case CellField::HZ:
+        frame_path << "hz";
+        break;
+    case CellField::HZ_SUM:
+        frame_path << "hz_sum";
+        break;
+    case CellField::DISTANCE:
+        frame_path << "distance";
+        break;
+    default:
+        break;
     }
+    frame_path << "." << generation_index << ".csv";
     std::ofstream out(frame_path.str());
 
-    for (uindex_t r = 0; r < frame.get_range()[1]; r++)
-    {
-        for (uindex_t c = 0; c < frame.get_range()[0]; c++)
-        {
-            if (snapshot)
-            {
+    for (uindex_t r = 0; r < frame.get_range()[1]; r++) {
+        for (uindex_t c = 0; c < frame.get_range()[0]; c++) {
+            switch (field) {
+            case CellField::EX:
+                out << frame[c][r].ex;
+                break;
+            case CellField::EY:
+                out << frame[c][r].ey;
+                break;
+            case CellField::HZ:
                 out << frame[c][r].hz;
-            }
-            else
-            {
+                break;
+            case CellField::HZ_SUM:
                 out << frame[c][r].hz_sum;
+                break;
+            case CellField::DISTANCE:
+                out << frame[c][r].distance;
+                break;
+            default:
+                break;
             }
 
-            if (c != frame.get_range()[0] - 1)
-            {
+            if (c != frame.get_range()[0] - 1) {
                 out << ",";
             }
         }
-        if (r != frame.get_range()[1] - 1)
-        {
+        if (r != frame.get_range()[1] - 1) {
             out << std::endl;
         }
     }
 }
 
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
     Parameters parameters(argc, argv);
     parameters.print_configuration();
 
-#ifdef HARDWARE
-    INTEL::fpga_selector device_selector;
-#else
-    INTEL::fpga_emulator_selector device_selector;
+#ifdef MONOTILE
+    if (parameters.grid_range()[0] > tile_width || parameters.grid_range()[1] > tile_height) {
+        std::cerr << "Error: The grid may not exceed the size of the tile (" << tile_width << " by "
+                  << tile_height << " cells) when using the monotile architecture." << std::endl;
+        exit(1);
+    }
 #endif
-    cl::sycl::queue fpga_queue(device_selector, exception_handler, {property::queue::enable_profiling{}});
+
+    selector device_selector;
+    cl::sycl::queue fpga_queue(device_selector, exception_handler,
+                               {property::queue::enable_profiling{}});
 
     cl::sycl::buffer<FDTDCell, 2> grid_buffer(parameters.grid_range());
     {
         auto init_ac = grid_buffer.get_access<cl::sycl::access::mode::discard_write>();
-        for (uindex_t c = 0; c < parameters.grid_range()[0]; c++)
-        {
-            for (uindex_t r = 0; r < parameters.grid_range()[1]; r++)
-            {
+        for (uindex_t c = 0; c < parameters.grid_range()[0]; c++) {
+            for (uindex_t r = 0; r < parameters.grid_range()[1]; r++) {
                 init_ac[c][r] = FDTDKernel::halo();
 
                 float a = float(c) - float(parameters.grid_range()[0]) / 2.0;
@@ -97,27 +145,29 @@ int main(int argc, char **argv)
         }
     }
 
-    StencilExecutor<FDTDCell, stencil_radius, FDTDKernel, pipeline_length, tile_width, tile_height, FDTD_BURST_SIZE> executor(FDTDKernel::halo(), FDTDKernel(parameters));
+    Executor executor(FDTDKernel::halo(), FDTDKernel(parameters));
     executor.set_input(grid_buffer);
-    executor.set_queue(fpga_queue, true);
+    executor.set_queue(fpga_queue);
 
     uindex_t n_timesteps = parameters.n_timesteps();
 
     std::cout << "Simulating..." << std::endl;
-    if (parameters.interval().has_value())
-    {
+    if (parameters.interval().has_value()) {
         uindex_t interval = 2 * *(parameters.interval());
         double runtime = 0.0;
 
-        while (executor.get_i_generation() + interval < n_timesteps)
-        {
+        while (executor.get_i_generation() + interval < n_timesteps) {
             executor.run(interval);
             executor.copy_output(grid_buffer);
 
             uindex_t i_generation = executor.get_i_generation();
-            save_frame(grid_buffer, i_generation, true, parameters);
+            save_frame(grid_buffer, i_generation, CellField::EX, parameters);
+            save_frame(grid_buffer, i_generation, CellField::EY, parameters);
+            save_frame(grid_buffer, i_generation, CellField::HZ, parameters);
+            save_frame(grid_buffer, i_generation, CellField::HZ_SUM, parameters);
+            save_frame(grid_buffer, i_generation, CellField::DISTANCE, parameters);
 
-            runtime += executor.get_runtime_sample()->get_total_runtime();
+            runtime += executor.get_runtime_sample().get_total_runtime();
             double progress = 100.0 * (double(i_generation) / double(n_timesteps));
             double speed = double(i_generation) / runtime;
             double time_remaining = double(n_timesteps - i_generation) / speed;
@@ -127,19 +177,16 @@ int main(int argc, char **argv)
             std::cout << ", ~" << time_remaining << "s left." << std::endl;
         }
 
-        if (n_timesteps % interval != 0)
-        {
+        if (n_timesteps % interval != 0) {
             executor.run(n_timesteps % interval);
         }
-    }
-    else
-    {
+    } else {
         executor.run(n_timesteps);
     }
     std::cout << "Simulation complete!" << std::endl;
 
     executor.copy_output(grid_buffer);
-    save_frame(grid_buffer, n_timesteps, false, parameters);
+    save_frame(grid_buffer, n_timesteps, CellField::HZ_SUM, parameters);
 
     return 0;
 }
diff --git a/examples/fdtd/src/simulation.hpp b/examples/fdtd/src/simulation.hpp
index 5343a38..5d95888 100644
--- a/examples/fdtd/src/simulation.hpp
+++ b/examples/fdtd/src/simulation.hpp
@@ -1,18 +1,27 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include "defines.hpp"
 #include <StencilStream/Stencil.hpp>
 
-class FDTDKernel
-{
+class FDTDKernel {
     float disk_radius;
     float tau;
     float omega;
@@ -23,21 +32,13 @@ class FDTDKernel
     float dt;
     Material vacuum;
 
-public:
-    FDTDKernel(Parameters const &parameters) : disk_radius(parameters.disk_radius),
-                                               tau(parameters.tau),
-                                               omega(parameters.omega()),
-                                               t_0(parameters.t_0()),
-                                               t_cutoff(parameters.t_cutoff()),
-                                               t_detect(parameters.t_detect()),
-                                               dx(parameters.dx),
-                                               dt(parameters.dt()),
-                                               vacuum(parameters.vacuum())
-    {
-    }
+  public:
+    FDTDKernel(Parameters const &parameters)
+        : disk_radius(parameters.disk_radius), tau(parameters.tau), omega(parameters.omega()),
+          t_0(parameters.t_0()), t_cutoff(parameters.t_cutoff()), t_detect(parameters.t_detect()),
+          dx(parameters.dx), dt(parameters.dt()), vacuum(parameters.vacuum()) {}
 
-    static FDTDCell halo()
-    {
+    static FDTDCell halo() {
         FDTDCell new_cell;
         new_cell.ex = 0;
         new_cell.ey = 0;
@@ -47,34 +48,29 @@ class FDTDKernel
         return new_cell;
     }
 
-    FDTDCell operator()(Stencil<FDTDCell, stencil_radius> const &stencil) const
-    {
+    FDTDCell operator()(Stencil<FDTDCell, stencil_radius> const &stencil) const {
         FDTDCell cell = stencil[ID(0, 0)];
 
-        if (cell.distance < disk_radius)
-        {
-            if ((stencil.stage & 0b1) == 0)
-            {
+        if (cell.distance < disk_radius) {
+            if ((stencil.stage & 0b1) == 0) {
                 cell.ex *= vacuum.ca;
                 cell.ex += vacuum.cb * (stencil[ID(0, 0)].hz - stencil[ID(0, -1)].hz);
 
                 cell.ey *= vacuum.ca;
                 cell.ey += vacuum.cb * (stencil[ID(-1, 0)].hz - stencil[ID(0, 0)].hz);
-            }
-            else
-            {
+            } else {
                 cell.hz *= vacuum.da;
-                cell.hz += vacuum.db * (stencil[ID(0, 1)].ex - stencil[ID(0, 0)].ex + stencil[ID(0, 0)].ey - stencil[ID(1, 0)].ey);
+                cell.hz += vacuum.db * (stencil[ID(0, 1)].ex - stencil[ID(0, 0)].ex +
+                                        stencil[ID(0, 0)].ey - stencil[ID(1, 0)].ey);
 
                 float current_time = (stencil.generation >> 1) * dt;
-                if (cell.distance < dx && current_time < t_cutoff)
-                {
+                if (cell.distance < dx && current_time < t_cutoff) {
                     float wave_progress = (current_time - t_0) / tau;
-                    cell.hz += cl::sycl::cos(omega * current_time) * cl::sycl::exp(-1 * wave_progress * wave_progress);
+                    cell.hz += cl::sycl::cos(omega * current_time) *
+                               cl::sycl::exp(-1 * wave_progress * wave_progress);
                 }
 
-                if (current_time > t_detect)
-                {
+                if (current_time > t_detect) {
                     cell.hz_sum += cell.hz * cell.hz;
                 }
             }
diff --git a/examples/fdtd/synthesize.sh b/examples/fdtd/synthesize.sh
deleted file mode 100755
index 8d3f10c..0000000
--- a/examples/fdtd/synthesize.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-#SBATCH -J fdtd-synthesis
-#SBATCH -o synthesize.log
-#SBATCH -p fpgasyn
-#SBATCH --mail-type=all
-#SBATCH --exclusive
-#SBATCH --time=24:00:00
-
-source /cm/shared/opt/intel_oneapi/2021.1.1/setvars.sh
-ml compiler/GCC nalla_pcie/19.4.0
-
-make fdtd_hw
\ No newline at end of file
diff --git a/examples/hotspot/.gitignore b/examples/hotspot/.gitignore
index 6e6a2ed..4c3d92c 100644
--- a/examples/hotspot/.gitignore
+++ b/examples/hotspot/.gitignore
@@ -1,7 +1,5 @@
 # executables
-hotspot_emu
-hotspot_hw
-hotspot_rodinia
+hotspot_*
 
 # build directories
 rodinia_3.1
diff --git a/examples/hotspot/Makefile b/examples/hotspot/Makefile
index 4f62ca2..1fa06a1 100644
--- a/examples/hotspot/Makefile
+++ b/examples/hotspot/Makefile
@@ -18,19 +18,37 @@ ifdef AOCL_BOARD_PACKAGE_ROOT
 	ARGS += -Xsboard=$(FPGA_BOARD_NAME) -Xsboard-package=$(AOCL_BOARD_PACKAGE_ROOT)
 endif
 
-EMU_ARGS = $(ARGS)
-HW_ARGS = $(ARGS) -DHARDWARE -Xshardware 
+EMU =
+HW = -DHARDWARE -Xshardware 
+MONOTILE = -DMONOTILE
+TILING =
 
-hotspot_emu: hotspot.cpp Makefile
-	$(CC) $(EMU_ARGS) hotspot.cpp -o hotspot_emu
+SOURCES = hotspot.cpp
+RESOURCES = $(wildcard StencilStream/*) Makefile
 
-hotspot_hw: hotspot.cpp Makefile
-	$(CC) $(HW_ARGS) hotspot.cpp -o hotspot_hw
+all: hotspot_mono_emu hotspot_tiling_emu hotspot_mono_hw hotspot_tiling_hw hotspot_rodinia
 
-hotspot_hw.report.tar.gz: hotspot.cpp Makefile
-	rm -f hotspot_hw
-	$(CC) $(HW_ARGS) -fsycl-link hotspot.cpp -o hotspot_hw
-	tar -caf hotspot_hw.report.tar.gz hotspot_hw.prj/reports
+hotspot_mono_emu: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(MONOTILE) $(EMU) $(SOURCES) -o hotspot_mono_emu
+
+hotspot_tiling_emu: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(TILING) $(EMU) $(SOURCES) -o hotspot_tiling_emu
+
+hotspot_mono_hw: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(MONOTILE) $(HW) $(SOURCES) -o hotspot_mono_hw
+
+hotspot_mono_hw.report.tar.gz: $(SOURCES) $(RESOURCES)
+	rm -f hotspot_mono_hw
+	$(CC) $(ARGS) $(MONOTILE) $(HW) -fsycl-link $(SOURCES) -o hotspot_mono_hw
+	tar -caf hotspot_mono_hw.report.tar.gz hotspot_mono_hw.prj/reports
+
+hotspot_tiling_hw: $(SOURCES) $(RESOURCES)
+	$(CC) $(ARGS) $(TILING) $(HW) $(SOURCES) -o hotspot_tiling_hw
+
+hotspot_tiling_hw.report.tar.gz: $(SOURCES) $(RESOURCES)
+	rm -f hotspot_tiling_hw
+	$(CC) $(ARGS) $(TILING) $(HW) -fsycl-link $(SOURCES) -o hotspot_tiling_hw
+	tar -caf hotspot_tiling_hw.report.tar.gz hotspot_tiling_hw.prj/reports
 
 hotspot_rodinia: Makefile
 	./scripts/build_rodinia.sh
diff --git a/examples/hotspot/hotspot.cpp b/examples/hotspot/hotspot.cpp
index 61fd9b7..ac90bba 100644
--- a/examples/hotspot/hotspot.cpp
+++ b/examples/hotspot/hotspot.cpp
@@ -1,14 +1,25 @@
 /*
- * Copyright © 2020-2021Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include <CL/sycl.hpp>
 #include <CL/sycl/INTEL/fpga_extensions.hpp>
+#include <StencilStream/MonotileExecutor.hpp>
 #include <StencilStream/StencilExecutor.hpp>
 #include <fstream>
 
@@ -37,23 +48,22 @@ const FLOAT amb_temp = 80.0;
 
 /* stencil parameters */
 const uindex_t stencil_radius = 1;
-#ifdef HARDWARE
-const uindex_t pipeline_length = 256;
-#else
 const uindex_t pipeline_length = 32;
-#endif
 const uindex_t tile_width = 1024;
 const uindex_t tile_height = 1024;
 const uindex_t burst_size = 1024;
 
-/* Number of simulations to run in benchmark mode */
-const uindex_t n_simulations = 10;
+#ifdef HARDWARE
+using selector = INTEL::fpga_selector;
+#else
+using selector = INTEL::fpga_emulator_selector;
+#endif
+
+using Cell = vec<FLOAT, 2>;
 
-void write_output(buffer<vec<FLOAT, 2>, 2> vect, string file)
-{
+void write_output(buffer<Cell, 2> vect, string file) {
     fstream out(file, out.out | out.trunc);
-    if (!out.is_open())
-    {
+    if (!out.is_open()) {
         throw std::runtime_error("The file was not opened\n");
     }
 
@@ -62,10 +72,8 @@ void write_output(buffer<vec<FLOAT, 2>, 2> vect, string file)
     auto vect_ac = vect.get_access<access::mode::read>();
 
     int i = 0;
-    for (index_t r = 0; r < n_rows; r++)
-    {
-        for (index_t c = 0; c < n_columns; c++)
-        {
+    for (index_t r = 0; r < n_rows; r++) {
+        for (index_t c = 0; c < n_columns; c++) {
             out << i << "\t" << vect_ac[id<2>(c, r)][0] << std::endl;
             i++;
         }
@@ -74,31 +82,26 @@ void write_output(buffer<vec<FLOAT, 2>, 2> vect, string file)
     out.close();
 }
 
-buffer<vec<FLOAT, 2>, 2>
-read_input(string temp_file, string power_file, range<2> buffer_range)
-{
+buffer<Cell, 2> read_input(string temp_file, string power_file, range<2> buffer_range) {
     fstream temp(temp_file, temp.in);
     fstream power(power_file, power.in);
-    if (!temp.is_open() || !power.is_open())
-    {
+    if (!temp.is_open() || !power.is_open()) {
         throw std::runtime_error("file could not be opened for reading");
     }
 
     uindex_t n_columns = buffer_range[0];
     uindex_t n_rows = buffer_range[1];
-    buffer<vec<FLOAT, 2>, 2> vect(buffer_range);
+    buffer<Cell, 2> vect(buffer_range);
 
     {
         auto vect_ac = vect.get_access<access::mode::write>();
 
         FLOAT tmp_temp, tmp_power;
-        for (index_t r = 0; r < n_rows; r++)
-        {
-            for (index_t c = 0; c < n_columns; c++)
-            {
+        for (index_t r = 0; r < n_rows; r++) {
+            for (index_t c = 0; c < n_columns; c++) {
                 temp >> tmp_temp;
                 power >> tmp_power;
-                vect_ac[id<2>(c, r)] = vec<FLOAT, 2>(tmp_temp, tmp_power);
+                vect_ac[id<2>(c, r)] = Cell(tmp_temp, tmp_power);
             }
         }
     }
@@ -108,38 +111,37 @@ read_input(string temp_file, string power_file, range<2> buffer_range)
     return vect;
 }
 
-void usage(int argc, char **argv)
-{
-    std::cerr << "Usage: " << argv[0] << "  <grid_rows> <grid_cols> <sim_time> <temp_file> <power_file> <output_file>" << std::endl;
-    std::cerr << "    <grid_rows>      - number of rows in the grid (positive integer)" << std::endl;
-    std::cerr << "    <grid_cols>      - number of columns in the grid (positive integer)" << std::endl;
+void usage(int argc, char **argv) {
+    std::cerr << "Usage: " << argv[0]
+              << "  <grid_rows> <grid_cols> <sim_time> <temp_file> <power_file> <output_file>"
+              << std::endl;
+    std::cerr << "    <grid_rows>      - number of rows in the grid (positive integer)"
+              << std::endl;
+    std::cerr << "    <grid_cols>      - number of columns in the grid (positive integer)"
+              << std::endl;
     std::cerr << "    <sim_time>       - number of iterations (positive integer)" << std::endl;
-    std::cerr << "    <temp_file>      - name of the file containing the initial temperature values of each cell" << std::endl;
-    std::cerr << "    <power_file>     - name of the file containing the dissipated power values of each cell" << std::endl;
+    std::cerr << "    <temp_file>      - name of the file containing the initial temperature "
+                 "values of each cell"
+              << std::endl;
+    std::cerr << "    <power_file>     - name of the file containing the dissipated power values "
+                 "of each cell"
+              << std::endl;
     std::cerr << "    <output_file>    - name of the output file" << std::endl;
-    std::cerr << "                       " << n_simulations << " simulations with i*<sim_time> generations will be executed in total, where i is the index of the simulation." << std::endl;
     exit(1);
 }
 
-auto exception_handler = [](cl::sycl::exception_list exceptions)
-{
-    for (std::exception_ptr const &e : exceptions)
-    {
-        try
-        {
+auto exception_handler = [](cl::sycl::exception_list exceptions) {
+    for (std::exception_ptr const &e : exceptions) {
+        try {
             std::rethrow_exception(e);
-        }
-        catch (cl::sycl::exception const &e)
-        {
-            std::cout << "Caught asynchronous SYCL exception:\n"
-                      << e.what() << "\n";
+        } catch (cl::sycl::exception const &e) {
+            std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << "\n";
             std::terminate();
         }
     }
 };
 
-double run_simulation(cl::sycl::queue working_queue, buffer<vec<FLOAT, 2>, 2> temp, uindex_t sim_time)
-{
+double run_simulation(cl::sycl::queue working_queue, buffer<Cell, 2> temp, uindex_t sim_time) {
     uindex_t n_columns = temp.get_range()[0];
     uindex_t n_rows = temp.get_range()[1];
 
@@ -159,12 +161,12 @@ double run_simulation(cl::sycl::queue working_queue, buffer<vec<FLOAT, 2>, 2> te
     FLOAT Rz_1 = 1.f / Rz;
     FLOAT Cap_1 = step / Cap;
 
-    auto kernel =
-        [=](Stencil<vec<FLOAT, 2>, stencil_radius> const &temp)
-    {
+    auto kernel = [=](Stencil<Cell, stencil_radius> const &temp) {
         ID idx = temp.id;
         index_t c = idx.c;
         index_t r = idx.r;
+        uindex_t width = temp.grid_range.c;
+        uindex_t height = temp.grid_range.r;
 
         FLOAT power = temp[ID(0, 0)][1];
         FLOAT old = temp[ID(0, 0)][0];
@@ -173,13 +175,35 @@ double run_simulation(cl::sycl::queue working_queue, buffer<vec<FLOAT, 2>, 2> te
         FLOAT top = temp[ID(0, -1)][0];
         FLOAT bottom = temp[ID(0, 1)][0];
 
+        if (c == 0) {
+            left = old;
+        } else if (c == width - 1) {
+            right = old;
+        }
+
+        if (r == 0) {
+            top = old;
+        } else if (r == height - 1) {
+            bottom = old;
+        }
+
         // As in the OpenCL version of the rodinia "hotspot" benchmark.
-        FLOAT new_temp = old + Cap_1 * (power + (bottom + top - 2.f * old) * Ry_1 + (right + left - 2.f * old) * Rx_1 + (amb_temp - old) * Rz_1);
+        FLOAT new_temp =
+            old + Cap_1 * (power + (bottom + top - 2.f * old) * Ry_1 +
+                           (right + left - 2.f * old) * Rx_1 + (amb_temp - old) * Rz_1);
 
         return vec(new_temp, power);
     };
 
-    StencilExecutor<vec<FLOAT, 2>, stencil_radius, decltype(kernel), pipeline_length, tile_width, tile_height, burst_size> executor(vec<FLOAT, 2>(0.0, 0.0), kernel);
+#ifdef MONOTILE
+    using Executor = MonotileExecutor<Cell, stencil_radius, decltype(kernel), pipeline_length,
+                                      tile_width, tile_height>;
+#else
+    using Executor = StencilExecutor<Cell, stencil_radius, decltype(kernel), pipeline_length,
+                                     tile_width, tile_height, burst_size>;
+#endif
+
+    Executor executor(Cell(0.0, 0.0), kernel);
     executor.set_input(temp);
 
 #ifdef HARDWARE
@@ -190,21 +214,19 @@ double run_simulation(cl::sycl::queue working_queue, buffer<vec<FLOAT, 2>, 2> te
 
     executor.run(sim_time);
 
-    return executor.get_runtime_sample().value().get_total_runtime();
+    executor.copy_output(temp);
+
+    return executor.get_runtime_sample().get_total_runtime();
 }
 
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
     int n_rows, n_columns, sim_time;
     char *tfile, *pfile, *ofile;
     bool benchmark_mode = false;
 
-#ifdef HARDWARE
-    INTEL::fpga_selector device_selector;
-#else
-    INTEL::fpga_emulator_selector device_selector;
-#endif
-    cl::sycl::queue working_queue(device_selector, exception_handler, {property::queue::enable_profiling{}});
+    selector device_selector;
+    cl::sycl::queue working_queue(device_selector, exception_handler,
+                                  {property::queue::enable_profiling{}});
 
     /* check validity of inputs	*/
     if (argc != 7)
@@ -216,11 +238,19 @@ int main(int argc, char **argv)
     if ((sim_time = atoi(argv[3])) <= 0)
         usage(argc, argv);
 
+#ifdef MONOTILE
+    if (n_columns > tile_width || n_rows > tile_height) {
+        std::cerr << "Error: The grid may not exceed the size of the tile (" << tile_width << " by "
+                  << tile_height << " cells) when using the monotile architecture." << std::endl;
+        exit(1);
+    }
+#endif
+
     /* read initial temperatures and input power	*/
     tfile = argv[4];
     pfile = argv[5];
     ofile = argv[6];
-    buffer<vec<FLOAT, 2>, 2> temp = read_input(string(tfile), string(pfile), range<2>(n_columns, n_rows));
+    buffer<Cell, 2> temp = read_input(string(tfile), string(pfile), range<2>(n_columns, n_rows));
 
     printf("Start computing the transient temperature\n");
 
diff --git a/examples/hotspot/synthesize.sh b/examples/hotspot/synthesize.sh
deleted file mode 100644
index f1748e2..0000000
--- a/examples/hotspot/synthesize.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-#SBATCH -J hotspot-synthesis
-#SBATCH -o synthesize.log
-#SBATCH -p fpgasyn
-#SBATCH --mail-type=all
-#SBATCH --mem=120000MB
-#SBATCH --time=48:00:00
-
-source /cm/shared/opt/intel_oneapi/2021.1.1/setvars.sh
-ml compiler/GCC nalla_pcie/19.4.0
-
-make hotspot_hw
\ No newline at end of file
diff --git a/tests/Makefile b/tests/Makefile
index 785e062..6bf3e61 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -8,7 +8,7 @@
 CC = dpcpp
 STENCIL_PATH = ../
 
-ARGS = -std=c++17 -g -I$(STENCIL_PATH) -fintelfpga -DSTENCIL_INDEX_WIDTH=32
+ARGS = -std=c++17 -g -I$(STENCIL_PATH) -Isrc/ -fintelfpga -DSTENCIL_INDEX_WIDTH=32
 
 ifdef OPTIMIZE
 	ARGS += -O3
@@ -19,7 +19,8 @@ ifdef EBROOTGCC
 endif
 
 UNIT_ARGS = $(ARGS)
-UNIT_OBJECTS = $(patsubst %.cpp,%.o,$(wildcard src/units/*.cpp))
+UNIT_SOURCES = $(wildcard src/units/*.cpp src/units/*/*.cpp)
+UNIT_OBJECTS = $(patsubst %.cpp,%.o,$(UNIT_SOURCES))
 RESOURCES = src/res/*.hpp ../StencilStream/* Makefile
 
 SYNTH_ARGS = $(ARGS) -Xsv
diff --git a/tests/src/res/FPGATransFunc.hpp b/tests/src/res/FPGATransFunc.hpp
deleted file mode 100644
index 59806ac..0000000
--- a/tests/src/res/FPGATransFunc.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#pragma once
-#include <CL/sycl.hpp>
-#include <StencilStream/GenericID.hpp>
-#include <StencilStream/Index.hpp>
-#include <StencilStream/Stencil.hpp>
-
-enum class CellStatus
-{
-    Normal,
-    Invalid,
-    Halo,
-};
-
-struct Cell
-{
-    stencil::index_t c;
-    stencil::index_t r;
-    stencil::index_t i_generation;
-    CellStatus status;
-};
-
-template <stencil::uindex_t radius>
-class FPGATransFunc
-{
-public:
-    using Cell = Cell;
-
-    static Cell halo() { return Cell{0, 0, 0, CellStatus::Halo}; }
-
-    Cell operator()(stencil::Stencil<Cell, radius> const &stencil) const
-    {
-        Cell new_cell = stencil[stencil::ID(0, 0)];
-
-        stencil::index_t center_column = stencil.id.c;
-        stencil::index_t center_row = stencil.id.r;
-
-        bool is_valid = true;
-#pragma unroll
-        for (stencil::index_t c = -stencil::index_t(radius); c <= stencil::index_t(radius); c++)
-        {
-#pragma unroll
-            for (stencil::index_t r = -stencil::index_t(radius); r <= stencil::index_t(radius); r++)
-            {
-                Cell old_cell = stencil[stencil::ID(c, r)];
-                is_valid &= (old_cell.c == c + center_column && old_cell.r == r + center_row && old_cell.i_generation == stencil.generation) || (old_cell.status == CellStatus::Halo);
-            }
-        }
-
-        if (new_cell.status == CellStatus::Normal)
-        {
-            if (!is_valid)
-            {
-                new_cell.status = CellStatus::Invalid;
-            }
-            new_cell.i_generation += 1;
-        }
-
-        return new_cell;
-    }
-};
\ No newline at end of file
diff --git a/tests/src/res/HostPipe.hpp b/tests/src/res/HostPipe.hpp
index c3701db..1dc5ed0 100644
--- a/tests/src/res/HostPipe.hpp
+++ b/tests/src/res/HostPipe.hpp
@@ -1,71 +1,59 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include <StencilStream/Index.hpp>
 #include <deque>
 #include <stdexcept>
 
-template <typename id, typename T>
-class HostPipe
-{
-public:
-    static T read()
-    {
-        return HostPipeImpl::instance().read();
-    }
+template <typename id, typename T> class HostPipe {
+  public:
+    static T read() { return HostPipeImpl::instance().read(); }
 
-    static void write(T new_value)
-    {
-        HostPipeImpl::instance().write(new_value);
-    }
+    static void write(T new_value) { HostPipeImpl::instance().write(new_value); }
 
-    static bool empty()
-    {
-        return HostPipeImpl::instance().empty();
-    }
+    static bool empty() { return HostPipeImpl::instance().empty(); }
 
-private:
-    class HostPipeImpl
-    {
-    public:
-        static HostPipeImpl &instance()
-        {
+  private:
+    class HostPipeImpl {
+      public:
+        static HostPipeImpl &instance() {
             static HostPipeImpl _instance;
             return _instance;
         }
 
-        T read()
-        {
-            if (queue.empty())
-            {
-                throw std::runtime_error("Try to read from empty pipe (blocking is not implemented).");
-            }
-            else
-            {
+        T read() {
+            if (queue.empty()) {
+                throw std::runtime_error(
+                    "Try to read from empty pipe (blocking is not implemented).");
+            } else {
                 T new_value = queue.back();
                 queue.pop_back();
                 return new_value;
             }
         }
 
-        void write(T new_value)
-        {
-            queue.push_front(new_value);
-        }
+        void write(T new_value) { queue.push_front(new_value); }
 
-        bool empty()
-        {
-            return queue.empty();
-        }
+        bool empty() { return queue.empty(); }
 
-    private:
+      private:
         HostPipeImpl() : queue() {}
         HostPipeImpl(const HostPipeImpl &);
         HostPipeImpl &operator=(const HostPipeImpl &);
diff --git a/tests/src/res/HostTransFunc.hpp b/tests/src/res/HostTransFunc.hpp
deleted file mode 100644
index e04a470..0000000
--- a/tests/src/res/HostTransFunc.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#pragma once
-#include <StencilStream/GenericID.hpp>
-#include <StencilStream/Index.hpp>
-#include <StencilStream/Stencil.hpp>
-#include <StencilStream/StencilInfo.hpp>
-
-template <stencil::uindex_t radius>
-class HostTransFunc
-{
-public:
-    struct Cell
-    {
-        Cell() : cell_id(0, 0), generation(0) {}
-
-        Cell(stencil::ID cell_id, stencil::uindex_t generation) : cell_id(cell_id), generation(generation) {}
-
-        Cell(Cell const &other_cell) : cell_id(other_cell.cell_id), generation(other_cell.generation) {}
-
-        Cell &operator=(Cell const &other_cell)
-        {
-            cell_id = other_cell.cell_id;
-            generation = other_cell.generation;
-            return *this;
-        }
-
-        stencil::ID cell_id;
-        stencil::uindex_t generation;
-    };
-
-    HostTransFunc(stencil::uindex_t pipeline_length) : pipeline_length(pipeline_length) {}
-
-    Cell operator()(stencil::Stencil<Cell, radius> const &stencil, stencil::StencilInfo const &info) const
-    {
-        stencil::index_t center_column = info.center_cell_id.c;
-        stencil::index_t center_row = info.center_cell_id.r;
-        stencil::index_t corner_id = -radius * (pipeline_length - info.cell_generation - 1);
-
-        if (center_column >= corner_id && center_row >= corner_id)
-        {
-            for (stencil::index_t c = -stencil::index_t(radius); c <= stencil::index_t(radius); c++)
-            {
-                for (stencil::index_t r = -stencil::index_t(radius); r <= stencil::index_t(radius); r++)
-                {
-                    REQUIRE(stencil[stencil::ID(c, r)].cell_id.c == stencil::index_t(c + center_column));
-                    REQUIRE(stencil[stencil::ID(c, r)].cell_id.r == stencil::index_t(r + center_row));
-                }
-            }
-            REQUIRE(stencil[stencil::ID(0, 0)].generation == info.cell_generation);
-        }
-
-        Cell new_cell = stencil[stencil::ID(0, 0)];
-        new_cell.generation += 1;
-        return new_cell;
-    }
-
-private:
-    stencil::uindex_t pipeline_length;
-};
\ No newline at end of file
diff --git a/tests/src/res/TransFuncs.hpp b/tests/src/res/TransFuncs.hpp
new file mode 100644
index 0000000..5a4b3c5
--- /dev/null
+++ b/tests/src/res/TransFuncs.hpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "catch.hpp"
+#include <CL/sycl.hpp>
+#include <StencilStream/GenericID.hpp>
+#include <StencilStream/Index.hpp>
+#include <StencilStream/Stencil.hpp>
+
+enum class CellStatus
+{
+    Normal,
+    Invalid,
+    Halo,
+};
+
+struct Cell
+{
+    stencil::index_t c;
+    stencil::index_t r;
+    stencil::index_t i_generation;
+    CellStatus status;
+
+    static Cell halo() { return Cell{0, 0, 0, CellStatus::Halo}; }
+};
+
+template <stencil::uindex_t radius>
+class FPGATransFunc
+{
+public:
+    Cell operator()(stencil::Stencil<Cell, radius> const &stencil) const
+    {
+        Cell new_cell = stencil[stencil::ID(0, 0)];
+
+        bool is_valid = true;
+#pragma unroll
+        for (stencil::index_t c = -stencil::index_t(radius); c <= stencil::index_t(radius); c++)
+        {
+#pragma unroll
+            for (stencil::index_t r = -stencil::index_t(radius); r <= stencil::index_t(radius); r++)
+            {
+                Cell old_cell = stencil[stencil::ID(c, r)];
+                stencil::index_t cell_c = stencil.id.c + c;
+                stencil::index_t cell_r = stencil.id.r + r;
+                if (cell_c >= 0 && cell_r >= 0 && cell_c < stencil.grid_range.c && cell_r < stencil.grid_range.r)
+                {
+                    is_valid &= old_cell.c == cell_c;
+                    is_valid &= old_cell.r == cell_r;
+                    is_valid &= old_cell.i_generation == stencil.generation;
+                    is_valid &= old_cell.status == CellStatus::Normal;
+                }
+                else
+                {
+                    is_valid &= old_cell.c == Cell::halo().c;
+                    is_valid &= old_cell.r == Cell::halo().r;
+                    is_valid &= old_cell.i_generation == Cell::halo().i_generation;
+                    is_valid &= old_cell.status == Cell::halo().status;
+                }
+            }
+        }
+
+        new_cell.status = is_valid ? CellStatus::Normal : CellStatus::Invalid;
+        new_cell.i_generation += 1;
+
+        return new_cell;
+    }
+};
+
+template <stencil::uindex_t radius>
+class HostTransFunc
+{
+public:
+    Cell operator()(stencil::Stencil<Cell, radius> const &stencil) const
+    {
+        Cell new_cell = stencil[stencil::ID(0, 0)];
+
+#pragma unroll
+        for (stencil::index_t c = -stencil::index_t(radius); c <= stencil::index_t(radius); c++)
+        {
+#pragma unroll
+            for (stencil::index_t r = -stencil::index_t(radius); r <= stencil::index_t(radius); r++)
+            {
+                Cell old_cell = stencil[stencil::ID(c, r)];
+                stencil::index_t cell_c = stencil.id.c + c;
+                stencil::index_t cell_r = stencil.id.r + r;
+                if (cell_c >= 0 && cell_r >= 0 && cell_c < stencil.grid_range.c && cell_r < stencil.grid_range.r)
+                {
+                    REQUIRE(old_cell.c == cell_c);
+                    REQUIRE(old_cell.r == cell_r);
+                    REQUIRE(old_cell.i_generation == stencil.generation);
+                    REQUIRE(old_cell.status == CellStatus::Normal);
+                }
+                else
+                {
+                    REQUIRE(old_cell.c == Cell::halo().c);
+                    REQUIRE(old_cell.r == Cell::halo().r);
+                    REQUIRE(old_cell.i_generation == Cell::halo().i_generation);
+                    REQUIRE(old_cell.status == Cell::halo().status);
+                }
+            }
+        }
+        
+        new_cell.i_generation += 1;
+
+        return new_cell;
+    }
+};
\ No newline at end of file
diff --git a/tests/src/res/constants.hpp b/tests/src/res/constants.hpp
index 0b3fd0e..0efb704 100644
--- a/tests/src/res/constants.hpp
+++ b/tests/src/res/constants.hpp
@@ -1,11 +1,21 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #pragma once
 #include <StencilStream/Helpers.hpp>
@@ -21,10 +31,14 @@ const stencil::uindex_t core_width = tile_width - 2 * halo_radius;
 const stencil::uindex_t core_height = tile_height - 2 * halo_radius;
 
 const stencil::uindex_t burst_length = 32;
-const stencil::uindex_t corner_bursts = stencil::burst_partitioned_range(halo_radius, halo_radius, burst_length)[0];
-const stencil::uindex_t horizontal_border_bursts = stencil::burst_partitioned_range(core_width, halo_radius, burst_length)[0];
-const stencil::uindex_t vertical_border_bursts = stencil::burst_partitioned_range(halo_radius, core_height, burst_length)[0];
-const stencil::uindex_t core_bursts = stencil::burst_partitioned_range(core_width, core_height, burst_length)[0];
+const stencil::uindex_t corner_bursts =
+    stencil::burst_partitioned_range(halo_radius, halo_radius, burst_length)[0];
+const stencil::uindex_t horizontal_border_bursts =
+    stencil::burst_partitioned_range(core_width, halo_radius, burst_length)[0];
+const stencil::uindex_t vertical_border_bursts =
+    stencil::burst_partitioned_range(halo_radius, core_height, burst_length)[0];
+const stencil::uindex_t core_bursts =
+    stencil::burst_partitioned_range(core_width, core_height, burst_length)[0];
 
 const stencil::uindex_t grid_width = 128;
 const stencil::uindex_t grid_height = 64;
\ No newline at end of file
diff --git a/tests/src/synthesis/main.cpp b/tests/src/synthesis/main.cpp
index 8d76adb..9b9d3a6 100644
--- a/tests/src/synthesis/main.cpp
+++ b/tests/src/synthesis/main.cpp
@@ -1,13 +1,23 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/FPGATransFunc.hpp"
+#include "../res/TransFuncs.hpp"
 #include <CL/sycl.hpp>
 #include <CL/sycl/INTEL/fpga_extensions.hpp>
 #include <StencilStream/StencilExecutor.hpp>
@@ -17,7 +27,7 @@ using namespace cl::sycl;
 using namespace stencil;
 
 const uindex_t stencil_radius = 1;
-const uindex_t pipeline_length = 32;
+const uindex_t pipeline_length = 34;
 const uindex_t tile_width = 1024;
 const uindex_t tile_height = 1024;
 const uindex_t grid_width = 2 * tile_width;
@@ -25,27 +35,21 @@ const uindex_t grid_height = 2 * tile_height;
 const uindex_t burst_size = 1024;
 
 using TransFunc = FPGATransFunc<stencil_radius>;
-using Executor = StencilExecutor<Cell, stencil_radius, TransFunc, pipeline_length, tile_width, tile_height, burst_size>;
+using Executor = StencilExecutor<Cell, stencil_radius, TransFunc, pipeline_length, tile_width,
+                                 tile_height, burst_size>;
 
-void exception_handler(cl::sycl::exception_list exceptions)
-{
-    for (std::exception_ptr const &e : exceptions)
-    {
-        try
-        {
+void exception_handler(cl::sycl::exception_list exceptions) {
+    for (std::exception_ptr const &e : exceptions) {
+        try {
             std::rethrow_exception(e);
-        }
-        catch (cl::sycl::exception const &e)
-        {
-            std::cout << "Caught asynchronous SYCL exception:\n"
-                      << e.what() << "\n";
+        } catch (cl::sycl::exception const &e) {
+            std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << "\n";
             std::terminate();
         }
     }
 }
 
-int main()
-{
+int main() {
 #ifdef HARDWARE
     INTEL::fpga_selector device_selector;
 #else
@@ -57,10 +61,8 @@ int main()
     {
         auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
 
-        for (uindex_t c = 0; c < grid_width; c++)
-        {
-            for (uindex_t r = 0; r < grid_height; r++)
-            {
+        for (uindex_t c = 0; c < grid_width; c++) {
+            for (uindex_t r = 0; r < grid_height; r++) {
                 in_buffer_ac[c][r] = Cell{index_t(c), index_t(r), 0, CellStatus::Normal};
             }
         }
@@ -68,9 +70,9 @@ int main()
 
     std::cout << "Input loaded" << std::endl;
 
-    Executor executor(TransFunc::halo(), TransFunc());
+    Executor executor(Cell::halo(), TransFunc());
     executor.set_input(in_buffer);
-    executor.set_queue(working_queue, false);
+    executor.set_queue(working_queue);
 
     executor.run(2 * pipeline_length);
 
@@ -82,10 +84,8 @@ int main()
     {
         auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
 
-        for (uindex_t c = 0; c < grid_width; c++)
-        {
-            for (uindex_t r = 0; r < grid_height; r++)
-            {
+        for (uindex_t c = 0; c < grid_width; c++) {
+            for (uindex_t r = 0; r < grid_height; r++) {
                 assert(out_buffer_ac[c][r].c == c);
                 assert(out_buffer_ac[c][r].r == r);
                 assert(out_buffer_ac[c][r].i_generation == 2 * pipeline_length);
@@ -94,5 +94,7 @@ int main()
         }
     }
 
+    std::cout << "Test finished successfully!" << std::endl;
+
     return 0;
 }
\ No newline at end of file
diff --git a/tests/src/units/CounterID.cpp b/tests/src/units/CounterID.cpp
index ef5aac3..9994189 100644
--- a/tests/src/units/CounterID.cpp
+++ b/tests/src/units/CounterID.cpp
@@ -1,28 +1,35 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
 #include <StencilStream/CounterID.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
 
 using namespace std;
 using namespace stencil;
 
 using CID = CounterID<uindex_t>;
 
-TEST_CASE("CounterID::operator++", "[CounterID]")
-{
+TEST_CASE("CounterID::operator++", "[CounterID]") {
     CID counter(0, 0, tile_width, tile_height);
-    for (uindex_t c = 0; c < tile_width; c++)
-    {
-        for (uindex_t r = 0; r < tile_height; r++)
-        {
+    for (uindex_t c = 0; c < tile_width; c++) {
+        for (uindex_t r = 0; r < tile_height; r++) {
             REQUIRE(counter.c == c);
             REQUIRE(counter.r == r);
             counter++;
@@ -32,13 +39,10 @@ TEST_CASE("CounterID::operator++", "[CounterID]")
     REQUIRE(counter.r == 0);
 }
 
-TEST_CASE("CounterID::operator--", "[CounterID]")
-{
+TEST_CASE("CounterID::operator--", "[CounterID]") {
     CID counter(tile_width - 1, tile_height - 1, tile_width, tile_height);
-    for (index_t c = tile_width - 1; c >= 0; c--)
-    {
-        for (index_t r = tile_height - 1; r >= 0; r--)
-        {
+    for (index_t c = tile_width - 1; c >= 0; c--) {
+        for (index_t r = tile_height - 1; r >= 0; r--) {
             REQUIRE(counter.c == c);
             REQUIRE(counter.r == r);
             counter--;
diff --git a/tests/src/units/Grid.cpp b/tests/src/units/Grid.cpp
deleted file mode 100644
index 041c5cd..0000000
--- a/tests/src/units/Grid.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
-#include <CL/sycl.hpp>
-#include <CL/sycl/INTEL/fpga_extensions.hpp>
-#include <StencilStream/Grid.hpp>
-#include <unordered_set>
-
-using namespace stencil;
-using namespace cl::sycl;
-using namespace std;
-
-const uindex_t add_grid_width = grid_width + 1;
-const uindex_t add_grid_height = grid_height + 1;
-
-using TestGrid = Grid<ID, tile_width, tile_height, halo_radius, burst_length>;
-
-TEST_CASE("Grid::Grid(uindex_t, uindex_t, T)", "[Grid]")
-{
-    TestGrid grid(add_grid_width, add_grid_height);
-
-    UID tile_range = grid.get_tile_range();
-    REQUIRE(tile_range.c == add_grid_width / tile_width + 1);
-    REQUIRE(tile_range.r == add_grid_height / tile_height + 1);
-}
-
-TEST_CASE("Grid::Grid(cl::sycl::buffer<T, 2>, T)", "[Grid]")
-{
-    buffer<ID, 2> in_buffer(range<2>(add_grid_width, add_grid_height));
-    {
-        auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
-        for (uindex_t c = 0; c < add_grid_width; c++)
-        {
-            for (uindex_t r = 0; r < add_grid_height; r++)
-            {
-                in_buffer_ac[c][r] = ID(c, r);
-            }
-        }
-    }
-
-    TestGrid grid(in_buffer);
-
-    UID tile_range = grid.get_tile_range();
-    REQUIRE(tile_range.c == add_grid_width / tile_width + 1);
-    REQUIRE(tile_range.r == add_grid_height / tile_height + 1);
-
-    buffer<ID, 2> out_buffer(range<2>(add_grid_width, add_grid_height));
-    grid.copy_to(out_buffer);
-
-    {
-        auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
-        for (uindex_t c = 0; c < add_grid_width; c++)
-        {
-            for (uindex_t r = 0; r < add_grid_height; r++)
-            {
-                REQUIRE(out_buffer_ac[c][r].c == c);
-                REQUIRE(out_buffer_ac[c][r].r == r);
-            }
-        }
-    }
-}
-
-TEST_CASE("Grid::submit_tile_input", "[Grid]")
-{
-    using grid_in_pipe = pipe<class grid_in_pipe_id, ID>;
-
-    buffer<ID, 2> in_buffer(range<2>(3 * tile_width, 3 * tile_height));
-    buffer<ID, 2> out_buffer(range<2>(2 * halo_radius + tile_width, 2 * halo_radius + tile_height));
-
-#ifdef HARDWARE
-    INTEL::fpga_selector device_selector;
-#else
-    INTEL::fpga_emulator_selector device_selector;
-#endif
-    cl::sycl::queue working_queue(device_selector);
-
-    {
-        auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
-        for (uindex_t c = 0; c < 3 * tile_width; c++)
-        {
-            for (uindex_t r = 0; r < 3 * tile_height; r++)
-            {
-                in_buffer_ac[c][r] = ID(c, r);
-            }
-        }
-    }
-
-    TestGrid grid(in_buffer);
-    grid.submit_tile_input<grid_in_pipe>(working_queue, UID(1, 1));
-
-    working_queue.submit([&](handler &cgh)
-                         {
-                             auto out_buffer_ac = out_buffer.get_access<access::mode::discard_write>(cgh);
-
-                             cgh.single_task<class input_test_kernel>([=]()
-                                                                      {
-                                                                          for (uindex_t c = 0; c < 2 * halo_radius + tile_width; c++)
-                                                                          {
-                                                                              for (uindex_t r = 0; r < 2 * halo_radius + tile_height; r++)
-                                                                              {
-                                                                                  out_buffer_ac[c][r] = grid_in_pipe::read();
-                                                                              }
-                                                                          }
-                                                                      });
-                         });
-
-    auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
-
-    for (uindex_t c = 0; c < 2 * halo_radius + tile_width; c++)
-    {
-        for (uindex_t r = 0; r < 2 * halo_radius + tile_height; r++)
-        {
-            REQUIRE(out_buffer_ac[c][r].c == c + tile_width - halo_radius);
-            REQUIRE(out_buffer_ac[c][r].r == r + tile_height - halo_radius);
-        }
-    }
-}
-
-TEST_CASE("Grid::submit_tile_output", "[Grid]")
-{
-    using grid_out_pipe = pipe<class grid_out_pipe_id, ID>;
-
-    TestGrid grid(tile_width, tile_height);
-
-#ifdef HARDWARE
-    INTEL::fpga_selector device_selector;
-#else
-    INTEL::fpga_emulator_selector device_selector;
-#endif
-    cl::sycl::queue working_queue(device_selector);
-
-    working_queue.submit([&](handler &cgh)
-                         {
-                             cgh.single_task<class output_test_kernel>([=]()
-                                                                       {
-                                                                           for (uindex_t c = 0; c < tile_width; c++)
-                                                                           {
-                                                                               for (uindex_t r = 0; r < tile_height; r++)
-                                                                               {
-                                                                                   grid_out_pipe::write(ID(c, r));
-                                                                               }
-                                                                           }
-                                                                       });
-                         });
-
-    grid.submit_tile_output<grid_out_pipe>(working_queue, UID(0, 0));
-
-    buffer<ID, 2> out_buffer(range<2>(tile_width, tile_height));
-    grid.copy_to(out_buffer);
-
-    auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
-    for (uindex_t c = 0; c < tile_width; c++)
-    {
-        for (uindex_t r = 0; r < tile_height; r++)
-        {
-            REQUIRE(out_buffer_ac[c][r].r == r);
-            REQUIRE(out_buffer_ac[c][r].c == c);
-        }
-    }
-}
diff --git a/tests/src/units/HostPipe.cpp b/tests/src/units/HostPipe.cpp
index ba6344e..4127dee 100644
--- a/tests/src/units/HostPipe.cpp
+++ b/tests/src/units/HostPipe.cpp
@@ -1,44 +1,48 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/HostPipe.hpp"
-#include "../res/catch.hpp"
+#include <res/HostPipe.hpp>
+#include <res/catch.hpp>
 
 using namespace stencil;
 
-TEST_CASE("HostPipe (normal)", "[HostPipe]")
-{
+TEST_CASE("HostPipe (normal)", "[HostPipe]") {
     using MyPipe = HostPipe<class MyPipeID, uindex_t>;
-    for (uindex_t i = 0; i < 256; i++)
-    {
+    for (uindex_t i = 0; i < 256; i++) {
         MyPipe::write(i);
     }
-    for (uindex_t i = 0; i < 256; i++)
-    {
+    for (uindex_t i = 0; i < 256; i++) {
         REQUIRE(MyPipe::read() == i);
     }
     REQUIRE(MyPipe::empty());
 }
 
-TEST_CASE("HostPipe (separated pipes)", "[HostPipe]")
-{
+TEST_CASE("HostPipe (separated pipes)", "[HostPipe]") {
     using PipeA = HostPipe<class PipeAID, index_t>;
     using PipeB = HostPipe<class PipeBID, index_t>;
 
-    for (index_t i = 0; i < 256; i++)
-    {
+    for (index_t i = 0; i < 256; i++) {
         PipeA::write(i);
         PipeB::write(-1 * i);
     }
 
-    for (index_t i = 0; i < 256; i++)
-    {
+    for (index_t i = 0; i < 256; i++) {
         REQUIRE(PipeA::read() == i);
         REQUIRE(PipeB::read() == -1 * i);
     }
@@ -47,15 +51,13 @@ TEST_CASE("HostPipe (separated pipes)", "[HostPipe]")
     REQUIRE(PipeB::empty());
 }
 
-TEST_CASE("HostPipe (continous read/write)", "[HostPipe]")
-{
+TEST_CASE("HostPipe (continous read/write)", "[HostPipe]") {
     using PipeC = HostPipe<class PipeCID, uindex_t>;
 
     PipeC::write(0);
     PipeC::write(1);
 
-    for (uindex_t i = 0; i < 256; i++)
-    {
+    for (uindex_t i = 0; i < 256; i++) {
         REQUIRE(PipeC::read() == i);
         PipeC::write(i + 2);
     }
diff --git a/tests/src/units/Stencil.cpp b/tests/src/units/Stencil.cpp
index b62a451..bed26ad 100644
--- a/tests/src/units/Stencil.cpp
+++ b/tests/src/units/Stencil.cpp
@@ -1,63 +1,62 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
 #include <StencilStream/Stencil.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
 
 using namespace stencil;
 
-TEST_CASE("Stencil::diameter", "[Stencil]")
-{
+TEST_CASE("Stencil::diameter", "[Stencil]") {
     Stencil<index_t, 2> stencil(ID(0, 0), 0, 0, UID(42, 42));
 
     REQUIRE(stencil.diameter == Stencil<index_t, 2>::diameter);
     REQUIRE(stencil.diameter == 2 * stencil_radius + 1);
 };
 
-TEST_CASE("Stencil::operator[](ID)", "[Stencil]")
-{
+TEST_CASE("Stencil::operator[](ID)", "[Stencil]") {
     Stencil<index_t, 2> stencil(ID(0, 0), 0, 0, UID(42, 42));
 
-    for (index_t c = -stencil_radius; c <= index_t(stencil_radius); c++)
-    {
-        for (index_t r = -stencil_radius; r <= index_t(stencil_radius); r++)
-        {
+    for (index_t c = -stencil_radius; c <= index_t(stencil_radius); c++) {
+        for (index_t r = -stencil_radius; r <= index_t(stencil_radius); r++) {
             stencil[ID(c, r)] = c + r;
         }
     }
 
-    for (uindex_t c = 0; c < stencil.diameter; c++)
-    {
-        for (uindex_t r = 0; r < stencil.diameter; r++)
-        {
+    for (uindex_t c = 0; c < stencil.diameter; c++) {
+        for (uindex_t r = 0; r < stencil.diameter; r++) {
             REQUIRE(stencil[UID(c, r)] == index_t(c) + index_t(r) - 2 * stencil_radius);
         }
     }
 };
 
-TEST_CASE("Stencil::operator[](UID)", "[Stencil]")
-{
+TEST_CASE("Stencil::operator[](UID)", "[Stencil]") {
     Stencil<index_t, 2> stencil(ID(0, 0), 0, 0, UID(42, 42));
 
-    for (uindex_t c = 0; c < stencil.diameter; c++)
-    {
-        for (uindex_t r = 0; r < stencil.diameter; r++)
-        {
+    for (uindex_t c = 0; c < stencil.diameter; c++) {
+        for (uindex_t r = 0; r < stencil.diameter; r++) {
             stencil[UID(c, r)] = c + r;
         }
     }
 
-    for (index_t c = -stencil_radius; c <= stencil_radius; c++)
-    {
-        for (index_t r = -stencil_radius; r <= stencil_radius; r++)
-        {
+    for (index_t c = -stencil_radius; c <= stencil_radius; c++) {
+        for (index_t r = -stencil_radius; r <= stencil_radius; r++) {
             REQUIRE(stencil[ID(c, r)] == index_t(c) + index_t(r) + 2 * stencil_radius);
         }
     }
diff --git a/tests/src/units/StencilExecutor.cpp b/tests/src/units/StencilExecutor.cpp
index 1459e71..dc3ddf2 100644
--- a/tests/src/units/StencilExecutor.cpp
+++ b/tests/src/units/StencilExecutor.cpp
@@ -1,94 +1,106 @@
 
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/FPGATransFunc.hpp"
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
+#include <StencilStream/MonotileExecutor.hpp>
 #include <StencilStream/StencilExecutor.hpp>
+#include <res/TransFuncs.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
 
 using namespace std;
 using namespace stencil;
 using namespace cl::sycl;
 
 using TransFunc = FPGATransFunc<stencil_radius>;
+using SingleQueueExecutorImpl = SingleQueueExecutor<Cell, stencil_radius, TransFunc>;
+using StencilExecutorImpl = StencilExecutor<Cell, stencil_radius, TransFunc, pipeline_length>;
+using MonotileExecutorImpl = MonotileExecutor<Cell, stencil_radius, TransFunc, pipeline_length>;
 
-TEST_CASE("StencilExecutor::copy_output(cl::sycl::buffer<T, 2>)", "[StencilExecutor]")
-{
+void test_executor_set_input_copy_output(SingleQueueExecutorImpl *executor, uindex_t grid_width,
+                                         uindex_t grid_height) {
     buffer<Cell, 2> in_buffer(range<2>(grid_width, grid_height));
     {
         auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
-        for (uindex_t c = 0; c < grid_width; c++)
-        {
-            for (uindex_t r = 0; r < grid_height; r++)
-            {
+        for (uindex_t c = 0; c < grid_width; c++) {
+            for (uindex_t r = 0; r < grid_height; r++) {
                 in_buffer_ac[c][r] = Cell{index_t(c), index_t(r), 0, CellStatus::Normal};
             }
         }
     }
 
-    StencilExecutor<Cell, stencil_radius, TransFunc> executor(TransFunc::halo(), TransFunc());
-    executor.set_input(in_buffer);
+    executor->set_input(in_buffer);
 
     buffer<Cell, 2> out_buffer(range<2>(grid_width, grid_height));
-    executor.copy_output(out_buffer);
+    executor->copy_output(out_buffer);
 
-    {
-        auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
-        for (uindex_t c = 0; c < grid_width; c++)
-        {
-            for (uindex_t r = 0; r < grid_height; r++)
-            {
-                REQUIRE(out_buffer_ac[c][r].c == c);
-                REQUIRE(out_buffer_ac[c][r].r == r);
-                REQUIRE(out_buffer_ac[c][r].i_generation == 0);
-                REQUIRE(out_buffer_ac[c][r].status == CellStatus::Normal);
-            }
+    auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
+    for (uindex_t c = 0; c < grid_width; c++) {
+        for (uindex_t r = 0; r < grid_height; r++) {
+            REQUIRE(out_buffer_ac[c][r].c == c);
+            REQUIRE(out_buffer_ac[c][r].r == r);
+            REQUIRE(out_buffer_ac[c][r].i_generation == 0);
+            REQUIRE(out_buffer_ac[c][r].status == CellStatus::Normal);
         }
     }
 }
 
-TEST_CASE("StencilExecutor::run(uindex_t)", "[StencilExecutor]")
-{
+TEST_CASE("StencilExecutor::copy_output(cl::sycl::buffer<T, 2>)", "[StencilExecutor]") {
+    StencilExecutorImpl executor(Cell::halo(), TransFunc());
+    test_executor_set_input_copy_output(&executor, grid_width, grid_height);
+}
+
+TEST_CASE("MonotileExecutor::copy_output(cl::sycl::buffer<T, 2>)", "[MonotileExecutor]") {
+    MonotileExecutorImpl executor(Cell::halo(), TransFunc());
+    test_executor_set_input_copy_output(&executor, tile_width - 1, tile_height - 1);
+}
+
+void test_executor_run(SingleQueueExecutorImpl *executor, uindex_t grid_width,
+                       uindex_t grid_height) {
     uindex_t n_generations = 2 * pipeline_length + 1;
 
     buffer<Cell, 2> in_buffer(range<2>(grid_width, grid_height));
     {
         auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
-        for (uindex_t c = 0; c < grid_width; c++)
-        {
-            for (uindex_t r = 0; r < grid_height; r++)
-            {
+        for (uindex_t c = 0; c < grid_width; c++) {
+            for (uindex_t r = 0; r < grid_height; r++) {
                 in_buffer_ac[c][r] = Cell{index_t(c), index_t(r), 0, CellStatus::Normal};
             }
         }
     }
 
-    StencilExecutor<Cell, stencil_radius, TransFunc, pipeline_length> executor(TransFunc::halo(), TransFunc());
-    REQUIRE(executor.get_i_generation() == 0);
+    REQUIRE(executor->get_i_generation() == 0);
 
-    executor.set_input(in_buffer);
-    REQUIRE(executor.get_grid_range().c == grid_width);
-    REQUIRE(executor.get_grid_range().r == grid_height);
+    executor->set_input(in_buffer);
+    REQUIRE(executor->get_grid_range().c == grid_width);
+    REQUIRE(executor->get_grid_range().r == grid_height);
 
-    executor.run(n_generations);
-    REQUIRE(executor.get_i_generation() == n_generations);
+    executor->run(n_generations);
+    REQUIRE(executor->get_i_generation() == n_generations);
 
     buffer<Cell, 2> out_buffer(range<2>(grid_width, grid_height));
-    executor.copy_output(out_buffer);
+    executor->copy_output(out_buffer);
 
     {
         auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
-        for (uindex_t c = 0; c < grid_width; c++)
-        {
-            for (uindex_t r = 0; r < grid_height; r++)
-            {
+        for (uindex_t c = 0; c < grid_width; c++) {
+            for (uindex_t r = 0; r < grid_height; r++) {
                 REQUIRE(out_buffer_ac[c][r].c == c);
                 REQUIRE(out_buffer_ac[c][r].r == r);
                 REQUIRE(out_buffer_ac[c][r].i_generation == n_generations);
@@ -96,4 +108,31 @@ TEST_CASE("StencilExecutor::run(uindex_t)", "[StencilExecutor]")
             }
         }
     }
+
+    // Now, a second run to show that behavior is still correct when i_generation != 0:
+    executor->run(n_generations);
+    REQUIRE(executor->get_i_generation() == 2 * n_generations);
+
+    executor->copy_output(out_buffer);
+    {
+        auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
+        for (uindex_t c = 0; c < grid_width; c++) {
+            for (uindex_t r = 0; r < grid_height; r++) {
+                REQUIRE(out_buffer_ac[c][r].c == c);
+                REQUIRE(out_buffer_ac[c][r].r == r);
+                REQUIRE(out_buffer_ac[c][r].i_generation == 2 * n_generations);
+                REQUIRE(out_buffer_ac[c][r].status == CellStatus::Normal);
+            }
+        }
+    }
+}
+
+TEST_CASE("StencilExecutor::run", "[StencilExecutor]") {
+    StencilExecutorImpl executor(Cell::halo(), TransFunc());
+    test_executor_run(&executor, grid_width, grid_height);
+}
+
+TEST_CASE("MonotileExecutor::run", "[MonotileExecutor]") {
+    MonotileExecutorImpl executor(Cell::halo(), TransFunc());
+    test_executor_run(&executor, grid_width, grid_height);
 }
\ No newline at end of file
diff --git a/tests/src/units/main.cpp b/tests/src/units/main.cpp
index 9fb0461..cf04247 100644
--- a/tests/src/units/main.cpp
+++ b/tests/src/units/main.cpp
@@ -1,11 +1,21 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #define CATCH_CONFIG_MAIN
-#include "../res/catch.hpp"
\ No newline at end of file
+#include <res/catch.hpp>
\ No newline at end of file
diff --git a/tests/src/units/monotile/ExecutionKernel.cpp b/tests/src/units/monotile/ExecutionKernel.cpp
new file mode 100644
index 0000000..b6db0c0
--- /dev/null
+++ b/tests/src/units/monotile/ExecutionKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <StencilStream/monotile/ExecutionKernel.hpp>
+#include <res/HostPipe.hpp>
+#include <res/TransFuncs.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
+
+using namespace stencil;
+using namespace std;
+using namespace cl::sycl;
+
+void test_monotile_kernel(uindex_t n_generations) {
+    using TransFunc = HostTransFunc<stencil_radius>;
+    using in_pipe = HostPipe<class MonotileExecutionKernelInPipeID, Cell>;
+    using out_pipe = HostPipe<class MonotileExecutionKernelOutPipeID, Cell>;
+    using TestExecutionKernel =
+        monotile::ExecutionKernel<TransFunc, Cell, stencil_radius, pipeline_length, tile_width,
+                                tile_height, in_pipe, out_pipe>;
+
+    for (uindex_t c = 0; c < tile_width; c++) {
+        for (uindex_t r = 0; r < tile_height; r++) {
+            in_pipe::write(Cell{index_t(c), index_t(r), 0, CellStatus::Normal});
+        }
+    }
+
+    TestExecutionKernel(TransFunc(), 0, n_generations, tile_width, tile_height, Cell::halo())();
+
+    buffer<Cell, 2> output_buffer(range<2>(tile_width, tile_height));
+
+    {
+        auto output_buffer_ac = output_buffer.get_access<access::mode::discard_write>();
+        for (uindex_t c = 0; c < tile_width; c++) {
+            for (uindex_t r = 0; r < tile_height; r++) {
+                output_buffer_ac[c][r] = out_pipe::read();
+            }
+        }
+    }
+
+    REQUIRE(in_pipe::empty());
+    REQUIRE(out_pipe::empty());
+
+    auto output_buffer_ac = output_buffer.get_access<access::mode::read>();
+    for (uindex_t c = 1; c < tile_width; c++) {
+        for (uindex_t r = 1; r < tile_height; r++) {
+            Cell cell = output_buffer_ac[c][r];
+            REQUIRE(cell.c == c);
+            REQUIRE(cell.r == r);
+            REQUIRE(cell.i_generation == n_generations);
+            REQUIRE(cell.status == CellStatus::Normal);
+        }
+    }
+}
+
+TEST_CASE("monotile::ExecutionKernel", "[monotile::ExecutionKernel]") {
+    test_monotile_kernel(pipeline_length);
+}
+
+TEST_CASE("monotile::ExecutionKernel (partial pipeline)", "[monotile::ExecutionKernel]") {
+    static_assert(pipeline_length != 1);
+    test_monotile_kernel(pipeline_length - 1);
+}
+
+TEST_CASE("monotile::ExecutionKernel (noop)", "[monotile::ExecutionKernel]") {
+    test_monotile_kernel(0);
+}
+
+TEST_CASE("monotile::ExecutionKernel: Incomplete Pipeline with i_generation != 0",
+          "[monotile::ExecutionKernel]") {
+    using Cell = uint8_t;
+    auto trans_func = [](Stencil<Cell, 1> const &stencil) { return stencil[ID(0, 0)] + 1; };
+
+    using in_pipe = HostPipe<class IncompletePipelineInPipeID, Cell>;
+    using out_pipe = HostPipe<class IncompletePipelineOutPipeID, Cell>;
+    using TestExecutionKernel =
+        monotile::ExecutionKernel<decltype(trans_func), Cell, 1, 16, 64, 64, in_pipe, out_pipe>;
+
+    for (int c = 0; c < 64; c++) {
+        for (int r = 0; r < 64; r++) {
+            in_pipe::write(0);
+        }
+    }
+
+    TestExecutionKernel kernel(trans_func, 16, 20, 64, 64, 0);
+    kernel.operator()();
+
+    REQUIRE(in_pipe::empty());
+
+    for (int c = 0; c < 64; c++) {
+        for (int r = 0; r < 64; r++) {
+            REQUIRE(out_pipe::read() == 4);
+        }
+    }
+
+    REQUIRE(out_pipe::empty());
+}
\ No newline at end of file
diff --git a/tests/src/units/ExecutionKernel.cpp b/tests/src/units/tiling/ExecutionKernel.cpp
similarity index 50%
rename from tests/src/units/ExecutionKernel.cpp
rename to tests/src/units/tiling/ExecutionKernel.cpp
index eb83976..3dbd960 100644
--- a/tests/src/units/ExecutionKernel.cpp
+++ b/tests/src/units/tiling/ExecutionKernel.cpp
@@ -1,54 +1,60 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/FPGATransFunc.hpp"
-#include "../res/HostPipe.hpp"
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
-#include <StencilStream/ExecutionKernel.hpp>
+#include <StencilStream/tiling/ExecutionKernel.hpp>
+#include <res/HostPipe.hpp>
+#include <res/TransFuncs.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
 
 using namespace stencil;
 using namespace std;
+using namespace stencil::tiling;
 using namespace cl::sycl;
 
-void test_kernel(uindex_t n_generations)
-{
+void test_tiling_kernel(uindex_t n_generations) {
     using TransFunc = FPGATransFunc<stencil_radius>;
-    using in_pipe = HostPipe<class ExecutionKernelInPipeID, Cell>;
-    using out_pipe = HostPipe<class ExecutionKernelOutPipeID, Cell>;
-    using TestExecutionKernel = ExecutionKernel<TransFunc, Cell, stencil_radius, pipeline_length, tile_width, tile_height, in_pipe, out_pipe>;
-
-    for (index_t c = -halo_radius; c < index_t(halo_radius + tile_width); c++)
-    {
-        for (index_t r = -halo_radius; r < index_t(halo_radius + tile_height); r++)
-        {
-            if (c >= index_t(0) && c < index_t(tile_width) && r >= index_t(0) && r < index_t(tile_height))
-            {
+    using in_pipe = HostPipe<class TilingExecutionKernelInPipeID, Cell>;
+    using out_pipe = HostPipe<class TilingExecutionKernelOutPipeID, Cell>;
+    using TestExecutionKernel = ExecutionKernel<TransFunc, Cell, stencil_radius, pipeline_length,
+                                                tile_width, tile_height, in_pipe, out_pipe>;
+
+    for (index_t c = -halo_radius; c < index_t(halo_radius + tile_width); c++) {
+        for (index_t r = -halo_radius; r < index_t(halo_radius + tile_height); r++) {
+            if (c >= index_t(0) && c < index_t(tile_width) && r >= index_t(0) &&
+                r < index_t(tile_height)) {
                 in_pipe::write(Cell{c, r, 0, CellStatus::Normal});
-            }
-            else
-            {
-                in_pipe::write(TransFunc::halo());
+            } else {
+                in_pipe::write(Cell::halo());
             }
         }
     }
 
-    TestExecutionKernel(TransFunc(), 0, n_generations, 0, 0, tile_width, tile_height, TransFunc::halo())();
+    TestExecutionKernel(TransFunc(), 0, n_generations, 0, 0, tile_width, tile_height,
+                        Cell::halo())();
 
     buffer<Cell, 2> output_buffer(range<2>(tile_width, tile_height));
 
     {
         auto output_buffer_ac = output_buffer.get_access<access::mode::discard_write>();
-        for (uindex_t c = 0; c < tile_width; c++)
-        {
-            for (uindex_t r = 0; r < tile_height; r++)
-            {
+        for (uindex_t c = 0; c < tile_width; c++) {
+            for (uindex_t r = 0; r < tile_height; r++) {
                 output_buffer_ac[c][r] = out_pipe::read();
             }
         }
@@ -58,10 +64,8 @@ void test_kernel(uindex_t n_generations)
     REQUIRE(out_pipe::empty());
 
     auto output_buffer_ac = output_buffer.get_access<access::mode::read>();
-    for (uindex_t c = 1; c < tile_width; c++)
-    {
-        for (uindex_t r = 1; r < tile_height; r++)
-        {
+    for (uindex_t c = 1; c < tile_width; c++) {
+        for (uindex_t r = 1; r < tile_height; r++) {
             Cell cell = output_buffer_ac[c][r];
             REQUIRE(cell.c == c);
             REQUIRE(cell.r == r);
@@ -71,43 +75,30 @@ void test_kernel(uindex_t n_generations)
     }
 }
 
-TEST_CASE("ExecutionKernel", "[ExecutionKernel]")
-{
-    test_kernel(pipeline_length);
+TEST_CASE("tiling::ExecutionKernel", "[tiling::ExecutionKernel]") {
+    test_tiling_kernel(pipeline_length);
 }
 
-TEST_CASE("ExecutionKernel (partial pipeline)", "[ExecutionKernel]")
-{
+TEST_CASE("tiling::ExecutionKernel (partial pipeline)", "[tiling::ExecutionKernel]") {
     static_assert(pipeline_length != 1);
-    test_kernel(pipeline_length - 1);
+    test_tiling_kernel(pipeline_length - 1);
 }
 
-TEST_CASE("ExecutionKernel (noop)", "[ExecutionKernel]")
-{
-    test_kernel(0);
-}
+TEST_CASE("tiling::ExecutionKernel (noop)", "[tiling::ExecutionKernel]") { test_tiling_kernel(0); }
 
-TEST_CASE("Halo values inside the pipeline are handled correctly", "[ExecutionKernel]")
-{
-    auto my_kernel = [=](Stencil<bool, stencil_radius> const &stencil)
-    {
+TEST_CASE("Halo values inside the pipeline are handled correctly", "[tiling::ExecutionKernel]") {
+    auto my_kernel = [=](Stencil<bool, stencil_radius> const &stencil) {
         ID idx = stencil.id;
         bool is_valid = true;
-        if (idx.c == 0)
-        {
+        if (idx.c == 0) {
             is_valid &= stencil[ID(-1, -1)] && stencil[ID(-1, 0)] && stencil[ID(-1, 1)];
-        }
-        else if (idx.c == tile_width - 1)
-        {
+        } else if (idx.c == tile_width - 1) {
             is_valid &= stencil[ID(1, -1)] && stencil[ID(1, 0)] && stencil[ID(1, 1)];
         }
 
-        if (idx.r == 0)
-        {
+        if (idx.r == 0) {
             is_valid &= stencil[ID(-1, -1)] && stencil[ID(0, -1)] && stencil[ID(1, -1)];
-        }
-        else if (idx.r == tile_height - 1)
-        {
+        } else if (idx.r == tile_height - 1) {
             is_valid &= stencil[ID(-1, 1)] && stencil[ID(0, 1)] && stencil[ID(1, 1)];
         }
 
@@ -116,22 +107,20 @@ TEST_CASE("Halo values inside the pipeline are handled correctly", "[ExecutionKe
 
     using in_pipe = HostPipe<class HaloValueTestInPipeID, bool>;
     using out_pipe = HostPipe<class HaloValueTestOutPipeID, bool>;
-    using TestExecutionKernel = ExecutionKernel<decltype(my_kernel), bool, stencil_radius, pipeline_length, tile_width, tile_height, in_pipe, out_pipe>;
+    using TestExecutionKernel =
+        ExecutionKernel<decltype(my_kernel), bool, stencil_radius, pipeline_length, tile_width,
+                        tile_height, in_pipe, out_pipe>;
 
-    for (index_t c = -halo_radius; c < index_t(halo_radius + tile_width); c++)
-    {
-        for (index_t r = -halo_radius; r < index_t(halo_radius + tile_height); r++)
-        {
+    for (index_t c = -halo_radius; c < index_t(halo_radius + tile_width); c++) {
+        for (index_t r = -halo_radius; r < index_t(halo_radius + tile_height); r++) {
             in_pipe::write(false);
         }
     }
 
     TestExecutionKernel(my_kernel, 0, pipeline_length, 0, 0, tile_width, tile_height, true)();
 
-    for (uindex_t c = 0; c < tile_width; c++)
-    {
-        for (uindex_t r = 0; r < tile_height; r++)
-        {
+    for (uindex_t c = 0; c < tile_width; c++) {
+        for (uindex_t r = 0; r < tile_height; r++) {
             REQUIRE(out_pipe::read());
         }
     }
@@ -140,19 +129,17 @@ TEST_CASE("Halo values inside the pipeline are handled correctly", "[ExecutionKe
     REQUIRE(out_pipe::empty());
 }
 
-TEST_CASE("Incomplete Pipeline with i_generation != 0", "[ExecutionKernel]")
-{
+TEST_CASE("Incomplete Pipeline with i_generation != 0", "[tiling::ExecutionKernel]") {
     using Cell = uint8_t;
-    auto trans_func = [](Stencil<Cell, 1> const &stencil) {
-        return stencil[ID(0,0)] + 1;
-    };
+    auto trans_func = [](Stencil<Cell, 1> const &stencil) { return stencil[ID(0, 0)] + 1; };
 
     using in_pipe = HostPipe<class IncompletePipelineInPipeID, Cell>;
     using out_pipe = HostPipe<class IncompletePipelineOutPipeID, Cell>;
-    using TestExecutionKernel = ExecutionKernel<decltype(trans_func), Cell, 1, 16, 64, 64, in_pipe, out_pipe>;
+    using TestExecutionKernel =
+        ExecutionKernel<decltype(trans_func), Cell, 1, 16, 64, 64, in_pipe, out_pipe>;
 
-    for (int c = -16; c < 16+64; c++) {
-        for (int r = -16; r < 16+64; r++) {
+    for (int c = -16; c < 16 + 64; c++) {
+        for (int r = -16; r < 16 + 64; r++) {
             in_pipe::write(0);
         }
     }
diff --git a/tests/src/units/tiling/Grid.cpp b/tests/src/units/tiling/Grid.cpp
new file mode 100644
index 0000000..6517522
--- /dev/null
+++ b/tests/src/units/tiling/Grid.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+#include <StencilStream/tiling/Grid.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
+#include <unordered_set>
+
+using namespace stencil;
+using namespace stencil::tiling;
+using namespace cl::sycl;
+using namespace std;
+
+const uindex_t add_grid_width = grid_width + 1;
+const uindex_t add_grid_height = grid_height + 1;
+
+using TestGrid = Grid<ID, tile_width, tile_height, halo_radius, burst_length>;
+
+TEST_CASE("Grid::Grid(uindex_t, uindex_t, T)", "[Grid]") {
+    TestGrid grid(add_grid_width, add_grid_height);
+
+    UID tile_range = grid.get_tile_range();
+    REQUIRE(tile_range.c == add_grid_width / tile_width + 1);
+    REQUIRE(tile_range.r == add_grid_height / tile_height + 1);
+}
+
+TEST_CASE("Grid::Grid(cl::sycl::buffer<T, 2>, T)", "[Grid]") {
+    buffer<ID, 2> in_buffer(range<2>(add_grid_width, add_grid_height));
+    {
+        auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
+        for (uindex_t c = 0; c < add_grid_width; c++) {
+            for (uindex_t r = 0; r < add_grid_height; r++) {
+                in_buffer_ac[c][r] = ID(c, r);
+            }
+        }
+    }
+
+    TestGrid grid(in_buffer);
+
+    UID tile_range = grid.get_tile_range();
+    REQUIRE(tile_range.c == add_grid_width / tile_width + 1);
+    REQUIRE(tile_range.r == add_grid_height / tile_height + 1);
+
+    buffer<ID, 2> out_buffer(range<2>(add_grid_width, add_grid_height));
+    grid.copy_to(out_buffer);
+
+    {
+        auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
+        for (uindex_t c = 0; c < add_grid_width; c++) {
+            for (uindex_t r = 0; r < add_grid_height; r++) {
+                REQUIRE(out_buffer_ac[c][r].c == c);
+                REQUIRE(out_buffer_ac[c][r].r == r);
+            }
+        }
+    }
+}
+
+TEST_CASE("Grid::submit_tile_input", "[Grid]") {
+    using grid_in_pipe = pipe<class grid_in_pipe_id, ID>;
+
+    buffer<ID, 2> in_buffer(range<2>(3 * tile_width, 3 * tile_height));
+    buffer<ID, 2> out_buffer(range<2>(2 * halo_radius + tile_width, 2 * halo_radius + tile_height));
+
+#ifdef HARDWARE
+    INTEL::fpga_selector device_selector;
+#else
+    INTEL::fpga_emulator_selector device_selector;
+#endif
+    cl::sycl::queue working_queue(device_selector);
+
+    {
+        auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
+        for (uindex_t c = 0; c < 3 * tile_width; c++) {
+            for (uindex_t r = 0; r < 3 * tile_height; r++) {
+                in_buffer_ac[c][r] = ID(c, r);
+            }
+        }
+    }
+
+    TestGrid grid(in_buffer);
+    grid.submit_tile_input<grid_in_pipe>(working_queue, UID(1, 1));
+
+    working_queue.submit([&](handler &cgh) {
+        auto out_buffer_ac = out_buffer.get_access<access::mode::discard_write>(cgh);
+
+        cgh.single_task<class input_test_kernel>([=]() {
+            for (uindex_t c = 0; c < 2 * halo_radius + tile_width; c++) {
+                for (uindex_t r = 0; r < 2 * halo_radius + tile_height; r++) {
+                    out_buffer_ac[c][r] = grid_in_pipe::read();
+                }
+            }
+        });
+    });
+
+    auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
+
+    for (uindex_t c = 0; c < 2 * halo_radius + tile_width; c++) {
+        for (uindex_t r = 0; r < 2 * halo_radius + tile_height; r++) {
+            REQUIRE(out_buffer_ac[c][r].c == c + tile_width - halo_radius);
+            REQUIRE(out_buffer_ac[c][r].r == r + tile_height - halo_radius);
+        }
+    }
+}
+
+TEST_CASE("Grid::submit_tile_output", "[Grid]") {
+    using grid_out_pipe = pipe<class grid_out_pipe_id, ID>;
+
+    TestGrid grid(tile_width, tile_height);
+
+#ifdef HARDWARE
+    INTEL::fpga_selector device_selector;
+#else
+    INTEL::fpga_emulator_selector device_selector;
+#endif
+    cl::sycl::queue working_queue(device_selector);
+
+    working_queue.submit([&](handler &cgh) {
+        cgh.single_task<class output_test_kernel>([=]() {
+            for (uindex_t c = 0; c < tile_width; c++) {
+                for (uindex_t r = 0; r < tile_height; r++) {
+                    grid_out_pipe::write(ID(c, r));
+                }
+            }
+        });
+    });
+
+    grid.submit_tile_output<grid_out_pipe>(working_queue, UID(0, 0));
+
+    buffer<ID, 2> out_buffer(range<2>(tile_width, tile_height));
+    grid.copy_to(out_buffer);
+
+    auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
+    for (uindex_t c = 0; c < tile_width; c++) {
+        for (uindex_t r = 0; r < tile_height; r++) {
+            REQUIRE(out_buffer_ac[c][r].r == r);
+            REQUIRE(out_buffer_ac[c][r].c == c);
+        }
+    }
+}
diff --git a/tests/src/units/IOKernel.cpp b/tests/src/units/tiling/IOKernel.cpp
similarity index 52%
rename from tests/src/units/IOKernel.cpp
rename to tests/src/units/tiling/IOKernel.cpp
index 7f38449..b2a8971 100644
--- a/tests/src/units/IOKernel.cpp
+++ b/tests/src/units/tiling/IOKernel.cpp
@@ -1,21 +1,32 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
+#include <res/HostPipe.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
 #include <CL/sycl/buffer.hpp>
 #include <CL/sycl/range.hpp>
 #include <StencilStream/GenericID.hpp>
-#include "../res/HostPipe.hpp"
-#include <StencilStream/IOKernel.hpp>
+#include <StencilStream/tiling/IOKernel.hpp>
 
 using namespace cl::sycl;
+using namespace stencil::tiling;
 using namespace stencil;
 using namespace std;
 
@@ -23,27 +34,23 @@ constexpr uindex_t n_bursts = 10;
 constexpr uindex_t n_halo_cells = 10;
 static_assert(burst_length > n_halo_cells);
 
-TEST_CASE("IOKernel::read()", "[IOKernel]")
-{
-    buffer<UID, 2> in_buffer[5] = {
-        buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
-        buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
-        buffer<UID, 2>(range<2>(vertical_border_bursts, burst_length)),
-        buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
-        buffer<UID, 2>(range<2>(corner_bursts, burst_length))};
+TEST_CASE("IOKernel::read()", "[IOKernel]") {
+    buffer<UID, 2> in_buffer[5] = {buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
+                                   buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
+                                   buffer<UID, 2>(range<2>(vertical_border_bursts, burst_length)),
+                                   buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
+                                   buffer<UID, 2>(range<2>(corner_bursts, burst_length))};
 
     {
-        accessor<UID, 2, access::mode::discard_write, access::target::host_buffer> in_buffer_ac[5] = {
-            in_buffer[0].get_access<access::mode::discard_write>(),
-            in_buffer[1].get_access<access::mode::discard_write>(),
-            in_buffer[2].get_access<access::mode::discard_write>(),
-            in_buffer[3].get_access<access::mode::discard_write>(),
-            in_buffer[4].get_access<access::mode::discard_write>()};
-
-        for (uindex_t burst_i = 0; burst_i < corner_bursts; burst_i++)
-        {
-            for (uindex_t cell_i = 0; cell_i < burst_length; cell_i++)
-            {
+        accessor<UID, 2, access::mode::discard_write, access::target::host_buffer> in_buffer_ac[5] =
+            {in_buffer[0].get_access<access::mode::discard_write>(),
+             in_buffer[1].get_access<access::mode::discard_write>(),
+             in_buffer[2].get_access<access::mode::discard_write>(),
+             in_buffer[3].get_access<access::mode::discard_write>(),
+             in_buffer[4].get_access<access::mode::discard_write>()};
+
+        for (uindex_t burst_i = 0; burst_i < corner_bursts; burst_i++) {
+            for (uindex_t cell_i = 0; cell_i < burst_length; cell_i++) {
                 in_buffer_ac[0][burst_i][cell_i] = UID(burst_i, cell_i);
                 in_buffer_ac[1][burst_i][cell_i] = UID(burst_i, cell_i);
                 in_buffer_ac[3][burst_i][cell_i] = UID(burst_i, cell_i);
@@ -51,33 +58,23 @@ TEST_CASE("IOKernel::read()", "[IOKernel]")
             }
         }
 
-        for (uindex_t burst_i = 0; burst_i < vertical_border_bursts; burst_i++)
-        {
-            for (uindex_t cell_i = 0; cell_i < burst_length; cell_i++)
-            {
+        for (uindex_t burst_i = 0; burst_i < vertical_border_bursts; burst_i++) {
+            for (uindex_t cell_i = 0; cell_i < burst_length; cell_i++) {
                 in_buffer_ac[2][burst_i][cell_i] = UID(burst_i, cell_i);
             }
         }
     }
 
     using in_pipe = HostPipe<class BufferedInputKernelPipeID, UID>;
-    using InputKernel = IOKernel<
-        UID,
-        halo_radius,
-        core_height,
-        burst_length,
-        in_pipe,
-        2,
-        access::mode::read,
-        access::target::host_buffer>;
+    using InputKernel = IOKernel<UID, halo_radius, core_height, burst_length, in_pipe, 2,
+                                 access::mode::read, access::target::host_buffer>;
 
     {
-        array<InputKernel::Accessor, 5> accessors = {
-            in_buffer[0].get_access<access::mode::read>(),
-            in_buffer[1].get_access<access::mode::read>(),
-            in_buffer[2].get_access<access::mode::read>(),
-            in_buffer[3].get_access<access::mode::read>(),
-            in_buffer[4].get_access<access::mode::read>()};
+        array<InputKernel::Accessor, 5> accessors = {in_buffer[0].get_access<access::mode::read>(),
+                                                     in_buffer[1].get_access<access::mode::read>(),
+                                                     in_buffer[2].get_access<access::mode::read>(),
+                                                     in_buffer[3].get_access<access::mode::read>(),
+                                                     in_buffer[4].get_access<access::mode::read>()};
 
         InputKernel kernel(accessors, halo_radius);
         kernel.read();
@@ -85,20 +82,16 @@ TEST_CASE("IOKernel::read()", "[IOKernel]")
 
     buffer<UID, 2> out_buffer(range<2>(halo_radius, 2 * halo_radius + tile_height));
     auto out_buffer_ac = out_buffer.get_access<access::mode::read_write>();
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < 2 * halo_radius + tile_height; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < 2 * halo_radius + tile_height; r++) {
             out_buffer_ac[c][r] = in_pipe::read();
         }
     }
     REQUIRE(in_pipe::empty());
 
     CounterID<uindex_t> counter(0, 0, corner_bursts, burst_length);
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < halo_radius; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < halo_radius; r++) {
             REQUIRE(out_buffer_ac[c][r].c == counter.c);
             REQUIRE(out_buffer_ac[c][r].r == counter.r);
             REQUIRE(out_buffer_ac[c][r + halo_radius].c == counter.c);
@@ -112,10 +105,8 @@ TEST_CASE("IOKernel::read()", "[IOKernel]")
     }
 
     counter = CounterID<uindex_t>(0, 0, vertical_border_bursts, burst_length);
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < core_height; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < core_height; r++) {
             REQUIRE(out_buffer_ac[c][r + 2 * halo_radius].c == counter.c);
             REQUIRE(out_buffer_ac[c][r + 2 * halo_radius].r == counter.r);
             counter++;
@@ -123,16 +114,13 @@ TEST_CASE("IOKernel::read()", "[IOKernel]")
     }
 }
 
-TEST_CASE("IOKernel::write()", "[IOKernel]")
-{
+TEST_CASE("IOKernel::write()", "[IOKernel]") {
     buffer<UID, 2> in_buffer(range<2>(halo_radius, tile_height));
     auto in_buffer_ac = in_buffer.get_access<access::mode::read_write>();
 
     CounterID<uindex_t> counter(0, 0, corner_bursts, burst_length);
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < halo_radius; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < halo_radius; r++) {
             in_buffer_ac[c][r] = counter;
             in_buffer_ac[c][r + halo_radius + core_height] = counter;
             counter++;
@@ -140,10 +128,8 @@ TEST_CASE("IOKernel::write()", "[IOKernel]")
     }
 
     counter = CounterID<uindex_t>(0, 0, vertical_border_bursts, burst_length);
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < core_height; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < core_height; r++) {
             in_buffer_ac[c][r + halo_radius] = counter;
             counter++;
         }
@@ -151,28 +137,18 @@ TEST_CASE("IOKernel::write()", "[IOKernel]")
 
     using out_pipe = HostPipe<class BufferedOutputKernelPipeID, UID>;
 
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < tile_height; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < tile_height; r++) {
             out_pipe::write(in_buffer_ac[c][r]);
         }
     }
 
-    buffer<UID, 2> out_buffer[3] = {
-        buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
-        buffer<UID, 2>(range<2>(vertical_border_bursts, burst_length)),
-        buffer<UID, 2>(range<2>(corner_bursts, burst_length))};
-
-    using OutputKernel = IOKernel<
-        UID,
-        halo_radius,
-        core_height,
-        burst_length,
-        out_pipe,
-        1,
-        access::mode::read_write,
-        access::target::host_buffer>;
+    buffer<UID, 2> out_buffer[3] = {buffer<UID, 2>(range<2>(corner_bursts, burst_length)),
+                                    buffer<UID, 2>(range<2>(vertical_border_bursts, burst_length)),
+                                    buffer<UID, 2>(range<2>(corner_bursts, burst_length))};
+
+    using OutputKernel = IOKernel<UID, halo_radius, core_height, burst_length, out_pipe, 1,
+                                  access::mode::read_write, access::target::host_buffer>;
 
     array<OutputKernel::Accessor, 3> out_buffer_ac = {
         out_buffer[0].get_access<access::mode::read_write>(),
@@ -185,10 +161,8 @@ TEST_CASE("IOKernel::write()", "[IOKernel]")
     REQUIRE(out_pipe::empty());
 
     counter = CounterID<uindex_t>(0, 0, corner_bursts, burst_length);
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < halo_radius; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < halo_radius; r++) {
             REQUIRE(in_buffer_ac[c][r].c == counter.c);
             REQUIRE(in_buffer_ac[c][r].r == counter.r);
             REQUIRE(in_buffer_ac[c][r + halo_radius + core_height].c == counter.c);
@@ -198,10 +172,8 @@ TEST_CASE("IOKernel::write()", "[IOKernel]")
     }
 
     counter = CounterID<uindex_t>(0, 0, vertical_border_bursts, burst_length);
-    for (uindex_t c = 0; c < halo_radius; c++)
-    {
-        for (uindex_t r = 0; r < core_height; r++)
-        {
+    for (uindex_t c = 0; c < halo_radius; c++) {
+        for (uindex_t r = 0; r < core_height; r++) {
             REQUIRE(in_buffer_ac[c][r + halo_radius].c == counter.c);
             REQUIRE(in_buffer_ac[c][r + halo_radius].r == counter.r);
             counter++;
diff --git a/tests/src/units/Tile.cpp b/tests/src/units/tiling/Tile.cpp
similarity index 59%
rename from tests/src/units/Tile.cpp
rename to tests/src/units/tiling/Tile.cpp
index 3d16979..cb693eb 100644
--- a/tests/src/units/Tile.cpp
+++ b/tests/src/units/tiling/Tile.cpp
@@ -1,30 +1,39 @@
 /*
- * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn University
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * Copyright © 2020-2021 Jan-Oliver Opdenhövel, Paderborn Center for Parallel Computing, Paderborn
+ * University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ * associated documentation files (the “Software”), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-#include "../res/catch.hpp"
-#include "../res/constants.hpp"
 #include <CL/sycl.hpp>
 #include <StencilStream/GenericID.hpp>
-#include <StencilStream/Tile.hpp>
+#include <StencilStream/tiling/Tile.hpp>
+#include <res/catch.hpp>
+#include <res/constants.hpp>
 
 using namespace stencil;
 using namespace cl::sycl;
+using namespace stencil::tiling;
 using namespace std;
 
 using TileImpl = Tile<ID, tile_width, tile_height, halo_radius, burst_length>;
 
-TEST_CASE("Tile::operator[]", "[Tile]")
-{
+TEST_CASE("Tile::operator[]", "[Tile]") {
     TileImpl tile;
 
-    for (TileImpl::Part part_type : TileImpl::all_parts)
-    {
+    for (TileImpl::Part part_type : TileImpl::all_parts) {
         auto part = tile[part_type];
 
         REQUIRE(part.get_range()[1] == burst_length);
@@ -36,16 +45,13 @@ TEST_CASE("Tile::operator[]", "[Tile]")
     }
 }
 
-void copy_from_test_impl(uindex_t tile_width, uindex_t tile_height)
-{
+void copy_from_test_impl(uindex_t tile_width, uindex_t tile_height) {
     buffer<ID, 2> in_buffer(range<2>(tile_width, tile_height));
 
     {
         auto in_buffer_ac = in_buffer.get_access<access::mode::discard_write>();
-        for (uindex_t c = 0; c < tile_width; c++)
-        {
-            for (uindex_t r = 0; r < tile_height; r++)
-            {
+        for (uindex_t c = 0; c < tile_width; c++) {
+            for (uindex_t r = 0; r < tile_height; r++) {
                 in_buffer_ac[c][r] = ID(c, r);
             }
         }
@@ -54,31 +60,24 @@ void copy_from_test_impl(uindex_t tile_width, uindex_t tile_height)
     TileImpl tile;
     tile.copy_from(in_buffer, id<2>(0, 0));
 
-    for (TileImpl::Part part : TileImpl::all_parts)
-    {
+    for (TileImpl::Part part : TileImpl::all_parts) {
         auto true_range = TileImpl::get_part_range(part);
         auto content_offset = TileImpl::get_part_offset(part);
         auto part_ac = tile[part].get_access<access::mode::read>();
         uindex_t i_cell = 0;
         uindex_t i_burst = 0;
 
-        for (uindex_t c = 0; c < true_range[0]; c++)
-        {
-            for (uindex_t r = 0; r < true_range[1]; r++)
-            {
-                if (c + content_offset[0] < tile_width && r + content_offset[1] < tile_height)
-                {
+        for (uindex_t c = 0; c < true_range[0]; c++) {
+            for (uindex_t r = 0; r < true_range[1]; r++) {
+                if (c + content_offset[0] < tile_width && r + content_offset[1] < tile_height) {
                     REQUIRE(part_ac[i_burst][i_cell].c == c + content_offset[0]);
                     REQUIRE(part_ac[i_burst][i_cell].r == r + content_offset[1]);
                 }
 
-                if (i_cell == burst_length - 1)
-                {
+                if (i_cell == burst_length - 1) {
                     i_cell = 0;
                     i_burst++;
-                }
-                else
-                {
+                } else {
                     i_cell++;
                 }
             }
@@ -86,39 +85,31 @@ void copy_from_test_impl(uindex_t tile_width, uindex_t tile_height)
     }
 }
 
-TEST_CASE("Tile::copy_from", "[Tile]")
-{
+TEST_CASE("Tile::copy_from", "[Tile]") {
     // Test for the full tile.
     copy_from_test_impl(tile_width, tile_height);
     // Test for a partial buffer.
     copy_from_test_impl(tile_width - 2 * halo_radius, tile_height - 2 * halo_radius);
 }
 
-void copy_to_test_impl(uindex_t tile_width, uindex_t tile_height)
-{
+void copy_to_test_impl(uindex_t tile_width, uindex_t tile_height) {
     TileImpl tile;
 
-    for (TileImpl::Part part : TileImpl::all_parts)
-    {
+    for (TileImpl::Part part : TileImpl::all_parts) {
         auto true_range = TileImpl::get_part_range(part);
         auto content_offset = TileImpl::get_part_offset(part);
         auto part_ac = tile[part].get_access<access::mode::read_write>();
         uindex_t i_cell = 0;
         uindex_t i_burst = 0;
 
-        for (uindex_t c = 0; c < true_range[0]; c++)
-        {
-            for (uindex_t r = 0; r < true_range[1]; r++)
-            {
+        for (uindex_t c = 0; c < true_range[0]; c++) {
+            for (uindex_t r = 0; r < true_range[1]; r++) {
                 part_ac[i_burst][i_cell] = ID(c + content_offset[0], r + content_offset[1]);
 
-                if (i_cell == burst_length - 1)
-                {
+                if (i_cell == burst_length - 1) {
                     i_cell = 0;
                     i_burst++;
-                }
-                else
-                {
+                } else {
                     i_cell++;
                 }
             }
@@ -129,18 +120,15 @@ void copy_to_test_impl(uindex_t tile_width, uindex_t tile_height)
     tile.copy_to(out_buffer, id<2>(0, 0));
 
     auto out_buffer_ac = out_buffer.get_access<access::mode::read>();
-    for (uindex_t c = 0; c < tile_width; c++)
-    {
-        for (uindex_t r = 0; r < tile_height; r++)
-        {
+    for (uindex_t c = 0; c < tile_width; c++) {
+        for (uindex_t r = 0; r < tile_height; r++) {
             REQUIRE(out_buffer_ac[c][r].c == c);
             REQUIRE(out_buffer_ac[c][r].r == r);
         }
     }
 }
 
-TEST_CASE("Tile::copy_to", "[Tile]")
-{
+TEST_CASE("Tile::copy_to", "[Tile]") {
     // Test for the full tile.
     copy_to_test_impl(tile_width, tile_height);
     // Test with a partial buffer.