diff --git a/PTRANS/CHANGELOG b/PTRANS/CHANGELOG
index c7665867..e0c91f51 100644
--- a/PTRANS/CHANGELOG
+++ b/PTRANS/CHANGELOG
@@ -2,6 +2,11 @@
 
 This file contains all changes made to the source code for each release.
 
+## 1.1
+
+#### Added:
+- Optimized kernel for Xilinx Vitis toolchain
+
 ## 1.0.1
 
 #### Added:
diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt
index 95326050..4af8b3ab 100755
--- a/PTRANS/CMakeLists.txt
+++ b/PTRANS/CMakeLists.txt
@@ -1,8 +1,8 @@
 cmake_minimum_required(VERSION 3.1)
-project(PTRANS VERSION 1.0.1)
+project(PTRANS VERSION 1.1)
 
 set(KERNEL_NAME transpose CACHE STRING "Name of the OpenCL kernel")
-set(DEFAULT_MATRIX_SIZE 4096 CACHE STRING "Default size of the used matrices")
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices")
 set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel")
 set(GLOBAL_MEM_UNROLL 16 CACHE STRING "Unrolling factor used to stream data")
 
diff --git a/PTRANS/README.md b/PTRANS/README.md
index 13b837a2..584c401a 100644
--- a/PTRANS/README.md
+++ b/PTRANS/README.md
@@ -69,25 +69,28 @@ For execution of the benchmark run:
     
 For more information on available input parameters run
 
-    ./trans_intel -h
-    
+    $./trans_xilinx -h
     Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA.
-    Version: 0.1.2
+    Version: 1.0.1
+
     Usage:
-      ./trans_intel [OPTION...]
-    
-      -f, --file arg        Kernel file name
-      -n, arg               Number of repetitions (default: 10)
-      -m, arg               Matrix size (default: 4096)
-          --kernel arg      Name of the kernel (default: transpose)
-      -i, --nointerleaving  Disable memory interleaving
-          --device arg      Index of the device that has to be used. If not given
+    ./trans_xilinx [OPTION...]
+
+    -f, --file arg        Kernel file name
+    -n, arg               Number of repetitions (default: 10)
+    -m, arg               Matrix size in number of blocks in one dimension
+                            (default: 8)
+    -b, arg               Block size in number of values in one dimension
+                            (default: 512)
+        --kernel arg      Name of the kernel (default: transpose)
+    -i, --nointerleaving  Disable memory interleaving
+        --device arg      Index of the device that has to be used. If not given
                             you will be asked which device to use if there are
                             multiple devices available. (default: -1)
-          --platform arg    Index of the platform that has to be used. If not
+        --platform arg    Index of the platform that has to be used. If not
                             given you will be asked which platform to use if there
                             are multiple platforms available. (default: -1)
-      -h, --help            Print this help
+    -h, --help            Print this help
     
 
     
diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt
index ea702d0c..4bf2e7da 100644
--- a/PTRANS/src/device/CMakeLists.txt
+++ b/PTRANS/src/device/CMakeLists.txt
@@ -3,15 +3,15 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
 if (INTELFPGAOPENCL_FOUND)
     generate_kernel_targets_intel(transpose_optimized)
-    add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m ${BLOCK_SIZE} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-    add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m ${BLOCK_SIZE}
+    add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+    add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
 if (VITIS_FOUND)
     generate_kernel_targets_xilinx(transpose_optimized)
-    add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m ${BLOCK_SIZE} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-    add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m ${BLOCK_SIZE}
+    add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+    add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
diff --git a/PTRANS/src/device/transpose_optimized.cl b/PTRANS/src/device/transpose_optimized.cl
index 06496878..723f1c13 100644
--- a/PTRANS/src/device/transpose_optimized.cl
+++ b/PTRANS/src/device/transpose_optimized.cl
@@ -25,11 +25,11 @@
 __attribute__((max_global_work_dim(0)))
 __kernel
 void transpose(__global DEVICE_DATA_TYPE *restrict A,
-        __global DEVICE_DATA_TYPE *restrict B,
-__global DEVICE_DATA_TYPE *restrict A_out,
-        uint matrixSize) {
+            __global DEVICE_DATA_TYPE *restrict B,
+            __global DEVICE_DATA_TYPE *restrict A_out,
+            const uint number_of_blocks) {
 
-    const unsigned number_of_blocks = matrixSize / BLOCK_SIZE;
+    const unsigned matrixSize = number_of_blocks * BLOCK_SIZE;
 
     // transpose the matrix block-wise from global memory
 #pragma loop_coalesce 2
@@ -37,7 +37,7 @@ __global DEVICE_DATA_TYPE *restrict A_out,
         for (int block_col = 0; block_col < number_of_blocks; block_col++) {
 
             // local memory buffer for a matrix block
-            DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / GLOBAL_MEM_UNROLL][GLOBAL_MEM_UNROLL];
+            DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / GLOBAL_MEM_UNROLL][GLOBAL_MEM_UNROLL] __attribute__((xcl_array_partition(cyclic, GLOBAL_MEM_UNROLL,1))) __attribute__((xcl_array_partition(cyclic, GLOBAL_MEM_UNROLL,2)));
 
             // read in block from global memory and store it in a memory efficient manner
 #pragma loop_coalesce 2
diff --git a/PTRANS/src/host/execution.h b/PTRANS/src/host/execution.h
index e640ec74..de460742 100644
--- a/PTRANS/src/host/execution.h
+++ b/PTRANS/src/host/execution.h
@@ -40,6 +40,7 @@ namespace bm_execution {
         std::string kernelName;
         uint repetitons;
         cl_uint matrixSize;
+        cl_uint blockSize;
         bool useMemInterleaving;
     };
 
diff --git a/PTRANS/src/host/execution_default.cpp b/PTRANS/src/host/execution_default.cpp
index 36f10a86..b5267d2e 100644
--- a/PTRANS/src/host/execution_default.cpp
+++ b/PTRANS/src/host/execution_default.cpp
@@ -55,7 +55,7 @@ namespace bm_execution {
         transposeKernel.setArg(0, bufferA);
         transposeKernel.setArg(1, bufferB);
         transposeKernel.setArg(2, bufferA_out);
-        transposeKernel.setArg(3, config->matrixSize);
+        transposeKernel.setArg(3, config->matrixSize / config->blockSize);
 
         cl::CommandQueue queue(config->context);
 
diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp
index c2ed8a96..07d64b98 100644
--- a/PTRANS/src/host/main.cpp
+++ b/PTRANS/src/host/main.cpp
@@ -37,7 +37,8 @@ main(int argc, char *argv[]) {
                     context, device, program,
                     programSettings->kernelName,
                     programSettings->numRepetitions,
-                    programSettings->matrixSize,
+                    programSettings->matrixSize * programSettings->blockSize,
+                    programSettings->blockSize,
                     programSettings->useMemInterleaving
             });
 
diff --git a/PTRANS/src/host/transpose_functionality.cpp b/PTRANS/src/host/transpose_functionality.cpp
index cb832794..f4d98494 100644
--- a/PTRANS/src/host/transpose_functionality.cpp
+++ b/PTRANS/src/host/transpose_functionality.cpp
@@ -56,8 +56,10 @@ parseProgramParameters(int argc, char *argv[]) {
             ("f,file", "Kernel file name", cxxopts::value<std::string>())
             ("n", "Number of repetitions",
              cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_REPETITIONS)))
-            ("m", "Matrix size",
+            ("m", "Matrix size in number of blocks in one dimension",
              cxxopts::value<cl_uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
+            ("b", "Block size in number of values in one dimension",
+             cxxopts::value<cl_uint>()->default_value(std::to_string(BLOCK_SIZE)))
             ("kernel", "Name of the kernel",
              cxxopts::value<std::string>()->default_value(KERNEL_NAME))
             ("i,nointerleaving", "Disable memory interleaving")
@@ -88,6 +90,7 @@ parseProgramParameters(int argc, char *argv[]) {
     std::shared_ptr<ProgramSettings> sharedSettings(
             new ProgramSettings{result["n"].as<uint>(),
                                 result["m"].as<cl_uint>(),
+                                result["b"].as<cl_uint>(),
                                 result["platform"].as<int>(),
                                 result["device"].as<int>(),
                                 static_cast<bool>(result.count("i") <= 0),
@@ -178,8 +181,7 @@ void printFinalConfiguration(const std::shared_ptr<ProgramSettings> &programSett
     std::cout << "Summary:" << std::endl
               << "Repetitions:         " << programSettings->numRepetitions
               << std::endl
-              << "Matrix Size:         " << programSettings->matrixSize
-              << "x" << programSettings->matrixSize
+              << "Matrix Size:         " << programSettings->matrixSize * programSettings->blockSize
               << std::endl
               << "Memory Interleaving: " << (programSettings->useMemInterleaving ? "Yes" : "No")
               << std::endl
diff --git a/PTRANS/src/host/transpose_functionality.hpp b/PTRANS/src/host/transpose_functionality.hpp
index 53001d4b..69c63375 100644
--- a/PTRANS/src/host/transpose_functionality.hpp
+++ b/PTRANS/src/host/transpose_functionality.hpp
@@ -46,6 +46,7 @@ Moreover the version and build time is also compiled into the description.
 struct ProgramSettings {
     uint numRepetitions;
     cl_uint matrixSize;
+    cl_uint blockSize;
     int defaultPlatform;
     int defaultDevice;
     bool useMemInterleaving;
diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
index 0aee88ca..d442f2a6 100644
--- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
@@ -54,6 +54,7 @@ struct OpenCLKernelTest : testing::Test {
                         KERNEL_NAME,
                         1,
                         matrix_size,
+                        BLOCK_SIZE,
                         false
                 });
         generateInputData(matrix_size, A, B);