diff --git a/CMakeLists.txt b/CMakeLists.txt index f9d1337..91e00d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,7 +141,14 @@ add_test(TestSimulation TestSimulation) enable_testing() # Synthesis flags -set(MM_SYNTHESIS_FLAGS "${MM_SYNTHESIS_FLAGS} -std=c++11 -O3 -DMM_SYNTHESIS -DHLSLIB_SYNTHESIS -DVITIS_MAJOR_VERSION=${Vitis_MAJOR_VERSION}") +set(MM_SYNTHESIS_FLAGS "${MM_SYNTHESIS_FLAGS} -std=c++11 -O3 -DMM_SYNTHESIS -DHLSLIB_SYNTHESIS -DVITIS_MAJOR_VERSION=${Vitis_MAJOR_VERSION} -DVITIS_MINOR_VERSION=${Vitis_MINOR_VERSION} -DVITIS_VERSION=${Vitis_VERSION}") +# Vitis 2020.1 introduces breaking changes to hls::stream +if(NOT Vitis_USE_VITIS_HLS) + set(MM_SYNTHESIS_FLAGS "${MM_SYNTHESIS_FLAGS} -D__VIVADO_HLS__") +else() + set(MM_SYNTHESIS_FLAGS "${MM_SYNTHESIS_FLAGS} -D__VITIS_HLS__") +endif() + if(MM_ADD_RESOURCE) set(MM_SYNTHESIS_FLAGS "${MM_SYNTHESIS_FLAGS} -DMM_ADD_RESOURCE=${MM_ADD_RESOURCE}") endif() diff --git a/hlslib b/hlslib index 753fd53..2a56a40 160000 --- a/hlslib +++ b/hlslib @@ -1 +1 @@ -Subproject commit 753fd5375ae9e911afbf49ce8481f75e3d256a55 +Subproject commit 2a56a401b8999d1f684a083e8a2321e3e178869f diff --git a/include/Compute.h b/include/Compute.h index f3efab1..5f9a6c8 100644 --- a/include/Compute.h +++ b/include/Compute.h @@ -5,10 +5,10 @@ #include "MatrixMultiplication.h" -void ProcessingElement(Stream &aIn, - Stream &aOut, - Stream &bIn, - Stream &bOut, +void ProcessingElement(Stream &aIn, + Stream &aOut, + Stream &bIn, + Stream &bOut, Stream &cOut, Stream &cIn, const unsigned locationN, const unsigned size_n, const unsigned size_k, diff --git a/include/Memory.h b/include/Memory.h index dcf047c..60ddb04 100644 --- a/include/Memory.h +++ b/include/Memory.h @@ -1,5 +1,4 @@ /// @author Johannes de Fine Licht (definelicht@inf.ethz.ch) -/// @date August 2017 /// @copyright This software is copyrighted under the BSD 3-Clause License. #pragma once @@ -11,68 +10,59 @@ // Read wide bursts from memory, then distribute it into separate column // buffers, which will be read out in column-major order and sent to the kernel -void ReadA(MemoryPackK_t const a[], - Stream aSplit[kTransposeWidth], +void ReadA(MemoryPackK_t const a[], Stream aSplit[kTransposeWidth], unsigned n, unsigned k, unsigned m); // We pop from the column buffers in column-major order, funneling the // transposed data to the kernel #ifdef MM_CONVERT_A -void TransposeA(Stream aSplit[kTransposeWidth], - Stream &toKernel, - unsigned n, unsigned k, unsigned m); +void TransposeA(Stream aSplit[kTransposeWidth], + Stream &toKernel, unsigned n, unsigned k, unsigned m); -void ConvertWidthA(Stream &narrow, - Stream &wide, +void ConvertWidthA(Stream &narrow, Stream &wide, unsigned, unsigned k, unsigned m); #else -void TransposeA(Stream aSplit[kTransposeWidth], - Stream &toKernel, - unsigned n, unsigned k, unsigned m); +void TransposeA(Stream aSplit[kTransposeWidth], + Stream &toKernel, unsigned n, unsigned k, + unsigned m); #endif -#else // MM_TRANSPOSED_A +#else // MM_TRANSPOSED_A -void ReadATransposed(MemoryPackN_t const memory[], - Stream &pipe, +void ReadATransposed(MemoryPackN_t const memory[], Stream &pipe, const unsigned size_n, const unsigned size_k, const unsigned size_m); -void ConvertWidthATransposed( - Stream &pipe_in, - Stream &pipe_out, const unsigned size_n, - const unsigned size_k, const unsigned size_m); +void ConvertWidthATransposed(Stream &pipe_in, + Stream &pipe_out, + const unsigned size_n, const unsigned size_k, + const unsigned size_m); #endif -void ReadB(MemoryPackM_t const memory[], - Stream &pipe, +void ReadB(MemoryPackM_t const memory[], Stream &pipe, unsigned n, unsigned k, unsigned m); #ifdef MM_CONVERT_B -void ConvertWidthB(Stream &wide, - Stream &narrow, +void ConvertWidthB(Stream &wide, Stream &narrow, unsigned n, unsigned k, unsigned m); -void FeedB(Stream &converted, - Stream &toKernel, +void FeedB(Stream &converted, Stream &toKernel, unsigned n, unsigned k, unsigned m); #else -void FeedB(Stream &fromMemory, - Stream &toKernel, +void FeedB(Stream &fromMemory, Stream &toKernel, unsigned n, unsigned k, unsigned m); #endif -void ConvertWidthC(Stream &narrow, - Stream &wide, +void ConvertWidthC(Stream &narrow, Stream &wide, unsigned n, unsigned k, unsigned m); -void WriteC(Stream &pipe, - MemoryPackM_t memory[], unsigned n, unsigned k, unsigned m); +void WriteC(Stream &pipe, MemoryPackM_t memory[], unsigned n, + unsigned k, unsigned m); diff --git a/kernel/Compute.cpp b/kernel/Compute.cpp index 7b5239d..b9e1b6c 100644 --- a/kernel/Compute.cpp +++ b/kernel/Compute.cpp @@ -8,15 +8,14 @@ #include "Memory.h" #include -void ProcessingElement(Stream &aIn, - Stream &aOut, - Stream &bIn, - Stream &bOut, +void ProcessingElement(Stream &aIn, + Stream &aOut, + Stream &bIn, + Stream &bOut, Stream &cOut, Stream &cIn, const unsigned locationN, const unsigned size_n, const unsigned size_k, const unsigned size_m) { - // A is double-buffered, such that new values can be read while the // previous outer product is being computed. This is required to achieve // a perfect pipeline across the K-dimension, which is necessary for diff --git a/kernel/Memory.cpp b/kernel/Memory.cpp index cd7667a..3ced8e9 100644 --- a/kernel/Memory.cpp +++ b/kernel/Memory.cpp @@ -1,6 +1,5 @@ /// @author Johannes de Fine Licht (definelicht@inf.ethz.ch) -/// @date June 2018 -/// @copyright This software is copyrighted under the BSD 3-Clause License. +/// @copyright This software is copyrighted under the BSD 3-Clause License. #include "Memory.h" #include @@ -20,7 +19,7 @@ unsigned IndexA(const unsigned n0, const unsigned n1, const unsigned n2, return index; } -#else // MM_TRANSPOSED_A +#else // MM_TRANSPOSED_A unsigned IndexATransposed(const unsigned k, const unsigned n0, const unsigned n1m, const unsigned size_n, @@ -32,7 +31,7 @@ unsigned IndexATransposed(const unsigned k, const unsigned n0, return index; } -#endif // MM_TRANSPOSED_A +#endif // MM_TRANSPOSED_A unsigned IndexB(const unsigned k, const unsigned m0, const unsigned m1m, const unsigned size_n, const unsigned size_k, @@ -57,28 +56,28 @@ unsigned IndexC(const unsigned n0, const unsigned n1, const unsigned m0, #ifndef MM_TRANSPOSED_A void _ReadAInner(MemoryPackK_t const a[], - Stream aSplit[kTransposeWidth], - const unsigned n0, const unsigned n1, const unsigned n2, - const unsigned k0, const unsigned k1, const unsigned size_n, + Stream aSplit[kTransposeWidth], const unsigned n0, + const unsigned n1, const unsigned n2, const unsigned k0, + const unsigned k1, const unsigned size_n, const unsigned size_k, const unsigned size_m) { #pragma HLS INLINE auto pack = a[IndexA(n0, n1, n2, k0, k1, size_n, size_k, size_m)]; ReadA_Unroll: for (unsigned w = 0; w < kMemoryWidthK; ++w) { #pragma HLS UNROLL - aSplit[k1 * kMemoryWidthK + w].Push(pack[w]); + aSplit[k1 * kMemoryWidthK + w].Push(pack[w]); } } template -void _ReadAInnerLoop( - MemoryPackK_t const a[], - Stream aSplit[kTransposeWidth], unsigned n0, - unsigned n1, unsigned n2, unsigned k0, const unsigned size_n, - const unsigned size_k, const unsigned size_m) { +void _ReadAInnerLoop(MemoryPackK_t const a[], + Stream aSplit[kTransposeWidth], unsigned n0, + unsigned n1, unsigned n2, unsigned k0, + const unsigned size_n, const unsigned size_k, + const unsigned size_m) { #pragma HLS INLINE ReadA_TransposeWidth: - for (unsigned k1 = 0; k1 < (kTransposeWidth / kMemoryWidthK); ++k1) { + for (unsigned k1 = 0; k1 < (kTransposeWidth / kMemoryWidthK); ++k1) { #pragma HLS PIPELINE II=1 #pragma HLS LOOP_FLATTEN _ReadAInner(a, aSplit, n0, n1, n2, k0, k1, size_n, size_k, size_m); @@ -88,22 +87,20 @@ void _ReadAInnerLoop( // Need a special case for kMemoryWidthK == kTransposeWidth, as Vivado HLS // otherwise doesn't pipeline the loops (because the inner trip count is 1). template <> -void _ReadAInnerLoop<1>( - MemoryPackK_t const a[], - Stream aSplit[kTransposeWidth], - const unsigned n0, const unsigned n1, const unsigned n2, const unsigned k0, - const unsigned size_n, const unsigned size_k, const unsigned size_m) { +void _ReadAInnerLoop<1>(MemoryPackK_t const a[], + Stream aSplit[kTransposeWidth], + const unsigned n0, const unsigned n1, const unsigned n2, + const unsigned k0, const unsigned size_n, + const unsigned size_k, const unsigned size_m) { #pragma HLS INLINE #pragma HLS PIPELINE II=1 #pragma HLS LOOP_FLATTEN _ReadAInner(a, aSplit, n0, n1, n2, k0, 0, size_n, size_k, size_m); } -void ReadA(MemoryPackK_t const a[], - Stream aSplit[kTransposeWidth], +void ReadA(MemoryPackK_t const a[], Stream aSplit[kTransposeWidth], const unsigned size_n, const unsigned size_k, const unsigned size_m) { - assert((static_cast(OuterTilesN(size_n)) * OuterTilesM(size_m) * (size_k / kTransposeWidth) * kInnerTilesN * kInnerTileSizeN * (kTransposeWidth / kMemoryWidthK) * @@ -129,9 +126,8 @@ void ReadA(MemoryPackK_t const a[], } template -void _TransposeAInner( - Stream aSplit[kTransposeWidth], - Stream &toKernel, const unsigned k) { +void _TransposeAInner(Stream aSplit[kTransposeWidth], + Stream &toKernel, const unsigned k) { #pragma HLS INLINE for (unsigned n1 = 0; n1 < kOuterTileSizeN / kComputeTileSizeN; ++n1) { ComputePackN_t pack; @@ -149,9 +145,8 @@ void _TransposeAInner( } template <> -void _TransposeAInner<1>( - Stream aSplit[kTransposeWidth], - Stream &toKernel, const unsigned k) { +void _TransposeAInner<1>(Stream aSplit[kTransposeWidth], + Stream &toKernel, const unsigned k) { #pragma HLS INLINE for (unsigned n1 = 0; n1 < kOuterTileSizeN; ++n1) { #pragma HLS PIPELINE II=1 @@ -164,11 +159,9 @@ void _TransposeAInner<1>( // We pop from the column buffers in column-major order, funneling the // transposed data to the kernel -void TransposeA(Stream aSplit[kTransposeWidth], - Stream &toKernel, - const unsigned size_n, const unsigned size_k, - const unsigned size_m) { - +void TransposeA(Stream aSplit[kTransposeWidth], + Stream &toKernel, const unsigned size_n, + const unsigned size_k, const unsigned size_m) { assert((static_cast(OuterTilesN(size_n)) * OuterTilesM(size_m) * size_k * kOuterTileSizeN) == TotalReadsFromA(size_n, size_k, size_m)); @@ -186,8 +179,7 @@ void TransposeA(Stream aSplit[kTransposeWidth], } #ifdef MM_CONVERT_A -void ConvertWidthA(Stream &narrow, - Stream &wide, +void ConvertWidthA(Stream &narrow, Stream &wide, const unsigned size_n, const unsigned size_k, const unsigned size_m) { ConvertWidthA_Outer: @@ -206,13 +198,11 @@ void ConvertWidthA(Stream &narrow, } #endif -#else // MM_TRANSPOSED_A == true +#else // MM_TRANSPOSED_A == true -void ReadATransposed(MemoryPackN_t const memory[], - Stream &pipe, +void ReadATransposed(MemoryPackN_t const memory[], Stream &pipe, const unsigned size_n, const unsigned size_k, const unsigned size_m) { - assert((static_cast(OuterTilesN(size_n)) * OuterTilesM(size_m) * size_k * kOuterTileSizeNMemory * MemoryPackN_t::kWidth) == TotalReadsFromA(size_n, size_k, size_m)); @@ -223,7 +213,6 @@ void ReadATransposed(MemoryPackN_t const memory[], for (unsigned m0 = 0; m0 < OuterTilesM(size_m); ++m0) { ReadA_K: for (unsigned k = 0; k < size_k; ++k) { - ReadA_BufferA_N1: for (unsigned n1m = 0; n1m < kOuterTileSizeNMemory; ++n1m) { #pragma HLS PIPELINE II=1 @@ -231,17 +220,15 @@ void ReadATransposed(MemoryPackN_t const memory[], pipe.Push( memory[IndexATransposed(k, n0, n1m, size_n, size_k, size_m)]); } - } } } } -void ConvertWidthATransposed( - Stream &wide, - Stream &narrow, const unsigned size_n, - const unsigned size_k, const unsigned size_m) { - +void ConvertWidthATransposed(Stream &wide, + Stream &narrow, + const unsigned size_n, const unsigned size_k, + const unsigned size_m) { static_assert(kMemoryWidthN % kComputeTileSizeN == 0, "Tile size must be a multiple of memory width."); @@ -266,18 +253,16 @@ void ConvertWidthATransposed( narrow.Push(computePack); } #else - narrow.Push(wide.Pop()); + narrow.Push(wide.Pop()); #endif } } -#endif // MM_TRANSPOSED_A == true +#endif // MM_TRANSPOSED_A == true -void ReadB(MemoryPackM_t const memory[], - Stream &pipe, +void ReadB(MemoryPackM_t const memory[], Stream &pipe, const unsigned size_n, const unsigned size_k, const unsigned size_m) { - assert((static_cast(OuterTilesN(size_n)) * OuterTilesM(size_m) * size_k * kOuterTileSizeMMemory * MemoryPackM_t::kWidth) == TotalReadsFromB(size_n, size_k, size_m)); @@ -288,23 +273,20 @@ void ReadB(MemoryPackM_t const memory[], for (unsigned m0 = 0; m0 < OuterTilesM(size_m); ++m0) { ReadB_K: for (unsigned k = 0; k < size_k; ++k) { - ReadB_BufferB_M1: for (unsigned m1m = 0; m1m < kOuterTileSizeMMemory; ++m1m) { #pragma HLS PIPELINE II=1 #pragma HLS LOOP_FLATTEN - pipe.Push(memory[IndexB(k, m0, m1m, size_n, size_k, size_m)]); + pipe.Push(memory[IndexB(k, m0, m1m, size_n, size_k, size_m)]); } - } } } } -void ConvertWidthB(Stream &wide, - Stream &narrow, const unsigned size_n, - const unsigned size_k, const unsigned size_m) { - +void ConvertWidthB(Stream &wide, Stream &narrow, + const unsigned size_n, const unsigned size_k, + const unsigned size_m) { assert(kMemoryWidthM % kComputeTileSizeM == 0); assert(((TotalReadsFromB(size_n, size_k, size_m) / kMemoryWidthM) * @@ -336,8 +318,7 @@ void ConvertWidthB(Stream &wide, } } -void ConvertWidthC(Stream &narrow, - Stream &wide, +void ConvertWidthC(Stream &narrow, Stream &wide, const unsigned size_n, const unsigned size_k, const unsigned size_m) { assert(kMemoryWidthM % ComputePackM_t::kWidth == 0); @@ -375,10 +356,9 @@ void ConvertWidthC(Stream &narrow, } } -void WriteC(Stream &pipe, - MemoryPackM_t memory[], const unsigned size_n, - const unsigned size_k, const unsigned size_m) { - +void WriteC(Stream &pipe, MemoryPackM_t memory[], + const unsigned size_n, const unsigned size_k, + const unsigned size_m) { // assert((OuterTilesN(size_n) * OuterTilesM(size_m) * kOuterTileSizeN * // kOuterTileSizeMMemory * MemoryPackM_t::kWidth) == size_n * size_m); @@ -401,7 +381,8 @@ void WriteC(Stream &pipe, } #ifndef MM_SYNTHESIS std::cout << "Finished tile (" << n0 << ", " << m0 << ") of (" - << OuterTilesN(size_n) - 1 << ", " << OuterTilesM(size_m) - 1 << ")\n" + << OuterTilesN(size_n) - 1 << ", " << OuterTilesM(size_m) - 1 + << ")\n" << std::flush; #endif } @@ -409,13 +390,13 @@ void WriteC(Stream &pipe, } #ifndef MM_CONVERT_B -void FeedB(Stream &fromMemory, - Stream &toKernel, const unsigned size_n, - const unsigned size_k, const unsigned size_m) { +void FeedB(Stream &fromMemory, Stream &toKernel, + const unsigned size_n, const unsigned size_k, + const unsigned size_m) { #else -void FeedB(Stream &fromMemory, - Stream &toKernel, const unsigned size_n, - const unsigned size_k, const unsigned size_m) { +void FeedB(Stream &fromMemory, Stream &toKernel, + const unsigned size_n, const unsigned size_k, + const unsigned size_m) { #endif assert(static_cast(OuterTilesN(size_n)) * OuterTilesM(size_m) * @@ -431,7 +412,6 @@ void FeedB(Stream &fromMemory, for (unsigned m0 = 0; m0 < bound_m; ++m0) { FeedB_K: for (unsigned k = 0; k < size_k; ++k) { - ComputePackM_t buffer[kInnerTilesM]; FeedB_Pipeline_N: @@ -450,7 +430,6 @@ void FeedB(Stream &fromMemory, toKernel.Push(val); } } - } } }