From 4d9f40cb5d31a14e0130356a4e2520ed36413023 Mon Sep 17 00:00:00 2001 From: Carlos Date: Mon, 13 May 2024 15:55:34 +0100 Subject: [PATCH] func(omp): make repeated reduce function more flexible (#125) * func(omp): set number of threads from input for repeated reduce * func(omp): make repeated reduce function more flexible --- .gitignore | 4 +- func/omp/repeated_reduce.cpp | 73 +++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 8f017a6..28e8ca5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ -build-wasm/ -build-native/ -build-native-shared/ +build-*/ compile_commands.json .clangd/ diff --git a/func/omp/repeated_reduce.cpp b/func/omp/repeated_reduce.cpp index 66a49a8..edc9beb 100644 --- a/func/omp/repeated_reduce.cpp +++ b/func/omp/repeated_reduce.cpp @@ -1,16 +1,21 @@ +#include #include #include #include #include #include +#include -bool doReduce() +// This reduce method is called with a varying number of threads, but with +// a maximum of 10. In addition, the inner parallel for pragma may be +// elastically scaled from nThreads, all the way up to 10. +bool doReduce(int numThreads) { - int nThreads = 10; int chunkSize = 1000; - int loopSize = nThreads * chunkSize; - int counts[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int loopSize = numThreads * chunkSize; + int maxNumThreads = 10; + std::vector counts(maxNumThreads, 0); int reducedA = 0; int reducedB = 0; @@ -20,7 +25,7 @@ bool doReduce() FAASM_REDUCE(reducedA, FAASM_TYPE_INT, FAASM_OP_SUM) FAASM_REDUCE(reducedB, FAASM_TYPE_INT, FAASM_OP_SUM) -#pragma omp parallel for num_threads(nThreads) default(none) \ +#pragma omp parallel for num_threads(numThreads) default(none) \ shared(counts, loopSize, success) reduction(+ : reducedA, reducedB) for (int i = 0; i < loopSize; i++) { int threadNum = omp_get_thread_num(); @@ -49,17 +54,53 @@ bool doReduce() return 1; } - // Check counts - for (int t = 0; t < nThreads; t++) { - if (counts[t] != chunkSize) { - printf( - "Loop count for thread %i: %i != %i\n", t, counts[t], chunkSize); + // First, work out how many threads actually executed the loop, by checking + // how many threads wrote to the counts array + int actualNumThreads = 0; + for (int i = 0; i < counts.size(); i++) { + if (counts.at(i) != 0) { + actualNumThreads++; + } + } + + if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) { + printf("Actual number of threads outside valid range: %i \\not \\in " + "[%i, %i]\n", + actualNumThreads, + numThreads, + maxNumThreads); + + // Exit fast in this case as posterior checks may seg-fault + return false; + } + + // Check counts (only count the aggregate, and a uniform distribution, as + // we may elastically change the parallelism of the loop) + int actualChunkSize = (int)loopSize / actualNumThreads; + int total = 0; + for (int tNum = 0; tNum < actualNumThreads; tNum++) { + if (counts[tNum] != actualChunkSize) { + printf("Loop count for thread %i: %i != %i\n", + tNum, + counts[tNum], + actualChunkSize); success = false; } + + total += counts[tNum]; } - int expectedFinalReducedA = 550000; - int expectedFinalReducedB = 825000; + if (total != loopSize) { + printf("Total loop count failed: %i != %i\n", total, loopSize); + success = false; + } + + // The expected final value is: constant (10/15) * (sum [1, nThreads]) * + // chunkSize + int expectedFinalReducedA = + (int)10 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize; + int expectedFinalReducedB = + (int)15 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize; if (reducedA != expectedFinalReducedA) { printf("reducedA %i != %i\n", reducedA, expectedFinalReducedA); @@ -76,10 +117,16 @@ bool doReduce() int main(int argc, char* argv[]) { + int numThreads = faasm::getIntInput(); + if (numThreads <= 0) { + printf("Incorrect number of threads passed as input: %i\n", numThreads); + return 1; + } + // Run reduce in a loop and check each iteration is correct int nLoops = 10; for (int i = 0; i < nLoops; i++) { - bool success = doReduce(); + bool success = doReduce(numThreads); if (!success) { printf("Repeated reduce failed on loop %i\n", i); return 1;