Skip to content
This repository has been archived by the owner on Jul 16, 2024. It is now read-only.

Commit

Permalink
Merge pull request #7 from markdryan/benchmarking
Browse files Browse the repository at this point in the history
Add benchmark code and fix various issues
  • Loading branch information
Mark Ryan authored Mar 1, 2022
2 parents ab388ad + 673a807 commit 61cdceb
Show file tree
Hide file tree
Showing 329 changed files with 7,395 additions and 487 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ jobs:
- uses: actions/checkout@v2

# Runs a set of commands using the runners shell

- name: Install dependencies
run: sudo apt-get install -y libbenchmark-dev elfutils

- name: build code
run: ./verify.sh

Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required (VERSION 3.16.3)
project(optimization C CXX ASM)

find_package(benchmark QUIET)

if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
enable_language(ASM_MASM)
endif()
Expand Down
4 changes: 3 additions & 1 deletion CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Intel
- [email protected]
- [email protected]
- [email protected]
- [email protected]
- [email protected]
22 changes: 15 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ To run the unit tests
5. make && make test

GCC 8.1 or higher is required to build the unit tests. The unit tests are
compiled with --march=haswell and so a Haswell CPU or later is required to run
them. Tests that execute instructions not present on Haswell will be
compiled with --march=haswell and so a fourth-generation Intel® Core™ (Haswell)
CPU or later is required to run them. Tests that execute instructions not present
on fourth-generation Intel® Core™ (Haswell) will be
skipped if the CPU on which they are run does not support those instructions.

The code samples can also be compiled with clang:
Expand All @@ -44,18 +45,25 @@ Dependency- Visual Studio 2019
5. To Build- build "ALL_BUILD" project
6. To Run tests- build "RUN_TESTS" project.

## Building the Benchmarks

Benchmark code is supplied for some of the code samples. These benchmarks are
built using [Google's Benchmark project](https://github.com/google/benchmark).
If Benchmark is installed and discoverable by CMake, the benchmarks for the code
samples will be automatically built when you type make.

## CPU Requirements

The code samples assume that they are being run on a Haswell processor
The code samples assume that they are being run on a fourth-generation Intel® Core™ (Haswell) processor
or later and do not perform runtime checks for the instructions that
they use that are present in Haswell, for example, FMA or AVX-2.
they use that are present in fourth-generation Intel® Core™ (Haswell), for example, FMA or AVX-2.
Some of the code samples may then crash if they are run
on a device that does not support these instructions.

The code samples do however check for post Haswell instruction sets such as AVX-512 and VNNI
before running. Tests will skip if they detect that the post Haswell instructions
The code samples do however check for post fourth-generation Intel® Core™ (Haswell) instruction sets such as AVX-512 and VNNI
before running. Tests will skip if they detect that the post fourth-generation Intel® Core™ (Haswell) instructions
they need are not present. Some of the newest examples use new instructions only found
in SkylakeX or later processors. If you have an older CPU
in seventh-generation Intel® Core™ (SkylakeX) or later processors. If you have an older CPU
in your PC you may find that everything builds on your system
but that some of the tests are skipped or crash (if you don't have AVX2) when run. In this case,
to fully run the tests, you need to run them under the SDE.
Expand Down
9 changes: 8 additions & 1 deletion chap15/ex1/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
add_executable(avx_ex1_tests ex1_test.cpp transform_sse.c transform_avx.c)
set(avx_ex1_srcs transform_sse.c transform_avx.c)
add_executable(avx_ex1_tests ex1_test.cpp ${avx_ex1_srcs})
target_link_libraries(avx_ex1_tests gtest_main)

IF( benchmark_FOUND )
add_executable(avx_ex1_bench ex1_bench.cpp ${avx_ex1_srcs})
target_link_libraries(avx_ex1_bench benchmark::benchmark)
ENDIF()

add_test(NAME avx_ex1_test COMMAND avx_ex1_tests)
84 changes: 84 additions & 0 deletions chap15/ex1/ex1_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (C) 2021 by Intel Corporation
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
* INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
* LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
* PERFORMANCE OF THIS SOFTWARE.
*/

#include <benchmark/benchmark.h>
#include <xmmintrin.h>

#include "transform_avx.h"
#include "transform_sse.h"

static void BM_transform_sse(benchmark::State &state)
{
int len = state.range(0);
// Dynamic memory allocation with 16byte
// alignment
float *pInVector = (float *)_mm_malloc(len * sizeof(float), 16);
float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 16);
// init data
for (int i = 0; i < len; i++)
pInVector[i] = 1;
float cos_teta = 0.8660254037;
float sin_teta = 0.5;

for (auto _ : state) {
transform_sse(sin_teta, cos_teta, pInVector, pOutVector, len);
}
state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
int64_t(sizeof(pInVector[0])));

_mm_free(pInVector);
_mm_free(pOutVector);
}

static void BM_transform_avx(benchmark::State &state)
{
int len = state.range(0);
// Dynamic memory allocation with 32byte
// alignment
float *pInVector = (float *)_mm_malloc(len * sizeof(float), 32);
float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 32);
// init data
for (int i = 0; i < len; i++)
pInVector[i] = 1;
float cos_teta = 0.8660254037;
float sin_teta = 0.5;

for (auto _ : state) {
transform_avx(sin_teta, cos_teta, pInVector, pOutVector, len);
}
state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
int64_t(sizeof(pInVector[0])));

_mm_free(pInVector);
_mm_free(pOutVector);
}

BENCHMARK(BM_transform_sse)
->Arg(1 << 6)
->Arg(1 << 8)
->Arg(1 << 10)
->Arg(1 << 12)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
BENCHMARK(BM_transform_avx)
->Arg(1 << 6)
->Arg(1 << 8)
->Arg(1 << 10)
->Arg(1 << 12)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
BENCHMARK_MAIN();
34 changes: 16 additions & 18 deletions chap15/ex1/ex1_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,14 @@ TEST(avx_1, transform_sse)
true);

for (int i = 0; i < len; i += 2) {
if (i & 1) {
float cosx = pInVector[i + 1] * cos_teta;
float sinx = pInVector[i + 1] * sin_teta;
ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]);
} else {
float cosx = pInVector[i] * cos_teta;
float sinx = pInVector[i] * sin_teta;
ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]);
}
// Assert X'
float cosx = pInVector[i] * cos_teta;
float siny = pInVector[i + 1] * sin_teta;
ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]);
// Assert Y'
float sinx = pInVector[i] * sin_teta;
float cosy = pInVector[i + 1] * cos_teta;
ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]);
}

ASSERT_EQ(
Expand Down Expand Up @@ -91,15 +90,14 @@ TEST(avx_1, transform_avx)
true);

for (int i = 0; i < len; i += 2) {
if (i & 1) {
float cosx = pInVector[i + 1] * cos_teta;
float sinx = pInVector[i + 1] * sin_teta;
ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]);
} else {
float cosx = pInVector[i] * cos_teta;
float sinx = pInVector[i] * sin_teta;
ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]);
}
// Assert X'
float cosx = pInVector[i] * cos_teta;
float siny = pInVector[i + 1] * sin_teta;
ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]);
// Assert Y'
float sinx = pInVector[i] * sin_teta;
float cosy = pInVector[i + 1] * cos_teta;
ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]);
}

ASSERT_EQ(
Expand Down
13 changes: 9 additions & 4 deletions chap15/ex10/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
set(avx_ex10_srcs ex10_test.cpp saxpy32.c)
if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
set(avx_ex10_srcs ${avx_ex10_srcs} saxpy32.s)
set(avx_ex10_ass saxpy32.s)
elseif(MSVC)
set(avx_ex10_srcs ${avx_ex10_srcs} saxpy32.asm)
set(avx_ex10_ass saxpy32.asm)
endif()
add_executable(avx_ex10_tests ${avx_ex10_srcs})
add_executable(avx_ex10_tests ex10_test.cpp saxpy32.c ${avx_ex10_ass})
target_link_libraries(avx_ex10_tests gtest_main)

IF( benchmark_FOUND )
add_executable(avx_ex10_bench ex10_bench.cpp ${avx_ex10_ass})
target_link_libraries(avx_ex10_bench benchmark::benchmark)
ENDIF()

add_test(NAME avx_ex10_test COMMAND avx_ex10_tests)
120 changes: 120 additions & 0 deletions chap15/ex10/ex10_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Copyright (C) 2022 by Intel Corporation
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
* INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
* LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
* PERFORMANCE OF THIS SOFTWARE.
*/

#include <benchmark/benchmark.h>
#include <xmmintrin.h>

#include "saxpy32.h"

static void init_sources(float *src, float *src2, int len)
{
for (int i = 0; i < len; i++) {
src[i] = 2.0f * i;
src2[i] = 3.0f * i;
}
}

static void BM_saxpy_avx_aligned(benchmark::State &state)
{
int len = state.range(0);
float *src = (float *)_mm_malloc(len * sizeof(float), 32);
float *src2 = (float *)_mm_malloc(len * sizeof(float), 32);
float *dest = (float *)_mm_malloc(len * sizeof(float), 32);

init_sources(src, src2, len);

for (auto _ : state) {
saxpy32(src, src2, len * sizeof(float), dest, 10.0);
}

state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
int64_t(sizeof(float) * 2));

_mm_free(dest);
_mm_free(src2);
_mm_free(src);
}

static void BM_saxpy_avx_misaligned1(benchmark::State &state)
{
int len = state.range(0);
float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
float *src = &src_mem[1];
float *src2 = (float *)_mm_malloc(len * sizeof(float), 32);
float *dest = (float *)_mm_malloc(len * sizeof(float), 32);

init_sources(src, src2, len);

for (auto _ : state) {
saxpy32(src, src2, len * sizeof(float), dest, 10.0);
}

state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
int64_t(sizeof(float) * 2));

_mm_free(dest);
_mm_free(src2);
_mm_free(src_mem);
}

static void BM_saxpy_avx_misaligned3(benchmark::State &state)
{
int len = state.range(0);
float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
float *src = &src_mem[1];
float *src2_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
float *src2 = &src2_mem[1];
float *dest_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
float *dest = &dest_mem[1];

init_sources(src, src2, len);

for (auto _ : state) {
saxpy32(src, src2, len * sizeof(float), dest, 10.0);
}

state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
int64_t(sizeof(float) * 2));

_mm_free(dest_mem);
_mm_free(src2_mem);
_mm_free(src_mem);
}

BENCHMARK(BM_saxpy_avx_aligned)
->Arg(1 << 6)
->Arg(1 << 8)
->Arg(1 << 10)
->Arg(1 << 12)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
BENCHMARK(BM_saxpy_avx_misaligned1)
->Arg(1 << 6)
->Arg(1 << 8)
->Arg(1 << 10)
->Arg(1 << 12)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
BENCHMARK(BM_saxpy_avx_misaligned3)
->Arg(1 << 6)
->Arg(1 << 8)
->Arg(1 << 10)
->Arg(1 << 12)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
BENCHMARK_MAIN();
3 changes: 2 additions & 1 deletion chap15/ex10/ex10_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ static float src[MAX_SIZE] __attribute__((aligned(32)));
static float dest[MAX_SIZE] __attribute__((aligned(32)));
static float src2[MAX_SIZE] __attribute__((aligned(32)));
#endif
void init_sources()

static void init_sources()
{
for (int i = 0; i < MAX_SIZE; i++) {
src[i] = 2.0f * i;
Expand Down
4 changes: 4 additions & 0 deletions chap15/ex10/saxpy32.s
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,7 @@ start_loop:
vzeroupper
pop rbx
ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
13 changes: 9 additions & 4 deletions chap15/ex12/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
set(avx_ex12_srcs ex12_test.cpp saxpy32.c saxpy16.c)
if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
set(avx_ex12_srcs ${avx_ex12_srcs} saxpy32.s saxpy16.s)
set(avx_ex12_ass saxpy32.s saxpy16.s)
elseif(MSVC)
set(avx_ex12_srcs ${avx_ex12_srcs} saxpy32.asm saxpy16.asm)
set(avx_ex12_ass saxpy32.asm saxpy16.asm)
endif()
add_executable(avx_ex12_tests ${avx_ex12_srcs})
add_executable(avx_ex12_tests ex12_test.cpp saxpy32.c saxpy16.c ${avx_ex12_ass})
target_link_libraries(avx_ex12_tests gtest_main)

IF( benchmark_FOUND )
add_executable(avx_ex12_bench ex12_bench.cpp ${avx_ex12_ass})
target_link_libraries(avx_ex12_bench benchmark::benchmark)
ENDIF()

add_test(NAME avx_ex12_test COMMAND avx_ex12_tests)
Loading

0 comments on commit 61cdceb

Please sign in to comment.