From 7d707d68f1b553bc717c92aee974078014328732 Mon Sep 17 00:00:00 2001 From: SharzyL Date: Fri, 10 May 2024 12:58:46 +0800 Subject: [PATCH] [cases] add rvv_bench --- tests/default.nix | 3 +- tests/rvv_bench/_include/bench.h | 170 ++++++++ tests/rvv_bench/_include/config.h | 25 ++ tests/rvv_bench/_include/nolibc.h | 80 ++++ tests/rvv_bench/_include/template.S | 80 ++++ tests/rvv_bench/_include/thirdparty/boring.c | 383 ++++++++++++++++++ tests/rvv_bench/_include/thirdparty/boring.h | 31 ++ .../_include/thirdparty/rvv-rollback.S | 255 ++++++++++++ .../rvv_bench/ascii_to_utf16/ascii_to_utf16.S | 68 ++++ .../rvv_bench/ascii_to_utf16/ascii_to_utf16.c | 63 +++ .../rvv_bench/ascii_to_utf32/ascii_to_utf32.S | 66 +++ .../rvv_bench/ascii_to_utf32/ascii_to_utf32.c | 63 +++ tests/rvv_bench/byteswap/byteswap.S | 81 ++++ tests/rvv_bench/byteswap/byteswap.c | 79 ++++ tests/rvv_bench/chacha20/chacha20.S | 5 + tests/rvv_bench/chacha20/chacha20.c | 61 +++ tests/rvv_bench/default.nix | 32 ++ tests/rvv_bench/mandelbrot/isFp | 0 tests/rvv_bench/mandelbrot/mandelbrot.S | 358 ++++++++++++++++ tests/rvv_bench/mandelbrot/mandelbrot.c | 94 +++++ tests/rvv_bench/memcpy/memcpy.S | 153 +++++++ tests/rvv_bench/memcpy/memcpy.c | 197 +++++++++ tests/rvv_bench/memset/memset.S | 96 +++++ tests/rvv_bench/memset/memset.c | 163 ++++++++ tests/rvv_bench/mergelines/mergelines.S | 179 ++++++++ tests/rvv_bench/mergelines/mergelines.c | 75 ++++ tests/rvv_bench/poly1305/poly1305.S | 5 + tests/rvv_bench/poly1305/poly1305.c | 64 +++ tests/rvv_bench/strlen/strlen.S | 91 +++++ tests/rvv_bench/strlen/strlen.c | 76 ++++ tests/rvv_bench/utf8_count/utf8_count.S | 213 ++++++++++ tests/rvv_bench/utf8_count/utf8_count.c | 135 ++++++ 32 files changed, 3443 insertions(+), 1 deletion(-) create mode 100644 tests/rvv_bench/_include/bench.h create mode 100644 tests/rvv_bench/_include/config.h create mode 100644 tests/rvv_bench/_include/nolibc.h create mode 100644 tests/rvv_bench/_include/template.S create mode 100644 tests/rvv_bench/_include/thirdparty/boring.c create mode 100644 tests/rvv_bench/_include/thirdparty/boring.h create mode 100644 tests/rvv_bench/_include/thirdparty/rvv-rollback.S create mode 100644 tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S create mode 100644 tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c create mode 100644 tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S create mode 100644 tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c create mode 100644 tests/rvv_bench/byteswap/byteswap.S create mode 100644 tests/rvv_bench/byteswap/byteswap.c create mode 100644 tests/rvv_bench/chacha20/chacha20.S create mode 100644 tests/rvv_bench/chacha20/chacha20.c create mode 100644 tests/rvv_bench/default.nix create mode 100644 tests/rvv_bench/mandelbrot/isFp create mode 100644 tests/rvv_bench/mandelbrot/mandelbrot.S create mode 100644 tests/rvv_bench/mandelbrot/mandelbrot.c create mode 100644 tests/rvv_bench/memcpy/memcpy.S create mode 100644 tests/rvv_bench/memcpy/memcpy.c create mode 100644 tests/rvv_bench/memset/memset.S create mode 100644 tests/rvv_bench/memset/memset.c create mode 100644 tests/rvv_bench/mergelines/mergelines.S create mode 100644 tests/rvv_bench/mergelines/mergelines.c create mode 100644 tests/rvv_bench/poly1305/poly1305.S create mode 100644 tests/rvv_bench/poly1305/poly1305.c create mode 100644 tests/rvv_bench/strlen/strlen.S create mode 100644 tests/rvv_bench/strlen/strlen.c create mode 100644 tests/rvv_bench/utf8_count/utf8_count.S create mode 100644 tests/rvv_bench/utf8_count/utf8_count.c diff --git a/tests/default.nix b/tests/default.nix index 027526d3a..f0fb7fa5a 100644 --- a/tests/default.nix +++ b/tests/default.nix @@ -46,11 +46,12 @@ let asm = casesSelf.callPackage ./asm { }; perf = casesSelf.callPackage ./perf { }; codegen = casesSelf.callPackage ./codegen { }; + rvv_bench = casesSelf.callPackage ./rvv_bench { }; })); # remove non-case attributes in scope scopeStripped = { - inherit (scope) mlir intrinsic asm perf codegen; + inherit (scope) mlir intrinsic asm perf codegen rvv_bench; }; all = diff --git a/tests/rvv_bench/_include/bench.h b/tests/rvv_bench/_include/bench.h new file mode 100644 index 000000000..126346d4a --- /dev/null +++ b/tests/rvv_bench/_include/bench.h @@ -0,0 +1,170 @@ +#include "config.h" +#include "nolibc.h" + +#ifndef BENCH_NEXT + #define BENCH_NEXT NEXT +#endif + +#define MX(f, F) f(F##_m1) f(F##_m2) f(F##_m4) f(F##_m8) +#define STR(x) STR_(x) +#define STR_(x) #x + +#define ROTL(x, n) (((x) << (n)) | ((x) >> (8 * sizeof(x) - (n)))) + +#if defined(__clang__) || defined(__GNUC__) || defined(__INTEL_COMPILER) + +#define BENCH_CLOBBER() ({ __asm volatile("" ::: "memory"); }) +#define BENCH_VOLATILE(x) \ + ({ __asm volatile("" : "+g"(x) : "g"(x) : "memory"); }) +#define BENCH_VOLATILE_REG(x) \ + ({ __asm volatile("" : "+r"(x) : "r"(x) : "memory"); }) +#define BENCH_VOLATILE_MEM(x) \ + ({ __asm volatile("" : "+m"(x) : "m"(x) : "memory"); }) +#define BENCH_FENCE() ({ __asm volatile("fence.i"); }) + +#define BENCH_MAY_ALIAS __attribute__((__may_alias__)) + +#else + +#define BENCH_CLOBBER() +#define BENCH_CLOBBER_WITH(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) +#define BENCH_CLOBBER_WITH_REG(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) +#define BENCH_CLOBBER_WITH_MEM(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) +static void bench_use_ptr(char const volatile *x) {} + +#define BENCH_MAY_ALIAS + +#endif + +static int compare_ux(void const *a, void const *b) { + ux A = *(ux *)a, B = *(ux *)b; + return A < B ? -1 : A > B ? 1 : 0; +} + +typedef struct { + ux x, y, z; +} RandState; +static RandState randState = {123, 456, 789}; + +/* RomuDuoJr, see https://romu-random.org/ */ +static ux urand(void) { + ux xp = randState.x, yp = randState.y, zp = randState.z; + randState.x = 3323815723u * zp; + randState.y = ROTL(yp - xp, 6); + randState.z = ROTL(zp - yp, 22); + return xp; +} + +typedef struct { + char const *name; + void *func; +} Impl; +typedef struct { + size_t N; + char const *name; + ux (*func)(void *, size_t); +} Bench; + +static unsigned char *mem = 0; + +void bench_main(void); +ux checksum(size_t n); +void init(void); + +static void memrand(void *ptr, size_t n) { + unsigned char *p = ptr; +#ifdef __GNUC__ + typedef ux __attribute__((__may_alias__)) uxa; + for (; n && (uintptr_t)p % sizeof(uxa); --n) + *p++ = urand(); + uxa *px = (uxa *)p; + for (; n > sizeof(ux); n -= sizeof(ux)) + *px++ = urand(); + p = (unsigned char *)px; +#endif + while (n--) + *p++ = urand(); +} + +#if __STDC_HOSTED__ +#include +#else +static ux heap[1 + MAX_MEM / sizeof(ux)]; +#endif + +int test(void) { + +#if __STDC_HOSTED__ + mem = malloc(MAX_MEM); +#else + mem = (unsigned char *)heap; +#endif + + size_t x; + randState.x ^= rv_cycles() * 7; + randState.y += rv_cycles() ^ (uintptr_t)&x + 666 * (uintptr_t)mem; + + /* initialize memory */ + memrand(mem, MAX_MEM); + + init(); + bench_main(); +#if __STDC_HOSTED__ + free(mem); +#endif + return 0; +} + +static fx bench_time(size_t n, Impl impl, Bench bench) { + static ux arr[MAX_REPEATS]; + size_t total = 0, repeats = 0; + for (; repeats < MAX_REPEATS; ++repeats) { + total += arr[repeats] = bench.func(impl.func, n); + if (repeats > MIN_REPEATS && total > STOP_CYCLES) + break; + } +#if MAX_REPEATS > 4 + qsort(arr, repeats, sizeof *arr, compare_ux); + ux sum = 0, count = 0; + for (size_t i = repeats * 0.2f; i < repeats * 0.8f; ++i, ++count) + sum += arr[i]; +#else + ux sum = 0, count = repeats; + for (size_t i = 0; i < repeats; ++i) + sum += arr[i]; +#endif + return n / ((fx)sum / count); +} + +static void bench_run(size_t nImpls, Impl *impls, size_t nBenches, + Bench *benches) { + for (Bench *b = benches; b != benches + nBenches; ++b) { + size_t N = b->N; + for (Impl *i = impls; i != impls + nImpls; ++i) { + printf("["); + for (size_t n = 1; n < N; n = BENCH_NEXT(n)) { + ux si = 0, s0 = 0; + printf("%f, ", bench_time(n, *i, *b)); + } + printf("],\n"); + } + printf("]\n},\n"); + } +} + +#define TIME \ + for (ux beg = rv_cycles(), _once = 1; _once; \ + BENCH_FENCE(), _cycles += rv_cycles() - beg, _once = 0) + +#define BENCH(name) \ + ux bench_##name(void *_func, size_t n) { \ + Func *f = _func; \ + ux _cycles = 0; +#define BENCH_END \ + return _cycles; \ + } + +#define BENCH_MAIN(impls, benches) \ + void bench_main(void) { \ + bench_run(ARR_LEN(impls), impls, ARR_LEN(benches), benches); \ + } diff --git a/tests/rvv_bench/_include/config.h b/tests/rvv_bench/_include/config.h new file mode 100644 index 000000000..44f1009b0 --- /dev/null +++ b/tests/rvv_bench/_include/config.h @@ -0,0 +1,25 @@ +/* processor specific configs */ +#define HAS_E64 (__riscv_v_elen >= 64) +#define HAS_F16 0 + +/* the maximum number of bytes to allocate, minimum of 4096 */ +#define MAX_MEM (4096 * 8) +/* the byte count for the next run */ +#define NEXT(c) (c + c / 3 + 3) + +/* minimum number of repeats, to sample median from */ +#define MIN_REPEATS 1 +/* maxium number of repeats, executed until more than STOP_TIME has elapsed */ +#define MAX_REPEATS 1 + +/* stop repeats early afer this many cycles have elapsed */ +#define STOP_CYCLES (1024 * 1024 * 500) + +/* custom scaling factors for benchmarks, these are used to make sure each + * benchmark approximately takes the same amount of time. */ + +#define SCALE_mandelbrot(N) ((N) / 10) +#define SCALE_mergelines(N) ((N) / 10) + +/* benchmark specific configurations */ +#define mandelbrot_ITER 100 diff --git a/tests/rvv_bench/_include/nolibc.h b/tests/rvv_bench/_include/nolibc.h new file mode 100644 index 000000000..88f31d136 --- /dev/null +++ b/tests/rvv_bench/_include/nolibc.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#if __riscv_xlen == 32 +typedef uint32_t ux; +typedef float fx; +#define IF64(...) +#elif __riscv_xlen == 64 +typedef uint64_t ux; +typedef double fx; +#define IF64(...) __VA_ARGS__ +#else +#error "unsupported XLEN" +#endif +#define ARR_LEN(x) (sizeof x / sizeof *(x)) + +static void memwrite(void const *ptr, size_t len) { + fwrite(ptr, 1, len, stdout); +} + +static size_t memread(void *ptr, size_t len) { + return fread(ptr, 1, len, stdin); +} + +static inline ux rv_cycles(void) { + ux cycle; + __asm volatile("csrr %0, mcycle" : "=r"(cycle)); + return cycle; +} + +static void memswap(void *a, void *b, size_t size) { + unsigned char *A = (unsigned char *)a, *B = (unsigned char *)b; + unsigned char *aEnd = A + size; + while (A < aEnd) { + unsigned char temp = *A; + *A++ = *B; + *B++ = temp; + } +} + +static ux usqrt(ux y) { + ux L = 0, R = y + 1; + while (L != R - 1) { + ux M = (L + R) / 2; + if (M * M <= y) + L = M; + else + R = M; + } + return L; +} + +static ux uhash(ux x) { +#if __riscv_xlen == 32 + /* MurmurHash3 32-bit finalizer */ + x ^= x >> 16; + x *= 0x85ebca6b; + x ^= x >> 13; + x *= 0xc2b2ae35; + x ^= x >> 16; +#else + /* splitmix64 finalizer */ + x ^= x >> 30; + x *= 0xbf58476d1ce4e5b9U; + x ^= x >> 27; + x *= 0x94d049bb133111ebU; + x ^= x >> 31; +#endif + return x; +} + +#define IFHOSTED(...) __VA_ARGS__ diff --git a/tests/rvv_bench/_include/template.S b/tests/rvv_bench/_include/template.S new file mode 100644 index 000000000..eabdd5017 --- /dev/null +++ b/tests/rvv_bench/_include/template.S @@ -0,0 +1,80 @@ +#define HAS_RVV_1_0 1 +#include "config.h" +.text +.balign 8 + +#define CAT_(a,b) a##b +#define CAT(a,b) CAT_(a,b) + +#define STR(x) #x +#define STRe(x) STR(x) + +#define MX_N 0 +#include STRe(INC) + +#undef MX_N + +#define MX_N 1 +#define MX8(x) x##m8 +#define MX4(x) x##m4 +#define MX2(x) x##m2 +#define MX(x) x##m1 +#if HAS_RVV_1_0 +#define MXf2(x) x##mf2 +#define MXf4(x) x##mf4 +# define MXf8(x) x##mf8 +#endif +#include STRe(INC) + +#undef MX_N +#undef MX8 +#undef MX4 +#undef MX2 +#undef MX +#undef MXf2 +#undef MXf4 +#undef MXf8 + +#define MX_N 2 +#define MX4(x) x##m8 +#define MX2(x) x##m4 +#define MX(x) x##m2 +#define MXf2(x) x##m1 +#if HAS_RVV_1_0 +#define MXf4(x) x##mf2 +# define MXf8(x) x##mf4 +#endif +#include STRe(INC) + +#undef MX_N +#undef MX4 +#undef MX2 +#undef MX +#undef MXf2 +#undef MXf4 +#undef MXf8 + +#define MX_N 4 +#define MX2(x) x##m8 +#define MX(x) x##m4 +#define MXf2(x) x##m2 +#define MXf4(x) x##m1 +#if HAS_RVV_1_0 +# define MXf8(x) x##mf2 +#endif +#include STRe(INC) + +#undef MX_N +#undef MX2 +#undef MX +#undef MXf2 +#undef MXf4 +#undef MXf8 + +#define MX_N 8 +#define MX(x) x##m8 +#define MXf2(x) x##m4 +#define MXf4(x) x##m2 +#define MXf8(x) x##m1 +#include STRe(INC) + diff --git a/tests/rvv_bench/_include/thirdparty/boring.c b/tests/rvv_bench/_include/thirdparty/boring.c new file mode 100644 index 000000000..e7cea237e --- /dev/null +++ b/tests/rvv_bench/_include/thirdparty/boring.c @@ -0,0 +1,383 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Adapted from the public domain, estream code by D. Bernstein. + +#include "boring.h" + + +extern void *memcpy(void *restrict dest, void const *restrict src, size_t n); + +#define U8TO32_LITTLE(p) \ + (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +// sigma contains the ChaCha constants, which happen to be an ASCII string. +static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', + '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; + +#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) + +// QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. +#define QUARTERROUND(a, b, c, d) \ + x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 16); \ + x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 12); \ + x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 8); \ + x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 7); + +#define U32TO8_LITTLE(p, v) \ + { \ + (p)[0] = (v >> 0) & 0xff; \ + (p)[1] = (v >> 8) & 0xff; \ + (p)[2] = (v >> 16) & 0xff; \ + (p)[3] = (v >> 24) & 0xff; \ + } + +// chacha_core performs 20 rounds of ChaCha on the input words in +// |input| and writes the 64 output bytes to |output|. +static void chacha_core(uint8_t output[64], const uint32_t input[16]) { + uint32_t x[16]; + int i; + + memcpy(x, input, sizeof(uint32_t) * 16); + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12) + QUARTERROUND(1, 5, 9, 13) + QUARTERROUND(2, 6, 10, 14) + QUARTERROUND(3, 7, 11, 15) + QUARTERROUND(0, 5, 10, 15) + QUARTERROUND(1, 6, 11, 12) + QUARTERROUND(2, 7, 8, 13) + QUARTERROUND(3, 4, 9, 14) + } + + for (i = 0; i < 16; ++i) { + x[i] += input[i]; + } + for (i = 0; i < 16; ++i) { + U32TO8_LITTLE(output + 4 * i, x[i]); + } +} + +void boring_chacha20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[12], + uint32_t counter) { + + uint32_t input[16]; + uint8_t buf[64]; + size_t todo, i; + + input[0] = U8TO32_LITTLE(sigma + 0); + input[1] = U8TO32_LITTLE(sigma + 4); + input[2] = U8TO32_LITTLE(sigma + 8); + input[3] = U8TO32_LITTLE(sigma + 12); + + input[4] = U8TO32_LITTLE(key + 0); + input[5] = U8TO32_LITTLE(key + 4); + input[6] = U8TO32_LITTLE(key + 8); + input[7] = U8TO32_LITTLE(key + 12); + + input[8] = U8TO32_LITTLE(key + 16); + input[9] = U8TO32_LITTLE(key + 20); + input[10] = U8TO32_LITTLE(key + 24); + input[11] = U8TO32_LITTLE(key + 28); + + input[12] = counter; + input[13] = U8TO32_LITTLE(nonce + 0); + input[14] = U8TO32_LITTLE(nonce + 4); + input[15] = U8TO32_LITTLE(nonce + 8); + + while (in_len > 0) { + todo = sizeof(buf); + if (in_len < todo) { + todo = in_len; + } + + chacha_core(buf, input); + for (i = 0; i < todo; i++) { + out[i] = in[i] ^ buf[i]; + } + + out += todo; + in += todo; + in_len -= todo; + + input[12]++; + } +} + +///// poly1305 + +static uint32_t U8TO32_LE(const uint8_t *m) { + uint32_t r; + memcpy(&r, m, sizeof(r)); + return r; +} + +static void U32TO8_LE(uint8_t *m, uint32_t v) { + memcpy(m, &v, sizeof(v)); +} + + +static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } + +struct poly1305_state_st { + uint32_t r0, r1, r2, r3, r4; + uint32_t s1, s2, s3, s4; + uint32_t h0, h1, h2, h3, h4; + uint8_t buf[16]; + unsigned int buf_used; + uint8_t key[16]; +}; + +static inline struct poly1305_state_st *poly1305_aligned_state( + poly1305_state *state) { + return (struct poly1305_state_st *)(((uintptr_t)state + 63) & ~63); +} + +static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t len) { + uint32_t t0, t1, t2, t3; + uint64_t t[5]; + uint32_t b; + uint64_t c; + size_t j; + uint8_t mp[16]; + + if (len < 16) { + goto poly1305_donna_atmost15bytes; + } + + poly1305_donna_16bytes: + t0 = U8TO32_LE(in); + t1 = U8TO32_LE(in + 4); + t2 = U8TO32_LE(in + 8); + t3 = U8TO32_LE(in + 12); + + in += 16; + len -= 16; + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8) | (1 << 24); + + poly1305_donna_mul: + t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + + mul32x32_64(state->h4, state->s1); + t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + + mul32x32_64(state->h4, state->s2); + t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + + mul32x32_64(state->h4, state->s3); + t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + + mul32x32_64(state->h4, state->s4); + t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + + mul32x32_64(state->h4, state->r0); + + state->h0 = (uint32_t)t[0] & 0x3ffffff; + c = (t[0] >> 26); + t[1] += c; + state->h1 = (uint32_t)t[1] & 0x3ffffff; + b = (uint32_t)(t[1] >> 26); + t[2] += b; + state->h2 = (uint32_t)t[2] & 0x3ffffff; + b = (uint32_t)(t[2] >> 26); + t[3] += b; + state->h3 = (uint32_t)t[3] & 0x3ffffff; + b = (uint32_t)(t[3] >> 26); + t[4] += b; + state->h4 = (uint32_t)t[4] & 0x3ffffff; + b = (uint32_t)(t[4] >> 26); + state->h0 += b * 5; + + if (len >= 16) { + goto poly1305_donna_16bytes; + } + + // final bytes + poly1305_donna_atmost15bytes: + if (!len) { + return; + } + + for (j = 0; j < len; j++) { + mp[j] = in[j]; + } + mp[j++] = 1; + for (; j < 16; j++) { + mp[j] = 0; + } + len = 0; + + t0 = U8TO32_LE(mp + 0); + t1 = U8TO32_LE(mp + 4); + t2 = U8TO32_LE(mp + 8); + t3 = U8TO32_LE(mp + 12); + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8); + + goto poly1305_donna_mul; +} + +void boring_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + uint32_t t0, t1, t2, t3; + + t0 = U8TO32_LE(key + 0); + t1 = U8TO32_LE(key + 4); + t2 = U8TO32_LE(key + 8); + t3 = U8TO32_LE(key + 12); + + // precompute multipliers + state->r0 = t0 & 0x3ffffff; + t0 >>= 26; + t0 |= t1 << 6; + state->r1 = t0 & 0x3ffff03; + t1 >>= 20; + t1 |= t2 << 12; + state->r2 = t1 & 0x3ffc0ff; + t2 >>= 14; + t2 |= t3 << 18; + state->r3 = t2 & 0x3f03fff; + t3 >>= 8; + state->r4 = t3 & 0x00fffff; + + state->s1 = state->r1 * 5; + state->s2 = state->r2 * 5; + state->s3 = state->r3 * 5; + state->s4 = state->r4 * 5; + + // init state + state->h0 = 0; + state->h1 = 0; + state->h2 = 0; + state->h3 = 0; + state->h4 = 0; + + state->buf_used = 0; + memcpy(state->key, key + 16, sizeof(state->key)); +} + +void boring_poly1305_update(poly1305_state *statep, const uint8_t *in, + size_t in_len) { + unsigned int i; + struct poly1305_state_st *state = poly1305_aligned_state(statep); + + if (state->buf_used) { + unsigned todo = 16 - state->buf_used; + if (todo > in_len) { + todo = (unsigned)in_len; + } + for (i = 0; i < todo; i++) { + state->buf[state->buf_used + i] = in[i]; + } + state->buf_used += todo; + in_len -= todo; + in += todo; + + if (state->buf_used == 16) { + poly1305_update(state, state->buf, 16); + state->buf_used = 0; + } + } + + if (in_len >= 16) { + size_t todo = in_len & ~0xf; + poly1305_update(state, in, todo); + in += todo; + in_len &= 0xf; + } + + if (in_len) { + for (i = 0; i < in_len; i++) { + state->buf[i] = in[i]; + } + state->buf_used = (unsigned)in_len; + } +} + +void boring_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + uint64_t f0, f1, f2, f3; + uint32_t g0, g1, g2, g3, g4; + uint32_t b, nb; + + if (state->buf_used) { + poly1305_update(state, state->buf, state->buf_used); + } + + b = state->h0 >> 26; + state->h0 = state->h0 & 0x3ffffff; + state->h1 += b; + b = state->h1 >> 26; + state->h1 = state->h1 & 0x3ffffff; + state->h2 += b; + b = state->h2 >> 26; + state->h2 = state->h2 & 0x3ffffff; + state->h3 += b; + b = state->h3 >> 26; + state->h3 = state->h3 & 0x3ffffff; + state->h4 += b; + b = state->h4 >> 26; + state->h4 = state->h4 & 0x3ffffff; + state->h0 += b * 5; + + g0 = state->h0 + 5; + b = g0 >> 26; + g0 &= 0x3ffffff; + g1 = state->h1 + b; + b = g1 >> 26; + g1 &= 0x3ffffff; + g2 = state->h2 + b; + b = g2 >> 26; + g2 &= 0x3ffffff; + g3 = state->h3 + b; + b = g3 >> 26; + g3 &= 0x3ffffff; + g4 = state->h4 + b - (1 << 26); + + b = (g4 >> 31) - 1; + nb = ~b; + state->h0 = (state->h0 & nb) | (g0 & b); + state->h1 = (state->h1 & nb) | (g1 & b); + state->h2 = (state->h2 & nb) | (g2 & b); + state->h3 = (state->h3 & nb) | (g3 & b); + state->h4 = (state->h4 & nb) | (g4 & b); + + f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); + f1 = ((state->h1 >> 6) | (state->h2 << 20)) + + (uint64_t)U8TO32_LE(&state->key[4]); + f2 = ((state->h2 >> 12) | (state->h3 << 14)) + + (uint64_t)U8TO32_LE(&state->key[8]); + f3 = ((state->h3 >> 18) | (state->h4 << 8)) + + (uint64_t)U8TO32_LE(&state->key[12]); + + U32TO8_LE(&mac[0], f0); + f1 += (f0 >> 32); + U32TO8_LE(&mac[4], f1); + f2 += (f1 >> 32); + U32TO8_LE(&mac[8], f2); + f3 += (f2 >> 32); + U32TO8_LE(&mac[12], f3); +} diff --git a/tests/rvv_bench/_include/thirdparty/boring.h b/tests/rvv_bench/_include/thirdparty/boring.h new file mode 100644 index 000000000..3fb2300b6 --- /dev/null +++ b/tests/rvv_bench/_include/thirdparty/boring.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include +#include + +void boring_chacha20(uint8_t *out, const uint8_t *in, + size_t in_len, const uint8_t key[32], + const uint8_t nonce[12], uint32_t counter); + +typedef uint8_t poly1305_state[512]; + +void boring_poly1305_init(poly1305_state *state, + const uint8_t key[32]); + +void boring_poly1305_update(poly1305_state *state, + const uint8_t *in, size_t in_len); + +void boring_poly1305_finish(poly1305_state *state, + uint8_t mac[16]); diff --git a/tests/rvv_bench/_include/thirdparty/rvv-rollback.S b/tests/rvv_bench/_include/thirdparty/rvv-rollback.S new file mode 100644 index 000000000..e941604bb --- /dev/null +++ b/tests/rvv_bench/_include/thirdparty/rvv-rollback.S @@ -0,0 +1,255 @@ +# rvv-rollback.S -- A minimal benchmarking library +# Olaf Bernstein +# Distributed under the MIT license, see license at the end of the file. +# New versions available at https://gist.github.com/camel-cdr/cfd9ba2b8754b521edf4892fe19c7031 +# Conversions taken from https://github.com/RISCVtestbed/rvv-rollback + +.macro vle32.v a:vararg + vlw.v \a +.endm +.macro vle16.v a:vararg + vlh.v \a +.endm +.macro vle8.v a:vararg + vlb.v \a +.endm +.macro vle32ff.v a:vararg + vlwff.v \a +.endm +.macro vle16ff.v a:vararg + vlhff.v \a +.endm +.macro vle8ff.v a:vararg + vlbff.v \a +.endm +.macro vse32.v a:vararg + vsw.v \a +.endm +.macro vse16.v a:vararg + vsh.v \a +.endm +.macro vse8.v a:vararg + vsb.v \a +.endm +.macro vluxei32.v a:vararg + vlxw.v \a +.endm +.macro vluxei16.v a:vararg + vlxh.v \a +.endm +.macro vluxei8.v a:vararg + vlxb.v \a +.endm +.macro vsuxei32.v a:vararg + vsuxw.v \a +.endm +.macro vsuxei16.v a:vararg + vsuxh.v \a +.endm +.macro vsuxei8.v a:vararg + vsuxb.v \a +.endm +.macro vlse32.v a:vararg + vlsw.v \a +.endm +.macro vlse16.v a:vararg + vlsh.v \a +.endm +.macro vlse8.v a:vararg + vlsb.v \a +.endm +.macro vsse32.v a:vararg + vssw.v \a +.endm +.macro vsse16.v a:vararg + vssh.v \a +.endm +.macro vsse8.v a:vararg + vssb.v \a +.endm +.macro vloxei32.v a:vararg + vlxw.v \a +.endm +.macro vloxei16.v a:vararg + vlxh.v \a +.endm +.macro vloxei8.v a:vararg + vlxb.v \a +.endm +.macro vsoxei32.v a:vararg + vsxw.v \a +.endm +.macro vsoxei16.v a:vararg + vsxh.v \a +.endm +.macro vsoxei8.v a:vararg + vsxb.v \a +.endm +.macro vfncvt.xu.f.w a:vararg + vfncvt.xu.f.v \a +.endm +.macro vfncvt.x.f.w a:vararg + vfncvt.x.f.v \a +.endm +.macro vfncvt.f.xu.w a:vararg + vfncvt.f.xu.v \a +.endm +.macro vfncvt.f.x.w a:vararg + vfncvt.f.x.v \a +.endm +.macro vfncvt.f.f.w a:vararg + vfncvt.f.f.v \a +.endm +.macro vfredusum a:vararg + vfredsum \a +.endm +.macro vfwredusum.vs a:vararg + vfwredsum.vs \a +.endm +.macro vnclip.wv a:vararg + vnclip.vv \a +.endm +.macro vnclip.wx a:vararg + vnclip.vx \a +.endm +.macro vnclip.wi a:vararg + vnclip.vi \a +.endm +.macro vnclipu.wv a:vararg + vnclipu.vv \a +.endm +.macro vnclipu.wx a:vararg + vnclipu.vx \a +.endm +.macro vnclipu.wi a:vararg + vnclipu.vi \a +.endm +.macro vnsra.wv a:vararg + vnsra.vv \a +.endm +.macro vnsra.wx a:vararg + vnsra.vx \a +.endm +.macro vnsra.wi a:vararg + vnsra.vi \a +.endm +.macro vnsrl.wv a:vararg + vnsrl.vv \a +.endm +.macro vnsrl.wx a:vararg + vnsrl.vx \a +.endm +.macro vnsrl.wi a:vararg + vnsrl.vi \a +.endm +.macro vmandn.mm a:vararg + vmandnot.mm \a +.endm +.macro vmorn.mm a:vararg + vmornot.mm \a +.endm +.macro vmmv.m a:vararg + vmcpy.m \a +.endm +.macro vcpop.m a:vararg + vmpopc.m \a +.endm +.macro vpop.m a:vararg + vmpopc.m \a +.endm +.macro vfirst.m a:vararg + vmfirst.m \a +.endm + +.macro define_for_all_nf prefix suffix prefix2 suffix2 + .macro \prefix\()2\suffix a:vararg + \prefix2\()2\suffix2 \a + .endm + .macro \prefix\()3\suffix a:vararg + \prefix2\()3\suffix2 \a + .endm + .macro \prefix\()4\suffix a:vararg + \prefix2\()4\suffix2 \a + .endm + .macro \prefix\()5\suffix a:vararg + \prefix2\()5\suffix2 \a + .endm + .macro \prefix\()6\suffix a:vararg + \prefix2\()6\suffix2 \a + .endm + .macro \prefix\()7\suffix a:vararg + \prefix2\()7\suffix2 \a + .endm + .macro \prefix\()8\suffix a:vararg + \prefix2\()8\suffix2 \a + .endm +.endm +define_for_all_nf vlseg e8.v vlseg b.v +define_for_all_nf vlseg e16.v vlseg h.v +define_for_all_nf vlseg e32.v vlseg w.v + +define_for_all_nf vsseg e8.v vsseg b.v +define_for_all_nf vsseg e16.v vsseg h.v +define_for_all_nf vsseg e32.v vsseg w.v + +define_for_all_nf vlsseg e8.v vlsseg bu.v +define_for_all_nf vlsseg e16.v vlsseg hu.v +define_for_all_nf vlsseg e32.v vlsseg wu.v + +define_for_all_nf vssseg e8.v vssseg b.v +define_for_all_nf vssseg e16.v vssseg h.v +define_for_all_nf vssseg e32.v vssseg w.v + +define_for_all_nf vloxseg e8.v vlxseg b.v +define_for_all_nf vloxseg e16.v vlxseg h.v +define_for_all_nf vloxseg e32.v vlxseg w.v +define_for_all_nf vluxseg e8.v vlxseg b.v +define_for_all_nf vluxseg e16.v vlxseg h.v +define_for_all_nf vluxseg e32.v vlxseg w.v + +define_for_all_nf vsoxseg e8.v vsxseg b.v +define_for_all_nf vsoxseg e16.v vsxseg h.v +define_for_all_nf vsoxseg e32.v vsxseg w.v +define_for_all_nf vsuxseg e8.v vsxseg b.v +define_for_all_nf vsuxseg e16.v vsxseg h.v +define_for_all_nf vsuxseg e32.v vsxseg w.v + + +.macro vsetvl0p7 rd, rs1, rs2, T=1, M=1 + vsetvl \rd, \rs1, \rs2 +.endm +.macro vsetvli0p7 rd, rs1, e=e8, m=m1, T=1, M=1 + .ifc \m, mf2 + NOT SUPPORTED IN rvv0.7 + .endif + .ifc \m, mf4 + NOT SUPPORTED IN rvv0.7 + .endif + .ifc \m, mf8 + NOT SUPPORTED IN rvv0.7 + .endif + vsetvli \rd, \rs1, \e, \m +.endm + +#define vsetvl vsetvl0p7 +#define vsetvli vsetvli0p7 + + + +# Copyright (c) 2023 Olaf Berstein +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + diff --git a/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S b/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S new file mode 100644 index 000000000..b363d7830 --- /dev/null +++ b/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S @@ -0,0 +1,68 @@ +#ifdef MX + +#if MX_N == 4 || MX_N == 2 || MX_N == 1 + +.global MX(ascii_to_utf16_rvv_vsseg_) +.type MX(ascii_to_utf16_rvv_vsseg_), @function +MX(ascii_to_utf16_rvv_vsseg_): + vsetvli t0, x0, e8, MX2(), ta, ma + vmv.v.i v0, 0 +1: + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v0, (a1) + vsseg2e8.v v0, (a0) + add a1, a1, t0 + sub a2, a2, t0 + slli t0, t0, 1 + add a0, a0, t0 + bnez a2, 1b + ret + + + +.global MX(ascii_to_utf16_rvv_ext_) +.type MX(ascii_to_utf16_rvv_ext_), @function +MX(ascii_to_utf16_rvv_ext_): +1: + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v0, (a1) +#if HAS_RVV_1_0 + vsetvli x0, x0, e16, MX2(), ta, ma + vzext.vf2 v8, v0 +#else + vwaddu.vx v8, v0, x0 + vsetvli x0, a2, e16, MX2(), ta, ma +#endif + vse16.v v8, (a0) + add a1, a1, t0 + sub a2, a2, t0 + slli t0, t0, 1 + add a0, a0, t0 + bnez a2, 1b + ret + + +.global MX(ascii_to_utf16_rvv_vss_) +.type MX(ascii_to_utf16_rvv_vss_), @function +MX(ascii_to_utf16_rvv_vss_): + vsetvli t0, x0, e8, MX2(), ta, ma + vmv.v.i v0, 0 + li a3, 2 +1: + vsetvli t0, a2, e16, MX2(), ta, ma + vse16.v v0, (a0) + + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v8, (a1) + vsse8.v v8, (a0), a3 + + add a1, a1, t0 + sub a2, a2, t0 + slli t0, t0, 1 + add a0, a0, t0 + bnez a2, 1b + ret + +#endif +#endif + diff --git a/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c b/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c new file mode 100644 index 000000000..fc3fba747 --- /dev/null +++ b/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c @@ -0,0 +1,63 @@ +#include "bench.h" + +void +ascii_to_utf16_scalar(uint16_t *restrict dest, uint8_t const *restrict src, size_t len) +{ + while (len--) *dest++ = *src++, BENCH_CLOBBER(); +} + +void +ascii_to_utf16_scalar_autovec(uint16_t *restrict dest, uint8_t const *restrict src, size_t len) +{ + while (len--) *dest++ = *src++; +} + +#define IMPLS(f) \ + f(scalar) f(scalar_autovec) \ + f(rvv_ext_m1) f(rvv_ext_m2) f(rvv_ext_m4) \ + f(rvv_vsseg_m1) f(rvv_vsseg_m2) f(rvv_vsseg_m4) \ + f(rvv_vss_m1) f(rvv_vss_m2) f(rvv_vss_m4) \ + +typedef void Func(uint16_t *restrict dest, uint8_t const *restrict src, size_t len); + +#define DECLARE(f) extern Func ascii_to_utf16_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &ascii_to_utf16_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint16_t *dest; +uint8_t *src; + +void init(void) { } + +ux checksum(size_t n) { + ux sum = 0; + for (size_t i = 0; i < n+9; ++i) + sum = uhash(sum) + dest[i]; + return sum; +} + +void common(size_t n, size_t dOff, size_t sOff) { + dest = (uint16_t*)mem + dOff/2; + src = (uint8_t*)(dest + 9 + MAX_MEM/3) + sOff; + memrand(src, n+9); + for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F; + memset(dest, 1, (n+9)*2); +} + +BENCH(base) { + common(n, urand() & 255, urand() & 255); + TIME f(dest, src, n); +} BENCH_END + +BENCH(aligned) { + common(n, 0, 0); + TIME f(dest, src, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM/3 - 512-9*2, "ascii to utf16", bench_base }, + { MAX_MEM/3 - 512-9*2, "ascii to utf16 aligned", bench_aligned }, +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S b/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S new file mode 100644 index 000000000..9cf21fad3 --- /dev/null +++ b/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S @@ -0,0 +1,66 @@ +#ifdef MX + +#if MX_N == 2 || MX_N == 1 + +.global MX(ascii_to_utf32_rvv_vsseg_) +MX(ascii_to_utf32_rvv_vsseg_): + vsetvli t0, x0, e8, MX4(), ta, ma + vmv.v.i v0, 0 +1: + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v0, (a1) + vsseg4e8.v v0, (a0) + add a1, a1, t0 + sub a2, a2, t0 + slli t0, t0, 2 + add a0, a0, t0 + bnez a2, 1b + ret + + +.global MX(ascii_to_utf32_rvv_ext_) +MX(ascii_to_utf32_rvv_ext_): +1: + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v0, (a1) +#if HAS_RVV_1_0 + vsetvli x0, x0, e32, MX4(), ta, ma + vzext.vf4 v8, v0 +#else + vwaddu.vx v16, v0, x0 + vsetvli x0, a2, e16, MX2(), ta, ma + vwaddu.vx v8, v16, x0 + vsetvli x0, a2, e32, MX4(), ta, ma +#endif + vse32.v v8, (a0) + add a1, a1, t0 + sub a2, a2, t0 + slli t0, t0, 2 + add a0, a0, t0 + bnez a2, 1b + ret + + +.global MX(ascii_to_utf32_rvv_vss_) +MX(ascii_to_utf32_rvv_vss_): + vsetvli t0, x0, e8, MX4(), ta, ma + vmv.v.i v0, 0 + li a3, 4 +1: + vsetvli t0, a2, e32, MX4(), ta, ma + vse32.v v0, (a0) + + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v8, (a1) + vsse8.v v8, (a0), a3 + + add a1, a1, t0 + sub a2, a2, t0 + slli t0, t0, 2 + add a0, a0, t0 + bnez a2, 1b + ret + +#endif +#endif + diff --git a/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c b/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c new file mode 100644 index 000000000..968493037 --- /dev/null +++ b/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c @@ -0,0 +1,63 @@ +#include "bench.h" + +void +ascii_to_utf32_scalar(uint32_t *restrict dest, uint8_t const *restrict src, size_t len) +{ + while (len--) *dest++ = *src++, BENCH_CLOBBER(); +} + +void +ascii_to_utf32_scalar_autovec(uint32_t *restrict dest, uint8_t const *restrict src, size_t len) +{ + while (len--) *dest++ = *src++; +} + +#define IMPLS(f) \ + f(scalar) f(scalar_autovec) \ + f(rvv_ext_m1) f(rvv_ext_m2) \ + f(rvv_vsseg_m1) f(rvv_vsseg_m2) \ + f(rvv_vss_m1) f(rvv_vss_m2) \ + +typedef void Func(uint32_t *restrict dest, uint8_t const *restrict src, size_t len); + +#define DECLARE(f) extern Func ascii_to_utf32_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &ascii_to_utf32_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint32_t *dest; +uint8_t *src; + +void init(void) { } + +ux checksum(size_t n) { + ux sum = 0; + for (size_t i = 0; i < n+9; ++i) + sum = uhash(sum) + dest[i]; + return sum; +} + +void common(size_t n, size_t dOff, size_t sOff) { + dest = (uint32_t*)mem + dOff/4; + src = (uint8_t*)(dest + 9 + MAX_MEM/5) + sOff; + memrand(src, n+9); + for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F; + memset(dest, 1, (n+9)*4); +} + +BENCH(base) { + common(n, urand() & 255, urand() & 255); + TIME f(dest, src, n); +} BENCH_END + +BENCH(aligned) { + common(n, 0, 0); + TIME f(dest, src, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM/5 - 512-9*2, "ascii to utf32", bench_base }, + { MAX_MEM/5 - 512-9*2, "ascii to utf32 aligned", bench_aligned }, +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/byteswap/byteswap.S b/tests/rvv_bench/byteswap/byteswap.S new file mode 100644 index 000000000..79154ef68 --- /dev/null +++ b/tests/rvv_bench/byteswap/byteswap.S @@ -0,0 +1,81 @@ +/* + * TODO: This currently only works for VLEN<=256. + * I think rvv 1.0 should only vrgatherei16.vv here in the future. + */ + +#ifdef MX + + +# a0 = ptr, a1 = len +.global MX(byteswap32_rvv_gather_) +MX(byteswap32_rvv_gather_): + vsetvli t0, x0, e8, MX(), ta, ma + vid.v v0 + vand.vi v8, v0, 3 + vrsub.vi v8, v8, 3 + vsrl.vi v0, v0, 2 + vsll.vi v0, v0, 2 + vadd.vv v0, v0, v8 # i/8*8 + (7-1%8) +1: + vsetvli t0, a1, e32, MX(), ta, ma + vle32.v v8, (a0) + slli t1, t0, 2 + vsetvli x0, t1, e8, MX(), ta, ma + vrgather.vv v16, v8, v0 + vsetvli x0, t0, e32, MX(), ta, ma + vse32.v v16, (a0) + sub a1, a1, t0 + add a0, a0, t1 + bnez a1, 1b + ret +#endif + +#if MX_N == 2 + +.macro byteswap32_rvv_m1_gathers n + .global byteswap32_rvv_m1_gathers_m\n + byteswap32_rvv_m1_gathers_m\n: + vsetvli t0, x0, e8, m1, ta, ma + vid.v v0 + vand.vi v8, v0, 3 + vrsub.vi v8, v8, 3 + vsrl.vi v0, v0, 2 + vsll.vi v0, v0, 2 + vadd.vv v0, v0, v8 # i/8*8 + (7-1%8) + 1: + vsetvli t0, a1, e32, m\n, ta, ma + vle32.v v8, (a0) + vsetvli t1, x0, e8, m1, ta, ma + vrgather.vv v16, v8, v0 + .ifge \n-2 + vrgather.vv v17, v9, v0 + .ifge \n-4 + vrgather.vv v18, v10, v0 + vrgather.vv v19, v11, v0 + .ifge \n-8 + vrgather.vv v20, v12, v0 + vrgather.vv v21, v13, v0 + vrgather.vv v22, v14, v0 + vrgather.vv v23, v15, v0 + .endif + .endif + .endif + vsetvli x0, t0, e32, m\n, ta, ma + vse32.v v16, (a0) + sub a1, a1, t0 + slli t0, t0, 2 + add a0, a0, t0 + bnez a1, 1b + ret +.endm + +byteswap32_rvv_m1_gathers 2 +#endif +#if MX_N == 4 +byteswap32_rvv_m1_gathers 4 +#endif +#if MX_N == 8 +byteswap32_rvv_m1_gathers 8 +#endif + + diff --git a/tests/rvv_bench/byteswap/byteswap.c b/tests/rvv_bench/byteswap/byteswap.c new file mode 100644 index 000000000..dff204b72 --- /dev/null +++ b/tests/rvv_bench/byteswap/byteswap.c @@ -0,0 +1,79 @@ +#include "bench.h" + +void +byteswap32_scalar(uint32_t *ptr, size_t n) +{ + for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) { + uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3]; + p[3] = p0; BENCH_CLOBBER(); + p[2] = p1; BENCH_CLOBBER(); + p[1] = p2; BENCH_CLOBBER(); + p[0] = p3; BENCH_CLOBBER(); + } +} + +void +byteswap32_scalar_autovec(uint32_t *ptr, size_t n) +{ + for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) { + uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3]; + p[3] = p0; + p[2] = p1; + p[1] = p2; + p[0] = p3; + } +} + +#if __riscv_zbb +void +byteswap32_SWAR_rev8(uint32_t *ptr, size_t n) +{ + while (n--) { + *ptr = __builtin_bswap32(*ptr); + ++ptr; + BENCH_CLOBBER(); + } +} +#define REV8(f) f(SWAR_rev8) +#else +#define REV8(f) +#endif + + +#define IMPLS(f) \ + f(scalar) \ + f(scalar_autovec) \ + REV8(f) \ + MX(f, rvv_gather) \ + f(rvv_m1_gathers_m2) \ + f(rvv_m1_gathers_m4) \ + f(rvv_m1_gathers_m8) \ + +typedef void Func(uint32_t *ptr, size_t n); + +#define DECLARE(f) extern Func byteswap32_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &byteswap32_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint32_t *ptr; + +void init(void) { ptr = (uint32_t*)mem; } + +ux checksum(size_t n) { + ux sum = 0; + for (size_t i = 0; i < n; ++i) + sum = uhash(sum) + ptr[i]; + return sum; +} + +BENCH(base) { + memrand(ptr, n * sizeof *ptr); + TIME f(ptr, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM/4, "byteswap32", bench_base } +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/chacha20/chacha20.S b/tests/rvv_bench/chacha20/chacha20.S new file mode 100644 index 000000000..9c62caeba --- /dev/null +++ b/tests/rvv_bench/chacha20/chacha20.S @@ -0,0 +1,5 @@ +#ifndef MX +#if __riscv_xlen >= 64 +#include "rvv-chacha-poly/vchacha.s" +#endif +#endif diff --git a/tests/rvv_bench/chacha20/chacha20.c b/tests/rvv_bench/chacha20/chacha20.c new file mode 100644 index 000000000..7d6328b54 --- /dev/null +++ b/tests/rvv_bench/chacha20/chacha20.c @@ -0,0 +1,61 @@ +#include "bench.h" +#if __riscv_xlen >= 64 +#include "../thirdparty/boring.h" + +uint8_t *dest, *src; +uint8_t key[32], nonce[12]; +uint32_t counter; + + +extern void vector_chacha20( + uint8_t *out, const uint8_t *in, + size_t in_len, const uint8_t key[32], + const uint8_t nonce[12], uint32_t counter); + +static void +chacha20_boring(void *restrict dest, void const *restrict src, size_t n) { + boring_chacha20(dest, src, n, key, nonce, counter); +} + +static void +chacha20_rvv(void *restrict dest, void const *restrict src, size_t n) { + vector_chacha20(dest, src, n, key, nonce, counter); +} + +typedef void *Func(void *restrict dest, void const *restrict src, size_t n); + +Impl impls[] = { + { "boring", &chacha20_boring }, + { "rvv", &chacha20_rvv }, +}; + +void init(void) { + memrand(key, sizeof key); + memrand(nonce, sizeof nonce); + counter = 0; +} + +ux checksum(size_t n) { + ux sum = 0; + for (size_t i = 0; i < n+16; ++i) + sum = uhash(sum) + mem[i]; + return sum; +} + +BENCH(aligned) { + memset(mem, 0, n+16); + TIME f(mem, mem + MAX_MEM/2 + 16, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM/2 - 16, "chacha20 aligned", bench_aligned } +}; BENCH_MAIN(impls, benches) + + +#include "../thirdparty/boring.c" +#else +void init(void) {} +Impl impls[] = {}; +Bench benches[] = {}; +BENCH_MAIN(impls, benches) +#endif diff --git a/tests/rvv_bench/default.nix b/tests/rvv_bench/default.nix new file mode 100644 index 000000000..e5376b838 --- /dev/null +++ b/tests/rvv_bench/default.nix @@ -0,0 +1,32 @@ +{ lib +, linkerScript +, makeBuilder +, findAndBuild +, t1main +}: + +let + include = ./_include; + builder = makeBuilder { casePrefix = "rvv_bench"; }; + build = { caseName, sourcePath }: + builder { + inherit caseName; + + src = sourcePath; + + isFp = lib.pathExists (lib.path.append sourcePath "isFp"); + + buildPhase = '' + runHook preBuild + + $CC -E -DINC=$PWD/${caseName}.S -E ${include}/template.S -o functions.S + $CC -I${include} ${caseName}.c -T${linkerScript} ${t1main} functions.S -o $pname.elf + + runHook postBuild + ''; + + meta.description = "test case '${caseName}', written in C intrinsic"; + }; +in + findAndBuild ./. build + diff --git a/tests/rvv_bench/mandelbrot/isFp b/tests/rvv_bench/mandelbrot/isFp new file mode 100644 index 000000000..e69de29bb diff --git a/tests/rvv_bench/mandelbrot/mandelbrot.S b/tests/rvv_bench/mandelbrot/mandelbrot.S new file mode 100644 index 000000000..55224666a --- /dev/null +++ b/tests/rvv_bench/mandelbrot/mandelbrot.S @@ -0,0 +1,358 @@ +#if 0 + +void +mandelbrot_rvv(size_t width, size_t maxIter, uint32_t *res) +{ + vfloat32m2_t cx, cy, zx, zy, zx2, zy2; + vuint32m2_t viter; + vbool16_t mask; + + for (size_t y = 0; y < width; ++y) { + size_t vl, x = width; + while (x > 0) { + x -= vl = __riscv_vsetvl_e32m2(x); + + mask = __riscv_vmset_m_b16(vl); + viter = __riscv_vmv_v_x_u32m2(0, vl); + + cx = __riscv_vfcvt_f_xu_v_f32m2(__riscv_vadd_vx_u32m2(__riscv_viota_m_u32m2(mask, vl), x, vl), vl); + cy = __riscv_vfmv_v_f_f32m2(y, vl); + + cx = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vf_f32m2(cx, 2.0f / width, vl), -1.5f, vl); + cy = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vf_f32m2(cy, 2.0f / width, vl), -1, vl); + + zx = zy = zx2 = zy2 = __riscv_vfmv_v_f_f32m2(0, vl); + + size_t iter = 0; + while (iter < maxIter && __riscv_vfirst_m_b16(mask, vl) >= 0) { + mask = __riscv_vmflt_vf_f32m2_b16(__riscv_vfadd_vv_f32m2(zx2, zy2, vl), 4, vl); + zx2 = __riscv_vfadd_vv_f32m2(__riscv_vfsub_vv_f32m2(zx2, zy2, vl), cx, vl); + zy = __riscv_vfmacc_vv_f32m2(cy, __riscv_vfadd_vv_f32m2(zx, zx, vl), zy, vl); + zx = zx2; + zx2 = __riscv_vfmul_vv_f32m2(zx, zx, vl); + zy2 = __riscv_vfmul_vv_f32m2(zy, zy, vl); + ++iter; + viter = __riscv_vmerge_vxm_u32m2(viter, iter, mask, vl); + } + __riscv_vse32_v_u32m2(res + x, viter, vl); + } + res += width; + } +} + +#endif + +#if MX_N > 0 && MX_N <= 2 + +#if HAS_F16 +.global MX(mandelbrot_rvv_f16_) # generated by clang +MX(rvv_f16_m1p5): + .half 0xbe00 # half -1.5 +MX(rvv_f16_m1): + .half 0xbc00 # half -1 +MX(rvv_f16_p4): + .half 0x4400 # half 4 +MX(mandelbrot_rvv_f16_): + beqz a0, MX(rvv_f16_13) + beqz a1, MX(rvv_f16_9) + li a7, 0 + fcvt.s.wu fa2, a0 + lui a3, 262144 + fmv.w.x fa1, a3 + la a3, MX(rvv_f16_m1p5) + flh fa5, (a3) + la a3, MX(rvv_f16_m1) + flh fa4, (a3) + la a3, MX(rvv_f16_p4) + flh fa3, (a3) + fdiv.s fa2, fa1, fa2 + fcvt.h.s fa2, fa2 + slli a6, a0, 2 + j MX(rvv_f16_4) +MX(rvv_f16_3): + addi a7, a7, 1 + add a2, a2, a6 + beq a7, a0, MX(rvv_f16_13) +MX(rvv_f16_4): + fcvt.s.wu fa1, a7 + fcvt.h.s fa1, fa1 + mv t0, a0 + j MX(rvv_f16_6) +MX(rvv_f16_5): + slli a3, t0, 2 + add a3, a3, a2 + vsetvli zero, zero, e32, MX2(), ta, ma + vse32.v v8, (a3) + beqz t0, MX(rvv_f16_3) +MX(rvv_f16_6): + vsetvli a3, t0, e32, MX2(), ta, ma + sub t0, t0, a3 + vmset.m v0 + vmv.v.i v8, 0 + vsetvli zero, zero, e16, MX(), ta, ma + viota.m v12, v0 + vadd.vx v12, v12, t0 + vfcvt.f.xu.v v12, v12 + vfmv.v.f v14, fa1 + vfmul.vf v12, v12, fa2 + vfadd.vf v12, v12, fa5 + vfmul.vf v14, v14, fa2 + vfadd.vf v14, v14, fa4 + vmv.v.i v20, 0 + li a4, 1 + mv a3, a1 + vmv.v.i v16, 0 + vmv.v.i v18, 0 + vmv.v.i v22, 0 +MX(rvv_f16_7): +#if HAS_RVV_1_0 || MX_N >= 2 + vsetvli zero, zero, e8, MXf2(), ta, ma +#else + vsetvli zero, zero, e8, m1, ta, ma +#endif + vfirst.m a5, v0 + bltz a5, MX(rvv_f16_5) + vsetvli zero, zero, e16, MX(), ta, ma + vfadd.vv v24, v18, v22 + vmflt.vf v0, v24, fa3 + vfsub.vv v18, v18, v22 + vfadd.vv v20, v20, v20 + vfadd.vv v24, v18, v12 + vfmadd.vv v16, v20, v14 + vfmul.vv v18, v24, v24 + vfmul.vv v22, v16, v16 + vsetvli zero, zero, e32, MX2(), ta, ma + vmerge.vxm v8, v8, a4, v0 + addi a3, a3, -1 + addi a4, a4, 1 +#if HAS_RVV_1_0 + vmv2r.v v20, v24 +#else + vsetvli zero, zero, e32, m2 + vmv.v.v v20, v24 +#endif + bnez a3, MX(rvv_f16_7) + j MX(rvv_f16_5) +MX(rvv_f16_9): + slli a3, a0, 2 +MX(rvv_f16_10): + mv a4, a0 +MX(rvv_f16_11): + vsetvli a5, a4, e32, MX2(), ta, ma + sub a4, a4, a5 + vmv.v.i v8, 0 + slli a5, a4, 2 + add a5, a5, a2 + vse32.v v8, (a5) + bnez a4, MX(rvv_f16_11) + addi a1, a1, 1 + add a2, a2, a3 + bne a1, a0, MX(rvv_f16_10) +MX(rvv_f16_13): + ret +#endif + + +.global MX(mandelbrot_rvv_f32_) # generated by clang +MX(mandelbrot_rvv_f32_): + beqz a0, MX(rvv_f32_13) + beqz a1, MX(rvv_f32_9) + li a7, 0 + fcvt.s.wu fa5, a0 + lui a3, 262144 + fmv.w.x fa4, a3 + fdiv.s fa5, fa4, fa5 + lui a3, 785408 + fmv.w.x fa4, a3 + lui a3, 784384 + fmv.w.x fa3, a3 + lui a3, 264192 + fmv.w.x fa2, a3 + slli a6, a0, 2 + j MX(rvv_f32_4) +MX(rvv_f32_3): + addi a7, a7, 1 + add a2, a2, a6 + beq a7, a0, MX(rvv_f32_13) +MX(rvv_f32_4): + fcvt.s.wu fa1, a7 + mv t0, a0 + j MX(rvv_f32_6) +MX(rvv_f32_5): + slli a3, t0, 2 + add a3, a3, a2 + vsetvli zero, zero, e32, MX(), ta, ma + vse32.v v8, (a3) + beqz t0, MX(rvv_f32_3) +MX(rvv_f32_6): + vsetvli t1, t0, e32, MX(), ta, ma + sub t0, t0, t1 + vmset.m v0 + vmv.v.i v8, 0 + viota.m v10, v0 + vadd.vx v10, v10, t0 + vfcvt.f.xu.v v10, v10 + vfmv.v.f v12, fa1 + vfmul.vf v10, v10, fa5 + vfadd.vf v10, v10, fa4 + vfmul.vf v12, v12, fa5 + vfadd.vf v12, v12, fa3 + vmv.v.i v18, 0 + li a3, 1 + mv a5, a1 + vmv.v.i v14, 0 + vmv.v.i v16, 0 + vmv.v.i v20, 0 +MX(rvv_f32_7): +#if HAS_RVV_1_0 + vsetvli zero, t1, e8, MXf4(), ta, ma +#else + vsetvli zero, t1, e8, m1, ta, ma +#endif + vfirst.m a4, v0 + bltz a4, MX(rvv_f32_5) + vsetvli zero, zero, e32, MX(), ta, ma + vfadd.vv v22, v16, v20 + vmflt.vf v0, v22, fa2 + vfsub.vv v16, v16, v20 + vfadd.vv v18, v18, v18 + vfadd.vv v22, v16, v10 + vfmadd.vv v14, v18, v12 + vfmul.vv v16, v22, v22 + vfmul.vv v20, v14, v14 + vmerge.vxm v8, v8, a3, v0 + addi a5, a5, -1 + addi a3, a3, 1 + vmv.v.v v18, v22 + bnez a5, MX(rvv_f32_7) + j MX(rvv_f32_5) +MX(rvv_f32_9): + slli a3, a0, 2 +MX(rvv_f32_10): + mv a4, a0 +MX(rvv_f32_11): + vsetvli a5, a4, e32, MX(), ta, ma + sub a4, a4, a5 + vmv.v.i v8, 0 + slli a5, a4, 2 + add a5, a5, a2 + vse32.v v8, (a5) + bnez a4, MX(rvv_f32_11) + addi a1, a1, 1 + add a2, a2, a3 + bne a1, a0, MX(rvv_f32_10) +MX(rvv_f32_13): + ret + +#endif + +#if MX_N == 2 && HAS_E64 + +.global MX(mandelbrot_rvv_f64_) # generated by clang +MX(rvv_f64_m1p5): + .quad 0xbff8000000000000 # double -1.5 +MX(rvv_f64_m1): + .quad 0xbff0000000000000 # double -1 +MX(rvv_f64_p4): + .quad 0x4010000000000000 # double 4 +MX(mandelbrot_rvv_f64_): + beqz a0, MX(rvv_f64_13) + beqz a1, MX(rvv_f64_9) + li a7, 0 + fcvt.s.wu fa2, a0 + lui a3, 262144 + fmv.w.x fa1, a3 + la a3, MX(rvv_f64_m1p5) + fld fa5, (a3) + la a3, MX(rvv_f64_m1) + fld fa4, (a3) + la a3, MX(rvv_f64_p4) + fld fa3, (a3) + fdiv.s fa2, fa1, fa2 + fcvt.d.s fa2, fa2 + slli a6, a0, 2 + j MX(rvv_f64_4) +MX(rvv_f64_3): + addi a7, a7, 1 + add a2, a2, a6 + beq a7, a0, MX(rvv_f64_13) +MX(rvv_f64_4): + fcvt.d.wu fa1, a7 + mv t0, a0 + j MX(rvv_f64_6) +MX(rvv_f64_5): + slli a3, t0, 2 + add a3, a3, a2 + vsetvli zero, zero, e32, m1, ta, ma + vse32.v v8, (a3) + beqz t0, MX(rvv_f64_3) +MX(rvv_f64_6): + vsetvli a3, t0, e32, m1, ta, ma + sub t0, t0, a3 + vmset.m v0 + vmv.v.i v8, 0 + vsetvli zero, zero, e64, m2, ta, ma + viota.m v10, v0 + vadd.vx v10, v10, t0 + vfcvt.f.xu.v v10, v10 + vfmv.v.f v12, fa1 + vfmul.vf v10, v10, fa2 + vfadd.vf v10, v10, fa5 + vfmul.vf v12, v12, fa2 + vfadd.vf v12, v12, fa4 + vmv.v.i v18, 0 + li a4, 1 + mv a3, a1 + vmv.v.i v14, 0 + vmv.v.i v16, 0 + vmv.v.i v20, 0 +MX(rvv_f64_7): +#if HAS_RVV_1_0 + vsetvli zero, zero, e8, MXf8(), ta, ma +#else + vsetvli zero, t1, e8, m1, ta, ma +#endif + vfirst.m a5, v0 + bltz a5, MX(rvv_f64_5) + vsetvli zero, zero, e64, m2, ta, ma + vfadd.vv v22, v16, v20 + vmflt.vf v0, v22, fa3 + vfsub.vv v16, v16, v20 + vfadd.vv v18, v18, v18 + vfadd.vv v22, v16, v10 + vfmadd.vv v14, v18, v12 + vfmul.vv v16, v22, v22 + vfmul.vv v20, v14, v14 + vsetvli zero, zero, e32, m1, ta, ma + vmerge.vxm v8, v8, a4, v0 + addi a3, a3, -1 + addi a4, a4, 1 +#if HAS_RVV_1_0 + vmv2r.v v18, v22 +#else + vsetvli zero, zero, e32, m2 + vmv.v.v v18, v22 +#endif + bnez a3, MX(rvv_f64_7) + j MX(rvv_f64_5) +MX(rvv_f64_9): + slli a3, a0, 2 +MX(rvv_f64_10): + mv a4, a0 +MX(rvv_f64_11): + vsetvli a5, a4, e32, m1, ta, ma + sub a4, a4, a5 + vmv.v.i v8, 0 + slli a5, a4, 2 + add a5, a5, a2 + vse32.v v8, (a5) + bnez a4, MX(rvv_f64_11) + addi a1, a1, 1 + add a2, a2, a3 + bne a1, a0, MX(rvv_f64_10) +MX(rvv_f64_13): + ret + +#endif + + diff --git a/tests/rvv_bench/mandelbrot/mandelbrot.c b/tests/rvv_bench/mandelbrot/mandelbrot.c new file mode 100644 index 000000000..f182eba0f --- /dev/null +++ b/tests/rvv_bench/mandelbrot/mandelbrot.c @@ -0,0 +1,94 @@ +#include "bench.h" + +void +mandelbrot_scalar_f32(size_t width, size_t maxIter, uint32_t *res) +{ + for (size_t y = 0; y < width; ++y) + for (size_t x = 0; x < width; ++x) { + float cx = x * 2.0f / width - 1.5; + float cy = y * 2.0f / width - 1; + size_t iter = 0; + float zx = 0, zy = 0, zxS = 0, zyS = 0; + + BENCH_VOLATILE_REG(cy); + while (zxS + zyS <= 4 && iter < maxIter) { + zxS = zxS - zyS + cx; + zy = 2 * zx * zy + cy; + zx = zxS; + BENCH_VOLATILE_REG(zx); + zxS = zx*zx; + zyS = zy*zy; + ++iter; + BENCH_CLOBBER(); + } + *res++ = iter; + } +} + +#if __riscv_xlen >= 64 +void +mandelbrot_scalar_f64(size_t width, size_t maxIter, uint32_t *res) +{ + for (size_t y = 0; y < width; ++y) + for (size_t x = 0; x < width; ++x) { + double cx = x * 2.0 / width - 1.5; + double cy = y * 2.0 / width - 1; + size_t iter = 0; + double zx = 0, zy = 0, zxS = 0, zyS = 0; + + BENCH_VOLATILE_REG(cy); + while (zxS + zyS <= 4 && iter < maxIter) { + zxS = zxS - zyS + cx; + zy = 2 * zx * zy + cy; + zx = zxS; + BENCH_VOLATILE_REG(zx); + zxS = zx*zx; + zyS = zy*zy; + ++iter; + } + *res++ = iter; + } +} +#endif + +#if HAS_F16 +# define IMPLS_F16(f) f(rvv_f16_m1) f(rvv_f16_m2) +#else +# define IMPLS_F16(f) +#endif + +#define IMPLS(f) \ + f(rvv_f32_m1) \ + f(scalar_f32) \ + IF64(f(scalar_f64)) \ + IMPLS_F16(f) \ + f(rvv_f32_m2) \ + IF64(f(rvv_f64_m2)) \ + +typedef void Func(size_t width, size_t maxIter, uint32_t *res); + +#define DECLARE(f) extern Func mandelbrot_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &mandelbrot_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint32_t *dest; +void init(void) { memset(mem, 0, MAX_MEM); dest = (uint32_t*)mem; } + +/* disabled, because of rounding errors, please independently verify */ +ux checksum(size_t n) { return 0; } + +BENCH(base) { + n = usqrt(n); + TIME f(n, mandelbrot_ITER, dest); +} BENCH_END + +Bench benches[] = { + { + SCALE_mandelbrot(MAX_MEM / 4), + "mandelbrot "STR(mandelbrot_ITER), + bench_base + }, +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/memcpy/memcpy.S b/tests/rvv_bench/memcpy/memcpy.S new file mode 100644 index 000000000..6511a0493 --- /dev/null +++ b/tests/rvv_bench/memcpy/memcpy.S @@ -0,0 +1,153 @@ +#if 0 +void *memcpy_rvv(void *restrict dest, void const *restrict src, size_t n) { + unsigned char *d = dest; + unsigned char const *s = src; + for (size_t vl; n > 0; n -= vl, s += vl, d += vl) { + vl = __riscv_vsetvl_e8m8(n); + vuint8m8_t vec_src = __riscv_vle8_v_u8m8(s, vl); + __riscv_vse8_v_u8m8(d, vec_src, vl); + } + return dest; +} +#endif + + +#ifdef MX + +# a0 = dest, a1 = src, a2 = len +.global MX(memcpy_rvv_) +MX(memcpy_rvv_): + mv a3, a0 +1: + vsetvli t0, a2, e8, MX(), ta, ma + vle8.v v0, (a1) + add a1, a1, t0 + sub a2, a2, t0 + vse8.v v0, (a3) + add a3, a3, t0 + bnez a2, 1b + ret + +.global MX(memcpy_rvv_align_dest_) +MX(memcpy_rvv_align_dest_): + mv a3, a0 +#if HAS_RVV_1_0 + csrr t0, vlenb +#else + vsetvli t0, zero, e8, m1, ta, ma # vlenb +#endif + bltu a2, t0, 2f # len < vlenb + # align dest to vlenb + sub t1, zero, a0 + addi t2, t0, -1 + and t1, t1, t2 #align = (-dest) & (vlenb-1) + vsetvli t0, t1, e8, MX(), ta, ma +1: + vle8.v v0, (a1) + add a1, a1, t0 + sub a2, a2, t0 + vse8.v v0, (a3) + add a3, a3, t0 +2: + vsetvli t0, a2, e8, MX(), ta, ma + bnez a2, 1b + ret + +.global MX(memcpy_rvv_align_src_) +MX(memcpy_rvv_align_src_): + mv a3, a0 +#if HAS_RVV_1_0 + csrr t0, vlenb +#else + vsetvli t0, zero, e8, m1, ta, ma # vlen +#endif + bltu a2, t0, 2f # len < vlen + # align src to vlen + sub t1, zero, a1 + addi t2, t0, -1 + and t1, t1, t2 # align = (-src) & (vlen-1) + vsetvli t0, t1, e8, MX(), ta, ma +1: + vle8.v v0, (a1) + add a1, a1, t0 + sub a2, a2, t0 + vse8.v v0, (a3) + add a3, a3, t0 +2: + vsetvli t0, a2, e8, MX(), ta, ma + bnez a2, 1b + ret + +# combination of memcpy_rvv_align_dest and memcpy_rvv +.global MX(memcpy_rvv_align_dest_hybrid_) +MX(memcpy_rvv_align_dest_hybrid_): + mv a3, a0 +#if HAS_RVV_1_0 + csrr t0, vlenb +#else + vsetvli t0, zero, e8, m1, ta, ma # vlen +#endif + slli t1, t0, 8 # skip costly division for more values + bltu a2, t1, 2f # len < vlen + sub t1, zero, a0 + addi t2, t0, -1 + and t1, t1, t2 # align = (-dest) & (vlen-1) + vsetvli t0, t1, e8, MX(), ta, ma # align dest to vlen +1: + vle8.v v0, (a1) + add a1, a1, t0 + sub a2, a2, t0 + vse8.v v0, (a3) + add a3, a3, t0 +2: + vsetvli t0, a2, e8, MX(), ta, ma + bnez a2, 1b + ret + + +.global MX(memcpy_rvv_tail_) +MX(memcpy_rvv_tail_): + vsetvli t0, a2, e8, MX(), ta, ma + remu a3, a2, t0 # tail = n % vlenb + sub a2, a2, a3 # n -= tail + add a4, a0, a2 # end = dest + n + mv a2, a0 # n = dest +1: + vle8.v v8, (a1) + add a1, a1, t0 # src += vlenb + vse8.v v8, (a2) + add a2, a2, t0 # dest += vlenb + bltu a2, a4, 1b # dest < end + # copy tail + vsetvli zero, a3, e8, MX(), ta, ma + vle8.v v8, (a1) + vse8.v v8, (a2) + ret + +# this is supposed to test how well the implementation handles +# operations with an vl smaller than VLMAX +.global MX(memcpy_rvv_128_) +MX(memcpy_rvv_128_): + li t0, 128/8 + bgt a2, t0, 1f + mv t0, a2 +1: + vsetvli t0, t0, e8, MX(), ta, ma + remu a3, a2, t0 # tail = n % vlenb + sub a2, a2, a3 # n -= tail + add a4, a0, a2 # end = dest + n + mv a2, a0 # n = dest +1: + vle8.v v8, (a1) + add a1, a1, t0 # src += vlenb + vse8.v v8, (a2) + add a2, a2, t0 # dest += vlenb + bltu a2, a4, 1b # dest < end + # copy tail + vsetvli zero, a3, e8, MX(), ta, ma + vle8.v v8, (a1) + vse8.v v8, (a2) + ret + +#endif + diff --git a/tests/rvv_bench/memcpy/memcpy.c b/tests/rvv_bench/memcpy/memcpy.c new file mode 100644 index 000000000..60a977c71 --- /dev/null +++ b/tests/rvv_bench/memcpy/memcpy.c @@ -0,0 +1,197 @@ +#include "bench.h" + +void * +memcpy_scalar(void *restrict dest, void const *restrict src, size_t n) +{ + unsigned char *d = dest; + unsigned char const *s = src; + while (n--) *d++ = *s++, BENCH_CLOBBER(); + return dest; +} + +void * +memcpy_scalar_autovec(void *restrict dest, void const *restrict src, size_t n) +{ + unsigned char *d = dest; + unsigned char const *s = src; + while (n--) *d++ = *s++; + return dest; +} + +/* https://git.musl-libc.org/cgit/musl/tree/src/string/memcpy.c */ +void * +memcpy_musl(void *restrict dest, void const *restrict src, size_t n) +{ + unsigned char *d = dest; + unsigned char const *s = src; + +#ifdef __GNUC__ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define LS >> +#define RS << +#else +#define LS << +#define RS >> +#endif + + typedef uint32_t __attribute__((__may_alias__)) u32; + uint32_t w, x; + + for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; + + if ((uintptr_t)d % 4 == 0) { + for (; n>=16; s+=16, d+=16, n-=16) { + *(u32 *)(d+0) = *(u32 *)(s+0); + *(u32 *)(d+4) = *(u32 *)(s+4); + *(u32 *)(d+8) = *(u32 *)(s+8); + *(u32 *)(d+12) = *(u32 *)(s+12); + } + if (n&8) { + *(u32 *)(d+0) = *(u32 *)(s+0); + *(u32 *)(d+4) = *(u32 *)(s+4); + d += 8; s += 8; + } + if (n&4) { + *(u32 *)(d+0) = *(u32 *)(s+0); + d += 4; s += 4; + } + if (n&2) { + *d++ = *s++; *d++ = *s++; + } + if (n&1) { + *d = *s; + } + return dest; + } + + if (n >= 32) switch ((uintptr_t)d % 4) { + case 1: + w = *(u32 *)s; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + n -= 3; + for (; n>=17; s+=16, d+=16, n-=16) { + x = *(u32 *)(s+1); + *(u32 *)(d+0) = (w LS 24) | (x RS 8); + w = *(u32 *)(s+5); + *(u32 *)(d+4) = (x LS 24) | (w RS 8); + x = *(u32 *)(s+9); + *(u32 *)(d+8) = (w LS 24) | (x RS 8); + w = *(u32 *)(s+13); + *(u32 *)(d+12) = (x LS 24) | (w RS 8); + } + break; + case 2: + w = *(u32 *)s; + *d++ = *s++; + *d++ = *s++; + n -= 2; + for (; n>=18; s+=16, d+=16, n-=16) { + x = *(u32 *)(s+2); + *(u32 *)(d+0) = (w LS 16) | (x RS 16); + w = *(u32 *)(s+6); + *(u32 *)(d+4) = (x LS 16) | (w RS 16); + x = *(u32 *)(s+10); + *(u32 *)(d+8) = (w LS 16) | (x RS 16); + w = *(u32 *)(s+14); + *(u32 *)(d+12) = (x LS 16) | (w RS 16); + } + break; + case 3: + w = *(u32 *)s; + *d++ = *s++; + n -= 1; + for (; n>=19; s+=16, d+=16, n-=16) { + x = *(u32 *)(s+3); + *(u32 *)(d+0) = (w LS 8) | (x RS 24); + w = *(u32 *)(s+7); + *(u32 *)(d+4) = (x LS 8) | (w RS 24); + x = *(u32 *)(s+11); + *(u32 *)(d+8) = (w LS 8) | (x RS 24); + w = *(u32 *)(s+15); + *(u32 *)(d+12) = (x LS 8) | (w RS 24); + } + break; + } + if (n&16) { + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + } + if (n&8) { + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + } + if (n&4) { + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + } + if (n&2) { + *d++ = *s++; *d++ = *s++; + } + if (n&1) { + *d = *s; + } + return dest; +#endif + + while (n--) { *d++ = *s++; BENCH_CLOBBER(); } + return dest; +} + +#define memcpy_libc memcpy + +#define IMPLS(f) \ + IFHOSTED(f(libc)) \ + f(musl) \ + f(scalar) \ + f(scalar_autovec) \ + MX(f, rvv) \ + MX(f, rvv_align_dest) \ + MX(f, rvv_align_src) \ + MX(f, rvv_align_dest_hybrid) \ + MX(f, rvv_tail) \ + MX(f, rvv_128) \ + +typedef void *Func(void *restrict dest, void const *restrict src, size_t n); + +#define DECLARE(f) extern Func memcpy_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &memcpy_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint8_t *dest, *src; +ux last; + +void init(void) { } + +ux checksum(size_t n) { + ux sum = last; + for (size_t i = 0; i < n+9; ++i) + sum = uhash(sum) + dest[i]; + return sum; +} + +void common(size_t n, size_t dOff, size_t sOff) { + dest = mem + dOff; src = dest + MAX_MEM/2 + sOff + 9; + memset(dest, 0, n+9); +} + +BENCH(base) { + common(n, urand() & 255, urand() & 255); + TIME last = (uintptr_t)f(dest, src, n); +} BENCH_END + +BENCH(aligned) { + common(n, 0, 0); + TIME last = (uintptr_t)f(dest, src, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM/2 - 521, "memcpy", bench_base }, + { MAX_MEM/2 - 521, "memcpy aligned", bench_aligned} +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/memset/memset.S b/tests/rvv_bench/memset/memset.S new file mode 100644 index 000000000..3d00eae62 --- /dev/null +++ b/tests/rvv_bench/memset/memset.S @@ -0,0 +1,96 @@ +#if 0 +void *memset(void *dst, int n, size_t len) { + unsigned char *d = dst; + vuint8m8_t v = __riscv_vmv_v_x_u8m8((uint8_t)n, __riscv_vsetvlmax_e8m8()); + for (size_t vl; len > 0; len -= vl, d += vl) { + vl = __riscv_vsetvl_e8m8(len); + __riscv_vse8_v_u8m8(d, v, vl); + } + return dst; +} +#endif + +#ifdef MX + +.global MX(memset_rvv_) +MX(memset_rvv_): + vsetvli a3, zero, e8, MX(), ta, ma + vmv.v.x v8, a1 + mv a1, a0 +1: + vsetvli a3, a2, e8, MX(), ta, ma + vse8.v v8, (a1) + sub a2, a2, a3 + add a1, a1, a3 + bnez a2, 1b + ret + + +.global MX(memset_rvv_align_) +MX(memset_rvv_align_): + vsetvli t0, zero, e8, m1, ta, ma # vlen + vmv.v.x v8, a1 + mv a1, a0 + vsetvli t0, zero, e8, MX(), ta, ma # vlen + bltu a2, t0, 2f # len < vlen + # align dest to vlen + sub t1, zero, a0 + remu t1, t1, t0 # align = (-dest) % vlen + vsetvli t0, t1, e8, MX(), ta, ma +1: + vse8.v v8, (a1) + sub a2, a2, t0 + add a1, a1, t0 +2: + vsetvli t0, a2, e8, MX(), ta, ma + bnez a2, 1b + ret + +.global MX(memset_rvv_tail_) +MX(memset_rvv_tail_): + vsetvli t0, a2, e8, MX(), ta, ma + vmv.v.x v8, a1 + remu a3, a2, t0 # tail = n % vlenb + sub a2, a2, a3 # n -= tail + add a4, a0, a2 # end = dest + n + mv a2, a0 # n = dest +1: + vse8.v v8, (a2) + add a2, a2, t0 # dest += vlenb + bltu a2, a4, 1b # dest < end + # handle tail + vsetvli zero, a3, e8, MX(), ta, ma + vse8.v v8, (a2) + ret + +.global MX(memset_rvv_tail_4x_) +MX(memset_rvv_tail_4x_): + vsetvli t0, a2, e8, MX(), ta, ma + vmv.v.x v8, a1 + slli t1, t0, 2 + mv a5, a0 + mv a3, a2 + bltu a2, t1, 2f + remu a3, a2, t1 # tail = n % (vlenb*4) + sub a2, a2, a3 # n -= tail + add a4, a0, a2 # end = dest + n +1: + vse8.v v8, (a5) + add a5, a5, t0 # dest += vlenb + vse8.v v8, (a5) + add a5, a5, t0 # dest += vlenb + vse8.v v8, (a5) + add a5, a5, t0 # dest += vlenb + vse8.v v8, (a5) + add a5, a5, t0 # dest += vlenb + bltu a5, a4, 1b # dest < end + # handle tail +2: + vsetvli a4, a3, e8, MX(), ta, ma + vse8.v v8, (a5) + sub a3, a3, a4 + add a5, a5, a4 + bnez a3, 2b + ret + +#endif diff --git a/tests/rvv_bench/memset/memset.c b/tests/rvv_bench/memset/memset.c new file mode 100644 index 000000000..9b2f7c463 --- /dev/null +++ b/tests/rvv_bench/memset/memset.c @@ -0,0 +1,163 @@ +#include "bench.h" + +void * +memset_scalar(void *dest, int c, size_t n) +{ + unsigned char *d = dest; + while (n--) *d++ = c, BENCH_CLOBBER(); + return dest; +} + +void * +memset_scalar_autovec(void *dest, int c, size_t n) +{ + unsigned char *d = dest; + while (n--) *d++ = c; + return dest; +} + +/* https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c */ +#if __riscv_xlen >= 64 +void * +memset_musl(void *dest, int c, size_t n) +{ + unsigned char *s = dest; + size_t k; + + /* Fill head and tail with minimal branching. Each + * conditional ensures that all the subsequently used + * offsets are well-defined and in the dest region. */ + + if (!n) return dest; + s[0] = c; + s[n-1] = c; + if (n <= 2) return dest; + s[1] = c; + s[2] = c; + s[n-2] = c; + s[n-3] = c; + if (n <= 6) return dest; + s[3] = c; + s[n-4] = c; + if (n <= 8) return dest; + + /* Advance pointer to align it at a 4-byte boundary, + * and truncate n to a multiple of 4. The previous code + * already took care of any head/tail that get cut off + * by the alignment. */ + + k = -(uintptr_t)s & 3; + s += k; + n -= k; + n &= -4; + +#ifdef __GNUC__ + typedef uint32_t __attribute__((__may_alias__)) u32; + typedef uint64_t __attribute__((__may_alias__)) u64; + + u32 c32 = ((u32)-1)/255 * (unsigned char)c; + + /* In preparation to copy 32 bytes at a time, aligned on + * an 8-byte bounary, fill head/tail up to 28 bytes each. + * As in the initial byte-based head/tail fill, each + * conditional below ensures that the subsequent offsets + * are valid (e.g. !(n<=24) implies n>=28). */ + + *(u32 *)(s+0) = c32; + *(u32 *)(s+n-4) = c32; + if (n <= 8) return dest; + *(u32 *)(s+4) = c32; + *(u32 *)(s+8) = c32; + *(u32 *)(s+n-12) = c32; + *(u32 *)(s+n-8) = c32; + if (n <= 24) return dest; + *(u32 *)(s+12) = c32; + *(u32 *)(s+16) = c32; + *(u32 *)(s+20) = c32; + *(u32 *)(s+24) = c32; + *(u32 *)(s+n-28) = c32; + *(u32 *)(s+n-24) = c32; + *(u32 *)(s+n-20) = c32; + *(u32 *)(s+n-16) = c32; + + /* Align to a multiple of 8 so we can fill 64 bits at a time, + * and avoid writing the same bytes twice as much as is + * practical without introducing additional branching. */ + + k = 24 + ((uintptr_t)s & 4); + s += k; + n -= k; + + /* If this loop is reached, 28 tail bytes have already been + * filled, so any remainder when n drops below 32 can be + * safely ignored. */ + + u64 c64 = c32 | ((u64)c32 << 32); + for (; n >= 32; n-=32, s+=32) { + *(u64 *)(s+0) = c64; + *(u64 *)(s+8) = c64; + *(u64 *)(s+16) = c64; + *(u64 *)(s+24) = c64; + } +#else + /* Pure C fallback with no aliasing violations. */ + while (n--) *s++ = c; +#endif + + return dest; +} +#endif + +#define memset_libc memset + +#define IMPLS(f) \ + IFHOSTED(f(libc)) \ + IF64(f(musl)) \ + f(scalar) \ + f(scalar_autovec) \ + MX(f, rvv) \ + MX(f, rvv_align) \ + MX(f, rvv_tail) \ + MX(f, rvv_tail_4x) \ + +typedef void *Func(void *dest, int c, size_t n); + +#define DECLARE(f) extern Func memset_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &memset_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint8_t *dest; +ux last; +char c; + +void init(void) { c = urand(); } + +ux checksum(size_t n) { + ux sum = last; + for (size_t i = 0; i < n+9; ++i) + sum = uhash(sum) + dest[i]; + return sum; +} + +void common(size_t n, size_t off) { + dest = mem + off; + memset(dest, c+3, n+9); +} + +BENCH(base) { + common(n, urand() & 511); + TIME last = (uintptr_t)f(dest, c, n); +} BENCH_END + +BENCH(aligned) { + common(n, 0); + TIME last = (uintptr_t)f(dest, c, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM - 521, "memset", bench_base }, + { MAX_MEM - 521, "memset aligned", bench_aligned} +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/mergelines/mergelines.S b/tests/rvv_bench/mergelines/mergelines.S new file mode 100644 index 000000000..051a0d7de --- /dev/null +++ b/tests/rvv_bench/mergelines/mergelines.S @@ -0,0 +1,179 @@ +#if 0 +size_t +mergelines_rvv(char *str, size_t len) +{ + uint8_t *dest = (uint8_t*)str; + uint8_t *src = (uint8_t*)str; + char last = 0; + + vuint8m8_t v, u, d; + vbool1_t m; + + for (size_t vl, VL; len > 1; ) { + VL = vl = __riscv_vsetvl_e8m8(len); + + char next = len > vl ? src[vl] : 0; + v = __riscv_vle8_v_u8m8(src, vl); + u = __riscv_vslide1up_vx_u8m8(v, last, vl); + d = __riscv_vslide1down_vx_u8m8(v, next, vl); + + m = __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(u, '\\', vl), __riscv_vmsne_vx_u8m8_b1(v, '\n', vl), vl); + #if DO_SKIP + if (likely(__riscv_vcpop_m_b1(m, vl) == vl && next != '\n')) + goto skip; + #endif + m = __riscv_vmand_mm_b1( + m, + __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(v, '\\', vl), __riscv_vmsne_vx_u8m8_b1(d, '\n', vl), vl), + vl); + + v = __riscv_vcompress_vm_u8m8(v, m, vl); + vl = __riscv_vcpop_m_b1(m, vl); + skip: + __riscv_vse8_v_u8m8(dest, v, vl); + dest += vl; src += VL; len -= VL; + last = src[-1]; + } + + if (len > 0 && !(last == '\\' && *src == '\n')) *dest++ = *src++; + return (dest - (uint8_t*)str); +} +#endif + +#ifdef MX + +.global MX(mergelines_rvv_) # generated by clang +MX(mergelines_rvv_): + li a2, 2 + bltu a1, a2, MX(rvv_6) + li t0, 0 + li a7, 92 + li a6, 1 + mv a2, a0 + mv a4, a0 + j MX(rvv_4) +MX(rvv_2): # in Loop: Header=BB0_4 Depth=1 + add a3, a4, a5 + lbu t1, 0(a3) +MX(rvv_3): # in Loop: Header=BB0_4 Depth=1 + vle8.v v8, (a4) + add a3, a4, a5 + vslide1up.vx v16, v8, t0 + vslide1down.vx v24, v8, t1 + vmsne.vx v0, v16, a7 + vmsne.vi v16, v8, 10 + vmor.mm v16, v0, v16 + vmsne.vx v17, v8, a7 + vmsne.vi v18, v24, 10 + vmor.mm v17, v17, v18 + vmand.mm v16, v16, v17 + vcompress.vm v24, v8, v16 + vcpop.m a4, v16 + vsetvli zero, a4, e8, MX(), ta, ma + vse8.v v24, (a2) + lbu t0, -1(a3) + sub a1, a1, a5 + add a2, a2, a4 + mv a4, a3 + bgeu a6, a1, MX(rvv_8) +MX(rvv_4): # =>This Inner Loop Header: Depth=1 + vsetvli a5, a1, e8, MX(), ta, ma + bltu a5, a1, MX(rvv_2) + li t1, 0 + j MX(rvv_3) +MX(rvv_6): + mv a2, a0 + beqz a1, MX(rvv_10) + lbu a1, 0(a0) + mv a2, a0 + j MX(rvv_11) +MX(rvv_8): + beqz a1, MX(rvv_10) + lbu a1, 0(a3) + xori a3, t0, 92 + xori a4, a1, 10 + or a3, a3, a4 + bnez a3, MX(rvv_11) +MX(rvv_10): + sub a0, a2, a0 + ret +MX(rvv_11): + addi a3, a2, 1 + sb a1, 0(a2) + sub a0, a3, a0 + ret + + +.global MX(mergelines_rvv_skip_) # generated by clang +MX(mergelines_rvv_skip_): + li a2, 2 + bltu a1, a2, MX(rvv_skip_9) + li a5, 0 + li a6, 92 + li a7, 1 + mv t1, a0 + mv a3, a0 +MX(rvv_skip_2): # =>This Inner Loop Header: Depth=1 + vsetvli a4, a1, e8, MX(), ta, ma + bgeu a4, a1, MX(rvv_skip_4) + add a2, a3, a4 + lbu t0, 0(a2) + j MX(rvv_skip_5) +MX(rvv_skip_4): # in Loop: Header=BB0_2 Depth=1 + li t0, 0 +MX(rvv_skip_5): # in Loop: Header=BB0_2 Depth=1 + vle8.v v8, (a3) + vslide1up.vx v16, v8, a5 + vmsne.vx v24, v16, a6 + vmsne.vi v16, v8, 10 + vmor.mm v16, v24, v16 + vcpop.m a2, v16 + xor a2, a2, a4 + seqz a2, a2 + addi a5, t0, -10 + snez a5, a5 + and a2, a2, a5 + beqz a2, MX(rvv_skip_8) + mv a2, a4 +MX(rvv_skip_7): # in Loop: Header=BB0_2 Depth=1 + add a3, a3, a4 + vsetvli zero, a2, e8, MX(), ta, ma + vse8.v v8, (t1) + lbu a5, -1(a3) + sub a1, a1, a4 + add t1, t1, a2 + bltu a7, a1, MX(rvv_skip_2) + j MX(rvv_skip_11) +MX(rvv_skip_8): # in Loop: Header=BB0_2 Depth=1 + vslide1down.vx v24, v8, t0 + vmsne.vx v17, v8, a6 + vmsne.vi v18, v24, 10 + vmor.mm v17, v17, v18 + vmand.mm v16, v16, v17 + vcompress.vm v24, v8, v16 + vcpop.m a2, v16 + vmv.v.v v8, v24 + j MX(rvv_skip_7) +MX(rvv_skip_9): + mv t1, a0 + beqz a1, MX(rvv_skip_13) + lbu a1, 0(a0) + mv t1, a0 + j MX(rvv_skip_14) +MX(rvv_skip_11): + beqz a1, MX(rvv_skip_13) + lbu a1, 0(a3) + xori a2, a5, 92 + xori a3, a1, 10 + or a2, a2, a3 + bnez a2, MX(rvv_skip_14) +MX(rvv_skip_13): + sub a0, t1, a0 + ret +MX(rvv_skip_14): + addi a2, t1, 1 + sb a1, 0(t1) + sub a0, a2, a0 + ret + +#endif diff --git a/tests/rvv_bench/mergelines/mergelines.c b/tests/rvv_bench/mergelines/mergelines.c new file mode 100644 index 000000000..2d1d2078d --- /dev/null +++ b/tests/rvv_bench/mergelines/mergelines.c @@ -0,0 +1,75 @@ +#include "bench.h" + +size_t +mergelines_scalar(char *str, size_t len) +{ + char *dest = str; + char *src = str; + + while (len > 1) { + if (src[0] == '\\' && src[1] == '\n') + src += 2, len -= 2; + else + *dest++ = *src++, --len; + BENCH_CLOBBER(); + } + if (len > 0) + *dest++ = *src++; + return dest - str; +} + +#define IMPLS(f) \ + MX(f, rvv) \ + f(scalar) \ + MX(f, rvv_skip) \ + +typedef size_t Func(char *buf, size_t len); + +#define DECLARE(f) extern Func mergelines_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &mergelines_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +char *str; +ux last; + +void init(void) { } +ux checksum(size_t n) { return last; } + +void common(size_t n, char const *chars, size_t nChars) { + str = (char*)mem + (urand() & 255); + for (size_t i = 0; i < n; ++i) + str[i] = chars[urand() % nChars]; +} + +BENCH(2_3) { + common(n, "\\\na", 3); + TIME last = (uintptr_t)f(str, n); +} BENCH_END + +BENCH(2_16) { + common(n, "\\\nabcdefgh", 16); + TIME last = (uintptr_t)f(str, n); +} BENCH_END + +BENCH(2_32) { + common(n, "\\\nabcdefgh123456789", 32); + TIME last = (uintptr_t)f(str, n); +} BENCH_END + +BENCH(2_256) { + str = (char*)mem + (urand() & 255); + for (size_t i = 0; i < n; ++i) + str[i] = urand() & 0xff; + TIME last = (uintptr_t)f(str, n); +} BENCH_END + +#define COUNT SCALE_mergelines(MAX_MEM) - 256 +Bench benches[] = { + { COUNT, "mergelines 2/3", bench_2_3 }, + { COUNT, "mergelines 2/16", bench_2_16 }, + { COUNT, "mergelines 2/32", bench_2_32 }, + { COUNT, "mergelines 2/256", bench_2_256 } +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/poly1305/poly1305.S b/tests/rvv_bench/poly1305/poly1305.S new file mode 100644 index 000000000..e5b332e02 --- /dev/null +++ b/tests/rvv_bench/poly1305/poly1305.S @@ -0,0 +1,5 @@ +#ifndef MX +#if __riscv_xlen >= 64 +#include "rvv-chacha-poly/vpoly.s" +#endif +#endif diff --git a/tests/rvv_bench/poly1305/poly1305.c b/tests/rvv_bench/poly1305/poly1305.c new file mode 100644 index 000000000..72849ac75 --- /dev/null +++ b/tests/rvv_bench/poly1305/poly1305.c @@ -0,0 +1,64 @@ +#include "bench.h" +#if __riscv_xlen >= 64 +#include "thirdparty/boring.h" + +uint8_t *src; +uint8_t key[32], sig[16]; + +extern uint64_t +vector_poly1305(const uint8_t* in, size_t len, + const uint8_t key[32], uint8_t sig[16]); + +static void +poly1305_boring(void const *src, size_t n) { + poly1305_state state; + boring_poly1305_init(&state, key); + boring_poly1305_update(&state, src, n); + boring_poly1305_finish(&state, sig); +} + +static void +poly1305_rvv(void const *src, size_t n) { + vector_poly1305(src, n, key, sig); +} + +typedef void *Func(void const *src, size_t n); + +Impl impls[] = { + { "boring", &poly1305_boring }, +#if HAS_E64 + { "rvv", &poly1305_rvv }, +#endif +}; + +void init(void) { + memrand(key, sizeof key); + memrand(sig, sizeof sig); +} + +ux checksum(size_t n) { + ux sum = 0; + for (size_t i = 0; i < ARR_LEN(sig); ++i) + sum = uhash(sum) + sig[i]; + return sum; +} + +BENCH(aligned) { + for (size_t i = 0; i < 256; ++i) + mem[urand()%n] = urand(); + n = (15+n) & -16; + TIME f(mem, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM, "poly1305 aligned", bench_aligned } +}; BENCH_MAIN(impls, benches) + + +#include "../thirdparty/boring.c" +#else +void init(void) {} +Impl impls[] = {}; +Bench benches[] = {}; +BENCH_MAIN(impls, benches) +#endif diff --git a/tests/rvv_bench/strlen/strlen.S b/tests/rvv_bench/strlen/strlen.S new file mode 100644 index 000000000..d639e5a80 --- /dev/null +++ b/tests/rvv_bench/strlen/strlen.S @@ -0,0 +1,91 @@ +#if 0 +size_t strlen_rvv(char *src) { + size_t vlmax = __riscv_vsetvlmax_e8m8(); + char *p = src; + long first = -1; + size_t vl; + while (first < 0) { + vuint8m8_t v = __riscv_vle8ff_v_u8m8((uint8_t*)p, &vl, vlmax); + first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); + p += vl; + } + p -= vl - first; + return (size_t)(p - src); +} + +#define PAGE_SIZE 4096 +size_t strlen_rvv_page_aligned_(char *src) { + char *p = src; + long first = 0; + + size_t n = 0 - ((uintptr_t)src | -4096); + size_t vl; + for (; n > 0; n -= vl) { + vl = __riscv_vsetvl_e8m8(n); + vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl); + first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); + p += vl; + if (first >= 0) { + goto end; + } + } + vl = __riscv_vsetvlmax_e8m8(); + do { + vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl); + first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); + p += vl; + } while (first < 0); +end: + p -= vl - first; + return (size_t)(p - src); +} +#endif + + +#ifdef MX + +.global MX(strlen_rvv_) +MX(strlen_rvv_): + mv a3, a0 +1: + vsetvli a1, x0, e8, MX(), ta, ma + vle8ff.v v8, (a3) + csrr a1, vl + vmseq.vi v0, v8, 0 + vfirst.m a2, v0 + add a3, a3, a1 # end += vl + bltz a2, 1b + add a0, a0, a1 # start += vl + add a3, a3, a2 # end += idx + sub a0, a3, a0 # start - end + ret + +.global MX(strlen_rvv_page_aligned_) # generated by clang +MX(strlen_rvv_page_aligned_): + lui a1, 1048575 + or a1, a1, a0 + neg a4, a1 + mv a1, a0 +1: + vsetvli a2, a4, e8, MX(), ta, ma + vle8.v v8, (a1) + vmseq.vi v16, v8, 0 + vfirst.m a3, v16 + add a1, a1, a2 + bgez a3, 1f + sub a4, a4, a2 + bnez a4, 1b + vsetvli a2, zero, e8, MX(), ta, ma +2: + vle8.v v8, (a1) + vmseq.vi v16, v8, 0 + vfirst.m a3, v16 + add a1, a1, a2 + bltz a3, 2b +1: + sub a1, a1, a2 + sub a0, a3, a0 + add a0, a0, a1 + ret + +#endif diff --git a/tests/rvv_bench/strlen/strlen.c b/tests/rvv_bench/strlen/strlen.c new file mode 100644 index 000000000..709e84b6f --- /dev/null +++ b/tests/rvv_bench/strlen/strlen.c @@ -0,0 +1,76 @@ +#include "bench.h" + +size_t +strlen_scalar(char const *s) +{ + char const *a = s; + while (*s) ++s, BENCH_CLOBBER(); + return s - a; +} + +size_t +strlen_scalar_autovec(char const *s) +{ + char const *a = s; + while (*s) ++s; + return s - a; +} + +/* https://git.musl-libc.org/cgit/musl/tree/src/string/strlen.c */ +#define ONES ((size_t)-1/UCHAR_MAX) +#define HIGHS (ONES * (UCHAR_MAX/2+1)) +#define HASZERO(x) (((x)-ONES) & ~(x) & HIGHS) +size_t +strlen_musl(char const *s) +{ + char const *a = s; +#ifdef __GNUC__ + typedef size_t __attribute__((__may_alias__)) word; + word const *w; + for (; (uintptr_t)s % sizeof *w; s++) if (!*s) return s-a; + for (w = (void const*)s; !HASZERO(*w); w++); + s = (void const*)w; +#endif + for (; *s; s++); + return s-a; +} + +#define strlen_libc strlen + +#define IMPLS(f) \ + f(scalar) \ + f(scalar_autovec) \ + IFHOSTED(f(libc)) \ + f(musl) \ + MX(f, rvv_page_aligned) \ + MX(f, rvv) \ + + +typedef size_t Func(char const *s); + +#define DECLARE(f) extern Func strlen_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &strlen_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +ux last; + +void init(void) { + for (size_t i = 0; i < MAX_MEM; ++i) + mem[i] += !mem[i]; // remove null bytes +} + +ux checksum(size_t n) { return last; } + +BENCH(base) { + char *p = (char*)mem + (urand() % 511); + p[n] = 0; + TIME last = f(p); + p[n] = urand() | 1; +} BENCH_END + +Bench benches[] = { + { MAX_MEM - 521, "strlen", bench_base }, +}; BENCH_MAIN(impls, benches) + diff --git a/tests/rvv_bench/utf8_count/utf8_count.S b/tests/rvv_bench/utf8_count/utf8_count.S new file mode 100644 index 000000000..41a079693 --- /dev/null +++ b/tests/rvv_bench/utf8_count/utf8_count.S @@ -0,0 +1,213 @@ +#if 0 +size_t utf8_count_rvv(char const *buf, size_t len) { + size_t sum = 0; + for (size_t vl; len > 0; len -= vl, buf += vl) { + vl = __riscv_vsetvl_e8m8(len); + vint8m8_t v = __riscv_vle8_v_i8m8((void*)buf, vl); + vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); + sum += __riscv_vcpop_m_b1(mask, vl); + } + return sum; +} +#endif + +#ifdef MX + +.global MX(utf8_count_rvv_) +MX(utf8_count_rvv_): + li a2, 0 + li a3, -65 +1: + vsetvli a4, a1, e8, MX(), ta, ma + vle8.v v8, (a0) + vmsgt.vx v16, v8, a3 + vcpop.m a5, v16 + add a2, a2, a5 + sub a1, a1, a4 + add a0, a0, a4 + bnez a1, 1b + mv a0, a2 + ret + +.global MX(utf8_count_rvv_align_) +MX(utf8_count_rvv_align_): + mv a2, a0 + li a0, 0 + li a3, -65 + vsetvli t0, zero, e8, MX(), ta, ma # vlen + bltu a1, t0, 2f # len < vlen + # align dest to vlen + sub t1, zero, a2 + remu t1, t1, t0 # align = (-dest) % vlen + vsetvli t0, t1, e8, MX(), ta, ma +1: + vle8.v v8,(a2) + vmsgt.vx v16, v8, a3 + vcpop.m a4, v16 + add a0, a0, a4 + sub a1, a1, t0 + add a2, a2, t0 +2: + vsetvli t0, a1, e8, MX(), ta, ma + bnez a1, 1b + ret + +.global MX(utf8_count_rvv_tail_) +MX(utf8_count_rvv_tail_): + vsetvli t0, a1, e8, MX(), ta, ma + remu a2, a1, t0 # tail = n % vlenb + sub a1, a1, a2 # n -= tail + add a3, a0, a1 # end = dest + n + mv a1, a0 # n = dest + li a0, 0 + li t1, -65 +1: + vle8.v v8, (a1) + vmsgt.vx v16, v8, t1 + vcpop.m t2, v16 + add a0, a0, t2 + add a1, a1, t0 # src += vlenb + bltu a1, a3, 1b # dest < end + # copy tail + vsetvli zero, a2, e8, MX(), ta, ma + vle8.v v8, (a1) + vmsgt.vx v16, v8, t1 + vcpop.m t2, v16 + add a0, a0, t2 + ret + +# this is supposed to test how well the implementation handles +# operations with an vl smaller than VLMAX +.global MX(utf8_count_rvv_128_) +MX(utf8_count_rvv_128_): + li t0, 128/8 + bgt a1, t0, 1f + mv t0, a1 +1: + vsetvli t0, t0, e8, MX(), ta, ma + remu a2, a1, t0 # tail = n % vlenb + sub a1, a1, a2 # n -= tail + add a3, a0, a1 # end = dest + n + mv a1, a0 # n = dest + li a0, 0 + li t1, -65 +1: + vle8.v v8, (a1) + vmsgt.vx v16, v8, t1 + vcpop.m t2, v16 + add a0, a0, t2 + add a1, a1, t0 # src += vlenb + bltu a1, a3, 1b # dest < end + # copy tail + vsetvli zero, a2, e8, MX(), ta, ma + vle8.v v8, (a1) + vmsgt.vx v16, v8, t1 + vcpop.m t2, v16 + add a0, a0, t2 + ret + + +.global MX(utf8_count_rvv_4x_) +MX(utf8_count_rvv_4x_): + mv a2, a0 + li a0, 0 + li a6, -65 +1: + vsetvli a4, a1, e8, MX(), ta, ma + vle8.v v8, (a2) + vmsgt.vx v16, v8, a6 + vcpop.m a7, v16 + sub a1, a1, a4 + add a2, a2, a4 + vsetvli a4, a1, e8, MX(), ta, ma + vle8.v v8, (a2) + vmsgt.vx v16, v8, a6 + vcpop.m a3, v16 + sub a1, a1, a4 + add a2, a2, a4 + vsetvli a4, a1, e8, MX(), ta, ma + vle8.v v8, (a2) + vmsgt.vx v16, v8, a6 + vcpop.m a5, v16 + sub a1, a1, a4 + add a2, a2, a4 + vsetvli a4, a1, e8, MX(), ta, ma + vle8.v v8, (a2) + add a0, a0, a7 + add a0, a0, a3 + add a0, a0, a5 + vmsgt.vx v16, v8, a6 + vcpop.m a3, v16 + add a0, a0, a3 + sub a1, a1, a4 + add a2, a2, a4 + bnez a1, 1b + ret + +// gcc generated from unrolled intrinsics implementation: +// https://godbolt.org/z/q75c6r3Ta +.global MX(utf8_count_rvv_4x_tail_) +MX(utf8_count_rvv_4x_tail_): + vsetvli a5, zero, e8, MX(), ta, ma + slli t3, a5, 2 + add a1, a0, a1 + add a2, a0, t3 + mv a4, a0 + bltu a1, a2, 5f + slli t4, a5, 1 + add t5, t4, a5 + li a0, 0 + li a6, -65 +1: + add a3, a5, a4 + vsetvli zero, zero, e8, MX(), ta, ma + add a7, t4, a4 + vle8.v v8, (a4) + vle8.v v16, (a3) + vmsgt.vx v8, v8, a6 + vmsgt.vx v16, v16, a6 + vcpop.m a3, v8 + vcpop.m t1, v16 + add a3, a3, t1 + vle8.v v8, (a7) + add a4, t5, a4 + vmsgt.vx v8, v8, a6 + vcpop.m a7, v8 + add a3, a3, a7 + vle8.v v8, (a4) + mv a4, a2 + vmsgt.vx v8, v8, a6 + add a2, a2, t3 + vcpop.m a7, v8 + add a3, a3, a7 + add a0, a0, a3 + bgeu a1, a2, 1b +2: + sub a3, a1, a4 + beq a1, a4, 4f + li a2, 0 + li a1, -65 +3: + vsetvli a5, a3, e8, MX(), ta, ma + sub a3, a3, a5 + vle8.v v8, (a4) + add a4, a4, a5 + vmsgt.vx v8, v8, a1 + vcpop.m a5, v8 + add a2, a2, a5 + bne a3, zero, 3b + add a0, a0, a2 +4: + ret +5: + li a0, 0 + j 2b + + + + +#endif + + + + diff --git a/tests/rvv_bench/utf8_count/utf8_count.c b/tests/rvv_bench/utf8_count/utf8_count.c new file mode 100644 index 000000000..ebe2e678c --- /dev/null +++ b/tests/rvv_bench/utf8_count/utf8_count.c @@ -0,0 +1,135 @@ +#include "bench.h" + +size_t +utf8_count_scalar(char const *str, size_t len) +{ + uint8_t const *p = (uint8_t const*)str; + size_t count = 0; + while (len--) count += (*p++ & 0xc0) != 0x80, BENCH_CLOBBER(); + return count; +} + +size_t +utf8_count_scalar_autovec(char const *str, size_t len) +{ + uint8_t const *p = (uint8_t const*)str; + size_t count = 0; + while (len--) count += (*p++ & 0xc0) != 0x80; + return count; +} + +#define GEN_SWAR(name, popc, clobber) \ + size_t \ + utf8_count_##name(char const *str, size_t len) \ + { \ + ux const BENCH_MAY_ALIAS *u; \ + size_t count = 0, tail = 0; \ +\ + uint8_t const *u8 = (uint8_t const*)str; \ + if (len < sizeof *u) { \ + tail = len; \ + goto skip; \ + } \ +\ + tail = sizeof *u - (uintptr_t)str % sizeof *u; \ +\ + len -= tail; \ + while (tail--) \ + count += (*u8++ & 0xC0) != 0x80, clobber; \ +\ + u = (ux const*)u8; \ + tail = len % sizeof *u; \ +\ + for (len /= sizeof *u; len--; ++u) { \ + ux b1 = ~*u & (ux)0x8080808080808080; \ + ux b2 = *u & (ux)0x4040404040404040; \ + count += popc((b1 >> 1) | b2); \ + clobber; \ + } \ +\ + u8 = (uint8_t const*)u; \ + skip: \ + while (tail--) \ + count += (*u8++ & 0xC0) != 0x80, clobber; \ + return count; \ + } + +#if __riscv_zbb +GEN_SWAR(SWAR_popc,__builtin_popcountll,BENCH_CLOBBER()) +GEN_SWAR(SWAR_popc_autovec,__builtin_popcountll,(void)0) +# define POPC(f) f(SWAR_popc) f(SWAR_popc_autovec) +#else +# define POPC(f) +#endif + +static inline int +upopcnt(ux x) +{ + /* 2-bit sums */ + x -= (x >> 1) & (-(ux)1/3); + /* 4-bit sums */ + x = (x & (-(ux)1/15*3)) + ((x >> 2) & (-(ux)1/15*3)); + /* 8-bit sums */ + x = (x + (x >> 4)) & (-(ux)1/255*15); + BENCH_VOLATILE_REG(x); + /* now we can just add the sums together, because can't overflow, + * since there can't be more than 255 bits set */ + x += (x >> 8); /* 16-bit sums */ + x += (x >> 16); /* sum 16-bit sums */ + IF64(x += (x >> 32)); /* sum 32-bit sums */ + return x & 127; +} + + +GEN_SWAR(SWAR_popc_bithack,upopcnt,BENCH_CLOBBER()) +GEN_SWAR(SWAR_popc_bithack_autovec,upopcnt,(void)0) + + +#define IMPLS(f) \ + MX(f, rvv) \ + f(scalar) \ + f(scalar_autovec) \ + POPC(f) \ + f(SWAR_popc_bithack) \ + f(SWAR_popc_bithack_autovec) \ + MX(f, rvv_align) \ + MX(f, rvv_tail) \ + MX(f, rvv_128) \ + MX(f, rvv_4x) \ + MX(f, rvv_4x_tail) \ + +typedef size_t Func(char const *str, size_t len); + +#define DECLARE(f) extern Func utf8_count_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &utf8_count_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +char *str; +ux last; + +void init(void) { } +ux checksum(size_t n) { return last; } + +void common(size_t n, size_t off) { + str = (char*)mem + off; + memrand(str, n + 9); +} + +BENCH(base) { + common(n, urand() & 511); + TIME last = (uintptr_t)f(str, n); +} BENCH_END + +BENCH(aligned) { + common(n, 0); + TIME last = (uintptr_t)f(str, n); +} BENCH_END + +Bench benches[] = { + { MAX_MEM - 521, "utf8 count", bench_base }, + { MAX_MEM - 521, "utf8 count aligned", bench_aligned } +}; BENCH_MAIN(impls, benches) + +