Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

fast mod for shuffle splitter #1073

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions native-sql-engine/cpp/src/shuffle/splitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1453,14 +1453,12 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch&
}
for (auto i = 0; i < num_rows; ++i) {
// positive mod
auto pid = pid_arr->Value(i) % num_partitions_;
// force to generate ASM
__asm__(
"lea (%[num_partitions],%[pid],1),%[tmp]\n"
"test %[pid],%[pid]\n"
"cmovs %[tmp],%[pid]\n"
: [ pid ] "+r"(pid)
: [ num_partitions ] "r"(num_partitions_), [ tmp ] "r"(0));
auto localPid = pid_arr->Value(i);
auto positiveLocalPid = localPid < 0 ? -localPid : localPid;
uint64_t M = fastmod::computeM_s32(num_partitions_); // do once
// auto pid = positiveLocalPid % num_partitions_;
auto pid = fastmod::fastmod_s32(positiveLocalPid, M, num_partitions_);

partition_id_[i] = pid;
partition_id_cnt_[pid]++;
}
Expand Down
1 change: 1 addition & 0 deletions native-sql-engine/cpp/src/shuffle/splitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

#include "shuffle/type.h"
#include "shuffle/utils.h"
#include "third_party/fastmod.h"

namespace sparkcolumnarplugin {
namespace shuffle {
Expand Down
200 changes: 200 additions & 0 deletions native-sql-engine/cpp/src/third_party/fastmod.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#ifndef FASTMOD_H
#define FASTMOD_H

#ifndef __cplusplus
#include <stdbool.h>
#include <stdint.h>
#else
// In C++ <cstdbool>/<stdbool.h> are irelevant as bool is already a type
#include <cstdint>
#endif

#ifndef __cplusplus
#define FASTMOD_API static inline
#else
// In C++ we mark all the functions inline.
// If C++14 relaxed constexpr is supported we use constexpr so functions
// can be used at compile-time.
#if __cpp_constexpr >= 201304 && !defined(_MSC_VER)
// visual studio does not like constexpr
#define FASTMOD_API constexpr
#define FASTMOD_CONSTEXPR constexpr
#else
#define FASTMOD_API inline
#define FASTMOD_CONSTEXPR
#endif
#endif

#ifdef _MSC_VER
#include <intrin.h>
#endif

#ifdef __cplusplus
namespace fastmod {
#endif

#ifdef _MSC_VER

// __umulh is only available in x64 mode under Visual Studio: don't compile to 32-bit!
FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
return __umulh(lowbits, d);
}

#else // _MSC_VER NOT defined

FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
return ((__uint128_t)lowbits * d) >> 64;
}

FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
return ((__int128_t)lowbits * d) >> 64;
}

// This is for the 64-bit functions.
FASTMOD_API uint64_t mul128_u64(__uint128_t lowbits, uint64_t d) {
__uint128_t bottom_half =
(lowbits & UINT64_C(0xFFFFFFFFFFFFFFFF)) * d; // Won't overflow
bottom_half >>= 64; // Only need the top 64 bits, as we'll shift the lower half away;
__uint128_t top_half = (lowbits >> 64) * d;
__uint128_t both_halves =
bottom_half + top_half; // Both halves are already shifted down by 64
both_halves >>= 64; // Get top half of both_halves
return (uint64_t)both_halves;
}

#endif // _MSC_VER

/**
* Unsigned integers.
* Usage:
* uint32_t d = ... ; // divisor, should be non-zero
* uint64_t M = computeM_u32(d); // do once
* fastmod_u32(a,M,d) is a % d for all 32-bit a.
*
**/

// M = ceil( (1<<64) / d ), d > 0
FASTMOD_API uint64_t computeM_u32(uint32_t d) {
return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1;
}

// fastmod computes (a % d) given precomputed M
FASTMOD_API uint32_t fastmod_u32(uint32_t a, uint64_t M, uint32_t d) {
uint64_t lowbits = M * a;
return (uint32_t)(mul128_u32(lowbits, d));
}

// fastmod computes (a / d) given precomputed M for d>1
FASTMOD_API uint32_t fastdiv_u32(uint32_t a, uint64_t M) {
return (uint32_t)(mul128_u32(M, a));
}

// given precomputed M, checks whether n % d == 0
FASTMOD_API bool is_divisible(uint32_t n, uint64_t M) { return n * M <= M - 1; }

/**
* signed integers
* Usage:
* int32_t d = ... ; // should be non-zero and between [-2147483647,2147483647]
* int32_t positive_d = d < 0 ? -d : d; // absolute value
* uint64_t M = computeM_s32(d); // do once
* fastmod_s32(a,M,positive_d) is a % d for all 32-bit a.
**/

// M = floor( (1<<64) / d ) + 1
// you must have that d is different from 0 and -2147483648
// if d = -1 and a = -2147483648, the result is undefined
FASTMOD_API uint64_t computeM_s32(int32_t d) {
if (d < 0) d = -d;
return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1 + ((d & (d - 1)) == 0 ? 1 : 0);
}

// fastmod computes (a % d) given precomputed M,
// you should pass the absolute value of d
FASTMOD_API int32_t fastmod_s32(int32_t a, uint64_t M, int32_t positive_d) {
uint64_t lowbits = M * a;
int32_t highbits = mul128_u32(lowbits, positive_d);
return highbits - ((positive_d - 1) & (a >> 31));
}

#ifndef _MSC_VER
// fastmod computes (a / d) given precomputed M, assumes that d must not
// be one of -1, 1, or -2147483648
FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) {
uint64_t highbits = mul128_s32(M, a);
highbits += (a < 0 ? 1 : 0);
if (d < 0) return -(int32_t)(highbits);
return (int32_t)(highbits);
}

// What follows is the 64-bit functions.
// They are currently not supported on Visual Studio
// due to the lack of a mul128_u64 function.
// They may not be faster than what the compiler
// can produce.

FASTMOD_API __uint128_t computeM_u64(uint64_t d) {
// what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out
__uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF);
M <<= 64;
M |= UINT64_C(0xFFFFFFFFFFFFFFFF);
M /= d;
M += 1;
return M;
}

FASTMOD_API __uint128_t computeM_s64(int64_t d) {
if (d < 0) d = -d;
__uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF);
M <<= 64;
M |= UINT64_C(0xFFFFFFFFFFFFFFFF);
M /= d;
M += 1;
M += ((d & (d - 1)) == 0 ? 1 : 0);
return M;
}

FASTMOD_API uint64_t fastmod_u64(uint64_t a, __uint128_t M, uint64_t d) {
__uint128_t lowbits = M * a;
return mul128_u64(lowbits, d);
}

FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) { return mul128_u64(M, a); }

// End of the 64-bit functions

#endif // #ifndef _MSC_VER

#ifdef __cplusplus

template <uint32_t d>
FASTMOD_API uint32_t fastmod(uint32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d);
return fastmod_u32(x, v, d);
}
template <uint32_t d>
FASTMOD_API uint32_t fastdiv(uint32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d);
return fastdiv_u32(x, v);
}
template <int32_t d>
FASTMOD_API int32_t fastmod(int32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d);
return fastmod_s32(x, v, d);
}
template <int32_t d>
FASTMOD_API int32_t fastdiv(int32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d);
return fastdiv_s32(x, v, d);
}

} // fastmod
#endif

// There's no reason to polute the global scope with this macro once its use ends
// This won't create any problems as the preprocessor will have done its thing once
// it reaches this point
#undef FASTMOD_API
#undef FASTMOD_CONSTEXPR

#endif // FASTMOD_H