Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

Commit

Permalink
fast mod for shuffle splitter
Browse files Browse the repository at this point in the history
Signed-off-by: Yuan Zhou <[email protected]>
  • Loading branch information
zhouyuan committed Aug 22, 2022
1 parent 25fa211 commit bf4a240
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 8 deletions.
14 changes: 6 additions & 8 deletions native-sql-engine/cpp/src/shuffle/splitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1453,14 +1453,12 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch&
}
for (auto i = 0; i < num_rows; ++i) {
// positive mod
auto pid = pid_arr->Value(i) % num_partitions_;
// force to generate ASM
__asm__(
"lea (%[num_partitions],%[pid],1),%[tmp]\n"
"test %[pid],%[pid]\n"
"cmovs %[tmp],%[pid]\n"
: [ pid ] "+r"(pid)
: [ num_partitions ] "r"(num_partitions_), [ tmp ] "r"(0));
auto localPid = pid_arr->Value(i);
auto positiveLocalPid = localPid < 0 ? -localPid : localPid;
uint64_t M = fastmod::computeM_s32(num_partitions_); // do once
//auto pid = positiveLocalPid % num_partitions_;
auto pid = fastmod::fastmod_s32(positiveLocalPid, M, num_partitions_);

partition_id_[i] = pid;
partition_id_cnt_[pid]++;
}
Expand Down
2 changes: 2 additions & 0 deletions native-sql-engine/cpp/src/shuffle/splitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#include "shuffle/type.h"
#include "shuffle/utils.h"

#include "third_party/fastmod.h"

namespace sparkcolumnarplugin {
namespace shuffle {

Expand Down
207 changes: 207 additions & 0 deletions native-sql-engine/cpp/src/third_party/fastmod.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#ifndef FASTMOD_H
#define FASTMOD_H

#ifndef __cplusplus
#include <stdbool.h>
#include <stdint.h>
#else
// In C++ <cstdbool>/<stdbool.h> are irelevant as bool is already a type
#include <cstdint>
#endif

#ifndef __cplusplus
#define FASTMOD_API static inline
#else
// In C++ we mark all the functions inline.
// If C++14 relaxed constexpr is supported we use constexpr so functions
// can be used at compile-time.
#if __cpp_constexpr >= 201304 && !defined(_MSC_VER)
// visual studio does not like constexpr
#define FASTMOD_API constexpr
#define FASTMOD_CONSTEXPR constexpr
#else
#define FASTMOD_API inline
#define FASTMOD_CONSTEXPR
#endif
#endif

#ifdef _MSC_VER
#include <intrin.h>
#endif

#ifdef __cplusplus
namespace fastmod {
#endif

#ifdef _MSC_VER

// __umulh is only available in x64 mode under Visual Studio: don't compile to 32-bit!
FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
return __umulh(lowbits, d);
}

#else // _MSC_VER NOT defined

FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
return ((__uint128_t)lowbits * d) >> 64;
}

FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
return ((__int128_t)lowbits * d) >> 64;
}

// This is for the 64-bit functions.
FASTMOD_API uint64_t mul128_u64(__uint128_t lowbits, uint64_t d) {
__uint128_t bottom_half = (lowbits & UINT64_C(0xFFFFFFFFFFFFFFFF)) * d; // Won't overflow
bottom_half >>= 64; // Only need the top 64 bits, as we'll shift the lower half away;
__uint128_t top_half = (lowbits >> 64) * d;
__uint128_t both_halves = bottom_half + top_half; // Both halves are already shifted down by 64
both_halves >>= 64; // Get top half of both_halves
return (uint64_t)both_halves;
}

#endif // _MSC_VER

/**
* Unsigned integers.
* Usage:
* uint32_t d = ... ; // divisor, should be non-zero
* uint64_t M = computeM_u32(d); // do once
* fastmod_u32(a,M,d) is a % d for all 32-bit a.
*
**/


// M = ceil( (1<<64) / d ), d > 0
FASTMOD_API uint64_t computeM_u32(uint32_t d) {
return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1;
}

// fastmod computes (a % d) given precomputed M
FASTMOD_API uint32_t fastmod_u32(uint32_t a, uint64_t M, uint32_t d) {
uint64_t lowbits = M * a;
return (uint32_t)(mul128_u32(lowbits, d));
}

// fastmod computes (a / d) given precomputed M for d>1
FASTMOD_API uint32_t fastdiv_u32(uint32_t a, uint64_t M) {
return (uint32_t)(mul128_u32(M, a));
}

// given precomputed M, checks whether n % d == 0
FASTMOD_API bool is_divisible(uint32_t n, uint64_t M) {
return n * M <= M - 1;
}

/**
* signed integers
* Usage:
* int32_t d = ... ; // should be non-zero and between [-2147483647,2147483647]
* int32_t positive_d = d < 0 ? -d : d; // absolute value
* uint64_t M = computeM_s32(d); // do once
* fastmod_s32(a,M,positive_d) is a % d for all 32-bit a.
**/

// M = floor( (1<<64) / d ) + 1
// you must have that d is different from 0 and -2147483648
// if d = -1 and a = -2147483648, the result is undefined
FASTMOD_API uint64_t computeM_s32(int32_t d) {
if (d < 0)
d = -d;
return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1 + ((d & (d - 1)) == 0 ? 1 : 0);
}

// fastmod computes (a % d) given precomputed M,
// you should pass the absolute value of d
FASTMOD_API int32_t fastmod_s32(int32_t a, uint64_t M, int32_t positive_d) {
uint64_t lowbits = M * a;
int32_t highbits = mul128_u32(lowbits, positive_d);
return highbits - ((positive_d - 1) & (a >> 31));
}

#ifndef _MSC_VER
// fastmod computes (a / d) given precomputed M, assumes that d must not
// be one of -1, 1, or -2147483648
FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) {
uint64_t highbits = mul128_s32(M, a);
highbits += (a < 0 ? 1 : 0);
if (d < 0)
return -(int32_t)(highbits);
return (int32_t)(highbits);
}

// What follows is the 64-bit functions.
// They are currently not supported on Visual Studio
// due to the lack of a mul128_u64 function.
// They may not be faster than what the compiler
// can produce.

FASTMOD_API __uint128_t computeM_u64(uint64_t d) {
// what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out
__uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF);
M <<= 64;
M |= UINT64_C(0xFFFFFFFFFFFFFFFF);
M /= d;
M += 1;
return M;
}

FASTMOD_API __uint128_t computeM_s64(int64_t d) {
if (d < 0)
d = -d;
__uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF);
M <<= 64;
M |= UINT64_C(0xFFFFFFFFFFFFFFFF);
M /= d;
M += 1;
M += ((d & (d - 1)) == 0 ? 1 : 0);
return M;
}

FASTMOD_API uint64_t fastmod_u64(uint64_t a, __uint128_t M, uint64_t d) {
__uint128_t lowbits = M * a;
return mul128_u64(lowbits, d);
}

FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) {
return mul128_u64(M, a);
}

// End of the 64-bit functions

#endif // #ifndef _MSC_VER

#ifdef __cplusplus

template<uint32_t d>
FASTMOD_API uint32_t fastmod(uint32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d);
return fastmod_u32(x, v, d);
}
template<uint32_t d>
FASTMOD_API uint32_t fastdiv(uint32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d);
return fastdiv_u32(x, v);
}
template<int32_t d>
FASTMOD_API int32_t fastmod(int32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d);
return fastmod_s32(x, v, d);
}
template<int32_t d>
FASTMOD_API int32_t fastdiv(int32_t x) {
FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d);
return fastdiv_s32(x, v, d);
}

} // fastmod
#endif


// There's no reason to polute the global scope with this macro once its use ends
// This won't create any problems as the preprocessor will have done its thing once
// it reaches this point
#undef FASTMOD_API
#undef FASTMOD_CONSTEXPR

#endif // FASTMOD_H

0 comments on commit bf4a240

Please sign in to comment.