Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(don't merge) Re-add siphash #222

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions cbits/siphash-sse2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* The original code was developed by Samuel Neves, and has been
* only lightly modified.
*
* Used with permission.
*/
#pragma GCC target("sse2")

#include <emmintrin.h>
#include "siphash.h"

#define _mm_roti_epi64(x, c) ((16 == (c)) ? _mm_shufflelo_epi16((x), _MM_SHUFFLE(2,1,0,3)) : _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c))))

uint64_t hashable_siphash24_sse2(uint64_t ik0, uint64_t ik1, const u8 *m, size_t n)
{
__m128i v0, v1, v2, v3;
__m128i k0, k1;
__m128i mi, mask, len;
size_t i, k;
union { uint64_t gpr; __m128i xmm; } hash;
const u8 *p;

/* We used to use the _mm_seti_epi32 intrinsic to initialize
SSE2 registers. This compiles to a movdqa instruction,
which requires 16-byte alignment. On 32-bit Windows, it
looks like ghc's runtime linker doesn't align ".rdata"
sections as requested, so we got segfaults for our trouble.

Now we use an intrinsic that cares less about alignment
(_mm_loadu_si128, aka movdqu) instead, and all seems
happy. */

static const u32 const iv[6][4] = {
{ 0x70736575, 0x736f6d65, 0, 0 },
{ 0x6e646f6d, 0x646f7261, 0, 0 },
{ 0x6e657261, 0x6c796765, 0, 0 },
{ 0x79746573, 0x74656462, 0, 0 },
{ -1, -1, 0, 0 },
{ 255, 0, 0, 0 },
};

k0 = _mm_loadl_epi64((__m128i*)(&ik0));
k1 = _mm_loadl_epi64((__m128i*)(&ik1));

v0 = _mm_xor_si128(k0, _mm_loadu_si128((__m128i*) &iv[0]));
v1 = _mm_xor_si128(k1, _mm_loadu_si128((__m128i*) &iv[1]));
v2 = _mm_xor_si128(k0, _mm_loadu_si128((__m128i*) &iv[2]));
v3 = _mm_xor_si128(k1, _mm_loadu_si128((__m128i*) &iv[3]));

#define HALF_ROUND(a,b,c,d,s,t) \
do \
{ \
a = _mm_add_epi64(a, b); c = _mm_add_epi64(c, d); \
b = _mm_roti_epi64(b, s); d = _mm_roti_epi64(d, t); \
b = _mm_xor_si128(b, a); d = _mm_xor_si128(d, c); \
} while(0)

#define COMPRESS(v0,v1,v2,v3) \
do \
{ \
HALF_ROUND(v0,v1,v2,v3,13,16); \
v0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(1,0,3,2)); \
HALF_ROUND(v2,v1,v0,v3,17,21); \
v2 = _mm_shufflelo_epi16(v2, _MM_SHUFFLE(1,0,3,2)); \
} while(0)

for(i = 0; i < (n-n%8); i += 8)
{
mi = _mm_loadl_epi64((__m128i*)(m + i));
v3 = _mm_xor_si128(v3, mi);
if (SIPHASH_ROUNDS == 2) {
COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3);
} else {
for (k = 0; k < SIPHASH_ROUNDS; ++k)
COMPRESS(v0,v1,v2,v3);
}
v0 = _mm_xor_si128(v0, mi);
}

p = m + n;

/* We must be careful to not trigger a segfault by reading an
unmapped page. So where is the end of our input? */

if (((uintptr_t) p & 4095) == 0)
/* Exactly at a page boundary: do not read past the end. */
mi = _mm_setzero_si128();
else if (((uintptr_t) p & 4095) <= 4088)
/* Inside a page: safe to read past the end, as we'll
mask out any bits we shouldn't have looked at below. */
mi = _mm_loadl_epi64((__m128i*)(m + i));
else
/* Within 8 bytes of the end of a page: ensure that
our final read re-reads some bytes so that we do
not cross the page boundary, then shift our result
right so that the re-read bytes vanish. */
mi = _mm_srli_epi64(_mm_loadl_epi64((__m128i*)(((uintptr_t) m + i) & ~7)),
8 * (((uintptr_t) m + i) % 8));

len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0);
mask = _mm_srli_epi64(_mm_loadu_si128((__m128i*) &iv[4]), 8*(8-n%8));
mi = _mm_xor_si128(_mm_and_si128(mi, mask), len);

v3 = _mm_xor_si128(v3, mi);
if (SIPHASH_ROUNDS == 2) {
COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3);
} else {
for (k = 0; k < SIPHASH_ROUNDS; ++k)
COMPRESS(v0,v1,v2,v3);
}
v0 = _mm_xor_si128(v0, mi);

v2 = _mm_xor_si128(v2, _mm_loadu_si128((__m128i*) &iv[5]));
if (SIPHASH_FINALROUNDS == 4) {
COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3);
COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3);
} else {
for (k = 0; k < SIPHASH_FINALROUNDS; ++k)
COMPRESS(v0,v1,v2,v3);
}

v0 = _mm_xor_si128(_mm_xor_si128(v0, v1), _mm_xor_si128(v2, v3));
hash.xmm = v0;

#undef COMPRESS
#undef HALF_ROUND
//return _mm_extract_epi32(v0, 0) | (((uint64_t)_mm_extract_epi32(v0, 1)) << 32);
return hash.gpr;
}
86 changes: 86 additions & 0 deletions cbits/siphash-sse41.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* The original code was developed by Samuel Neves, and has been
* only lightly modified.
*
* Used with permission.
*/
#pragma GCC target("sse4.1")

#include <smmintrin.h>
#include "siphash.h"

// Specialized for siphash, do not reuse
#define rotate16(x) _mm_shufflehi_epi16((x), _MM_SHUFFLE(2,1,0,3))

#define _mm_roti_epi64(x, c) (((c) == 16) ? rotate16((x)) : _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c))))
//#define _mm_roti_epi64(x, c) _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c)))


uint64_t hashable_siphash24_sse41(uint64_t _k0, uint64_t _k1, const unsigned char *m, size_t n)
{
__m128i v0, v1, v02, v13;
__m128i k0;
__m128i mi, mask, len, h;
const __m128i zero = _mm_setzero_si128();
size_t i, k;
union { uint64_t gpr; __m128i xmm; } hash;
unsigned char key[16];

((uint64_t *)key)[0] = _k0;
((uint64_t *)key)[1] = _k1;

k0 = _mm_loadu_si128((__m128i*)(key + 0));

v0 = _mm_xor_si128(k0, _mm_set_epi32(0x646f7261, 0x6e646f6d, 0x736f6d65, 0x70736575));
v1 = _mm_xor_si128(k0, _mm_set_epi32(0x74656462, 0x79746573, 0x6c796765, 0x6e657261));

v02 = _mm_unpacklo_epi64(v0, v1);
v13 = _mm_unpackhi_epi64(v0, v1);

#define HALF_ROUND(a,b,s,t) \
do \
{ \
__m128i b1,b2; \
a = _mm_add_epi64(a, b); \
b1 = _mm_roti_epi64(b, s); b2 = _mm_roti_epi64(b, t); b = _mm_blend_epi16(b1, b2, 0xF0); \
b = _mm_xor_si128(b, a); \
} while(0)

#define COMPRESS(v02,v13) \
do \
{ \
HALF_ROUND(v02,v13,13,16); \
v02 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
HALF_ROUND(v02,v13,17,21); \
v02 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
} while(0)

for(i = 0; i < (n-n%8); i += 8)
{
mi = _mm_loadl_epi64((__m128i*)(m + i));
v13 = _mm_xor_si128(v13, _mm_unpacklo_epi64(zero, mi));
for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v02,v13);
v02 = _mm_xor_si128(v02, mi);
}

mi = _mm_loadl_epi64((__m128i*)(m + i));
len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0);
mask = _mm_srli_epi64(_mm_set_epi32(0, 0, 0xffffffff, 0xffffffff), 8*(8-n%8));
mi = _mm_xor_si128(_mm_and_si128(mi, mask), len);

v13 = _mm_xor_si128(v13, _mm_unpacklo_epi64(zero, mi));
for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v02,v13);
v02 = _mm_xor_si128(v02, mi);

v02 = _mm_xor_si128(v02, _mm_set_epi32(0, 0xff, 0, 0));
for(k = 0; k < SIPHASH_FINALROUNDS; ++k) COMPRESS(v02,v13);

v0 = _mm_xor_si128(v02, v13);
v0 = _mm_xor_si128(v0, _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(zero), _mm_castsi128_ps(v0))));
hash.xmm = v0;

#undef COMPRESS
#undef HALF_ROUND
//return _mm_extract_epi32(v0, 0) | (((uint64_t)_mm_extract_epi32(v0, 1)) << 32);
return hash.gpr;
}
155 changes: 155 additions & 0 deletions cbits/siphash.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/* Almost a verbatim copy of the reference implementation. */

#include "siphash.h"
#include <stddef.h>

#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))

#define SIPROUND \
do { \
v[0] += v[1]; \
v[1] = ROTL(v[1], 13); \
v[1] ^= v[0]; \
v[0] = ROTL(v[0], 32); \
v[2] += v[3]; \
v[3] = ROTL(v[3], 16); \
v[3] ^= v[2]; \
v[0] += v[3]; \
v[3] = ROTL(v[3], 21); \
v[3] ^= v[0]; \
v[2] += v[1]; \
v[1] = ROTL(v[1], 17); \
v[1] ^= v[2]; \
v[2] = ROTL(v[2], 32); \
} while (0)

#if defined(__i386)
#define _siphash24 plain_siphash24
#endif

static inline uint64_t odd_read(const u8 *p, int count, uint64_t val,
int shift) {
switch (count) {
case 7:
val |= ((uint64_t)p[6]) << (shift + 48);
case 6:
val |= ((uint64_t)p[5]) << (shift + 40);
case 5:
val |= ((uint64_t)p[4]) << (shift + 32);
case 4:
val |= ((uint64_t)p[3]) << (shift + 24);
case 3:
val |= ((uint64_t)p[2]) << (shift + 16);
case 2:
val |= ((uint64_t)p[1]) << (shift + 8);
case 1:
val |= ((uint64_t)p[0]) << shift;
}
return val;
}

static inline void _siphash_compression
( const int c
, uint64_t v[4] // this mutates, allowing you to keep on hashing
, const u8 *str
, const size_t len
){
const u8 *p;
const u8* end;

// compress message
for (p = str, end = str + (len & ~7); p < end; p += 8) {
uint64_t m = peek_uint64_tle((uint64_t *)p);
v[3] ^= m;
for (int i = 0; i < c; i++){
SIPROUND;
}
v[0] ^= m;
}

// compress remainder
uint64_t b = odd_read(p, len & 7, ((uint64_t)len) << 56, 0);

v[3] ^= b;
for (int i = 0; i < c; i++){
SIPROUND;
}
v[0] ^= b;
}

static inline uint64_t _siphash_finalize
( const int d
, uint64_t v[4] // this mutates, allowing you to keep on hashing
){
v[2] ^= 0xff;
if (d == 4) {
SIPROUND;
SIPROUND;
SIPROUND;
SIPROUND;
} else {
for (int i = 0; i < d; i++)
SIPROUND;
}
return v[0] ^ v[1] ^ v[2] ^ v[3];
}

#if defined(__i386)
#undef _siphash24

static uint64_t (*_siphash24)(uint64_t k0, uint64_t k1, const u8 *, size_t);

static void maybe_use_sse() __attribute__((constructor));

static void maybe_use_sse() {
uint32_t eax = 1, ebx, ecx, edx;

__asm volatile("mov %%ebx, %%edi;" /* 32bit PIC: don't clobber ebx */
"cpuid;"
"mov %%ebx, %%esi;"
"mov %%edi, %%ebx;"
: "+a"(eax), "=S"(ebx), "=c"(ecx), "=d"(edx)
:
: "edi");

#if defined(HAVE_SSE2)
if (edx & (1 << 26))
_siphash24 = hashable_siphash24_sse2;
#if defined(HAVE_SSE41)
else if (ecx & (1 << 19))
_siphash24 = hashable_siphash24_sse41;
#endif
else
#endif
_siphash24 = plain_siphash24;
}

#endif

/* ghci's linker fails to call static initializers. */
static inline void ensure_sse_init() {
#if defined(__i386)
if (_siphash24 == NULL)
maybe_use_sse();
#endif
}

void hashable_siphash_init(uint64_t k0, uint64_t k1, uint64_t *v) {
ensure_sse_init();
v[0] = 0x736f6d6570736575ull ^ k0;
v[1] = 0x646f72616e646f6dull ^ k1;
v[2] = 0x6c7967656e657261ull ^ k0;
v[3] = 0x7465646279746573ull ^ k1;
}

/*
* Used for ByteArray#.
*/
void hashable_siphash_compression(const int c, uint64_t v[4], const u8 *str,
size_t off, size_t len) {
_siphash_compression(c, v, str + off, len);
}

uint64_t hashable_siphash_finalize(const int d, uint64_t *v) {
return _siphash_finalize(d, v);
}
Loading