/*
- * Copyright (C) 2012, 2013
+ * Copyright (C) 2012, 2013, 2014
* Dale Weiler
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
#include "gmqcc.h"
#include <limits.h>
+#if defined(_MSC_VER)
+# define HASH_ROTL32(X, Y) _rotl((X), (Y))
+#elif defined (__GNUC__) && (defined(__i386__) || defined(__amd64__))
+ static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) {
+ __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r));
+ return x;
+ }
+# define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y))
+#else
+# define HASH_ROTL32(X, Y) (((X) << (Y)) | ((X) >> (32 - (Y))))
+#endif
+
/*
* This is a version of the Murmur3 hashing function optimized for various
- * compilers/architectures. It uses the traditional Murmur2 mix stagin
+ * compilers/architectures. It uses the traditional Murmur2 mix staging
* but fixes the mix staging inner loops.
*
* Murmur 2 contains an inner loop such as:
* This means we have a 14.5% higher chance of collision. This is where
* Murmur3 comes in to save the day.
*/
-
-/*
- * Some rotation tricks:
- * MSVC one shaves off six instructions, where GCC optimized one for
- * x86 and amd64 shaves off four instructions. Native methods are often
- * optimized rather well at -O3, but not at -O2.
- */
-#if defined(_MSC_VER)
-# define HASH_ROTL32(X, Y) _rotl((X), (Y))
-#else
-static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) {
-#if defined (__GNUC__) && (defined(__i386__) || defined(__amd64__))
- __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r));
- return x;
-#else /* ! (defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))) */
- return (x << r) | (x >> (32 - r));
-#endif
-}
-# define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y))
-#endif /* !(_MSC_VER) */
-
-static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) {
+static GMQCC_FORCEINLINE uint32_t hash_murmur_mix32(uint32_t hash) {
hash ^= hash >> 16;
hash *= 0x85EBCA6B;
hash ^= hash >> 13;
* case senario for Murmur3:
* http://code.google.com/p/smhasher/
*/
-#define HASH_MASK1 0xCC9E2D51
-#define HASH_MASK2 0x1B873593
-#define HASH_SEED 0x9747B28C
+#define HASH_MURMUR_MASK1 0xCC9E2D51
+#define HASH_MURMUR_MASK2 0x1B873593
+#define HASH_MURMUR_SEED 0x9747B28C
#if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
-# define HASH_NATIVE_SAFEREAD(PTR) (*((uint32_t*)(PTR)))
+# define HASH_MURMUR_SAFEREAD(PTR) (*((uint32_t*)(PTR)))
#elif PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_BIG
# if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR >= 3))
-# define HASH_NATIVE_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR))))
+# define HASH_MURMUR_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR))))
# endif
#endif
/* Process individual bytes at this point since the endianess isn't known. */
-#ifndef HASH_NATIVE_SAFEREAD
-# define HASH_NATIVE_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24)
+#ifndef HASH_MURMUR_SAFEREAD
+# define HASH_MURMUR_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24)
#endif
-#define HASH_NATIVE_BLOCK(H, K) \
+#define HASH_MURMUR_BLOCK(H, K) \
do { \
- K *= HASH_MASK1; \
+ K *= HASH_MURMUR_MASK1; \
K = HASH_ROTL32(K, 15); \
- K *= HASH_MASK2; \
+ K *= HASH_MURMUR_MASK2; \
H ^= K; \
H = HASH_ROTL32(H, 13); \
H = H * 5 + 0xE6546B64; \
} while (0)
-
-#define HASH_NATIVE_BYTES(COUNT, H, C, N, PTR, LENGTH) \
+#define HASH_MURMUR_BYTES(COUNT, H, C, N, PTR, LENGTH) \
do { \
int i = COUNT; \
while (i--) { \
N++; \
LENGTH--; \
if (N == 4) { \
- HASH_NATIVE_BLOCK(H, C); \
+ HASH_MURMUR_BLOCK(H, C); \
N = 0; \
} \
} \
} while (0)
+#define HASH_MURMUR_TAIL(P, Z, H, C, N, PTR, LEN) \
+ do { \
+ LEN -= LEN/4*4; \
+ HASH_MURMUR_BYTES(LEN, H, C, N, PTR, LEN); \
+ *P = H; \
+ *Z = ((C) & ~0xFF) | (N); \
+ } while (0)
-/*
- * Highly unrolled at per-carry bit granularity instead of per-block granularity. This will achieve the
- * highest possible instruction level parallelism.
- */
-static GMQCC_FORCEINLINE void hash_native_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) {
+#if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
+static GMQCC_FORCEINLINE void hash_murmur_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) {
uint32_t h1 = *ph1;
uint32_t c = *carry;
const uint8_t *ptr = (uint8_t*)key;
const uint8_t *end;
- /* carry count from low 2 bits of carry value */
- int n = c & 3;
-
- /*
- * Unaligned word accesses are safe in LE. Thus we can obtain a little
- * more speed.
- */
-# if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
- /* Consume carry bits */
+ int n = c & 3;
int it = (4 - n) & 3;
if (it && it <= length)
- HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
+ HASH_MURMUR_BYTES(it, h1, c, n, ptr, length);
- /* word size chunk consumption */
end = ptr + length/4*4;
for (; ptr < end; ptr += 4) {
- uint32_t k1 = HASH_NATIVE_SAFEREAD(ptr);
- HASH_NATIVE_BLOCK(h1, k1);
+ uint32_t k1 = HASH_MURMUR_SAFEREAD(ptr);
+ HASH_MURMUR_BLOCK(h1, k1);
}
-# else
- /*
- * Unsafe to assume unaligned word accesses. Thus we'll need to consume
- * to alignment then process in aligned block chunks.
- */
+ HASH_MURMUR_TAIL(ph1, carry, h1, c, n, ptr, length);
+}
+#else
+static GMQCC_FORCEINLINE void hash_murmur_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) {
uint32_t k1;
+ uint32_t h1 = *ph1;
+ uint32_t c = *carry;
+
+ const uint8_t *ptr = (uint8_t*)key;
+ const uint8_t *end;
+
+ int n = c & 3;
int it = -(long)ptr & 3;
if (it && it <= length)
- HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
+ HASH_MURMUR_BYTES(it, h1, c, n, ptr, length);
- /*
- * Alignment has been reached, deal with aligned blocks, specializing for
- * all possible carry counts.
- */
end = ptr + length / 4 * 4;
switch (n) {
case 0:
for (; ptr < end; ptr += 4) {
- k1 = HASH_NATIVE_SAFEREAD(ptr);
- HASH_NATIVE_BLOCK(h1, k1);
- }
- break;
-
- case 1:
- for (; ptr < end; ptr += 4) {
- k1 = c >> 24;
- c = HASH_NATIVE_SAFEREAD(ptr);
- k1 |= c << 8;
- HASH_NATIVE_BLOCK(h1, k1);
- }
- break;
-
- case 2:
- for (; ptr < end; ptr += 4) {
- k1 = c >> 16;
- c = HASH_NATIVE_SAFEREAD(ptr);
- k1 |= c << 16;
- HASH_NATIVE_BLOCK(h1, k1);
+ k1 = HASH_MURMUR_SAFEREAD(ptr);
+ HASH_MURMUR_BLOCK(h1, k1);
}
break;
- case 3:
- for (; ptr < end; ptr += 4) {
- k1 = c >> 8;
- c = HASH_NATIVE_SAFEREAD(ptr);
- k1 |= c << 24;
- HASH_NATIVE_BLOCK(h1, k1);
- }
- break;
+# define NEXT(N, RIGHT, LEFT) \
+ case N: \
+ for (; ptr < end; ptr += 4) { \
+ k1 = c >> RIGHT; \
+ c = HASH_MURMUR_SAFEREAD(ptr); \
+ k1 |= c << LEFT; \
+ HASH_MURMUR_BLOCK(h1, k1); \
+ } \
+ break
+ NEXT(1, 24, 8);
+ NEXT(2, 16, 16);
+ NEXT(3, 8, 24);
+ #undef NEXT
}
-#endif /* misaligned reads */
-
- /*
- * Advanced over 32-bit chunks, this can possibly leave 1..3 bytes of
- * additional trailing content to process.
- */
- length -= length/4*4;
-
- HASH_NATIVE_BYTES(length, h1, c, n, ptr, length);
-
- *ph1 = h1;
- *carry = (c & ~0xFF) | n;
+ HASH_MURMUR_TAIL(ph1, carry, h1, c, n, ptr, length);
}
+#endif
-static GMQCC_FORCEINLINE uint32_t hash_native_result(uint32_t hash, uint32_t carry, size_t length) {
+static GMQCC_FORCEINLINE uint32_t hash_murmur_result(uint32_t hash, uint32_t carry, size_t length) {
uint32_t k1;
int n = carry & 3;
if (GMQCC_LIKELY(n)) {
k1 = carry >> (4 - n) * 8;
- k1 *= HASH_MASK1;
+ k1 *= HASH_MURMUR_MASK1;
k1 = HASH_ROTL32(k1, 15);
- k1 *= HASH_MASK2;
+ k1 *= HASH_MURMUR_MASK2;
hash ^= k1;
}
hash ^= length;
- hash = hash_mix32(hash);
+ hash = hash_murmur_mix32(hash);
return hash;
}
-static GMQCC_FORCEINLINE GMQCC_USED uint32_t hash_native(const void *GMQCC_RESTRICT key, size_t length) {
- uint32_t hash = HASH_SEED;
+static GMQCC_FORCEINLINE uint32_t hash_murmur(const void *GMQCC_RESTRICT key, size_t length) {
+ uint32_t hash = HASH_MURMUR_SEED;
uint32_t carry = 0;
-
- /* Seperate calls for inliner to deal with */
- hash_native_process(&hash, &carry, key, length);
- return hash_native_result(hash, carry, length);
+ hash_murmur_process(&hash, &carry, key, length);
+ return hash_murmur_result(hash, carry, length);
}
/*
- * Inline assembly optimized SSE version for when SSE is present via CPUID
- * or the host compiler has __SSE__. This is about 16 cycles faster than
- * native at -O2 for GCC and 11 cycles for -O3.
+ * The following hash function implements it's own strlen to avoid using libc's
+ * which isn't always slow but isn't always fastest either.
+ *
+ * Some things to note about this strlen that are otherwise confusing to grasp
+ * at first is that it does intentionally depend on undefined behavior.
*
- * Tested with -m32 on a Phenom II X4 with:
- * gcc version 4.8.1 20130725 (prerelease) (GCC)
+ * The first step to the strlen is to ensure alignment before checking words,
+ * without this step we risk crossing a page boundry with the word check and
+ * that would cause a crash.
+ *
+ * The second step to the strlen contains intentional undefined behavior. When
+ * accessing a word of any size, the first byte of that word is accessible if
+ * and only if the whole word is accessible because words are aligned. This is
+ * indicated by the fact that size / alignment always divides the page size.
+ * One could argue that an architecture exists where size_t and alignment are
+ * different, if that were the case, the alignment will always assume to be the
+ * size of the type (size_t). So it's always safe in that regard.
+ *
+ * In other words, an aligned 2^n load cannot cross a page boundry unless
+ * n > log2(PAGE_SIZE). There are no known architectures which support such
+ * a wide load larger than PAGE_SIZE.
+ *
+ * Valgrind and address sanatizer may choke on this because they're strictly
+ * trying to find bugs, it's a false positive to assume this is a bug when it's
+ * intentional. To prevent these false positives, both things need to be taught
+ * about the intentional behavior; for address sanatizer this can be done with
+ * a compiler attribute, effectively preventing the function from being
+ * instrumented. Valgrind requires a little more work as there is no way to
+ * downright prevent a function from being instrumented, instead we can mark
+ * + sizeof(size_t) bytes ahead of each byte we're reading as we calculate
+ * the length of the string, then we can make that additional + sizeof(size_t)
+ * on the end undefined after the length has been calculated.
+ *
+ * If the compiler doesn't have the attribute to disable address sanatizer
+ * instrumentation we fall back to using libc's strlen instead. This isn't the
+ * best solution. On windows we can assume this method always because neither
+ * address sanatizer or valgrind exist.
*/
-#if defined(__GNUC__) && defined(__i386__)
-static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
- uint32_t ret;
- __asm__ __volatile__ (
- " mov %%eax, %%ebx\n"
- " mov %2, %%eax\n"
- " movd %%eax, %%xmm7\n"
- " shufps $0, %%xmm7, %%xmm7\n"
- " mov %3, %%eax\n"
- " movd %%eax, %%xmm6\n"
- " shufps $0, %%xmm6, %%xmm6\n"
- " lea (%%esi, %%ecx, 1), %%edi\n"
- " jmp 2f\n"
- "1:\n"
- " movaps (%%esi), %%xmm0\n"
- " pmulld %%xmm7, %%xmm0\n"
- " movaps %%xmm0, %%xmm2\n"
- " pslld $15, %%xmm0\n"
- " psrld $17, %%xmm2\n"
- " orps %%xmm2, %%xmm0\n"
- " pmulld %%xmm6, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " add $16, %%esi\n"
- "2:\n"
- " cmp %%esi, %%edi\n"
- " jne 1b\n"
- " xor %%ecx, %%ebx\n"
- " mov %%ebx, %%eax\n"
- " shr $16, %%ebx\n"
- " xor %%ebx, %%eax\n"
- " imul $0x85EBCA6b, %%eax\n"
- " mov %%eax, %%ebx\n"
- " shr $13, %%ebx\n"
- " xor %%ebx, %%eax\n"
- " imul $0xC2B2AE35, %%eax\n"
- " mov %%eax, %%ebx\n"
- " shr $16, %%ebx\n"
- " xor %%ebx, %%eax\n"
- : "=a" (ret)
-
- : "a" (HASH_SEED),
- "i" (HASH_MASK1),
- "i" (HASH_MASK2),
- "S" (key),
- "c" (length)
- : "%ebx",
- "%edi"
- );
- return ret;
-}
+/* Some compilers expose this */
+#if defined(__has_feature)
+# if __has_feature(address_sanitizer)
+# define ASAN_DISABLE __attribute__((no_sanitize_address))
+# define HAS_ASAN_DISABLE
+# endif
#endif
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
-/*
- * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
- * to determine if we should use the SSE route.
- */
-static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
- __asm__ __volatile__ (
- "cpuid"
- : "=a"(lanes[0]),
- "=b"(lanes[1]),
- "=c"(lanes[2]),
- "=d"(lanes[3])
-
- : "a" (entry)
- );
-}
-
-#endif /* !(defined(__GNUC__) && defined(__i386__) */
-
-static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
-/*
- * No host SSE instruction set assumed do runtime test instead. This
- * is for MinGW32 mostly which doesn't define SSE.
- */
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
- static bool memoize = false;
- static bool sse = false;
+/* If they don't try to find by version the attriubte was introduces */
+#if defined(__GNUC__) && __GNUC__ >= 4
+# define ASAN_DISABLE __attribute__((no_sanitize_address))
+# define HAS_ASAN_DISABLE
+#elif defined(__clang__) && __clang_major__ >= 3
+# define ASAN_DISABLE __attribute__((no_sanatize_address))
+# define HAS_ASAN_DISABLE
+/* On windows asan doesn't exist */
+#elif defined(_WIN32)
+# define ASAN_DISABLE /* nothing */
+# define HAS_ASAN_DISABLE
+#endif
- if (GMQCC_UNLIKELY(!memoize)) {
- /*
- * Only calculate SSE one time, thus it's unlikely that this branch
- * is taken more than once.
- */
- static int lanes[4];
- hash_cpuid(lanes, 0);
- /*
- * It's very likely that lanes[0] will contain a value unless it
- * isn't a modern x86.
- */
- if (GMQCC_LIKELY(*lanes >= 1))
- sse = (lanes[3] & ((int)1 << 25)) != 0;
- memoize = true;
- }
+#ifndef HAS_ASAN_DISABLE
+# include <string.h>
+#endif
- return (GMQCC_LIKELY(sse))
- ? hash_sse(key, length)
- : hash_native(key, length);
-/*
- * Same as above but this time host compiler was defined with SSE support.
- * This handles MinGW32 builds for i686+
- */
-#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__)
- return hash_sse(key, length);
+#ifndef NVALGRIND
+# include <valgrind/valgrind.h>
+# include <valgrind/memcheck.h>
#else
- /*
- * Go the native route which itself is highly optimized as well for
- * unaligned load/store when dealing with LE.
- */
- return hash_native(key, length);
+# define VALGRIND_MAKE_MEM_DEFINED(PTR, REDZONE_SIZE)
+# define VALGRIND_MAKE_MEM_NOACCESS(PTR, REDZONE_SIZE)
#endif
-}
-#define HASH_LEN_ALIGN (sizeof(size_t))
-#define HASH_LEN_ONES ((size_t)-1/UCHAR_MAX)
-#define HASH_LEN_HIGHS (HASH_LEN_ONES * (UCHAR_MAX / 2 + 1))
-#define HASH_LEN_HASZERO(X) (((X)-HASH_LEN_ONES) & ~(X) & HASH_LEN_HIGHS)
+#ifdef HAS_ASAN_DISABLE
+#define STRLEN_ALIGN (sizeof(size_t))
+#define STRLEN_ONES ((size_t)-1/UCHAR_MAX)
+#define STRLEN_HIGHS (STRLEN_ONES * (UCHAR_MAX/2+1))
+#define STRLEN_HASZERO(X) (((X)-STRLEN_ONES) & ~(X) & STRLEN_HIGHS)
-size_t hash(const char *key) {
- const char *s = key;
- const char *a = s;
+static ASAN_DISABLE size_t hash_strlen(const char *key) {
+ const char *s = key;
+ const char *a = s;
const size_t *w;
- /* Align for fast staging */
- for (; (uintptr_t)s % HASH_LEN_ALIGN; s++) {
- /* Quick stage if terminated before alignment */
- if (!*s)
- return hash_entry(key, s-a);
+ for (; (uintptr_t)s % STRLEN_ALIGN; s++) {
+ if (*s)
+ continue;
+ return s-a;
+ }
+
+ VALGRIND_MAKE_MEM_DEFINED(s, STRLEN_ALIGN);
+ for (w = (const void *)s; !STRLEN_HASZERO(*w); w++) {
+ /* Make the next word legal to access */
+ VALGRIND_MAKE_MEM_DEFINED(w + STRLEN_ALIGN, STRLEN_ALIGN);
}
- /*
- * Efficent staging of words for string length calculation, this is
- * faster than ifunc resolver of strlen call.
- *
- * On a x64 this becomes literally two masks, and a quick skip through
- * bytes along the string with the following masks:
- * movabs $0xFEFEFEFEFEFEFEFE,%r8
- * movabs $0x8080808080808080,%rsi
- */
- for (w = (const void *)s; !HASH_LEN_HASZERO(*w); w++);
for (s = (const void *)w; *s; s++);
- return hash_entry(key, s-a);
+ /* It's not legal to access this area anymore */
+ VALGRIND_MAKE_MEM_NOACCESS(s + 1, STRLEN_ALIGN);
+ return s-a;
}
+#else
+static GMQCC_INLINE size_t hash_strlen(const char *key) {
+ return strlen(key);
+}
+#endif
-#undef HASH_LEN_HASZERO
-#undef HASH_LEN_HIGHS
-#undef HASH_LEN_ONES
-#undef HASH_LEN_ALIGN
-#undef HASH_SEED
-#undef HASH_MASK2
-#undef HASH_MASK1
-#undef HASH_ROTL32
-#undef HASH_NATIVE_BLOCK
-#undef HASH_NATIVE_BYTES
-#undef HASH_NATIVE_SAFEREAD
+size_t hash(const char *key) {
+ return hash_murmur((const void *)key, hash_strlen(key));
+}