X-Git-Url: https://git.xonotic.org/?p=xonotic%2Fgmqcc.git;a=blobdiff_plain;f=hash.c;h=bc65b7f6821b2e33f0cd6434d2cef69dbb4e1da9;hp=276817e2134573c2425ddbc744901992e297f4e4;hb=337d7ddbf4c330087e9923b580c757ec0f33fcb1;hpb=31e13e6e6469dbc460f18a38e31495b0f39bfcec diff --git a/hash.c b/hash.c index 276817e..bc65b7f 100644 --- a/hash.c +++ b/hash.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012, 2013 + * Copyright (C) 2012, 2013, 2014 * Dale Weiler * * Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -23,9 +23,21 @@ #include "gmqcc.h" #include +#if defined(_MSC_VER) +# define HASH_ROTL32(X, Y) _rotl((X), (Y)) +#elif defined (__GNUC__) && (defined(__i386__) || defined(__amd64__)) + static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) { + __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r)); + return x; + } +# define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y)) +#else +# define HASH_ROTL32(X, Y) (((X) << (Y)) | ((X) >> (32 - (Y)))) +#endif + /* * This is a version of the Murmur3 hashing function optimized for various - * compilers/architectures. It uses the traditional Murmur2 mix stagin + * compilers/architectures. It uses the traditional Murmur2 mix staging * but fixes the mix staging inner loops. * * Murmur 2 contains an inner loop such as: @@ -67,28 +79,7 @@ * This means we have a 14.5% higher chance of collision. This is where * Murmur3 comes in to save the day. */ - -/* - * Some rotation tricks: - * MSVC one shaves off six instructions, where GCC optimized one for - * x86 and amd64 shaves off four instructions. Native methods are often - * optimized rather well at -O3, but not at -O2. - */ -#if defined(_MSC_VER) -# define HASH_ROTL32(X, Y) _rotl((X), (Y)) -#else -static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) { -#if defined (__GNUC__) && (defined(__i386__) || defined(__amd64__)) - __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r)); - return x; -#else /* ! (defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))) */ - return (x << r) | (x >> (32 - r)); -#endif -} -# define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y)) -#endif /* !(_MSC_VER) */ - -static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) { +static GMQCC_FORCEINLINE uint32_t hash_murmur_mix32(uint32_t hash) { hash ^= hash >> 16; hash *= 0x85EBCA6B; hash ^= hash >> 13; @@ -102,33 +93,32 @@ static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) { * case senario for Murmur3: * http://code.google.com/p/smhasher/ */ -#define HASH_MASK1 0xCC9E2D51 -#define HASH_MASK2 0x1B873593 -#define HASH_SEED 0x9747B28C +#define HASH_MURMUR_MASK1 0xCC9E2D51 +#define HASH_MURMUR_MASK2 0x1B873593 +#define HASH_MURMUR_SEED 0x9747B28C #if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE -# define HASH_NATIVE_SAFEREAD(PTR) (*((uint32_t*)(PTR))) +# define HASH_MURMUR_SAFEREAD(PTR) (*((uint32_t*)(PTR))) #elif PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_BIG # if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR >= 3)) -# define HASH_NATIVE_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR)))) +# define HASH_MURMUR_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR)))) # endif #endif /* Process individual bytes at this point since the endianess isn't known. */ -#ifndef HASH_NATIVE_SAFEREAD -# define HASH_NATIVE_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24) +#ifndef HASH_MURMUR_SAFEREAD +# define HASH_MURMUR_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24) #endif -#define HASH_NATIVE_BLOCK(H, K) \ +#define HASH_MURMUR_BLOCK(H, K) \ do { \ - K *= HASH_MASK1; \ + K *= HASH_MURMUR_MASK1; \ K = HASH_ROTL32(K, 15); \ - K *= HASH_MASK2; \ + K *= HASH_MURMUR_MASK2; \ H ^= K; \ H = HASH_ROTL32(H, 13); \ H = H * 5 + 0xE6546B64; \ } while (0) - -#define HASH_NATIVE_BYTES(COUNT, H, C, N, PTR, LENGTH) \ +#define HASH_MURMUR_BYTES(COUNT, H, C, N, PTR, LENGTH) \ do { \ int i = COUNT; \ while (i--) { \ @@ -136,316 +126,123 @@ static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) { N++; \ LENGTH--; \ if (N == 4) { \ - HASH_NATIVE_BLOCK(H, C); \ + HASH_MURMUR_BLOCK(H, C); \ N = 0; \ } \ } \ } while (0) +#define HASH_MURMUR_TAIL(P, Z, H, C, N, PTR, LEN) \ + do { \ + LEN -= LEN/4*4; \ + HASH_MURMUR_BYTES(LEN, H, C, N, PTR, LEN); \ + *P = H; \ + *Z = ((C) & ~0xFF) | (N); \ + } while (0) -/* - * Highly unrolled at per-carry bit granularity instead of per-block granularity. This will achieve the - * highest possible instruction level parallelism. - */ -static GMQCC_FORCEINLINE void hash_native_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) { +#if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE +static GMQCC_FORCEINLINE void hash_murmur_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) { uint32_t h1 = *ph1; uint32_t c = *carry; const uint8_t *ptr = (uint8_t*)key; const uint8_t *end; - /* carry count from low 2 bits of carry value */ - int n = c & 3; - - /* - * Unaligned word accesses are safe in LE. Thus we can obtain a little - * more speed. - */ -# if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE - /* Consume carry bits */ + int n = c & 3; int it = (4 - n) & 3; if (it && it <= length) - HASH_NATIVE_BYTES(it, h1, c, n, ptr, length); + HASH_MURMUR_BYTES(it, h1, c, n, ptr, length); - /* word size chunk consumption */ end = ptr + length/4*4; for (; ptr < end; ptr += 4) { - uint32_t k1 = HASH_NATIVE_SAFEREAD(ptr); - HASH_NATIVE_BLOCK(h1, k1); + uint32_t k1 = HASH_MURMUR_SAFEREAD(ptr); + HASH_MURMUR_BLOCK(h1, k1); } -# else - /* - * Unsafe to assume unaligned word accesses. Thus we'll need to consume - * to alignment then process in aligned block chunks. - */ + HASH_MURMUR_TAIL(ph1, carry, h1, c, n, ptr, length); +} +#else +static GMQCC_FORCEINLINE void hash_murmur_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) { uint32_t k1; + uint32_t h1 = *ph1; + uint32_t c = *carry; + + const uint8_t *ptr = (uint8_t*)key; + const uint8_t *end; + + int n = c & 3; int it = -(long)ptr & 3; if (it && it <= length) - HASH_NATIVE_BYTES(it, h1, c, n, ptr, length); + HASH_MURMUR_BYTES(it, h1, c, n, ptr, length); - /* - * Alignment has been reached, deal with aligned blocks, specializing for - * all possible carry counts. - */ end = ptr + length / 4 * 4; switch (n) { case 0: for (; ptr < end; ptr += 4) { - k1 = HASH_NATIVE_SAFEREAD(ptr); - HASH_NATIVE_BLOCK(h1, k1); + k1 = HASH_MURMUR_SAFEREAD(ptr); + HASH_MURMUR_BLOCK(h1, k1); } break; - case 1: - for (; ptr < end; ptr += 4) { - k1 = c >> 24; - c = HASH_NATIVE_SAFEREAD(ptr); - k1 |= c << 8; - HASH_NATIVE_BLOCK(h1, k1); - } - break; - - case 2: - for (; ptr < end; ptr += 4) { - k1 = c >> 16; - c = HASH_NATIVE_SAFEREAD(ptr); - k1 |= c << 16; - HASH_NATIVE_BLOCK(h1, k1); - } - break; - - case 3: - for (; ptr < end; ptr += 4) { - k1 = c >> 8; - c = HASH_NATIVE_SAFEREAD(ptr); - k1 |= c << 24; - HASH_NATIVE_BLOCK(h1, k1); - } - break; +# define NEXT(N, RIGHT, LEFT) \ + case N: \ + for (; ptr < end; ptr += 4) { \ + k1 = c >> RIGHT; \ + c = HASH_MURMUR_SAFEREAD(ptr); \ + k1 |= c << LEFT; \ + HASH_MURMUR_BLOCK(h1, k1); \ + } \ + break + NEXT(1, 24, 8); + NEXT(2, 16, 16); + NEXT(3, 8, 24); + #undef NEXT } -#endif /* misaligned reads */ - - /* - * Advanced over 32-bit chunks, this can possibly leave 1..3 bytes of - * additional trailing content to process. - */ - length -= length/4*4; - - HASH_NATIVE_BYTES(length, h1, c, n, ptr, length); - - *ph1 = h1; - *carry = (c & ~0xFF) | n; + HASH_MURMUR_TAIL(ph1, carry, h1, c, n, ptr, length); } +#endif -static GMQCC_FORCEINLINE uint32_t hash_native_result(uint32_t hash, uint32_t carry, size_t length) { +static GMQCC_FORCEINLINE uint32_t hash_murmur_result(uint32_t hash, uint32_t carry, size_t length) { uint32_t k1; int n = carry & 3; if (GMQCC_LIKELY(n)) { k1 = carry >> (4 - n) * 8; - k1 *= HASH_MASK1; + k1 *= HASH_MURMUR_MASK1; k1 = HASH_ROTL32(k1, 15); - k1 *= HASH_MASK2; + k1 *= HASH_MURMUR_MASK2; hash ^= k1; } hash ^= length; - hash = hash_mix32(hash); + hash = hash_murmur_mix32(hash); return hash; } -static GMQCC_FORCEINLINE GMQCC_USED uint32_t hash_native(const void *GMQCC_RESTRICT key, size_t length) { - uint32_t hash = HASH_SEED; +static GMQCC_FORCEINLINE uint32_t hash_murmur(const void *GMQCC_RESTRICT key, size_t length) { + uint32_t hash = HASH_MURMUR_SEED; uint32_t carry = 0; - - /* Seperate calls for inliner to deal with */ - hash_native_process(&hash, &carry, key, length); - return hash_native_result(hash, carry, length); + hash_murmur_process(&hash, &carry, key, length); + return hash_murmur_result(hash, carry, length); } -/* - * Inline assembly optimized SSE version for when SSE is present via CPUID - * or the host compiler has __SSE__. This is about 16 cycles faster than - * native at -O2 for GCC and 11 cycles for -O3. - * - * Tested with -m32 on a Phenom II X4 with: - * gcc version 4.8.1 20130725 (prerelease) (GCC) - */ -#if defined(__GNUC__) && defined(__i386__) -static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) { - uint32_t ret; - __asm__ __volatile__ ( - " mov %%eax, %%ebx\n" - " mov %2, %%eax\n" - " movd %%eax, %%xmm7\n" - " shufps $0, %%xmm7, %%xmm7\n" - " mov %3, %%eax\n" - " movd %%eax, %%xmm6\n" - " shufps $0, %%xmm6, %%xmm6\n" - " lea (%%esi, %%ecx, 1), %%edi\n" - " jmp 2f\n" - "1:\n" - " movaps (%%esi), %%xmm0\n" - " pmulld %%xmm7, %%xmm0\n" - " movaps %%xmm0, %%xmm2\n" - " pslld $15, %%xmm0\n" - " psrld $17, %%xmm2\n" - " orps %%xmm2, %%xmm0\n" - " pmulld %%xmm6, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " shufps $0x39, %%xmm0, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " shufps $0x39, %%xmm0, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " shufps $0x39, %%xmm0, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " add $16, %%esi\n" - "2:\n" - " cmp %%esi, %%edi\n" - " jne 1b\n" - " xor %%ecx, %%ebx\n" - " mov %%ebx, %%eax\n" - " shr $16, %%ebx\n" - " xor %%ebx, %%eax\n" - " imul $0x85EBCA6b, %%eax\n" - " mov %%eax, %%ebx\n" - " shr $13, %%ebx\n" - " xor %%ebx, %%eax\n" - " imul $0xC2B2AE35, %%eax\n" - " mov %%eax, %%ebx\n" - " shr $16, %%ebx\n" - " xor %%ebx, %%eax\n" - : "=a" (ret) - - : "a" (HASH_SEED), - "i" (HASH_MASK1), - "i" (HASH_MASK2), - "S" (key), - "c" (length) - - : "%ebx", - "%edi" - ); - return ret; -} -#endif - -#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__) -/* - * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used - * to determine if we should use the SSE route. - */ -static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) { - __asm__ __volatile__ ( - "cpuid" - : "=a"(lanes[0]), - "=b"(lanes[1]), - "=c"(lanes[2]), - "=d"(lanes[3]) - - : "a" (entry) - ); -} - -#endif /* !(defined(__GNUC__) && defined(__i386__) */ - -static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) { -/* - * No host SSE instruction set assumed do runtime test instead. This - * is for MinGW32 mostly which doesn't define SSE. - */ -#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__) - static bool memoize = false; - static bool sse = false; - - if (GMQCC_UNLIKELY(!memoize)) { - /* - * Only calculate SSE one time, thus it's unlikely that this branch - * is taken more than once. - */ - static int lanes[4]; - hash_cpuid(lanes, 0); - /* - * It's very likely that lanes[0] will contain a value unless it - * isn't a modern x86. - */ - if (GMQCC_LIKELY(*lanes >= 1)) - sse = (lanes[3] & ((int)1 << 25)) != 0; - memoize = true; - } - - return (GMQCC_LIKELY(sse)) - ? hash_sse(key, length) - : hash_native(key, length); -/* - * Same as above but this time host compiler was defined with SSE support. - * This handles MinGW32 builds for i686+ - */ -#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__) - return hash_sse(key, length); -#else - /* - * Go the native route which itself is highly optimized as well for - * unaligned load/store when dealing with LE. - */ - return hash_native(key, length); -#endif -} - -#define HASH_LEN_ALIGN (sizeof(size_t)) -#define HASH_LEN_ONES ((size_t)-1/UCHAR_MAX) -#define HASH_LEN_HIGHS (HASH_LEN_ONES * (UCHAR_MAX / 2 + 1)) -#define HASH_LEN_HASZERO(X) (((X)-HASH_LEN_ONES) & ~(X) & HASH_LEN_HIGHS) - size_t hash(const char *key) { const char *s = key; const char *a = s; const size_t *w; - /* Align for fast staging */ - for (; (uintptr_t)s % HASH_LEN_ALIGN; s++) { - /* Quick stage if terminated before alignment */ + for (; (uintptr_t)s % sizeof(size_t); s++) if (!*s) - return hash_entry(key, s-a); - } + return hash_murmur((const void *)key, s-a); - /* - * Efficent staging of words for string length calculation, this is - * faster than ifunc resolver of strlen call. - * - * On a x64 this becomes literally two masks, and a quick skip through - * bytes along the string with the following masks: - * movabs $0xFEFEFEFEFEFEFEFE,%r8 - * movabs $0x8080808080808080,%rsi - */ - for (w = (const void *)s; !HASH_LEN_HASZERO(*w); w++); - for (s = (const void *)w; *s; s++); + for (w = (const size_t*)s; !((*w-(size_t)-1/UCHAR_MAX) & ~*w & ((size_t)-1/UCHAR_MAX) * (UCHAR_MAX / 2 + 1)); w++); + for (s = (const char *)w; *s; s++); - return hash_entry(key, s-a); + return hash_murmur((const void *)key, s-a); } -#undef HASH_LEN_HASZERO -#undef HASH_LEN_HIGHS -#undef HASH_LEN_ONES -#undef HASH_LEN_ALIGN -#undef HASH_SEED -#undef HASH_MASK2 -#undef HASH_MASK1 #undef HASH_ROTL32 -#undef HASH_NATIVE_BLOCK -#undef HASH_NATIVE_BYTES -#undef HASH_NATIVE_SAFEREAD +#undef HASH_MURMUR_MASK1 +#undef HASH_MURMUR_MASK2 +#undef HASH_MURMUR_SEED +#undef HASH_MURMUR_SAFEREAD +#undef HASH_MURMUR_BLOCK +#undef HASH_MURMUR_BYTES +#undef HASH_MURMUR_TAIL