From 4d4851e17903a5e8b66a2eef027354594e391559 Mon Sep 17 00:00:00 2001 From: Dale Weiler Date: Sat, 14 Dec 2013 01:23:39 -0500 Subject: [PATCH] Faster hashing reaching 16 GB/s on Phenom II X4. --- BSDmakefile | 1 + Makefile | 1 + gmqcc.h | 1 + hash.c | 460 ++++++++++++++++++++++++++++++++++++++++++++++++++++ include.mk | 2 +- stat.c | 134 +-------------- 6 files changed, 467 insertions(+), 132 deletions(-) create mode 100644 hash.c diff --git a/BSDmakefile b/BSDmakefile index 7ebc66e..812df35 100644 --- a/BSDmakefile +++ b/BSDmakefile @@ -115,6 +115,7 @@ exec.o: gmqcc.h opts.def fold.o: ast.h ir.h gmqcc.h opts.def parser.h lexer.h fs.o: gmqcc.h opts.def platform.h ftepp.o: gmqcc.h opts.def lexer.h +hash.o: gmqcc.h opts.def intrin.o: parser.h gmqcc.h opts.def lexer.h ast.h ir.h ir.o: gmqcc.h opts.def ir.h lexer.o: gmqcc.h opts.def lexer.h diff --git a/Makefile b/Makefile index 1215107..97f16c9 100644 --- a/Makefile +++ b/Makefile @@ -143,6 +143,7 @@ install-doc: ansi.o: platform.h gmqcc.h opts.def util.o: gmqcc.h opts.def platform.h +hash.o: gmqcc.h opts.def stat.o: gmqcc.h opts.def fs.o: gmqcc.h opts.def platform.h opts.o: gmqcc.h opts.def diff --git a/gmqcc.h b/gmqcc.h index 76289da..9317822 100644 --- a/gmqcc.h +++ b/gmqcc.h @@ -239,6 +239,7 @@ const char *util_ctime (const time_t *timer); typedef struct fs_file_s fs_file_t; bool util_isatty(fs_file_t *); +size_t hash(const char *key); /* * A flexible vector implementation: all vector pointers contain some diff --git a/hash.c b/hash.c new file mode 100644 index 0000000..c71ba71 --- /dev/null +++ b/hash.c @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2012, 2013 + * Dale Weiler + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "gmqcc.h" +#include + +/* + * This is a version of the Murmur3 hashing function optimized for various + * compilers/architectures. It uses the traditional Murmur2 mix stagin + * but fixes the mix staging inner loops. + * + * Murmur 2 contains an inner loop such as: + * while (l >= 4) { + * u32 k = *(u32*)d; + * k *= m; + * k ^= k >> r; + * k *= m; + * + * h *= m; + * h ^= k; + * d += 4; + * l -= 4; + * } + * + * The two u32s that form the key are the same value for x + * this premix stage will perform the same results for both values. Unrolled + * this produces just: + * x *= m; + * x ^= x >> r; + * x *= m; + * + * h *= m; + * h ^= x; + * h *= m; + * h ^= x; + * + * This appears to be fine, except what happens when m == 1? well x + * cancels out entierly, leaving just: + * x ^= x >> r; + * h ^= x; + * h ^= x; + * + * So all keys hash to the same value, but how often does m == 1? + * well, it turns out testing x for all possible values yeilds only + * 172,013,942 unique results instead of 2^32. So nearly ~4.6 bits + * are cancelled out on average! + * + * This means we have a 14.5% higher chance of collision. This is where + * Murmur3 comes in to save the day. + */ + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wattributes" +#endif + +/* + * Some rotation tricks: + * MSVC one shaves off six instructions, where GCC optimized one for + * x86 and amd64 shaves off four instructions. Native methods are often + * optimized rather well at -O3, but not at -O2. + */ +#if defined(_MSC_VER) +# define HASH_ROTL32(X, Y) _rotl((X), (Y)) +#else +static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) { +#if defined (__GNUC__) && (defined(__i386__) || defined(__amd64__)) + __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r)); + return x; +#else /* ! (defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))) */ + return (x << r) | (x >> (32 - r)); +#endif +} +# define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y)) +#endif /* !(_MSC_VER) */ + +static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) { + hash ^= hash >> 16; + hash *= 0x85EBCA6B; + hash ^= hash >> 13; + hash *= 0xC2B2AE35; + hash ^= hash >> 16; + return hash; +} + +/* + * These constants were calculated with SMHasher to determine the best + * case senario for Murmur3: + * http://code.google.com/p/smhasher/ + */ +#define HASH_MASK1 0xCC9E2D51 +#define HASH_MASK2 0x1B873593 +#define HASH_SEED 0x9747B28C + +#if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE +# define HASH_NATIVE_SAFEREAD(PTR) (*((uint32_t*)(PTR))) +#elif PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_BIG +# if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR >= 3)) +# define HASH_NATIVE_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR)))) +# endif +#endif +/* Process individual bytes at this point since the endianess isn't known. */ +#ifndef HASH_NATIVE_SAFEREAD +# define HASH_NATIVE_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24) +#endif + +#define HASH_NATIVE_BLOCK(H, K) \ + do { \ + K *= HASH_MASK1; \ + K = HASH_ROTL32(K, 15); \ + K *= HASH_MASK2; \ + H ^= K; \ + H = HASH_ROTL32(H, 13); \ + H = H * 5 + 0xE6546B64; \ + } while (0) + +#define HASH_NATIVE_BYTES(COUNT, H, C, N, PTR, LENGTH) \ + do { \ + int i = COUNT; \ + while (i--) { \ + C = C >> 8 | *PTR++ << 24; \ + N++; \ + LENGTH--; \ + if (N == 4) { \ + HASH_NATIVE_BLOCK(H, C); \ + N = 0; \ + } \ + } \ + } while (0) + +/* + * Highly unrolled at per-carry bit granularity instead of per-block granularity. This will achieve the + * highest possible instruction level parallelism. + */ +static GMQCC_FORCEINLINE void hash_native_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) { + uint32_t h1 = *ph1; + uint32_t c = *carry; + + const uint8_t *ptr = (uint8_t*)key; + const uint8_t *end; + + /* carry count from low 2 bits of carry value */ + int n = c & 3; + + /* + * Unaligned word accesses are safe in LE. Thus we can obtain a little + * more speed. + */ +# if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE + /* Consume carry bits */ + int it = (4 - n) & 3; + if (it && it <= length) + HASH_NATIVE_BYTES(it, h1, c, n, ptr, length); + + /* word size chunk consumption */ + end = ptr + length/4*4; + for (; ptr < end; ptr += 4) { + uint32_t k1 = HASH_NATIVE_SAFEREAD(ptr); + HASH_NATIVE_BLOCK(h1, k1); + } +# else + /* + * Unsafe to assume unaligned word accesses. Thus we'll need to consume + * to alignment then process in aligned block chunks. + */ + uint32_t k1; + int it = -(long)ptr & 3; + if (it && it <= length) + HASH_NATIVE_BYTES(it, h1, c, n, ptr, length); + + /* + * Alignment has been reached, deal with aligned blocks, specializing for + * all possible carry counts. + */ + end = ptr + len / 4 * 4; + switch (n) { + case 0: + for (; ptr < end; ptr += 4) { + k1 = HASH_NATIVE_SAFEREAD(ptr); + HASH_NATIVE_BLOCK(h1, k1); + } + break; + + case 1: + for (; ptr < end; ptr += 4) { + k1 = c >> 24; + c = HASH_NATIVE_SAFEREAD(ptr); + k1 |= c << 8; + HASH_NATIVE_BLOCK(h1, k1); + } + break; + + case 2: + for (; ptr < end; ptr += 4) { + k1 = c >> 16; + c = HASH_NATIVE_SAFEREAD(ptr); + k1 |= c << 16; + HASH_NATIVE_BLOCK(h1, k1); + } + break; + + case 3: + for (; ptr < end; ptr += 4) { + k1 = c >> 8; + c = HASH_NATIVE_SAFEREAD(ptr); + k1 |= c << 24; + HASH_NATIVE_BLOCK(h1, k1); + } + break; + } +#endif /* misaligned reads */ + + /* + * Advanced over 32-bit chunks, this can possibly leave 1..3 bytes of + * additional trailing content to process. + */ + length -= length/4*4; + + HASH_NATIVE_BYTES(length, h1, c, n, ptr, length); + + *ph1 = h1; + *carry = (c & ~0xFF) | n; +} + +static GMQCC_FORCEINLINE uint32_t hash_native_result(uint32_t hash, uint32_t carry, size_t length) { + uint32_t k1; + int n = carry & 3; + if (GMQCC_LIKELY(n)) { + k1 = carry >> (4 - n) * 8; + k1 *= HASH_MASK1; + k1 = HASH_ROTL32(k1, 15); + k1 *= HASH_MASK2; + hash ^= k1; + } + hash ^= length; + hash = hash_mix32(hash); + + return hash; +} + +static GMQCC_FORCEINLINE uint32_t hash_native(const void *GMQCC_RESTRICT key, size_t length) { + uint32_t hash = HASH_SEED; + uint32_t carry = 0; + + /* Seperate calls for inliner to deal with */ + hash_native_process(&hash, &carry, key, length); + return hash_native_result(hash, carry, length); +} + +/* + * Inline assembly optimized SSE version for when SSE is present via CPUID + * or the host compiler has __SSE__. This is about 16 cycles faster than + * native at -O2 for GCC and 11 cycles for -O3. + * + * Tested with -m32 on a Phenom II X4 with: + * gcc version 4.8.1 20130725 (prerelease) (GCC) + */ +#if defined(__GNUC__) && defined(__i386__) +static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) { + uint32_t ret; + __asm__ __volatile__ ( + " mov %%eax, %%ebx\n" + " mov %2, %%eax\n" + " movd %%eax, %%xmm7\n" + " shufps $0, %%xmm7, %%xmm7\n" + " mov %3, %%eax\n" + " movd %%eax, %%xmm6\n" + " shufps $0, %%xmm6, %%xmm6\n" + " lea (%%esi, %%ecx, 1), %%edi\n" + " jmp 2f\n" + "1:\n" + " movaps (%%esi), %%xmm0\n" + " pmulld %%xmm7, %%xmm0\n" + " movaps %%xmm0, %%xmm2\n" + " pslld $15, %%xmm0\n" + " psrld $17, %%xmm2\n" + " orps %%xmm2, %%xmm0\n" + " pmulld %%xmm6, %%xmm0\n" + " movd %%xmm0, %%eax\n" + " xor %%eax, %%ebx\n" + " rol $13, %%ebx\n" + " imul $5, %%ebx\n" + " add $0xE6546B64, %%ebx\n" + " shufps $0x39, %%xmm0, %%xmm0\n" + " movd %%xmm0, %%eax\n" + " xor %%eax, %%ebx\n" + " rol $13, %%ebx\n" + " imul $5, %%ebx\n" + " add $0xE6546B64, %%ebx\n" + " shufps $0x39, %%xmm0, %%xmm0\n" + " movd %%xmm0, %%eax\n" + " xor %%eax, %%ebx\n" + " rol $13, %%ebx\n" + " imul $5, %%ebx\n" + " add $0xE6546B64, %%ebx\n" + " shufps $0x39, %%xmm0, %%xmm0\n" + " movd %%xmm0, %%eax\n" + " xor %%eax, %%ebx\n" + " rol $13, %%ebx\n" + " imul $5, %%ebx\n" + " add $0xE6546B64, %%ebx\n" + " add $16, %%esi\n" + "2:\n" + " cmp %%esi, %%edi\n" + " jne 1b\n" + " xor %%ecx, %%ebx\n" + " mov %%ebx, %%eax\n" + " shr $16, %%ebx\n" + " xor %%ebx, %%eax\n" + " imul $0x85EBCA6b, %%eax\n" + " mov %%eax, %%ebx\n" + " shr $13, %%ebx\n" + " xor %%ebx, %%eax\n" + " imul $0xC2B2AE35, %%eax\n" + " mov %%eax, %%ebx\n" + " shr $16, %%ebx\n" + " xor %%ebx, %%eax\n" + : "=a" (ret) + + : "a" (HASH_SEED), + "i" (HASH_MASK1), + "i" (HASH_MASK2), + "S" (key), + "c" (length) + + : "%ebx", + "%edi" + ); + return ret; +} +#endif + +#if defined (__GNUC__) && defined(__i386__) +/* + * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used + * to determine if we should use the SSE route. + */ +static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) { + __asm__ __volatile__ ( + "cpuid" + : "=a"(lanes[0]), + "=b"(lanes[1]), + "=c"(lanes[2]), + "=d"(lanes[3]) + + : "a" (entry) + ); +} + +#endif /* !(defined(__GNUC__) && defined(__i386__) */ + +static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) { +/* + * No host SSE instruction set assumed do runtime test instead. This + * is for MinGW32 mostly which doesn't define SSE. + */ +#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__) + static bool memoize = false; + static bool sse = false; + + if (GMQCC_UNLIKELY(!memoize)) { + /* + * Only calculate SSE one time, thus it's unlikely that this branch + * is taken more than once. + */ + static int lanes[4]; + hash_cpuid(lanes, 0); + /* + * It's very likely that lanes[0] will contain a value unless it + * isn't a modern x86. + */ + if (GMQCC_LIKELY(*lanes >= 1)) + sse = (lanes[3] & ((int)1 << 25)) != 0; + memoize = true; + } + + return (GMQCC_LIKELY(sse)) + ? hash_sse(key, length); + : hash_native(key, length); +/* + * Same as above but this time host compiler was defined with SSE support. + * This handles MinGW32 builds for i686+ + */ +#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__) + return hash_sse(key, length); +#else + /* + * Go the native route which itself is highly optimized as well for + * unaligned load/store when dealing with LE. + */ + return hash_native(key, length); +#endif +} + +#define HASH_LEN_ALIGN (sizeof(size_t)) +#define HASH_LEN_ONES ((size_t)-1/UCHAR_MAX) +#define HASH_LEN_HIGHS (HASH_LEN_ONES * (UCHAR_MAX / 2 + 1)) +#define HASH_LEN_HASZERO(X) (((X)-HASH_LEN_ONES) & ~(X) & HASH_LEN_HIGHS) + +size_t hash(const char *key) { + const char *s = key; + const char *a = s; + const size_t *w; + + /* Align for fast staging */ + for (; (uintptr_t)s % HASH_LEN_ALIGN; s++) { + /* Quick stage if terminated before alignment */ + if (!*s) + return hash_entry(key, s-a); + } + + /* + * Efficent staging of words for string length calculation, this is + * faster than ifunc resolver of strlen call. + * + * On a x64 this becomes literally two masks, and a quick skip through + * bytes along the string with the following masks: + * movabs $0xFEFEFEFEFEFEFEFE,%r8 + * movabs $0x8080808080808080,%rsi + */ + for (w = (const void *)s; !HASH_LEN_HASZERO(*w); w++); + for (s = (const void *)w; *s; s++); + + return hash_entry(key, s-a); +} + +#undef HASH_LEN_HASZERO +#undef HASH_LEN_HIGHS +#undef HASH_LEN_ONES +#undef HASH_LEN_ALIGN +#undef HASH_SEED +#undef HASH_MASK2 +#undef HASH_MASK1 +#undef HASH_ROTL32 +#undef HASH_NATIVE_BLOCK +#undef HASH_NATIVE_BYTES +#undef HASH_NATIVE_SAFEREAD + +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif diff --git a/include.mk b/include.mk index 2bc2a83..efb4a89 100644 --- a/include.mk +++ b/include.mk @@ -14,7 +14,7 @@ LDFLAGS += LIBS += -lm #common objects -COMMON = ansi.o util.o stat.o fs.o opts.o conout.o +COMMON = ansi.o util.o hash.o stat.o fs.o opts.o conout.o #objects OBJ_C = $(COMMON) main.o lexer.o parser.o code.o ast.o ir.o ftepp.o utf8.o correct.o fold.o intrin.o diff --git a/stat.c b/stat.c index eedb0a4..7cd1bcf 100644 --- a/stat.c +++ b/stat.c @@ -353,139 +353,11 @@ typedef struct hash_node_t { struct hash_node_t *next; /* next node (linked list) */ } hash_node_t; -/* - * This is a patched version of the Murmur2 hashing function to use - * a proper pre-mix and post-mix setup. Infact this is Murmur3 for - * the most part just reinvented. - * - * Murmur 2 contains an inner loop such as: - * while (l >= 4) { - * u32 k = *(u32*)d; - * k *= m; - * k ^= k >> r; - * k *= m; - * - * h *= m; - * h ^= k; - * d += 4; - * l -= 4; - * } - * - * The two u32s that form the key are the same value x (pulled from data) - * this premix stage will perform the same results for both values. Unrolled - * this produces just: - * x *= m; - * x ^= x >> r; - * x *= m; - * - * h *= m; - * h ^= x; - * h *= m; - * h ^= x; - * - * This appears to be fine, except what happens when m == 1? well x - * cancels out entierly, leaving just: - * x ^= x >> r; - * h ^= x; - * h ^= x; - * - * So all keys hash to the same value, but how often does m == 1? - * well, it turns out testing x for all possible values yeilds only - * 172,013,942 unique results instead of 2^32. So nearly ~4.6 bits - * are cancelled out on average! - * - * This means we have a 14.5% (rounded) chance of colliding more, which - * results in another bucket/chain for the hashtable. - * - * We fix it buy upgrading the pre and post mix ssystems to align with murmur - * hash 3. - */ -#if 1 -#define GMQCC_ROTL32(X, R) (((X) << (R)) | ((X) >> (32 - (R)))) -size_t util_hthash(hash_table_t *ht, const char *key) { - const unsigned char *data = (const unsigned char *)key; - const size_t len = strlen(key); - const size_t block = len / 4; - const uint32_t mask1 = 0xCC9E2D51; - const uint32_t mask2 = 0x1B873593; - const uint32_t *blocks = (const uint32_t*)(data + block * 4); - const unsigned char *tail = (const unsigned char *)(data + block * 4); - - size_t i; - uint32_t k; - uint32_t h = 0x1EF0 ^ len; - - for (i = -((int)block); i; i++) { - k = blocks[i]; - k *= mask1; - k = GMQCC_ROTL32(k, 15); - k *= mask2; - h ^= k; - h = GMQCC_ROTL32(h, 13); - h = h * 5 + 0xE6546B64; - } - k = 0; - switch (len & 3) { - case 3: - k ^= tail[2] << 16; - case 2: - k ^= tail[1] << 8; - case 1: - k ^= tail[0]; - k *= mask1; - k = GMQCC_ROTL32(k, 15); - k *= mask2; - h ^= k; - } - - h ^= len; - h ^= h >> 16; - h *= 0x85EBCA6B; - h ^= h >> 13; - h *= 0xC2B2AE35; - h ^= h >> 16; - - return (size_t) (h % ht->size); -} -#undef GMQCC_ROTL32 -#else -/* We keep the old for reference */ -GMQCC_INLINE size_t util_hthash(hash_table_t *ht, const char *key) { - const uint32_t mix = 0x5BD1E995; - const uint32_t rot = 24; - size_t size = strlen(key); - uint32_t hash = 0x1EF0 /* LICRC TAB */ ^ size; - uint32_t alias = 0; - const unsigned char *data = (const unsigned char*)key; - - while (size >= 4) { - alias = (data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24)); - alias *= mix; - alias ^= alias >> rot; - alias *= mix; - - hash *= mix; - hash ^= alias; - - data += 4; - size -= 4; - } - - switch (size) { - case 3: hash ^= data[2] << 16; - case 2: hash ^= data[1] << 8; - case 1: hash ^= data[0]; - hash *= mix; - } - - hash ^= hash >> 13; - hash *= mix; - hash ^= hash >> 15; - - return (size_t) (hash % ht->size); +size_t hash(const char *key); +size_t util_hthash(hash_table_t *ht, const char *key) { + return hash(key) % ht->size; } -#endif static hash_node_t *_util_htnewpair(const char *key, void *value) { hash_node_t *node; -- 2.39.2