2 * Copyright (C) 2012, 2013
5 * Permission is hereby granted, free of charge, to any person obtaining a copy of
6 * this software and associated documentation files (the "Software"), to deal in
7 * the Software without restriction, including without limitation the rights to
8 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is furnished to do
10 * so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in all
13 * copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 * This is a version of the Murmur3 hashing function optimized for various
28 * compilers/architectures. It uses the traditional Murmur2 mix stagin
29 * but fixes the mix staging inner loops.
31 * Murmur 2 contains an inner loop such as:
44 * The two u32s that form the key are the same value for x
45 * this premix stage will perform the same results for both values. Unrolled
56 * This appears to be fine, except what happens when m == 1? well x
57 * cancels out entierly, leaving just:
62 * So all keys hash to the same value, but how often does m == 1?
63 * well, it turns out testing x for all possible values yeilds only
64 * 172,013,942 unique results instead of 2^32. So nearly ~4.6 bits
65 * are cancelled out on average!
67 * This means we have a 14.5% higher chance of collision. This is where
68 * Murmur3 comes in to save the day.
72 # pragma GCC diagnostic push
73 # pragma GCC diagnostic ignored "-Wattributes"
77 * Some rotation tricks:
78 * MSVC one shaves off six instructions, where GCC optimized one for
79 * x86 and amd64 shaves off four instructions. Native methods are often
80 * optimized rather well at -O3, but not at -O2.
83 # define HASH_ROTL32(X, Y) _rotl((X), (Y))
85 static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) {
86 #if defined (__GNUC__) && (defined(__i386__) || defined(__amd64__))
87 __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r));
89 #else /* ! (defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))) */
90 return (x << r) | (x >> (32 - r));
93 # define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y))
94 #endif /* !(_MSC_VER) */
96 static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) {
106 * These constants were calculated with SMHasher to determine the best
107 * case senario for Murmur3:
108 * http://code.google.com/p/smhasher/
110 #define HASH_MASK1 0xCC9E2D51
111 #define HASH_MASK2 0x1B873593
112 #define HASH_SEED 0x9747B28C
114 #if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
115 # define HASH_NATIVE_SAFEREAD(PTR) (*((uint32_t*)(PTR)))
116 #elif PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_BIG
117 # if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR >= 3))
118 # define HASH_NATIVE_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR))))
121 /* Process individual bytes at this point since the endianess isn't known. */
122 #ifndef HASH_NATIVE_SAFEREAD
123 # define HASH_NATIVE_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24)
126 #define HASH_NATIVE_BLOCK(H, K) \
129 K = HASH_ROTL32(K, 15); \
132 H = HASH_ROTL32(H, 13); \
133 H = H * 5 + 0xE6546B64; \
136 #define HASH_NATIVE_BYTES(COUNT, H, C, N, PTR, LENGTH) \
140 C = C >> 8 | *PTR++ << 24; \
144 HASH_NATIVE_BLOCK(H, C); \
151 * Highly unrolled at per-carry bit granularity instead of per-block granularity. This will achieve the
152 * highest possible instruction level parallelism.
154 static GMQCC_FORCEINLINE void hash_native_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) {
158 const uint8_t *ptr = (uint8_t*)key;
161 /* carry count from low 2 bits of carry value */
165 * Unaligned word accesses are safe in LE. Thus we can obtain a little
168 # if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
169 /* Consume carry bits */
170 int it = (4 - n) & 3;
171 if (it && it <= length)
172 HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
174 /* word size chunk consumption */
175 end = ptr + length/4*4;
176 for (; ptr < end; ptr += 4) {
177 uint32_t k1 = HASH_NATIVE_SAFEREAD(ptr);
178 HASH_NATIVE_BLOCK(h1, k1);
182 * Unsafe to assume unaligned word accesses. Thus we'll need to consume
183 * to alignment then process in aligned block chunks.
186 int it = -(long)ptr & 3;
187 if (it && it <= length)
188 HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
191 * Alignment has been reached, deal with aligned blocks, specializing for
192 * all possible carry counts.
194 end = ptr + len / 4 * 4;
197 for (; ptr < end; ptr += 4) {
198 k1 = HASH_NATIVE_SAFEREAD(ptr);
199 HASH_NATIVE_BLOCK(h1, k1);
204 for (; ptr < end; ptr += 4) {
206 c = HASH_NATIVE_SAFEREAD(ptr);
208 HASH_NATIVE_BLOCK(h1, k1);
213 for (; ptr < end; ptr += 4) {
215 c = HASH_NATIVE_SAFEREAD(ptr);
217 HASH_NATIVE_BLOCK(h1, k1);
222 for (; ptr < end; ptr += 4) {
224 c = HASH_NATIVE_SAFEREAD(ptr);
226 HASH_NATIVE_BLOCK(h1, k1);
230 #endif /* misaligned reads */
233 * Advanced over 32-bit chunks, this can possibly leave 1..3 bytes of
234 * additional trailing content to process.
236 length -= length/4*4;
238 HASH_NATIVE_BYTES(length, h1, c, n, ptr, length);
241 *carry = (c & ~0xFF) | n;
244 static GMQCC_FORCEINLINE uint32_t hash_native_result(uint32_t hash, uint32_t carry, size_t length) {
247 if (GMQCC_LIKELY(n)) {
248 k1 = carry >> (4 - n) * 8;
250 k1 = HASH_ROTL32(k1, 15);
255 hash = hash_mix32(hash);
260 static GMQCC_FORCEINLINE uint32_t hash_native(const void *GMQCC_RESTRICT key, size_t length) {
261 uint32_t hash = HASH_SEED;
264 /* Seperate calls for inliner to deal with */
265 hash_native_process(&hash, &carry, key, length);
266 return hash_native_result(hash, carry, length);
270 * Inline assembly optimized SSE version for when SSE is present via CPUID
271 * or the host compiler has __SSE__. This is about 16 cycles faster than
272 * native at -O2 for GCC and 11 cycles for -O3.
274 * Tested with -m32 on a Phenom II X4 with:
275 * gcc version 4.8.1 20130725 (prerelease) (GCC)
277 #if defined(__GNUC__) && defined(__i386__)
278 static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
280 __asm__ __volatile__ (
281 " mov %%eax, %%ebx\n"
283 " movd %%eax, %%xmm7\n"
284 " shufps $0, %%xmm7, %%xmm7\n"
286 " movd %%eax, %%xmm6\n"
287 " shufps $0, %%xmm6, %%xmm6\n"
288 " lea (%%esi, %%ecx, 1), %%edi\n"
291 " movaps (%%esi), %%xmm0\n"
292 " pmulld %%xmm7, %%xmm0\n"
293 " movaps %%xmm0, %%xmm2\n"
294 " pslld $15, %%xmm0\n"
295 " psrld $17, %%xmm2\n"
296 " orps %%xmm2, %%xmm0\n"
297 " pmulld %%xmm6, %%xmm0\n"
298 " movd %%xmm0, %%eax\n"
299 " xor %%eax, %%ebx\n"
302 " add $0xE6546B64, %%ebx\n"
303 " shufps $0x39, %%xmm0, %%xmm0\n"
304 " movd %%xmm0, %%eax\n"
305 " xor %%eax, %%ebx\n"
308 " add $0xE6546B64, %%ebx\n"
309 " shufps $0x39, %%xmm0, %%xmm0\n"
310 " movd %%xmm0, %%eax\n"
311 " xor %%eax, %%ebx\n"
314 " add $0xE6546B64, %%ebx\n"
315 " shufps $0x39, %%xmm0, %%xmm0\n"
316 " movd %%xmm0, %%eax\n"
317 " xor %%eax, %%ebx\n"
320 " add $0xE6546B64, %%ebx\n"
323 " cmp %%esi, %%edi\n"
325 " xor %%ecx, %%ebx\n"
326 " mov %%ebx, %%eax\n"
328 " xor %%ebx, %%eax\n"
329 " imul $0x85EBCA6b, %%eax\n"
330 " mov %%eax, %%ebx\n"
332 " xor %%ebx, %%eax\n"
333 " imul $0xC2B2AE35, %%eax\n"
334 " mov %%eax, %%ebx\n"
336 " xor %%ebx, %%eax\n"
352 #if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
354 * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
355 * to determine if we should use the SSE route.
357 static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
358 __asm__ __volatile__ (
369 #endif /* !(defined(__GNUC__) && defined(__i386__) */
371 static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
373 * No host SSE instruction set assumed do runtime test instead. This
374 * is for MinGW32 mostly which doesn't define SSE.
376 #if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
377 static bool memoize = false;
378 static bool sse = false;
380 if (GMQCC_UNLIKELY(!memoize)) {
382 * Only calculate SSE one time, thus it's unlikely that this branch
383 * is taken more than once.
386 hash_cpuid(lanes, 0);
388 * It's very likely that lanes[0] will contain a value unless it
389 * isn't a modern x86.
391 if (GMQCC_LIKELY(*lanes >= 1))
392 sse = (lanes[3] & ((int)1 << 25)) != 0;
396 return (GMQCC_LIKELY(sse))
397 ? hash_sse(key, length);
398 : hash_native(key, length);
400 * Same as above but this time host compiler was defined with SSE support.
401 * This handles MinGW32 builds for i686+
403 #elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__)
404 return hash_sse(key, length);
407 * Go the native route which itself is highly optimized as well for
408 * unaligned load/store when dealing with LE.
410 return hash_native(key, length);
414 #define HASH_LEN_ALIGN (sizeof(size_t))
415 #define HASH_LEN_ONES ((size_t)-1/UCHAR_MAX)
416 #define HASH_LEN_HIGHS (HASH_LEN_ONES * (UCHAR_MAX / 2 + 1))
417 #define HASH_LEN_HASZERO(X) (((X)-HASH_LEN_ONES) & ~(X) & HASH_LEN_HIGHS)
419 size_t hash(const char *key) {
424 /* Align for fast staging */
425 for (; (uintptr_t)s % HASH_LEN_ALIGN; s++) {
426 /* Quick stage if terminated before alignment */
428 return hash_entry(key, s-a);
432 * Efficent staging of words for string length calculation, this is
433 * faster than ifunc resolver of strlen call.
435 * On a x64 this becomes literally two masks, and a quick skip through
436 * bytes along the string with the following masks:
437 * movabs $0xFEFEFEFEFEFEFEFE,%r8
438 * movabs $0x8080808080808080,%rsi
440 for (w = (const void *)s; !HASH_LEN_HASZERO(*w); w++);
441 for (s = (const void *)w; *s; s++);
443 return hash_entry(key, s-a);
446 #undef HASH_LEN_HASZERO
447 #undef HASH_LEN_HIGHS
449 #undef HASH_LEN_ALIGN
454 #undef HASH_NATIVE_BLOCK
455 #undef HASH_NATIVE_BYTES
456 #undef HASH_NATIVE_SAFEREAD
459 # pragma GCC diagnostic pop