hash.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Dale Weiler
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include "gmqcc.h"
  24 #include <limits.h>
  25
  26 /*
  27  * This is a version of the Murmur3 hashing function optimized for various
  28  * compilers/architectures. It uses the traditional Murmur2 mix stagin
  29  * but fixes the mix staging inner loops.
  30  *
  31  * Murmur 2 contains an inner loop such as:
  32  * while (l >= 4) {
  33  *      u32 k = *(u32*)d;
  34  *      k *= m;
  35  *      k ^= k >> r;
  36  *      k *= m;
  37  *
  38  *      h *= m;
  39  *      h ^= k;
  40  *      d += 4;
  41  *      l -= 4;
  42  * }
  43  *
  44  * The two u32s that form the key are the same value for x
  45  * this premix stage will perform the same results for both values. Unrolled
  46  * this produces just:
  47  *  x *= m;
  48  *  x ^= x >> r;
  49  *  x *= m;
  50  *
  51  *  h *= m;
  52  *  h ^= x;
  53  *  h *= m;
  54  *  h ^= x;
  55  *
  56  * This appears to be fine, except what happens when m == 1? well x
  57  * cancels out entierly, leaving just:
  58  *  x ^= x >> r;
  59  *  h ^= x;
  60  *  h ^= x;
  61  *
  62  * So all keys hash to the same value, but how often does m == 1?
  63  * well, it turns out testing x for all possible values yeilds only
  64  * 172,013,942 unique results instead of 2^32. So nearly ~4.6 bits
  65  * are cancelled out on average!
  66  *
  67  * This means we have a 14.5% higher chance of collision. This is where
  68  * Murmur3 comes in to save the day.
  69  */
  70
  71 #ifdef __GNUC__
  72 #   pragma GCC diagnostic push
  73 #   pragma GCC diagnostic ignored "-Wattributes"
  74 #endif
  75
  76 /*
  77  * Some rotation tricks:
  78  *  MSVC one shaves off six instructions, where GCC optimized one for
  79  *  x86 and amd64 shaves off four instructions. Native methods are often
  80  *  optimized rather well at -O3, but not at -O2.
  81  */
  82 #if defined(_MSC_VER)
  83 #   define HASH_ROTL32(X, Y) _rotl((X), (Y))
  84 #else
  85 static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) {
  86 #if defined (__GNUC__) && (defined(__i386__) || defined(__amd64__))
  87     __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r));
  88     return x;
  89 #else /* ! (defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))) */
  90     return (x << r) | (x >> (32 - r));
  91 #endif
  92 }
  93 #   define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y))
  94 #endif /* !(_MSC_VER) */
  95
  96 static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) {
  97     hash ^= hash >> 16;
  98     hash *= 0x85EBCA6B;
  99     hash ^= hash >> 13;
 100     hash *= 0xC2B2AE35;
 101     hash ^= hash >> 16;
 102     return hash;
 103 }
 104
 105 /*
 106  * These constants were calculated with SMHasher to determine the best
 107  * case senario for Murmur3:
 108  *  http://code.google.com/p/smhasher/
 109  */
 110 #define HASH_MASK1 0xCC9E2D51
 111 #define HASH_MASK2 0x1B873593
 112 #define HASH_SEED  0x9747B28C
 113
 114 #if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
 115 #   define HASH_NATIVE_SAFEREAD(PTR) (*((uint32_t*)(PTR)))
 116 #elif PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_BIG
 117 #   if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR >= 3))
 118 #       define HASH_NATIVE_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR))))
 119 #   endif
 120 #endif
 121 /* Process individual bytes at this point since the endianess isn't known. */
 122 #ifndef HASH_NATIVE_SAFEREAD
 123 #   define HASH_NATIVE_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24)
 124 #endif
 125
 126 #define HASH_NATIVE_BLOCK(H, K)                        \
 127     do {                                               \
 128         K *= HASH_MASK1;                               \
 129         K  = HASH_ROTL32(K, 15);                       \
 130         K *= HASH_MASK2;                               \
 131         H ^= K;                                        \
 132         H  = HASH_ROTL32(H, 13);                       \
 133         H  = H * 5 + 0xE6546B64;                       \
 134     } while (0)
 135
 136 #define HASH_NATIVE_BYTES(COUNT, H, C, N, PTR, LENGTH) \
 137     do {                                               \
 138         int i = COUNT;                                 \
 139         while (i--) {                                  \
 140             C = C >> 8 | *PTR++ << 24;                 \
 141             N++;                                       \
 142             LENGTH--;                                  \
 143             if (N == 4) {                              \
 144                 HASH_NATIVE_BLOCK(H, C);               \
 145                 N = 0;                                 \
 146             }                                          \
 147         }                                              \
 148     } while (0)
 149
 150 /*
 151  * Highly unrolled at per-carry bit granularity instead of per-block granularity. This will achieve the
 152  * highest possible instruction level parallelism.
 153  */
 154 static GMQCC_FORCEINLINE void hash_native_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) {
 155     uint32_t h1 = *ph1;
 156     uint32_t c  = *carry;
 157
 158     const uint8_t *ptr = (uint8_t*)key;
 159     const uint8_t *end;
 160
 161     /* carry count from low 2 bits of carry value */
 162     int n = c & 3;
 163
 164     /*
 165      * Unaligned word accesses are safe in LE. Thus we can obtain a little
 166      * more speed.
 167      */
 168 #   if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
 169     /* Consume carry bits */
 170     int it = (4 - n) & 3;
 171     if (it && it <= length)
 172         HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
 173
 174     /* word size chunk consumption */
 175     end = ptr + length/4*4;
 176     for (; ptr < end; ptr += 4) {
 177         uint32_t k1 = HASH_NATIVE_SAFEREAD(ptr);
 178         HASH_NATIVE_BLOCK(h1, k1);
 179     }
 180 #   else
 181     /*
 182      * Unsafe to assume unaligned word accesses. Thus we'll need to consume
 183      * to alignment then process in aligned block chunks.
 184      */
 185     uint32_t k1;
 186     int it = -(long)ptr & 3;
 187     if (it && it <= length)
 188         HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
 189
 190     /*
 191      * Alignment has been reached, deal with aligned blocks, specializing for
 192      * all possible carry counts.
 193      */
 194     end = ptr + len / 4 * 4;
 195     switch (n) {
 196         case 0:
 197             for (; ptr < end; ptr += 4) {
 198                 k1 = HASH_NATIVE_SAFEREAD(ptr);
 199                 HASH_NATIVE_BLOCK(h1, k1);
 200             }
 201             break;
 202
 203         case 1:
 204             for (; ptr < end; ptr += 4) {
 205                 k1  = c >> 24;
 206                 c   = HASH_NATIVE_SAFEREAD(ptr);
 207                 k1 |= c << 8;
 208                 HASH_NATIVE_BLOCK(h1, k1);
 209             }
 210             break;
 211
 212         case 2:
 213             for (; ptr < end; ptr += 4) {
 214                 k1  = c >> 16;
 215                 c   = HASH_NATIVE_SAFEREAD(ptr);
 216                 k1 |= c << 16;
 217                 HASH_NATIVE_BLOCK(h1, k1);
 218             }
 219             break;
 220
 221         case 3:
 222             for (; ptr < end; ptr += 4) {
 223                 k1  = c >> 8;
 224                 c   = HASH_NATIVE_SAFEREAD(ptr);
 225                 k1 |= c << 24;
 226                 HASH_NATIVE_BLOCK(h1, k1);
 227             }
 228             break;
 229     }
 230 #endif /* misaligned reads */
 231
 232     /*
 233      * Advanced over 32-bit chunks, this can possibly leave 1..3 bytes of
 234      * additional trailing content to process.
 235      */
 236     length -= length/4*4;
 237
 238     HASH_NATIVE_BYTES(length, h1, c, n, ptr, length);
 239
 240     *ph1   = h1;
 241     *carry = (c & ~0xFF) | n;
 242 }
 243
 244 static GMQCC_FORCEINLINE uint32_t hash_native_result(uint32_t hash, uint32_t carry, size_t length) {
 245     uint32_t k1;
 246     int n = carry & 3;
 247     if (GMQCC_LIKELY(n)) {
 248         k1    = carry >> (4 - n) * 8;
 249         k1   *= HASH_MASK1;
 250         k1    = HASH_ROTL32(k1, 15);
 251         k1   *= HASH_MASK2;
 252         hash ^= k1;
 253     }
 254     hash ^= length;
 255     hash  = hash_mix32(hash);
 256
 257     return hash;
 258 }
 259
 260 static GMQCC_FORCEINLINE uint32_t hash_native(const void *GMQCC_RESTRICT key, size_t length) {
 261     uint32_t hash  = HASH_SEED;
 262     uint32_t carry = 0;
 263
 264     /* Seperate calls for inliner to deal with */
 265     hash_native_process(&hash, &carry, key, length);
 266     return hash_native_result(hash, carry, length);
 267 }
 268
 269 /*
 270  * Inline assembly optimized SSE version for when SSE is present via CPUID
 271  * or the host compiler has __SSE__. This is about 16 cycles faster than
 272  * native at -O2 for GCC and 11 cycles for -O3.
 273  *
 274  *  Tested with -m32 on a Phenom II X4 with:
 275  *      gcc version 4.8.1 20130725 (prerelease) (GCC)
 276  */
 277 #if defined(__GNUC__) && defined(__i386__)
 278 static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
 279     uint32_t ret;
 280     __asm__ __volatile__ (
 281         "   mov %%eax, %%ebx\n"
 282         "   mov %2, %%eax\n"
 283         "   movd %%eax, %%xmm7\n"
 284         "   shufps $0, %%xmm7, %%xmm7\n"
 285         "   mov %3, %%eax\n"
 286         "   movd %%eax, %%xmm6\n"
 287         "   shufps $0, %%xmm6, %%xmm6\n"
 288         "   lea (%%esi, %%ecx, 1), %%edi\n"
 289         "   jmp 2f\n"
 290         "1:\n"
 291         "   movaps (%%esi), %%xmm0\n"
 292         "   pmulld %%xmm7, %%xmm0\n"
 293         "   movaps %%xmm0, %%xmm2\n"
 294         "   pslld $15, %%xmm0\n"
 295         "   psrld $17, %%xmm2\n"
 296         "   orps %%xmm2, %%xmm0\n"
 297         "   pmulld %%xmm6, %%xmm0\n"
 298         "   movd %%xmm0, %%eax\n"
 299         "   xor %%eax, %%ebx\n"
 300         "   rol $13, %%ebx\n"
 301         "   imul $5, %%ebx\n"
 302         "   add $0xE6546B64, %%ebx\n"
 303         "   shufps $0x39, %%xmm0, %%xmm0\n"
 304         "   movd %%xmm0, %%eax\n"
 305         "   xor %%eax, %%ebx\n"
 306         "   rol $13, %%ebx\n"
 307         "   imul $5, %%ebx\n"
 308         "   add $0xE6546B64, %%ebx\n"
 309         "   shufps $0x39, %%xmm0, %%xmm0\n"
 310         "   movd %%xmm0, %%eax\n"
 311         "   xor %%eax, %%ebx\n"
 312         "   rol $13, %%ebx\n"
 313         "   imul $5, %%ebx\n"
 314         "   add $0xE6546B64, %%ebx\n"
 315         "   shufps $0x39, %%xmm0, %%xmm0\n"
 316         "   movd %%xmm0, %%eax\n"
 317         "   xor %%eax, %%ebx\n"
 318         "   rol $13, %%ebx\n"
 319         "   imul $5, %%ebx\n"
 320         "   add $0xE6546B64, %%ebx\n"
 321         "   add $16, %%esi\n"
 322         "2:\n"
 323         "   cmp %%esi, %%edi\n"
 324         "   jne 1b\n"
 325         "   xor %%ecx, %%ebx\n"
 326         "   mov %%ebx, %%eax\n"
 327         "   shr $16, %%ebx\n"
 328         "   xor %%ebx, %%eax\n"
 329         "   imul $0x85EBCA6b, %%eax\n"
 330         "   mov %%eax, %%ebx\n"
 331         "   shr $13, %%ebx\n"
 332         "   xor %%ebx, %%eax\n"
 333         "   imul $0xC2B2AE35, %%eax\n"
 334         "   mov %%eax, %%ebx\n"
 335         "   shr $16, %%ebx\n"
 336         "   xor %%ebx, %%eax\n"
 337         :   "=a" (ret)
 338
 339         :   "a" (HASH_SEED),
 340             "i" (HASH_MASK1),
 341             "i" (HASH_MASK2),
 342             "S" (key),
 343             "c" (length)
 344
 345         :   "%ebx",
 346             "%edi"
 347     );
 348     return ret;
 349 }
 350 #endif
 351
 352 #if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
 353 /*
 354  * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
 355  * to determine if we should use the SSE route.
 356  */
 357 static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
 358     __asm__ __volatile__ (
 359         "cpuid"
 360         :   "=a"(lanes[0]),
 361             "=b"(lanes[1]),
 362             "=c"(lanes[2]),
 363             "=d"(lanes[3])
 364
 365         :   "a" (entry)
 366     );
 367 }
 368
 369 #endif /* !(defined(__GNUC__) && defined(__i386__) */
 370
 371 static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
 372 /*
 373  * No host SSE instruction set assumed do runtime test instead. This
 374  * is for MinGW32 mostly which doesn't define SSE.
 375  */
 376 #if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
 377     static bool memoize = false;
 378     static bool sse     = false;
 379
 380     if (GMQCC_UNLIKELY(!memoize)) {
 381         /*
 382          * Only calculate SSE one time, thus it's unlikely that this branch
 383          * is taken more than once.
 384          */
 385         static int lanes[4];
 386         hash_cpuid(lanes, 0);
 387         /*
 388          * It's very likely that lanes[0] will contain a value unless it
 389          * isn't a modern x86.
 390          */
 391         if (GMQCC_LIKELY(*lanes >= 1))
 392             sse = (lanes[3] & ((int)1 << 25)) != 0;
 393         memoize = true;
 394     }
 395
 396     return (GMQCC_LIKELY(sse))
 397                 ? hash_sse(key, length);
 398                 : hash_native(key, length);
 399 /*
 400  * Same as above but this time host compiler was defined with SSE support.
 401  * This handles MinGW32 builds for i686+
 402  */
 403 #elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__)
 404     return hash_sse(key, length);
 405 #else
 406     /*
 407      * Go the native route which itself is highly optimized as well for
 408      * unaligned load/store when dealing with LE.
 409      */
 410     return hash_native(key, length);
 411 #endif
 412 }
 413
 414 #define HASH_LEN_ALIGN      (sizeof(size_t))
 415 #define HASH_LEN_ONES       ((size_t)-1/UCHAR_MAX)
 416 #define HASH_LEN_HIGHS      (HASH_LEN_ONES * (UCHAR_MAX / 2 + 1))
 417 #define HASH_LEN_HASZERO(X) (((X)-HASH_LEN_ONES) & ~(X) & HASH_LEN_HIGHS)
 418
 419 size_t hash(const char *key) {
 420     const char   *s = key;
 421     const char   *a = s;
 422     const size_t *w;
 423
 424     /* Align for fast staging */
 425     for (; (uintptr_t)s % HASH_LEN_ALIGN; s++) {
 426         /* Quick stage if terminated before alignment */
 427         if (!*s)
 428             return hash_entry(key, s-a);
 429     }
 430
 431     /*
 432      * Efficent staging of words for string length calculation, this is
 433      * faster than ifunc resolver of strlen call.
 434      *
 435      * On a x64 this becomes literally two masks, and a quick skip through
 436      * bytes along the string with the following masks:
 437      *      movabs $0xFEFEFEFEFEFEFEFE,%r8
 438      *      movabs $0x8080808080808080,%rsi
 439      */
 440     for (w = (const void *)s; !HASH_LEN_HASZERO(*w); w++);
 441     for (s = (const void *)w; *s; s++);
 442
 443     return hash_entry(key, s-a);
 444 }
 445
 446 #undef HASH_LEN_HASZERO
 447 #undef HASH_LEN_HIGHS
 448 #undef HASH_LEN_ONES
 449 #undef HASH_LEN_ALIGN
 450 #undef HASH_SEED
 451 #undef HASH_MASK2
 452 #undef HASH_MASK1
 453 #undef HASH_ROTL32
 454 #undef HASH_NATIVE_BLOCK
 455 #undef HASH_NATIVE_BYTES
 456 #undef HASH_NATIVE_SAFEREAD
 457
 458 #ifdef __GNUC__
 459 #   pragma GCC diagnostic pop
 460 #endif