-/*
- * Inline assembly optimized SSE version for when SSE is present via CPUID
- * or the host compiler has __SSE__. This is about 16 cycles faster than
- * native at -O2 for GCC and 11 cycles for -O3.
- *
- * Tested with -m32 on a Phenom II X4 with:
- * gcc version 4.8.1 20130725 (prerelease) (GCC)
- */
-#if defined(__GNUC__) && defined(__i386__)
-static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
- uint32_t ret;
- __asm__ __volatile__ (
- " mov %%eax, %%ebx\n"
- " mov %2, %%eax\n"
- " movd %%eax, %%xmm7\n"
- " shufps $0, %%xmm7, %%xmm7\n"
- " mov %3, %%eax\n"
- " movd %%eax, %%xmm6\n"
- " shufps $0, %%xmm6, %%xmm6\n"
- " lea (%%esi, %%ecx, 1), %%edi\n"
- " jmp 2f\n"
- "1:\n"
- " movaps (%%esi), %%xmm0\n"
- " pmulld %%xmm7, %%xmm0\n"
- " movaps %%xmm0, %%xmm2\n"
- " pslld $15, %%xmm0\n"
- " psrld $17, %%xmm2\n"
- " orps %%xmm2, %%xmm0\n"
- " pmulld %%xmm6, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " shufps $0x39, %%xmm0, %%xmm0\n"
- " movd %%xmm0, %%eax\n"
- " xor %%eax, %%ebx\n"
- " rol $13, %%ebx\n"
- " imul $5, %%ebx\n"
- " add $0xE6546B64, %%ebx\n"
- " add $16, %%esi\n"
- "2:\n"
- " cmp %%esi, %%edi\n"
- " jne 1b\n"
- " xor %%ecx, %%ebx\n"
- " mov %%ebx, %%eax\n"
- " shr $16, %%ebx\n"
- " xor %%ebx, %%eax\n"
- " imul $0x85EBCA6b, %%eax\n"
- " mov %%eax, %%ebx\n"
- " shr $13, %%ebx\n"
- " xor %%ebx, %%eax\n"
- " imul $0xC2B2AE35, %%eax\n"
- " mov %%eax, %%ebx\n"
- " shr $16, %%ebx\n"
- " xor %%ebx, %%eax\n"
- : "=a" (ret)
-
- : "a" (HASH_SEED),
- "i" (HASH_MASK1),
- "i" (HASH_MASK2),
- "S" (key),
- "c" (length)
-
- : "%ebx",
- "%edi"
- );
- return ret;
-}
-#endif
-
-#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
-/*
- * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
- * to determine if we should use the SSE route.
- */
-static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
- __asm__ __volatile__ (
- "cpuid"
- : "=a"(lanes[0]),
- "=b"(lanes[1]),
- "=c"(lanes[2]),
- "=d"(lanes[3])
-
- : "a" (entry)
- );
-}
-
-#endif /* !(defined(__GNUC__) && defined(__i386__) */
-