From: Dale Weiler Date: Sat, 14 Dec 2013 22:30:51 +0000 (-0500) Subject: Remove SSE hash, t's just too much effort to maintain. X-Git-Tag: xonotic-v0.8.0~29 X-Git-Url: https://git.xonotic.org/?p=xonotic%2Fgmqcc.git;a=commitdiff_plain;h=f24bdced104d84598b9eb9fa9132a24e67477518 Remove SSE hash, t's just too much effort to maintain. --- diff --git a/hash.c b/hash.c index 276817e..6ef3847 100644 --- a/hash.c +++ b/hash.c @@ -261,149 +261,8 @@ static GMQCC_FORCEINLINE GMQCC_USED uint32_t hash_native(const void *GMQCC_RESTR return hash_native_result(hash, carry, length); } -/* - * Inline assembly optimized SSE version for when SSE is present via CPUID - * or the host compiler has __SSE__. This is about 16 cycles faster than - * native at -O2 for GCC and 11 cycles for -O3. - * - * Tested with -m32 on a Phenom II X4 with: - * gcc version 4.8.1 20130725 (prerelease) (GCC) - */ -#if defined(__GNUC__) && defined(__i386__) -static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) { - uint32_t ret; - __asm__ __volatile__ ( - " mov %%eax, %%ebx\n" - " mov %2, %%eax\n" - " movd %%eax, %%xmm7\n" - " shufps $0, %%xmm7, %%xmm7\n" - " mov %3, %%eax\n" - " movd %%eax, %%xmm6\n" - " shufps $0, %%xmm6, %%xmm6\n" - " lea (%%esi, %%ecx, 1), %%edi\n" - " jmp 2f\n" - "1:\n" - " movaps (%%esi), %%xmm0\n" - " pmulld %%xmm7, %%xmm0\n" - " movaps %%xmm0, %%xmm2\n" - " pslld $15, %%xmm0\n" - " psrld $17, %%xmm2\n" - " orps %%xmm2, %%xmm0\n" - " pmulld %%xmm6, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " shufps $0x39, %%xmm0, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " shufps $0x39, %%xmm0, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " shufps $0x39, %%xmm0, %%xmm0\n" - " movd %%xmm0, %%eax\n" - " xor %%eax, %%ebx\n" - " rol $13, %%ebx\n" - " imul $5, %%ebx\n" - " add $0xE6546B64, %%ebx\n" - " add $16, %%esi\n" - "2:\n" - " cmp %%esi, %%edi\n" - " jne 1b\n" - " xor %%ecx, %%ebx\n" - " mov %%ebx, %%eax\n" - " shr $16, %%ebx\n" - " xor %%ebx, %%eax\n" - " imul $0x85EBCA6b, %%eax\n" - " mov %%eax, %%ebx\n" - " shr $13, %%ebx\n" - " xor %%ebx, %%eax\n" - " imul $0xC2B2AE35, %%eax\n" - " mov %%eax, %%ebx\n" - " shr $16, %%ebx\n" - " xor %%ebx, %%eax\n" - : "=a" (ret) - - : "a" (HASH_SEED), - "i" (HASH_MASK1), - "i" (HASH_MASK2), - "S" (key), - "c" (length) - - : "%ebx", - "%edi" - ); - return ret; -} -#endif - -#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__) -/* - * Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used - * to determine if we should use the SSE route. - */ -static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) { - __asm__ __volatile__ ( - "cpuid" - : "=a"(lanes[0]), - "=b"(lanes[1]), - "=c"(lanes[2]), - "=d"(lanes[3]) - - : "a" (entry) - ); -} - -#endif /* !(defined(__GNUC__) && defined(__i386__) */ - static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) { -/* - * No host SSE instruction set assumed do runtime test instead. This - * is for MinGW32 mostly which doesn't define SSE. - */ -#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__) - static bool memoize = false; - static bool sse = false; - - if (GMQCC_UNLIKELY(!memoize)) { - /* - * Only calculate SSE one time, thus it's unlikely that this branch - * is taken more than once. - */ - static int lanes[4]; - hash_cpuid(lanes, 0); - /* - * It's very likely that lanes[0] will contain a value unless it - * isn't a modern x86. - */ - if (GMQCC_LIKELY(*lanes >= 1)) - sse = (lanes[3] & ((int)1 << 25)) != 0; - memoize = true; - } - - return (GMQCC_LIKELY(sse)) - ? hash_sse(key, length) - : hash_native(key, length); -/* - * Same as above but this time host compiler was defined with SSE support. - * This handles MinGW32 builds for i686+ - */ -#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__) - return hash_sse(key, length); -#else - /* - * Go the native route which itself is highly optimized as well for - * unaligned load/store when dealing with LE. - */ return hash_native(key, length); -#endif } #define HASH_LEN_ALIGN (sizeof(size_t))