libs/container/hashfunc.h

   1 /*
   2    Copyright (C) 2001-2006, William Joseph.
   3    All Rights Reserved.
   4
   5    This file is part of GtkRadiant.
   6
   7    GtkRadiant is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    GtkRadiant is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GtkRadiant; if not, write to the Free Software
  19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #if !defined( INCLUDED_CONTAINER_HASHFUNC_H )
  23 #define INCLUDED_CONTAINER_HASHFUNC_H
  24
  25 #include <cctype>
  26 #include "string/string.h"
  27 #include "container/array.h"
  28 typedef  unsigned long int ub4;     /* unsigned 4-byte quantities */
  29 typedef  unsigned char ub1;
  30
  31 inline ub1 ub1_as_ub1_nocase( ub1 byte ){
  32         return std::tolower( byte );
  33 }
  34
  35 inline ub4 ub1x4_as_ub4_nocase( const ub1 bytes[4] ){
  36         ub4 result;
  37         reinterpret_cast<ub1*>( &result )[0] = ub1_as_ub1_nocase( bytes[0] );
  38         reinterpret_cast<ub1*>( &result )[1] = ub1_as_ub1_nocase( bytes[1] );
  39         reinterpret_cast<ub1*>( &result )[2] = ub1_as_ub1_nocase( bytes[2] );
  40         reinterpret_cast<ub1*>( &result )[3] = ub1_as_ub1_nocase( bytes[3] );
  41         return result;
  42 }
  43
  44 class ub1_default_traits
  45 {
  46 public:
  47 static ub1 as_ub1( ub1 byte ){
  48         return byte;
  49 }
  50 };
  51
  52 class ub1_nocase_traits
  53 {
  54 public:
  55 static ub1 as_ub1( ub1 byte ){
  56         return ub1_as_ub1_nocase( byte );
  57 }
  58 };
  59
  60 class ub1x4_default_traits
  61 {
  62 public:
  63 static ub4 as_ub4( const ub1 bytes[4] ){
  64         return *reinterpret_cast<const ub4*>( bytes );
  65 }
  66 };
  67
  68 class ub1x4_nocase_traits
  69 {
  70 public:
  71 static ub4 as_ub4( const ub1 bytes[4] ){
  72         return ub1x4_as_ub4_nocase( bytes );
  73 }
  74 };
  75
  76 class ub4_default_traits
  77 {
  78 public:
  79 static ub4 as_ub4( ub4 i ){
  80         return i;
  81 }
  82 };
  83
  84 class ub4_nocase_traits
  85 {
  86 public:
  87 static ub4 as_ub4( ub4 i ){
  88         return ub1x4_as_ub4_nocase( reinterpret_cast<const ub1*>( &i ) );
  89 }
  90 };
  91
  92 // lookup2.c
  93 // By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this
  94 // code any way you wish, private, educational, or commercial.  It's free.
  95
  96 #define hashsize( n ) ( (ub4)1 << ( n ) )
  97 #define hashmask( n ) ( hashsize( n ) - 1 )
  98
  99 /*
 100    --------------------------------------------------------------------
 101    mix -- mix 3 32-bit values reversibly.
 102    For every delta with one or two bit set, and the deltas of all three
 103    high bits or all three low bits, whether the original value of a,b,c
 104    is almost all zero or is uniformly distributed,
 105  * If mix() is run forward or backward, at least 32 bits in a,b,c
 106    have at least 1/4 probability of changing.
 107  * If mix() is run forward, every bit of c will change between 1/3 and
 108    2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
 109    mix() was built out of 36 single-cycle latency instructions in a
 110    structure that could supported 2x parallelism, like so:
 111       a -= b;
 112       a -= c; x = (c>>13);
 113       b -= c; a ^= x;
 114       b -= a; x = (a<<8);
 115       c -= a; b ^= x;
 116       c -= b; x = (b>>13);
 117       ...
 118    Unfortunately, superscalar Pentiums and Sparcs can't take advantage
 119    of that parallelism.  They've also turned some of those single-cycle
 120    latency instructions into multi-cycle latency instructions.  Still,
 121    this is the fastest good hash I could find.  There were about 2^^68
 122    to choose from.  I only looked at a billion or so.
 123    --------------------------------------------------------------------
 124  */
 125 #define mix( a,b,c ) \
 126         { \
 127                 a -= b; a -= c; a ^= ( c >> 13 ); \
 128                 b -= c; b -= a; b ^= ( a << 8 ); \
 129                 c -= a; c -= b; c ^= ( b >> 13 ); \
 130                 a -= b; a -= c; a ^= ( c >> 12 );  \
 131                 b -= c; b -= a; b ^= ( a << 16 ); \
 132                 c -= a; c -= b; c ^= ( b >> 5 ); \
 133                 a -= b; a -= c; a ^= ( c >> 3 );  \
 134                 b -= c; b -= a; b ^= ( a << 10 ); \
 135                 c -= a; c -= b; c ^= ( b >> 15 ); \
 136         }
 137
 138 /* same, but slower, works on systems that might have 8 byte ub4's */
 139 #define mix2( a,b,c ) \
 140         { \
 141                 a -= b; a -= c; a ^= ( c >> 13 ); \
 142                 b -= c; b -= a; b ^= ( a << 8 ); \
 143                 c -= a; c -= b; c ^= ( ( b & 0xffffffff ) >> 13 ); \
 144                 a -= b; a -= c; a ^= ( ( c & 0xffffffff ) >> 12 ); \
 145                 b -= c; b -= a; b = ( b ^ ( a << 16 ) ) & 0xffffffff; \
 146                 c -= a; c -= b; c = ( c ^ ( b >> 5 ) ) & 0xffffffff; \
 147                 a -= b; a -= c; a = ( a ^ ( c >> 3 ) ) & 0xffffffff; \
 148                 b -= c; b -= a; b = ( b ^ ( a << 10 ) ) & 0xffffffff; \
 149                 c -= a; c -= b; c = ( c ^ ( b >> 15 ) ) & 0xffffffff; \
 150         }
 151
 152 /*
 153    --------------------------------------------------------------------
 154    hash() -- hash a variable-length key into a 32-bit value
 155    k     : the key (the unaligned variable-length array of bytes)
 156    len   : the length of the key, counting by bytes
 157    level : can be any 4-byte value
 158    Returns a 32-bit value.  Every bit of the key affects every bit of
 159    the return value.  Every 1-bit and 2-bit delta achieves avalanche.
 160    About 36+6len instructions.
 161
 162    The best hash table sizes are powers of 2.  There is no need to do
 163    mod a prime (mod is sooo slow!).  If you need less than 32 bits,
 164    use a bitmask.  For example, if you need only 10 bits, do
 165    h = (h & hashmask(10));
 166    In which case, the hash table should have hashsize(10) elements.
 167
 168    If you are hashing n strings (ub1 **)k, do it like this:
 169    for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
 170
 171    See http://burlteburtle.net/bob/hash/evahash.html
 172    Use for hash table lookup, or anything where one collision in 2^32 is
 173    acceptable.  Do NOT use for cryptographic purposes.
 174    --------------------------------------------------------------------
 175  */
 176
 177 template<typename UB1Traits, typename UB4x1Traits>
 178 inline ub4 hash(
 179         const ub1 *k,    /* the key */
 180         ub4 length, /* the length of the key */
 181         ub4 initval, /* the previous hash, or an arbitrary value */
 182         const UB1Traits& ub1traits,
 183         const UB4x1Traits& ub4x1traits
 184         ){
 185         ub4 a,b,c,len;
 186
 187         /* Set up the internal state */
 188         len = length;
 189         a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
 190         c = initval;          /* the previous hash value */
 191
 192         /*---------------------------------------- handle most of the key */
 193         while ( len >= 12 )
 194         {
 195                 a += ( k[0] + ( ( ub4 ) UB1Traits::as_ub1( k[1] ) << 8 ) + ( ( ub4 ) UB1Traits::as_ub1( k[2] ) << 16 ) + ( ( ub4 ) UB1Traits::as_ub1( k[3] ) << 24 ) );
 196                 b += ( k[4] + ( ( ub4 ) UB1Traits::as_ub1( k[5] ) << 8 ) + ( ( ub4 ) UB1Traits::as_ub1( k[6] ) << 16 ) + ( ( ub4 ) UB1Traits::as_ub1( k[7] ) << 24 ) );
 197                 c += ( k[8] + ( ( ub4 ) UB1Traits::as_ub1( k[9] ) << 8 ) + ( ( ub4 ) UB1Traits::as_ub1( k[10] ) << 16 ) + ( ( ub4 ) UB1Traits::as_ub1( k[11] ) << 24 ) );
 198                 mix( a,b,c );
 199                 k += 12; len -= 12;
 200         }
 201
 202         /*------------------------------------- handle the last 11 bytes */
 203         c += length;
 204         switch ( len )          /* all the case statements fall through */
 205         {
 206 #if defined(__GNUC__) && __GNUC__ < 7
 207         case 11: c += ( ( ub4 ) UB1Traits::as_ub1( k[10] ) << 24 ); /* fall through */
 208         case 10: c += ( ( ub4 ) UB1Traits::as_ub1( k[9] ) << 16 ); /* fall through */
 209         case 9: c += ( ( ub4 ) UB1Traits::as_ub1( k[8] ) << 8 ); /* fall through */
 210         /* the first byte of c is reserved for the length */
 211         case 8: b += ( ( ub4 ) UB1Traits::as_ub1( k[7] ) << 24 ); /* fall through */
 212         case 7: b += ( ( ub4 ) UB1Traits::as_ub1( k[6] ) << 16 ); /* fall through */
 213         case 6: b += ( ( ub4 ) UB1Traits::as_ub1( k[5] ) << 8 ); /* fall through */
 214         case 5: b += UB1Traits::as_ub1( k[4] );  /* fall through */
 215         case 4: a += ( ( ub4 ) UB1Traits::as_ub1( k[3] ) << 24 ); /* fall through */
 216         case 3: a += ( ( ub4 ) UB1Traits::as_ub1( k[2] ) << 16 ); /* fall through */
 217         case 2: a += ( ( ub4 ) UB1Traits::as_ub1( k[1] ) << 8 ); /* fall through */
 218         case 1: a += UB1Traits::as_ub1( k[0] );
 219 #else
 220         case 11: c += ( ( ub4 ) UB1Traits::as_ub1( k[10] ) << 24 ); __attribute((fallthrough));
 221         case 10: c += ( ( ub4 ) UB1Traits::as_ub1( k[9] ) << 16 ); __attribute((fallthrough));
 222         case 9: c += ( ( ub4 ) UB1Traits::as_ub1( k[8] ) << 8 ); __attribute((fallthrough));
 223         /* the first byte of c is reserved for the length */
 224         case 8: b += ( ( ub4 ) UB1Traits::as_ub1( k[7] ) << 24 ); __attribute((fallthrough));
 225         case 7: b += ( ( ub4 ) UB1Traits::as_ub1( k[6] ) << 16 ); __attribute((fallthrough));
 226         case 6: b += ( ( ub4 ) UB1Traits::as_ub1( k[5] ) << 8 ); __attribute((fallthrough));
 227         case 5: b += UB1Traits::as_ub1( k[4] );  __attribute((fallthrough));
 228         case 4: a += ( ( ub4 ) UB1Traits::as_ub1( k[3] ) << 24 ); __attribute((fallthrough));
 229         case 3: a += ( ( ub4 ) UB1Traits::as_ub1( k[2] ) << 16 ); __attribute((fallthrough));
 230         case 2: a += ( ( ub4 ) UB1Traits::as_ub1( k[1] ) << 8 ); __attribute((fallthrough));
 231         case 1: a += UB1Traits::as_ub1( k[0] );
 232 #endif
 233                 /* case 0: nothing left to add */
 234         }
 235         mix( a,b,c );
 236         /*-------------------------------------------- report the result */
 237         return c;
 238 }
 239
 240 /*
 241    --------------------------------------------------------------------
 242    This works on all machines.  hash2() is identical to hash() on
 243    little-endian machines, except that the length has to be measured
 244    in ub4s instead of bytes.  It is much faster than hash().  It
 245    requires
 246    -- that the key be an array of ub4's, and
 247    -- that all your machines have the same endianness, and
 248    -- that the length be the number of ub4's in the key
 249    --------------------------------------------------------------------
 250  */
 251 template<typename UB4Traits>
 252 inline ub4 hash2(
 253         const ub4 *k,    /* the key */
 254         ub4 length, /* the length of the key, in ub4s */
 255         ub4 initval, /* the previous hash, or an arbitrary value */
 256         const UB4Traits& ub4traits
 257         ){
 258         ub4 a,b,c,len;
 259
 260         /* Set up the internal state */
 261         len = length;
 262         a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
 263         c = initval;          /* the previous hash value */
 264
 265         /*---------------------------------------- handle most of the key */
 266         while ( len >= 3 )
 267         {
 268                 a += UB4Traits::as_ub4( k[0] );
 269                 b += UB4Traits::as_ub4( k[1] );
 270                 c += UB4Traits::as_ub4( k[2] );
 271                 mix( a,b,c );
 272                 k += 3; len -= 3;
 273         }
 274
 275         /*-------------------------------------- handle the last 2 ub4's */
 276         c += length;
 277         switch ( len )          /* all the case statements fall through */
 278         {
 279         /* c is reserved for the length */
 280         case 2: b += UB4Traits::as_ub4( k[1] );
 281         case 1: a += UB4Traits::as_ub4( k[0] );
 282                 /* case 0: nothing left to add */
 283         }
 284         mix( a,b,c );
 285         /*-------------------------------------------- report the result */
 286         return c;
 287 }
 288
 289 typedef ub4 hash_t;
 290
 291 inline hash_t hash_ub1( const ub1* key, std::size_t len, hash_t previous = 0 ){
 292         return hash( key, ub4( len ), previous, ub1_default_traits(), ub1x4_default_traits() );
 293 }
 294
 295 inline hash_t hash_ub1_nocase( const ub1* key, std::size_t len, hash_t previous = 0 ){
 296         return hash( key, ub4( len ), previous, ub1_nocase_traits(), ub1x4_nocase_traits() );
 297 }
 298
 299 template<typename UB4Traits>
 300 inline hash_t hash_ub4( const ub4* key, std::size_t len, const UB4Traits& traits, hash_t previous = 0 ){
 301         return hash2( key,ub4( len ), previous, traits );
 302 }
 303
 304 inline ub4 hash_combine( ub4 left, ub4 right ){
 305         return hash_ub1( reinterpret_cast<const ub1*>( &left ), 4, right );
 306 }
 307
 308 template<typename POD>
 309 inline hash_t pod_hash( const POD& pod ){
 310         return hash_ub1( reinterpret_cast<const ub1*>( &pod ), sizeof( POD ) );
 311 }
 312
 313 inline hash_t string_hash( const char* string, hash_t previous = 0 ){
 314         return hash_ub1( reinterpret_cast<const ub1*>( string ), string_length( string ), previous );
 315 }
 316
 317 inline hash_t string_hash_nocase( const char* string, hash_t previous = 0 ){
 318         return hash_ub1_nocase( reinterpret_cast<const ub1*>( string ), string_length( string ), previous );
 319 }
 320
 321 struct RawStringHash
 322 {
 323         typedef hash_t hash_type;
 324         hash_type operator()( const char* string ) const {
 325                 return string_hash( string );
 326         }
 327 };
 328
 329 struct HashString
 330 {
 331         typedef hash_t hash_type;
 332         hash_type operator()( const CopiedString& string ) const {
 333                 return string_hash( string.c_str() );
 334         }
 335 };
 336
 337 struct HashStringNoCase
 338 {
 339         typedef hash_t hash_type;
 340         hash_type operator()( const CopiedString& string ) const {
 341                 return string_hash_nocase( string.c_str() );
 342         }
 343 };
 344
 345 /// \brief Length of a string in ub4.
 346 /// "wibble" (6) gives 2,
 347 /// "and" (3) gives 1,
 348 /// "bleh" (4) gives 2
 349 inline std::size_t string_length_ub4( const char* string ){
 350         return ( ( string_length( string ) >> 2 ) + 1 ) << 2;
 351 }
 352
 353 /// \brief Hashable key type that stores a string as an array of ub4 - making hashing faster.
 354 /// Also caches the 32-bit result of the hash to speed up comparison of keys.
 355 template<typename UB4Traits = ub4_default_traits>
 356 class HashKey
 357 {
 358 Array<ub4> m_key;
 359 hash_t m_hash;
 360
 361 void copy( const HashKey& other ){
 362         std::copy( other.m_key.begin(), other.m_key.end(), m_key.begin() );
 363         m_hash = other.m_hash;
 364 }
 365 void copy( const char* string ){
 366         strncpy( reinterpret_cast<char*>( m_key.data() ), string, m_key.size() );
 367         for ( Array<ub4>::iterator i = m_key.begin(); i != m_key.end(); ++i )
 368         {
 369                 *i = UB4Traits::as_ub4( *i );
 370         }
 371         m_hash = hash_ub4( m_key.data(), m_key.size(), ub4_default_traits() );
 372 }
 373 bool equal( const HashKey& other ) const {
 374         return m_hash == other.m_hash && m_key.size() == other.m_key.size()
 375                    && std::equal( m_key.begin(), m_key.end(), other.m_key.begin() );
 376 }
 377
 378 public:
 379 HashKey( const HashKey& other ) : m_key( other.m_key.size() ){
 380         copy( other );
 381 }
 382 HashKey( const char* string ) : m_key( string_length_ub4( string ) ){
 383         copy( string );
 384 }
 385 HashKey& operator=( const char* string ){
 386         m_key.resize( string_length_ub4( string ) );
 387         copy( string );
 388         return *this;
 389 }
 390 bool operator==( const HashKey& other ) const {
 391         return equal( other );
 392 }
 393 bool operator!=( const HashKey& other ) const {
 394         return !equal( other );
 395 }
 396 hash_t hash() const {
 397         return m_hash;
 398 }
 399 #if 0
 400 const char* c_str() const {
 401         return reinterpret_cast<const char*>( m_key.data() );
 402 }
 403 #endif
 404 };
 405
 406 /// \brief Hash function to use with HashKey.
 407 struct HashKeyHasher
 408 {
 409         typedef hash_t hash_type;
 410         hash_type operator()( const HashKey<ub4_default_traits>& key ) const {
 411                 return key.hash();
 412         }
 413 };
 414
 415
 416
 417 #endif