hash.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Dale Weiler
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include "gmqcc.h"
  24 #include <limits.h>
  25
  26 /*
  27  * This is a version of the Murmur3 hashing function optimized for various
  28  * compilers/architectures. It uses the traditional Murmur2 mix stagin
  29  * but fixes the mix staging inner loops.
  30  *
  31  * Murmur 2 contains an inner loop such as:
  32  * while (l >= 4) {
  33  *      u32 k = *(u32*)d;
  34  *      k *= m;
  35  *      k ^= k >> r;
  36  *      k *= m;
  37  *
  38  *      h *= m;
  39  *      h ^= k;
  40  *      d += 4;
  41  *      l -= 4;
  42  * }
  43  *
  44  * The two u32s that form the key are the same value for x
  45  * this premix stage will perform the same results for both values. Unrolled
  46  * this produces just:
  47  *  x *= m;
  48  *  x ^= x >> r;
  49  *  x *= m;
  50  *
  51  *  h *= m;
  52  *  h ^= x;
  53  *  h *= m;
  54  *  h ^= x;
  55  *
  56  * This appears to be fine, except what happens when m == 1? well x
  57  * cancels out entierly, leaving just:
  58  *  x ^= x >> r;
  59  *  h ^= x;
  60  *  h ^= x;
  61  *
  62  * So all keys hash to the same value, but how often does m == 1?
  63  * well, it turns out testing x for all possible values yeilds only
  64  * 172,013,942 unique results instead of 2^32. So nearly ~4.6 bits
  65  * are cancelled out on average!
  66  *
  67  * This means we have a 14.5% higher chance of collision. This is where
  68  * Murmur3 comes in to save the day.
  69  */
  70
  71 /*
  72  * Some rotation tricks:
  73  *  MSVC one shaves off six instructions, where GCC optimized one for
  74  *  x86 and amd64 shaves off four instructions. Native methods are often
  75  *  optimized rather well at -O3, but not at -O2.
  76  */
  77 #if defined(_MSC_VER)
  78 #   define HASH_ROTL32(X, Y) _rotl((X), (Y))
  79 #else
  80 static GMQCC_FORCEINLINE uint32_t hash_rotl32(volatile uint32_t x, int8_t r) {
  81 #if defined (__GNUC__) && (defined(__i386__) || defined(__amd64__))
  82     __asm__ __volatile__ ("roll %1,%0" : "+r"(x) : "c"(r));
  83     return x;
  84 #else /* ! (defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))) */
  85     return (x << r) | (x >> (32 - r));
  86 #endif
  87 }
  88 #   define HASH_ROTL32(X, Y) hash_rotl32((volatile uint32_t)(X), (Y))
  89 #endif /* !(_MSC_VER) */
  90
  91 static GMQCC_FORCEINLINE uint32_t hash_mix32(uint32_t hash) {
  92     hash ^= hash >> 16;
  93     hash *= 0x85EBCA6B;
  94     hash ^= hash >> 13;
  95     hash *= 0xC2B2AE35;
  96     hash ^= hash >> 16;
  97     return hash;
  98 }
  99
 100 /*
 101  * These constants were calculated with SMHasher to determine the best
 102  * case senario for Murmur3:
 103  *  http://code.google.com/p/smhasher/
 104  */
 105 #define HASH_MASK1 0xCC9E2D51
 106 #define HASH_MASK2 0x1B873593
 107 #define HASH_SEED  0x9747B28C
 108
 109 #if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
 110 #   define HASH_NATIVE_SAFEREAD(PTR) (*((uint32_t*)(PTR)))
 111 #elif PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_BIG
 112 #   if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR >= 3))
 113 #       define HASH_NATIVE_SAFEREAD(PTR) (__builtin_bswap32(*((uint32_t*)(PTR))))
 114 #   endif
 115 #endif
 116 /* Process individual bytes at this point since the endianess isn't known. */
 117 #ifndef HASH_NATIVE_SAFEREAD
 118 #   define HASH_NATIVE_SAFEREAD(PTR) ((PTR)[0] | (PTR)[1] << 8 | (PTR)[2] << 16 | (PTR)[3] << 24)
 119 #endif
 120
 121 #define HASH_NATIVE_BLOCK(H, K)                        \
 122     do {                                               \
 123         K *= HASH_MASK1;                               \
 124         K  = HASH_ROTL32(K, 15);                       \
 125         K *= HASH_MASK2;                               \
 126         H ^= K;                                        \
 127         H  = HASH_ROTL32(H, 13);                       \
 128         H  = H * 5 + 0xE6546B64;                       \
 129     } while (0)
 130
 131 #define HASH_NATIVE_BYTES(COUNT, H, C, N, PTR, LENGTH) \
 132     do {                                               \
 133         int i = COUNT;                                 \
 134         while (i--) {                                  \
 135             C = C >> 8 | *PTR++ << 24;                 \
 136             N++;                                       \
 137             LENGTH--;                                  \
 138             if (N == 4) {                              \
 139                 HASH_NATIVE_BLOCK(H, C);               \
 140                 N = 0;                                 \
 141             }                                          \
 142         }                                              \
 143     } while (0)
 144
 145 /*
 146  * Highly unrolled at per-carry bit granularity instead of per-block granularity. This will achieve the
 147  * highest possible instruction level parallelism.
 148  */
 149 static GMQCC_FORCEINLINE void hash_native_process(uint32_t *ph1, uint32_t *carry, const void *key, int length) {
 150     uint32_t h1 = *ph1;
 151     uint32_t c  = *carry;
 152
 153     const uint8_t *ptr = (uint8_t*)key;
 154     const uint8_t *end;
 155
 156     /* carry count from low 2 bits of carry value */
 157     int n = c & 3;
 158
 159     /*
 160      * Unaligned word accesses are safe in LE. Thus we can obtain a little
 161      * more speed.
 162      */
 163 #   if PLATFORM_BYTE_ORDER == GMQCC_BYTE_ORDER_LITTLE
 164     /* Consume carry bits */
 165     int it = (4 - n) & 3;
 166     if (it && it <= length)
 167         HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
 168
 169     /* word size chunk consumption */
 170     end = ptr + length/4*4;
 171     for (; ptr < end; ptr += 4) {
 172         uint32_t k1 = HASH_NATIVE_SAFEREAD(ptr);
 173         HASH_NATIVE_BLOCK(h1, k1);
 174     }
 175 #   else
 176     /*
 177      * Unsafe to assume unaligned word accesses. Thus we'll need to consume
 178      * to alignment then process in aligned block chunks.
 179      */
 180     uint32_t k1;
 181     int it = -(long)ptr & 3;
 182     if (it && it <= length)
 183         HASH_NATIVE_BYTES(it, h1, c, n, ptr, length);
 184
 185     /*
 186      * Alignment has been reached, deal with aligned blocks, specializing for
 187      * all possible carry counts.
 188      */
 189     end = ptr + length / 4 * 4;
 190     switch (n) {
 191         case 0:
 192             for (; ptr < end; ptr += 4) {
 193                 k1 = HASH_NATIVE_SAFEREAD(ptr);
 194                 HASH_NATIVE_BLOCK(h1, k1);
 195             }
 196             break;
 197
 198         case 1:
 199             for (; ptr < end; ptr += 4) {
 200                 k1  = c >> 24;
 201                 c   = HASH_NATIVE_SAFEREAD(ptr);
 202                 k1 |= c << 8;
 203                 HASH_NATIVE_BLOCK(h1, k1);
 204             }
 205             break;
 206
 207         case 2:
 208             for (; ptr < end; ptr += 4) {
 209                 k1  = c >> 16;
 210                 c   = HASH_NATIVE_SAFEREAD(ptr);
 211                 k1 |= c << 16;
 212                 HASH_NATIVE_BLOCK(h1, k1);
 213             }
 214             break;
 215
 216         case 3:
 217             for (; ptr < end; ptr += 4) {
 218                 k1  = c >> 8;
 219                 c   = HASH_NATIVE_SAFEREAD(ptr);
 220                 k1 |= c << 24;
 221                 HASH_NATIVE_BLOCK(h1, k1);
 222             }
 223             break;
 224     }
 225 #endif /* misaligned reads */
 226
 227     /*
 228      * Advanced over 32-bit chunks, this can possibly leave 1..3 bytes of
 229      * additional trailing content to process.
 230      */
 231     length -= length/4*4;
 232
 233     HASH_NATIVE_BYTES(length, h1, c, n, ptr, length);
 234
 235     *ph1   = h1;
 236     *carry = (c & ~0xFF) | n;
 237 }
 238
 239 static GMQCC_FORCEINLINE uint32_t hash_native_result(uint32_t hash, uint32_t carry, size_t length) {
 240     uint32_t k1;
 241     int n = carry & 3;
 242     if (GMQCC_LIKELY(n)) {
 243         k1    = carry >> (4 - n) * 8;
 244         k1   *= HASH_MASK1;
 245         k1    = HASH_ROTL32(k1, 15);
 246         k1   *= HASH_MASK2;
 247         hash ^= k1;
 248     }
 249     hash ^= length;
 250     hash  = hash_mix32(hash);
 251
 252     return hash;
 253 }
 254
 255 static GMQCC_FORCEINLINE GMQCC_USED uint32_t hash_native(const void *GMQCC_RESTRICT key, size_t length) {
 256     uint32_t hash  = HASH_SEED;
 257     uint32_t carry = 0;
 258
 259     /* Seperate calls for inliner to deal with */
 260     hash_native_process(&hash, &carry, key, length);
 261     return hash_native_result(hash, carry, length);
 262 }
 263
 264 static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
 265     return hash_native(key, length);
 266 }
 267
 268 #define HASH_LEN_ALIGN      (sizeof(size_t))
 269 #define HASH_LEN_ONES       ((size_t)-1/UCHAR_MAX)
 270 #define HASH_LEN_HIGHS      (HASH_LEN_ONES * (UCHAR_MAX / 2 + 1))
 271 #define HASH_LEN_HASZERO(X) (((X)-HASH_LEN_ONES) & ~(X) & HASH_LEN_HIGHS)
 272
 273 size_t hash(const char *key) {
 274     const char   *s = key;
 275     const char   *a = s;
 276     const size_t *w;
 277
 278     /* Align for fast staging */
 279     for (; (uintptr_t)s % HASH_LEN_ALIGN; s++) {
 280         /* Quick stage if terminated before alignment */
 281         if (!*s)
 282             return hash_entry(key, s-a);
 283     }
 284
 285     /*
 286      * Efficent staging of words for string length calculation, this is
 287      * faster than ifunc resolver of strlen call.
 288      *
 289      * On a x64 this becomes literally two masks, and a quick skip through
 290      * bytes along the string with the following masks:
 291      *      movabs $0xFEFEFEFEFEFEFEFE,%r8
 292      *      movabs $0x8080808080808080,%rsi
 293      */
 294     for (w = (const void *)s; !HASH_LEN_HASZERO(*w); w++);
 295     for (s = (const void *)w; *s; s++);
 296
 297     return hash_entry(key, s-a);
 298 }
 299
 300 #undef HASH_LEN_HASZERO
 301 #undef HASH_LEN_HIGHS
 302 #undef HASH_LEN_ONES
 303 #undef HASH_LEN_ALIGN
 304 #undef HASH_SEED
 305 #undef HASH_MASK2
 306 #undef HASH_MASK1
 307 #undef HASH_ROTL32
 308 #undef HASH_NATIVE_BLOCK
 309 #undef HASH_NATIVE_BYTES
 310 #undef HASH_NATIVE_SAFEREAD