[PATCH] [libc++] Remove unaligned type punning from murmurhash/cityhash

Tue Jul 2 14:52:31 PDT 2013

I believe the code below is a conforming C++11 program.  However, I'm
pretty sure with libc++, this code causes an unaligned word access
(problem for strict alignment architectures) and also involves type
punning (problem for -fstrict-aliasing).

This is because libc++'s hash<basic_string<...>> directly calls the
murmurhash/cityhash functions on its string data, which in turn
dereference the memory via size_t pointers.  The allocator below
ensures the array is oddly aligned; but also because the string data
is a char array, it's unsafe to dereference it as a size_t or uint32_t
array.

Attached patch replaces the type punning with memcpy(), which on
x86/x86-64 clang optimizes to direct word accesses anyway.


#include <string>
#include <iostream>
#include <cstdlib>

namespace {

template <typename T>
class SillyAlloc
{
public:
    typedef T value_type;

    SillyAlloc() = default;
    template <typename U> SillyAlloc(const SillyAlloc<U>&) noexcept {}

    T* allocate(std::size_t n)
    {
        T* p = static_cast<T*>(std::malloc((n + 1) * sizeof(T)));
        return p ? p + 1 : nullptr;
    }

    void deallocate(T* p, std::size_t) noexcept
    {
        if (p) std::free(p - 1);
    }
};

typedef std::basic_string<char, std::char_traits<char>, SillyAlloc<char>> SillyString;

}

int
main()
{
    SillyString s = "hello";
    std::hash<SillyString> h;
    std::cout << h(s) << std::endl;
}
-------------- next part --------------
Index: libcxx/include/memory
===================================================================

--- libcxx/include/memory	(revision 185441)
+++ libcxx/include/memory	(working copy)
@@ -3126,6 +3126,17 @@
 
 template <class _Tp> struct hash;
 
+namespace {
+template <class _Size>
+inline _Size
+__loadword(const void* __p)
+{
+    _Size __r;
+    std::memcpy(&__r, __p, sizeof(__r));
+    return __r;
+}
+}
+
 // We use murmur2 when size_t is 32 bits, and cityhash64 when size_t
 // is 64 bits.  This is because cityhash64 uses 64bit x 64bit
 // multiplication, which can be very slow on 32-bit systems.
@@ -3149,7 +3160,7 @@
     const unsigned char* __data = static_cast<const unsigned char*>(__key);
     for (; __len >= 4; __data += 4, __len -= 4)
     {
-        _Size __k = *(const _Size*)__data;
+        _Size __k = __loadword<_Size>(__data);
         __k *= __m;
         __k ^= __k >> __r;
         __k *= __m;
@@ -3208,13 +3219,13 @@
 
   static _Size __hash_len_0_to_16(const char* __s, _Size __len) {
     if (__len > 8) {
-      const _Size __a = *(const _Size*)__s;
-      const _Size __b = *(const _Size*)(__s + __len - 8);
+      const _Size __a = __loadword<_Size>(__s);
+      const _Size __b = __loadword<_Size>(__s + __len - 8);
       return __hash_len_16(__a, __rotate_by_at_least_1(__b + __len, __len)) ^ __b;
     }
     if (__len >= 4) {
-      const uint32_t __a = *(const uint32_t*)(__s);
-      const uint32_t __b = *(const uint32_t*)(__s + __len - 4);
+      const uint32_t __a = __loadword<uint32_t>(__s);
+      const uint32_t __b = __loadword<uint32_t>(__s + __len - 4);
       return __hash_len_16(__len + (__a << 3), __b);
     }
     if (__len > 0) {
@@ -3230,10 +3241,10 @@
   }
 
   static _Size __hash_len_17_to_32(const char *__s, _Size __len) {
-    const _Size __a = *(const _Size*)(__s) * __k1;
-    const _Size __b = *(const _Size*)(__s + 8);
-    const _Size __c = *(const _Size*)(__s + __len - 8) * __k2;
-    const _Size __d = *(const _Size*)(__s + __len - 16) * __k0;
+    const _Size __a = __loadword<_Size>(__s) * __k1;
+    const _Size __b = __loadword<_Size>(__s + 8);
+    const _Size __c = __loadword<_Size>(__s + __len - 8) * __k2;
+    const _Size __d = __loadword<_Size>(__s + __len - 16) * __k0;
     return __hash_len_16(__rotate(__a - __b, 43) + __rotate(__c, 30) + __d,
                          __a + __rotate(__b ^ __k3, 20) - __c + __len);
   }
@@ -3254,33 +3265,33 @@
   // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
   static pair<_Size, _Size> __weak_hash_len_32_with_seeds(
       const char* __s, _Size __a, _Size __b) {
-    return __weak_hash_len_32_with_seeds(*(const _Size*)(__s),
-                                         *(const _Size*)(__s + 8),
-                                         *(const _Size*)(__s + 16),
-                                         *(const _Size*)(__s + 24),
+    return __weak_hash_len_32_with_seeds(__loadword<_Size>(__s),
+                                         __loadword<_Size>(__s + 8),
+                                         __loadword<_Size>(__s + 16),
+                                         __loadword<_Size>(__s + 24),
                                          __a,
                                          __b);
   }
 
   // Return an 8-byte hash for 33 to 64 bytes.
   static _Size __hash_len_33_to_64(const char *__s, size_t __len) {
-    _Size __z = *(const _Size*)(__s + 24);
-    _Size __a = *(const _Size*)(__s) +
-                (__len + *(const _Size*)(__s + __len - 16)) * __k0;
+    _Size __z = __loadword<_Size>(__s + 24);
+    _Size __a = __loadword<_Size>(__s) +
+                (__len + __loadword<_Size>(__s + __len - 16)) * __k0;
     _Size __b = __rotate(__a + __z, 52);
     _Size __c = __rotate(__a, 37);
-    __a += *(const _Size*)(__s + 8);
+    __a += __loadword<_Size>(__s + 8);
     __c += __rotate(__a, 7);
-    __a += *(const _Size*)(__s + 16);
+    __a += __loadword<_Size>(__s + 16);
     _Size __vf = __a + __z;
     _Size __vs = __b + __rotate(__a, 31) + __c;
-    __a = *(const _Size*)(__s + 16) + *(const _Size*)(__s + __len - 32);
-    __z += *(const _Size*)(__s + __len - 8);
+    __a = __loadword<_Size>(__s + 16) + __loadword<_Size>(__s + __len - 32);
+    __z += __loadword<_Size>(__s + __len - 8);
     __b = __rotate(__a + __z, 52);
     __c = __rotate(__a, 37);
-    __a += *(const _Size*)(__s + __len - 24);
+    __a += __loadword<_Size>(__s + __len - 24);
     __c += __rotate(__a, 7);
-    __a += *(const _Size*)(__s + __len - 16);
+    __a += __loadword<_Size>(__s + __len - 16);
     _Size __wf = __a + __z;
     _Size __ws = __b + __rotate(__a, 31) + __c;
     _Size __r = __shift_mix((__vf + __ws) * __k2 + (__wf + __vs) * __k0);
@@ -3306,26 +3317,26 @@
 
   // For strings over 64 bytes we hash the end first, and then as we
   // loop we keep 56 bytes of state: v, w, x, y, and z.
-  _Size __x = *(const _Size*)(__s + __len - 40);
-  _Size __y = *(const _Size*)(__s + __len - 16) +
-              *(const _Size*)(__s + __len - 56);
-  _Size __z = __hash_len_16(*(const _Size*)(__s + __len - 48) + __len,
-                          *(const _Size*)(__s + __len - 24));
+  _Size __x = __loadword<_Size>(__s + __len - 40);
+  _Size __y = __loadword<_Size>(__s + __len - 16) +
+              __loadword<_Size>(__s + __len - 56);
+  _Size __z = __hash_len_16(__loadword<_Size>(__s + __len - 48) + __len,
+                          __loadword<_Size>(__s + __len - 24));
   pair<_Size, _Size> __v = __weak_hash_len_32_with_seeds(__s + __len - 64, __len, __z);
   pair<_Size, _Size> __w = __weak_hash_len_32_with_seeds(__s + __len - 32, __y + __k1, __x);
-  __x = __x * __k1 + *(const _Size*)(__s);
+  __x = __x * __k1 + __loadword<_Size>(__s);
 
   // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
   __len = (__len - 1) & ~static_cast<_Size>(63);
   do {
-    __x = __rotate(__x + __y + __v.first + *(const _Size*)(__s + 8), 37) * __k1;
-    __y = __rotate(__y + __v.second + *(const _Size*)(__s + 48), 42) * __k1;
+    __x = __rotate(__x + __y + __v.first + __loadword<_Size>(__s + 8), 37) * __k1;
+    __y = __rotate(__y + __v.second + __loadword<_Size>(__s + 48), 42) * __k1;
     __x ^= __w.second;
-    __y += __v.first + *(const _Size*)(__s + 40);
+    __y += __v.first + __loadword<_Size>(__s + 40);
     __z = __rotate(__z + __w.first, 33) * __k1;
     __v = __weak_hash_len_32_with_seeds(__s, __v.second * __k1, __x + __w.first);
     __w = __weak_hash_len_32_with_seeds(__s + 32, __z + __w.second,
-                                        __y + *(const _Size*)(__s + 16));
+                                        __y + __loadword<_Size>(__s + 16));
     std::swap(__z, __x);
     __s += 64;
     __len -= 64;