r187716 - Use a shuffle with undef elements instead of inserting 0s in the 128-bit to 256-bit casting intrinsics to improve performance. Thanks to Katya Romanova for identifying this issue.

Sun Aug 4 23:17:21 PDT 2013

Author: ctopper
Date: Mon Aug  5 01:17:21 2013
New Revision: 187716

URL: http://llvm.org/viewvc/llvm-project?rev=187716&view=rev
Log:
Use a shuffle with undef elements instead of inserting 0s in the 128-bit to 256-bit casting intrinsics to improve performance. Thanks to Katya Romanova for identifying this issue.

Modified:
    cfe/trunk/lib/Headers/avxintrin.h

Modified: cfe/trunk/lib/Headers/avxintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/avxintrin.h?rev=187716&r1=187715&r2=187716&view=diff
==============================================================================

--- cfe/trunk/lib/Headers/avxintrin.h (original)
+++ cfe/trunk/lib/Headers/avxintrin.h Mon Aug  5 01:17:21 2013
@@ -1137,22 +1137,19 @@ _mm256_castsi256_si128(__m256i __a)
 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
 _mm256_castpd128_pd256(__m128d __a)
 {
-  __m128d __zero = _mm_setzero_pd();
-  return __builtin_shufflevector(__a, __zero, 0, 1, 2, 2);
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
 }
 
 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
 _mm256_castps128_ps256(__m128 __a)
 {
-  __m128 __zero = _mm_setzero_ps();
-  return __builtin_shufflevector(__a, __zero, 0, 1, 2, 3, 4, 4, 4, 4);
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
 _mm256_castsi128_si256(__m128i __a)
 {
-  __m128i __zero = _mm_setzero_si128();
-  return __builtin_shufflevector(__a, __zero, 0, 1, 2, 2);
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
 }
 
 /* SIMD load ops (unaligned) */