r209846 - Implement AVX1 vbroadcast intrinsics with vector initializers

Adam Nemet anemet at apple.com
Thu May 29 13:47:29 PDT 2014


Author: anemet
Date: Thu May 29 15:47:29 2014
New Revision: 209846

URL: http://llvm.org/viewvc/llvm-project?rev=209846&view=rev
Log:
Implement AVX1 vbroadcast intrinsics with vector initializers

These intrinsics are special because they directly take a memory operand (AVX2
adds the register counterparts).  Typically, other non-memop intrinsics take
registers and then it's left to isel to fold memory operands.

In order to LICM intrinsics directly reading memory, we require that no stores
are in the loop (LICM) or that the folded load accesses constant memory
(MachineLICM).  When neither is the case we fail to hoist a loop-invariant
broadcast.

We can work around this limitation if we expose the load as a regular load and
then just implement the broadcast using the vector initializer syntax.  This
exposes the load to LICM and other optimizations.

At the IR level this is translated into a series of insertelements.  The
sequence is already recognized as a broadcast so there is no impact on the
quality of codegen.

_mm256_broadcast_pd and _mm256_broadcast_ps are not updated by this patch
because right now we lack the DAG-combiner smartness to recover the broadcast
instructions.  This will be tackled in a follow-on.

There will be completing changes on the LLVM side to remove the LLVM
intrinsics and to auto-upgrade bitcode files.

Fixes <rdar://problem/16494520>

Modified:
    cfe/trunk/include/clang/Basic/BuiltinsX86.def
    cfe/trunk/lib/Headers/avxintrin.h
    cfe/trunk/test/CodeGen/avx-shuffle-builtins.c
    cfe/trunk/test/CodeGen/builtins-x86.c

Modified: cfe/trunk/include/clang/Basic/BuiltinsX86.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsX86.def?rev=209846&r1=209845&r2=209846&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/BuiltinsX86.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsX86.def Thu May 29 15:47:29 2014
@@ -453,9 +453,6 @@ BUILTIN(__builtin_ia32_movmskpd256, "iV4
 BUILTIN(__builtin_ia32_movmskps256, "iV8f", "")
 BUILTIN(__builtin_ia32_vzeroall, "v", "")
 BUILTIN(__builtin_ia32_vzeroupper, "v", "")
-BUILTIN(__builtin_ia32_vbroadcastss, "V4ffC*", "")
-BUILTIN(__builtin_ia32_vbroadcastsd256, "V4ddC*", "")
-BUILTIN(__builtin_ia32_vbroadcastss256, "V8ffC*", "")
 BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "")
 BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "")
 BUILTIN(__builtin_ia32_storeupd256, "vd*V4d", "")

Modified: cfe/trunk/lib/Headers/avxintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/avxintrin.h?rev=209846&r1=209845&r2=209846&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/avxintrin.h (original)
+++ cfe/trunk/lib/Headers/avxintrin.h Thu May 29 15:47:29 2014
@@ -737,19 +737,22 @@ _mm256_zeroupper(void)
 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_broadcast_ss(float const *__a)
 {
-  return (__m128)__builtin_ia32_vbroadcastss(__a);
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
 }
 
 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
 _mm256_broadcast_sd(double const *__a)
 {
-  return (__m256d)__builtin_ia32_vbroadcastsd256(__a);
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
 }
 
 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
 _mm256_broadcast_ss(float const *__a)
 {
-  return (__m256)__builtin_ia32_vbroadcastss256(__a);
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
 }
 
 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

Modified: cfe/trunk/test/CodeGen/avx-shuffle-builtins.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/avx-shuffle-builtins.c?rev=209846&r1=209845&r2=209846&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/avx-shuffle-builtins.c (original)
+++ cfe/trunk/test/CodeGen/avx-shuffle-builtins.c Thu May 29 15:47:29 2014
@@ -63,3 +63,37 @@ __m256i test_mm256_permute2f128_si256(__
   // CHECK: @llvm.x86.avx.vperm2f128.si.256
   return _mm256_permute2f128_si256(a, b, 0x20);
 }
+
+__m128
+test_mm_broadcast_ss(float const *__a) {
+  // CHECK-LABEL: @test_mm_broadcast_ss
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, i32 3
+  return _mm_broadcast_ss(__a);
+}
+
+__m256d
+test_mm256_broadcast_sd(double const *__a) {
+  // CHECK-LABEL: @test_mm256_broadcast_sd
+  // CHECK: insertelement <4 x double> {{.*}}, i32 0
+  // CHECK: insertelement <4 x double> {{.*}}, i32 1
+  // CHECK: insertelement <4 x double> {{.*}}, i32 2
+  // CHECK: insertelement <4 x double> {{.*}}, i32 3
+  return _mm256_broadcast_sd(__a);
+}
+
+__m256
+test_mm256_broadcast_ss(float const *__a) {
+  // CHECK-LABEL: @test_mm256_broadcast_ss
+  // CHECK: insertelement <8 x float> {{.*}}, i32 0
+  // CHECK: insertelement <8 x float> {{.*}}, i32 1
+  // CHECK: insertelement <8 x float> {{.*}}, i32 2
+  // CHECK: insertelement <8 x float> {{.*}}, i32 3
+  // CHECK: insertelement <8 x float> {{.*}}, i32 4
+  // CHECK: insertelement <8 x float> {{.*}}, i32 5
+  // CHECK: insertelement <8 x float> {{.*}}, i32 6
+  // CHECK: insertelement <8 x float> {{.*}}, i32 7
+  return _mm256_broadcast_ss(__a);
+}

Modified: cfe/trunk/test/CodeGen/builtins-x86.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-x86.c?rev=209846&r1=209845&r2=209846&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/builtins-x86.c (original)
+++ cfe/trunk/test/CodeGen/builtins-x86.c Thu May 29 15:47:29 2014
@@ -451,9 +451,6 @@ void f0() {
   tmp_i = __builtin_ia32_movmskps256(tmp_V8f);
   __builtin_ia32_vzeroall();
   __builtin_ia32_vzeroupper();
-  tmp_V4f = __builtin_ia32_vbroadcastss(tmp_fCp);
-  tmp_V4d = __builtin_ia32_vbroadcastsd256(tmp_dCp);
-  tmp_V8f = __builtin_ia32_vbroadcastss256(tmp_fCp);
   tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
   tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
   __builtin_ia32_storeupd256(tmp_dp, tmp_V4d);





More information about the cfe-commits mailing list