[PATCH] Inefficient code generation for 128-bit->256-bit typecast intrinsics (BZ #15712)

Thu Jul 18 17:32:20 PDT 2013

kromanova added you to the CC list for the revision "Inefficient code generation for 128-bit->256-bit typecast intrinsics (BZ #15712)".

Fix for BZ #15712 
http://lists.cs.uiuc.edu/pipermail/llvmbugs/2013-April/027885.html

As it stands right now, when Clang compiles 128 bit->256 bit typecasts (e.g., in _mm256_castsi128_si256 intrinsic), it finds it necessary to zero out the upper 128-bits of the 256-bit register. As a result, Clang generates two additional instructions:

vxorps xmm2,xmm2,xmm2 
vinsertf128 ymm0,ymm2,xmm0,0x0 

Most of the industry-standard C/C++ compilers (GCC, Intel’s compiler, Visual Studio compiler) don’t generate any extra moves for the typecast intrinsics _mm256_castsi128_si256, _mm256_castps128_ps256, _mm256_castpd128_pd256,  and don’t zero-extend the upper 128 bits of 256-bit YMM register. The upper 128 bits are left undefined.

I am proposing a fix for the poor code generation in Clang: to generate a call to builtins for typecast intrinstics in avxintrin.h and then in CGBuiltin.cpp to lower these builtins to shufflevector, using “undef” value to fill in the upper 128 bits.  

I added a new testcase “avx-cast-builtins.c” to monitor code generation. This testcase checks that the Clang compiler doesn't generate any additional instructions for _mm256_castsi128_si256


http://llvm-reviews.chandlerc.com/D1141

Files:
  test/CodeGen/avx-cast-builtins.c
  test/CodeGen/builtins-x86.c
  include/clang/Basic/BuiltinsX86.def
  lib/CodeGen/CGBuiltin.cpp
  lib/Headers/avxintrin.h

Index: test/CodeGen/avx-cast-builtins.c
===================================================================

--- test/CodeGen/avx-cast-builtins.c
+++ test/CodeGen/avx-cast-builtins.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -O2 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+//
+// Test LLVM IR codegen of cast instructions
+//
+
+__m256i si256( __m128i vec03)
+{
+  // Check that generate a shufflevector with undefined upper 128 bits, 
+  // instead of a shuffle vector with zero upper 128 bits. 
+  // CHECK: define <4 x i64> @si256(<2 x i64> %vec03)
+  // CHECK: %cast128_256.i = shufflevector <4 x i32> %0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: ret <4 x i64> %1
+  return _mm256_castsi128_si256(vec03);
+}
+
+__m256 ps256( __m128 vec03)
+{
+  // Check that generate a shufflevector with undefined upper 128 bits,
+  // instead of a shuffle vector with zero upper 128 bits.
+  // CHECK: define <8 x float> @ps256(<4 x float> %vec03)
+  // CHECK: %cast128_256.i = shufflevector <4 x float> %vec03, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: ret <8 x float> %cast128_256.i
+  return _mm256_castps128_ps256(vec03);
+}
+
+__m256d pd256( __m128d vec03)
+{
+  // Check that generate a shufflevector with undefined upper 128 bits,
+  // instead of a shuffle vector with zero upper 128 bits.
+  // CHECK: define <4 x double> @pd256(<2 x double> %vec03)
+  // CHECK: %cast128pd_256pd.i = shufflevector <2 x double> %vec03, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: ret <4 x double> %cast128pd_256pd.i
+  return _mm256_castpd128_pd256(vec03);
+}
+
Index: test/CodeGen/builtins-x86.c
===================================================================
--- test/CodeGen/builtins-x86.c
+++ test/CodeGen/builtins-x86.c
@@ -425,6 +425,9 @@
   tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f);
   tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1);
   tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1);
+  tmp_V8i = __builtin_ia32_si256_si(tmp_V4i);
+  tmp_V8f = __builtin_ia32_ps256_ps(tmp_V4f);
+  tmp_V4d = __builtin_ia32_pd256_pd(tmp_V2d);
   tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d);
   tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d);
   tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d);
Index: include/clang/Basic/BuiltinsX86.def
===================================================================
--- include/clang/Basic/BuiltinsX86.def
+++ include/clang/Basic/BuiltinsX86.def
@@ -432,6 +432,9 @@
 BUILTIN(__builtin_ia32_rcpps256, "V8fV8f", "")
 BUILTIN(__builtin_ia32_roundpd256, "V4dV4dIi", "")
 BUILTIN(__builtin_ia32_roundps256, "V8fV8fIi", "")
+BUILTIN(__builtin_ia32_si256_si, "V8iV4i", "")
+BUILTIN(__builtin_ia32_ps256_ps, "V8fV4f", "")
+BUILTIN(__builtin_ia32_pd256_pd, "V4dV2d", "")
 BUILTIN(__builtin_ia32_vtestzpd, "iV2dV2d", "")
 BUILTIN(__builtin_ia32_vtestcpd, "iV2dV2d", "")
 BUILTIN(__builtin_ia32_vtestnzcpd, "iV2dV2d", "")
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -2631,6 +2631,33 @@
 
   switch (BuiltinID) {
   default: return 0;
+  case X86::BI__builtin_ia32_si256_si: 
+  case X86::BI__builtin_ia32_ps256_ps: {
+    SmallVector<Constant*, 8> Indices;
+    for (unsigned i = 0; i < 8; ++i) {
+      if (i < 4)
+        Indices.push_back(ConstantInt::get(Int32Ty, i));
+      else
+        Indices.push_back(ConstantInt::get(Int32Ty, 4));
+    }
+
+    Value *UndefVal = llvm::UndefValue::get(Ops[0]->getType());
+    Value *SV = llvm::ConstantVector::get(Indices);
+    return Builder.CreateShuffleVector(Ops[0], UndefVal, SV, "cast128_256");
+  }
+  case X86::BI__builtin_ia32_pd256_pd: {
+    SmallVector<Constant*, 4> Indices;
+    for (unsigned i = 0; i < 4; ++i) {
+      if (i < 2)
+        Indices.push_back(ConstantInt::get(Int32Ty, i));
+      else
+        Indices.push_back(ConstantInt::get(Int32Ty, 2));
+    }
+
+    Value *UndefVal = llvm::UndefValue::get(Ops[0]->getType());
+    Value *SV = llvm::ConstantVector::get(Indices);
+    return Builder.CreateShuffleVector(Ops[0], UndefVal, SV, "cast128pd_256pd");
+  }
   case X86::BI__builtin_ia32_vec_init_v8qi:
   case X86::BI__builtin_ia32_vec_init_v4hi:
   case X86::BI__builtin_ia32_vec_init_v2si:
Index: lib/Headers/avxintrin.h
===================================================================
--- lib/Headers/avxintrin.h
+++ lib/Headers/avxintrin.h
@@ -1134,22 +1134,19 @@
 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
 _mm256_castpd128_pd256(__m128d __a)
 {
-  __m128d __zero = _mm_setzero_pd();
-  return __builtin_shufflevector(__a, __zero, 0, 1, 2, 2);
+  return (__m256d)__builtin_ia32_pd256_pd((__v2df)__a);
 }
 
 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
 _mm256_castps128_ps256(__m128 __a)
 {
-  __m128 __zero = _mm_setzero_ps();
-  return __builtin_shufflevector(__a, __zero, 0, 1, 2, 3, 4, 4, 4, 4);
+  return (__m256)__builtin_ia32_ps256_ps((__v4sf)__a);
 }
 
 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
 _mm256_castsi128_si256(__m128i __a)
 {
-  __m128i __zero = _mm_setzero_si128();
-  return __builtin_shufflevector(__a, __zero, 0, 1, 2, 2);
+  return (__m256i)__builtin_ia32_si256_si((__v4si)__a);
 }
 
 /* SIMD load ops (unaligned) */
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D1141.1.patch
Type: text/x-patch
Size: 5578 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20130718/148a6398/attachment.bin>