[PATCH] Inefficient code generation for 128-bit->256-bit typecast intrinsics (BZ #15712)
Katya Romanova
Katya_Romanova at playstation.sony.com
Thu Jul 18 17:32:20 PDT 2013
kromanova added you to the CC list for the revision "Inefficient code generation for 128-bit->256-bit typecast intrinsics (BZ #15712)".
Fix for BZ #15712
http://lists.cs.uiuc.edu/pipermail/llvmbugs/2013-April/027885.html
As it stands right now, when Clang compiles 128 bit->256 bit typecasts (e.g., in _mm256_castsi128_si256 intrinsic), it finds it necessary to zero out the upper 128-bits of the 256-bit register. As a result, Clang generates two additional instructions:
vxorps xmm2,xmm2,xmm2
vinsertf128 ymm0,ymm2,xmm0,0x0
Most of the industry-standard C/C++ compilers (GCC, Intel’s compiler, Visual Studio compiler) don’t generate any extra moves for the typecast intrinsics _mm256_castsi128_si256, _mm256_castps128_ps256, _mm256_castpd128_pd256, and don’t zero-extend the upper 128 bits of 256-bit YMM register. The upper 128 bits are left undefined.
I am proposing a fix for the poor code generation in Clang: to generate a call to builtins for typecast intrinstics in avxintrin.h and then in CGBuiltin.cpp to lower these builtins to shufflevector, using “undef” value to fill in the upper 128 bits.
I added a new testcase “avx-cast-builtins.c” to monitor code generation. This testcase checks that the Clang compiler doesn't generate any additional instructions for _mm256_castsi128_si256
http://llvm-reviews.chandlerc.com/D1141
Files:
test/CodeGen/avx-cast-builtins.c
test/CodeGen/builtins-x86.c
include/clang/Basic/BuiltinsX86.def
lib/CodeGen/CGBuiltin.cpp
lib/Headers/avxintrin.h
Index: test/CodeGen/avx-cast-builtins.c
===================================================================
--- test/CodeGen/avx-cast-builtins.c
+++ test/CodeGen/avx-cast-builtins.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -O2 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+//
+// Test LLVM IR codegen of cast instructions
+//
+
+__m256i si256( __m128i vec03)
+{
+ // Check that generate a shufflevector with undefined upper 128 bits,
+ // instead of a shuffle vector with zero upper 128 bits.
+ // CHECK: define <4 x i64> @si256(<2 x i64> %vec03)
+ // CHECK: %cast128_256.i = shufflevector <4 x i32> %0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ // CHECK: ret <4 x i64> %1
+ return _mm256_castsi128_si256(vec03);
+}
+
+__m256 ps256( __m128 vec03)
+{
+ // Check that generate a shufflevector with undefined upper 128 bits,
+ // instead of a shuffle vector with zero upper 128 bits.
+ // CHECK: define <8 x float> @ps256(<4 x float> %vec03)
+ // CHECK: %cast128_256.i = shufflevector <4 x float> %vec03, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ // CHECK: ret <8 x float> %cast128_256.i
+ return _mm256_castps128_ps256(vec03);
+}
+
+__m256d pd256( __m128d vec03)
+{
+ // Check that generate a shufflevector with undefined upper 128 bits,
+ // instead of a shuffle vector with zero upper 128 bits.
+ // CHECK: define <4 x double> @pd256(<2 x double> %vec03)
+ // CHECK: %cast128pd_256pd.i = shufflevector <2 x double> %vec03, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ // CHECK: ret <4 x double> %cast128pd_256pd.i
+ return _mm256_castpd128_pd256(vec03);
+}
+
Index: test/CodeGen/builtins-x86.c
===================================================================
--- test/CodeGen/builtins-x86.c
+++ test/CodeGen/builtins-x86.c
@@ -425,6 +425,9 @@
tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f);
tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1);
tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1);
+ tmp_V8i = __builtin_ia32_si256_si(tmp_V4i);
+ tmp_V8f = __builtin_ia32_ps256_ps(tmp_V4f);
+ tmp_V4d = __builtin_ia32_pd256_pd(tmp_V2d);
tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d);
tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d);
tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d);
Index: include/clang/Basic/BuiltinsX86.def
===================================================================
--- include/clang/Basic/BuiltinsX86.def
+++ include/clang/Basic/BuiltinsX86.def
@@ -432,6 +432,9 @@
BUILTIN(__builtin_ia32_rcpps256, "V8fV8f", "")
BUILTIN(__builtin_ia32_roundpd256, "V4dV4dIi", "")
BUILTIN(__builtin_ia32_roundps256, "V8fV8fIi", "")
+BUILTIN(__builtin_ia32_si256_si, "V8iV4i", "")
+BUILTIN(__builtin_ia32_ps256_ps, "V8fV4f", "")
+BUILTIN(__builtin_ia32_pd256_pd, "V4dV2d", "")
BUILTIN(__builtin_ia32_vtestzpd, "iV2dV2d", "")
BUILTIN(__builtin_ia32_vtestcpd, "iV2dV2d", "")
BUILTIN(__builtin_ia32_vtestnzcpd, "iV2dV2d", "")
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -2631,6 +2631,33 @@
switch (BuiltinID) {
default: return 0;
+ case X86::BI__builtin_ia32_si256_si:
+ case X86::BI__builtin_ia32_ps256_ps: {
+ SmallVector<Constant*, 8> Indices;
+ for (unsigned i = 0; i < 8; ++i) {
+ if (i < 4)
+ Indices.push_back(ConstantInt::get(Int32Ty, i));
+ else
+ Indices.push_back(ConstantInt::get(Int32Ty, 4));
+ }
+
+ Value *UndefVal = llvm::UndefValue::get(Ops[0]->getType());
+ Value *SV = llvm::ConstantVector::get(Indices);
+ return Builder.CreateShuffleVector(Ops[0], UndefVal, SV, "cast128_256");
+ }
+ case X86::BI__builtin_ia32_pd256_pd: {
+ SmallVector<Constant*, 4> Indices;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (i < 2)
+ Indices.push_back(ConstantInt::get(Int32Ty, i));
+ else
+ Indices.push_back(ConstantInt::get(Int32Ty, 2));
+ }
+
+ Value *UndefVal = llvm::UndefValue::get(Ops[0]->getType());
+ Value *SV = llvm::ConstantVector::get(Indices);
+ return Builder.CreateShuffleVector(Ops[0], UndefVal, SV, "cast128pd_256pd");
+ }
case X86::BI__builtin_ia32_vec_init_v8qi:
case X86::BI__builtin_ia32_vec_init_v4hi:
case X86::BI__builtin_ia32_vec_init_v2si:
Index: lib/Headers/avxintrin.h
===================================================================
--- lib/Headers/avxintrin.h
+++ lib/Headers/avxintrin.h
@@ -1134,22 +1134,19 @@
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castpd128_pd256(__m128d __a)
{
- __m128d __zero = _mm_setzero_pd();
- return __builtin_shufflevector(__a, __zero, 0, 1, 2, 2);
+ return (__m256d)__builtin_ia32_pd256_pd((__v2df)__a);
}
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castps128_ps256(__m128 __a)
{
- __m128 __zero = _mm_setzero_ps();
- return __builtin_shufflevector(__a, __zero, 0, 1, 2, 3, 4, 4, 4, 4);
+ return (__m256)__builtin_ia32_ps256_ps((__v4sf)__a);
}
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castsi128_si256(__m128i __a)
{
- __m128i __zero = _mm_setzero_si128();
- return __builtin_shufflevector(__a, __zero, 0, 1, 2, 2);
+ return (__m256i)__builtin_ia32_si256_si((__v4si)__a);
}
/* SIMD load ops (unaligned) */
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D1141.1.patch
Type: text/x-patch
Size: 5578 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20130718/148a6398/attachment.bin>
More information about the cfe-commits
mailing list