r272540 - [Clang][X86] Convert non-temporal store builtins to generic __builtin_nontemporal_store in headers
Simon Pilgrim via cfe-commits
cfe-commits at lists.llvm.org
Mon Jun 13 02:57:54 PDT 2016
Author: rksimon
Date: Mon Jun 13 04:57:52 2016
New Revision: 272540
URL: http://llvm.org/viewvc/llvm-project?rev=272540&view=rev
Log:
[Clang][X86] Convert non-temporal store builtins to generic __builtin_nontemporal_store in headers
We can now use __builtin_nontemporal_store instead of target specific builtins for naturally aligned nontemporal stores which avoids the need for handling in CGBuiltin.cpp
The scalar integer nontemporal (unaligned) store builtins will have to wait as __builtin_nontemporal_store currently assumes natural alignment and doesn't accept the 'packed struct' trick that we use for normal unaligned load/stores.
The nontemporal loads require further backend support before we can safely convert them to __builtin_nontemporal_load
Differential Revision: http://reviews.llvm.org/D21272
Modified:
cfe/trunk/include/clang/Basic/BuiltinsX86.def
cfe/trunk/lib/CodeGen/CGBuiltin.cpp
cfe/trunk/lib/Headers/avx512fintrin.h
cfe/trunk/lib/Headers/avxintrin.h
cfe/trunk/lib/Headers/emmintrin.h
cfe/trunk/lib/Headers/xmmintrin.h
cfe/trunk/test/CodeGen/avx512f-builtins.c
cfe/trunk/test/CodeGen/builtins-x86.c
Modified: cfe/trunk/include/clang/Basic/BuiltinsX86.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsX86.def?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/BuiltinsX86.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsX86.def Mon Jun 13 04:57:52 2016
@@ -313,7 +313,6 @@ TARGET_BUILTIN(__builtin_ia32_cvtss2si64
TARGET_BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "", "sse")
-TARGET_BUILTIN(__builtin_ia32_movntps, "vf*V4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_sfence, "v", "", "sse")
TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "", "sse")
@@ -327,8 +326,6 @@ TARGET_BUILTIN(__builtin_ia32_movmskpd,
TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntpd, "vd*V2d", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntdq, "vV2LLi*V2LLi", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2LLiV16cV16c", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "", "sse2")
@@ -493,9 +490,6 @@ TARGET_BUILTIN(__builtin_ia32_vzeroupper
TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "", "avx")
TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "", "avx")
TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntdq256, "vV4LLi*V4LLi", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntpd256, "vd*V4d", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntps256, "vf*V8f", "", "avx")
TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2LLi", "", "avx")
TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "", "avx")
TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4LLi", "", "avx")
@@ -2154,10 +2148,7 @@ TARGET_BUILTIN(__builtin_ia32_kortestzhi
TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntdq512, "vV8LLi*V8LLi","","avx512f")
TARGET_BUILTIN(__builtin_ia32_movntdqa512, "V8LLiV8LLi*","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntpd512, "vd*V8d","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntps512, "vf*V16f","","avx512f")
TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw")
TARGET_BUILTIN(__builtin_ia32_palignr128_mask, "V16cV16cV16cIiV16cUs","","avx512bw,avx512vl")
TARGET_BUILTIN(__builtin_ia32_palignr256_mask, "V32cV32cV32cIiV32cUi","","avx512bw,avx512vl")
Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Mon Jun 13 04:57:52 2016
@@ -243,14 +243,14 @@ static Value *EmitSignBit(CodeGenFunctio
// little-Endian, the high bits in big-Endian. Therefore, on big-Endian
// we need to shift the high bits down to the low before truncating.
Width >>= 1;
- if (CGF.getTarget().isBigEndian()) {
- Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
- V = CGF.Builder.CreateLShr(V, ShiftCst);
- }
- // We are truncating value in order to extract the higher-order
- // double, which we will be using to extract the sign from.
- IntTy = llvm::IntegerType::get(C, Width);
- V = CGF.Builder.CreateTrunc(V, IntTy);
+ if (CGF.getTarget().isBigEndian()) {
+ Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
+ V = CGF.Builder.CreateLShr(V, ShiftCst);
+ }
+ // We are truncating value in order to extract the higher-order
+ // double, which we will be using to extract the sign from.
+ IntTy = llvm::IntegerType::get(C, Width);
+ V = CGF.Builder.CreateTrunc(V, IntTy);
}
Value *Zero = llvm::Constant::getNullValue(IntTy);
return CGF.Builder.CreateICmpSLT(V, Zero);
@@ -1815,13 +1815,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(
case Builtin::BI__builtin_smull_overflow:
case Builtin::BI__builtin_smulll_overflow:
IntrinsicId = llvm::Intrinsic::smul_with_overflow;
- break;
- }
-
-
- llvm::Value *Carry;
- llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
- Builder.CreateStore(Sum, SumOutPtr);
+ break;
+ }
+
+
+ llvm::Value *Carry;
+ llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
+ Builder.CreateStore(Sum, SumOutPtr);
return RValue::get(Carry);
}
@@ -3569,13 +3569,13 @@ static Value *packTBLDVectorList(CodeGen
llvm::Type *ResTy, unsigned IntID,
const char *Name) {
SmallVector<Value *, 2> TblOps;
- if (ExtOp)
- TblOps.push_back(ExtOp);
-
- // Build a vector containing sequential number like (0, 1, 2, ..., 15)
- SmallVector<uint32_t, 16> Indices;
- llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
- for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
+ if (ExtOp)
+ TblOps.push_back(ExtOp);
+
+ // Build a vector containing sequential number like (0, 1, 2, ..., 15)
+ SmallVector<uint32_t, 16> Indices;
+ llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
+ for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
Indices.push_back(2*i);
Indices.push_back(2*i+1);
}
@@ -3596,13 +3596,13 @@ static Value *packTBLDVectorList(CodeGen
ZeroTbl, Indices, Name));
}
- Function *TblF;
- TblOps.push_back(IndexOp);
- TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
-
- return CGF.EmitNeonCall(TblF, TblOps, Name);
-}
-
+ Function *TblF;
+ TblOps.push_back(IndexOp);
+ TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
+
+ return CGF.EmitNeonCall(TblF, TblOps, Name);
+}
+
Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
unsigned Value;
switch (BuiltinID) {
@@ -4102,13 +4102,13 @@ Value *CodeGenFunction::EmitARMBuiltinEx
"vsha1h");
// The ARM _MoveToCoprocessor builtins put the input register value as
- // the first argument, but the LLVM intrinsic expects it as the third one.
- case ARM::BI_MoveToCoprocessor:
- case ARM::BI_MoveToCoprocessor2: {
- Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
- Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
- return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
- Ops[3], Ops[4], Ops[5]});
+ // the first argument, but the LLVM intrinsic expects it as the third one.
+ case ARM::BI_MoveToCoprocessor:
+ case ARM::BI_MoveToCoprocessor2: {
+ Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
+ Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
+ return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
+ Ops[3], Ops[4], Ops[5]});
}
}
@@ -6701,39 +6701,27 @@ Value *CodeGenFunction::EmitX86BuiltinEx
if (Ops.size() == 3)
return Align;
- return EmitX86Select(*this, Ops[4], Align, Ops[3]);
- }
-
- case X86::BI__builtin_ia32_movntps:
- case X86::BI__builtin_ia32_movntps256:
- case X86::BI__builtin_ia32_movntpd:
- case X86::BI__builtin_ia32_movntpd256:
- case X86::BI__builtin_ia32_movntdq:
- case X86::BI__builtin_ia32_movntdq256:
- case X86::BI__builtin_ia32_movnti:
- case X86::BI__builtin_ia32_movnti64: {
- llvm::MDNode *Node = llvm::MDNode::get(
+ return EmitX86Select(*this, Ops[4], Align, Ops[3]);
+ }
+
+ case X86::BI__builtin_ia32_movnti:
+ case X86::BI__builtin_ia32_movnti64: {
+ llvm::MDNode *Node = llvm::MDNode::get(
getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
// Convert the type of the pointer to a pointer to the stored type.
Value *BC = Builder.CreateBitCast(Ops[0],
llvm::PointerType::getUnqual(Ops[1]->getType()),
"cast");
- StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);
- SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
-
- // If the operand is an integer, we can't assume alignment. Otherwise,
- // assume natural alignment.
- QualType ArgTy = E->getArg(1)->getType();
- unsigned Align;
- if (ArgTy->isIntegerType())
- Align = 1;
- else
- Align = getContext().getTypeSizeInChars(ArgTy).getQuantity();
- SI->setAlignment(Align);
- return SI;
- }
- case X86::BI__builtin_ia32_selectb_128:
+ StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);
+ SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
+
+ // No alignment for scalar intrinsic store.
+ QualType ArgTy = E->getArg(1)->getType();
+ SI->setAlignment(1);
+ return SI;
+ }
+ case X86::BI__builtin_ia32_selectb_128:
case X86::BI__builtin_ia32_selectb_256:
case X86::BI__builtin_ia32_selectb_512:
case X86::BI__builtin_ia32_selectw_128:
Modified: cfe/trunk/lib/Headers/avx512fintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/avx512fintrin.h?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/avx512fintrin.h (original)
+++ cfe/trunk/lib/Headers/avx512fintrin.h Mon Jun 13 04:57:52 2016
@@ -8866,7 +8866,7 @@ _mm512_kxor (__mmask16 __A, __mmask16 __
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_si512 (__m512i * __P, __m512i __A)
{
- __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+ __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -8878,13 +8878,13 @@ _mm512_stream_load_si512 (void *__P)
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_pd (double *__P, __m512d __A)
{
- __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+ __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_ps (float *__P, __m512 __A)
{
- __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+ __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
Modified: cfe/trunk/lib/Headers/avxintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/avxintrin.h?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/avxintrin.h (original)
+++ cfe/trunk/lib/Headers/avxintrin.h Mon Jun 13 04:57:52 2016
@@ -2496,19 +2496,19 @@ _mm_maskstore_ps(float *__p, __m128i __m
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_si256(__m256i *__a, __m256i __b)
{
- __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+ __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
}
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_pd(double *__a, __m256d __b)
{
- __builtin_ia32_movntpd256(__a, (__v4df)__b);
+ __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
}
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_ps(float *__p, __m256 __a)
{
- __builtin_ia32_movntps256(__p, (__v8sf)__a);
+ __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
}
/* Create vectors */
Modified: cfe/trunk/lib/Headers/emmintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/emmintrin.h?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/emmintrin.h (original)
+++ cfe/trunk/lib/Headers/emmintrin.h Mon Jun 13 04:57:52 2016
@@ -2210,13 +2210,13 @@ _mm_storel_epi64(__m128i *__p, __m128i _
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pd(double *__p, __m128d __a)
{
- __builtin_ia32_movntpd(__p, (__v2df)__a);
+ __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i *__p, __m128i __a)
{
- __builtin_ia32_movntdq(__p, (__v2di)__a);
+ __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
}
static __inline__ void __DEFAULT_FN_ATTRS
Modified: cfe/trunk/lib/Headers/xmmintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/xmmintrin.h?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/xmmintrin.h (original)
+++ cfe/trunk/lib/Headers/xmmintrin.h Mon Jun 13 04:57:52 2016
@@ -2080,7 +2080,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(float *__p, __m128 __a)
{
- __builtin_ia32_movntps(__p, (__v4sf)__a);
+ __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
}
/// \brief Forces strong memory ordering (serialization) between store
Modified: cfe/trunk/test/CodeGen/avx512f-builtins.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/avx512f-builtins.c?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/avx512f-builtins.c (original)
+++ cfe/trunk/test/CodeGen/avx512f-builtins.c Mon Jun 13 04:57:52 2016
@@ -5800,7 +5800,7 @@ __mmask16 test_mm512_kxor(__mmask16 __A,
void test_mm512_stream_si512(__m512i * __P, __m512i __A) {
// CHECK-LABEL: @test_mm512_stream_si512
- // CHECK: @llvm.x86.avx512.storent.q.512
+ // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal
_mm512_stream_si512(__P, __A);
}
@@ -5812,13 +5812,13 @@ __m512i test_mm512_stream_load_si512(voi
void test_mm512_stream_pd(double *__P, __m512d __A) {
// CHECK-LABEL: @test_mm512_stream_pd
- // CHECK: @llvm.x86.avx512.storent.pd.512
+ // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 64, !nontemporal
return _mm512_stream_pd(__P, __A);
}
void test_mm512_stream_ps(float *__P, __m512 __A) {
// CHECK-LABEL: @test_mm512_stream_ps
- // CHECK: @llvm.x86.avx512.storent.ps.512
+ // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 64, !nontemporal
_mm512_stream_ps(__P, __A);
}
Modified: cfe/trunk/test/CodeGen/builtins-x86.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-x86.c?rev=272540&r1=272539&r2=272540&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/builtins-x86.c (original)
+++ cfe/trunk/test/CodeGen/builtins-x86.c Mon Jun 13 04:57:52 2016
@@ -300,7 +300,6 @@ void f0() {
(void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
tmp_i = __builtin_ia32_movmskps(tmp_V4f);
tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
- (void) __builtin_ia32_movntps(tmp_fp, tmp_V4f);
(void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
(void) __builtin_ia32_sfence();
@@ -318,8 +317,6 @@ void f0() {
#ifdef USE_64
(void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
#endif
- (void) __builtin_ia32_movntpd(tmp_dp, tmp_V2d);
- (void) __builtin_ia32_movntdq(tmp_V2LLip, tmp_V2LLi);
tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
@@ -446,9 +443,6 @@ void f0() {
tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
- __builtin_ia32_movntdq256(tmp_V4LLip, tmp_V4LLi);
- __builtin_ia32_movntpd256(tmp_dp, tmp_V4d);
- __builtin_ia32_movntps256(tmp_fp, tmp_V8f);
tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);
More information about the cfe-commits
mailing list