[llvm-branch-commits] [llvm-branch] r276990 - Merging r275981 and r276740:
Hans Wennborg via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jul 28 08:38:58 PDT 2016
Author: hans
Date: Thu Jul 28 10:38:57 2016
New Revision: 276990
URL: http://llvm.org/viewvc/llvm-project?rev=276990&view=rev
Log:
Merging r275981 and r276740:
------------------------------------------------------------------------
r275981 | rksimon | 2016-07-19 08:07:43 -0700 (Tue, 19 Jul 2016) | 13 lines
[X86][SSE] Reimplement SSE fp2si conversion intrinsics instead of using generic IR
D20859 and D20860 attempted to replace the SSE (V)CVTTPS2DQ and VCVTTPD2DQ truncating conversions with generic IR instead.
It turns out that the behaviour of these intrinsics is different enough from generic IR that this will cause problems, INF/NAN/out of range values are guaranteed to result in a 0x80000000 value - which plays havoc with constant folding which converts them to either zero or UNDEF. This is also an issue with the scalar implementations (which were already generic IR and what I was trying to match).
This patch changes both scalar and packed versions back to using x86-specific builtins.
It also deals with the other scalar conversion cases that are runtime rounding mode dependent and can have similar issues with constant folding.
A companion clang patch is at D22105
Differential Revision: https://reviews.llvm.org/D22106
------------------------------------------------------------------------
------------------------------------------------------------------------
r276740 | rksimon | 2016-07-26 03:41:28 -0700 (Tue, 26 Jul 2016) | 5 lines
[X86][SSE] Fixed issue with memory folding of (v)cvtsd2ss intrinsics
Fixed typo in the intrinsic definitions of (v)cvtsd2ss with memory folding.
This was only unearthed when rL276102 started using the intrinsic again.....
------------------------------------------------------------------------
Modified:
llvm/branches/release_39/ (props changed)
llvm/branches/release_39/include/llvm/IR/IntrinsicsX86.td
llvm/branches/release_39/lib/Analysis/ConstantFolding.cpp
llvm/branches/release_39/lib/IR/AutoUpgrade.cpp
llvm/branches/release_39/lib/Target/X86/X86InstrSSE.td
llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86.ll
llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86.ll
llvm/branches/release_39/test/Transforms/ConstProp/calls.ll
Propchange: llvm/branches/release_39/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Thu Jul 28 10:38:57 2016
@@ -1,3 +1,3 @@
/llvm/branches/Apple/Pertwee:110850,110961
/llvm/branches/type-system-rewrite:133420-134817
-/llvm/trunk:155241,275870,275879,275898,275928,275935,275946,275978,276015,276077,276109,276181,276209,276236-276237,276358,276364,276368,276389,276438,276479,276510
+/llvm/trunk:155241,275870,275879,275898,275928,275935,275946,275978,275981,276015,276077,276109,276181,276209,276236-276237,276358,276364,276368,276389,276438,276479,276510,276740
Modified: llvm/branches/release_39/include/llvm/IR/IntrinsicsX86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/include/llvm/IR/IntrinsicsX86.td?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/include/llvm/IR/IntrinsicsX86.td (original)
+++ llvm/branches/release_39/include/llvm/IR/IntrinsicsX86.td Thu Jul 28 10:38:57 2016
@@ -479,6 +479,8 @@ let TargetPrefix = "x86" in { // All in
Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+ def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
@@ -1512,8 +1514,12 @@ let TargetPrefix = "x86" in { // All in
Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+ def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+ def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
}
// Vector bit test
Modified: llvm/branches/release_39/lib/Analysis/ConstantFolding.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Analysis/ConstantFolding.cpp?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Analysis/ConstantFolding.cpp (original)
+++ llvm/branches/release_39/lib/Analysis/ConstantFolding.cpp Thu Jul 28 10:38:57 2016
@@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*
/// integer type Ty is used to select how many bits are available for the
/// result. Returns null if the conversion cannot be performed, otherwise
/// returns the Constant value resulting from the conversion.
-Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
- Type *Ty) {
+Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
+ Type *Ty) {
// All of these conversion intrinsics form an integer of at most 64bits.
unsigned ResultWidth = Ty->getIntegerBitWidth();
assert(ResultWidth <= 64 &&
@@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const
APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
/*isSigned=*/true, mode,
&isExact);
- if (status != APFloat::opOK && status != APFloat::opInexact)
+ if (status != APFloat::opOK &&
+ (!roundTowardZero || status != APFloat::opInexact))
return nullptr;
return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
}
@@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringR
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
if (ConstantFP *FPOp =
- dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
- return ConstantFoldConvertToInt(FPOp->getValueAPF(),
- /*roundTowardZero=*/false, Ty);
+ dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+ return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+ /*roundTowardZero=*/false, Ty);
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
if (ConstantFP *FPOp =
- dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
- return ConstantFoldConvertToInt(FPOp->getValueAPF(),
- /*roundTowardZero=*/true, Ty);
+ dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+ return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+ /*roundTowardZero=*/true, Ty);
}
}
Modified: llvm/branches/release_39/lib/IR/AutoUpgrade.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/IR/AutoUpgrade.cpp?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/lib/IR/AutoUpgrade.cpp (original)
+++ llvm/branches/release_39/lib/IR/AutoUpgrade.cpp Thu Jul 28 10:38:57 2016
@@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Fu
Name == "sse2.cvtps2pd" ||
Name == "avx.cvtdq2.pd.256" ||
Name == "avx.cvt.ps2.pd.256" ||
- Name == "sse2.cvttps2dq" ||
- Name.startswith("avx.cvtt.") ||
Name.startswith("avx.vinsertf128.") ||
Name == "avx2.vinserti128" ||
Name.startswith("avx.vextractf128.") ||
@@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst
Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
else
Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
- } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
- Name.startswith("avx.cvtt."))) {
- // Truncation (round to zero) float/double to i32 vector conversion.
- Value *Src = CI->getArgOperand(0);
- VectorType *DstTy = cast<VectorType>(CI->getType());
- Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
} else if (IsX86 && Name.startswith("sse4a.movnt.")) {
Module *M = F->getParent();
SmallVector<Metadata *, 1> Elts;
Modified: llvm/branches/release_39/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/X86/X86InstrSSE.td?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/branches/release_39/lib/Target/X86/X86InstrSSE.td Thu Jul 28 10:38:57 2016
@@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -2009,24 +2009,35 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+ [(set VR256:$dst,
+ (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+ (loadv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
Modified: llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-fast-isel.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-fast-isel.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-fast-isel.ll Thu Jul 28 10:38:57 2016
@@ -681,10 +681,11 @@ define <2 x i64> @test_mm256_cvttpd_epi3
; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
- %cvt = fptosi <4 x double> %a0 to <4 x i32>
+ %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
%res = bitcast <4 x i32> %cvt to <2 x i64>
ret <2 x i64> %res
}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvttps_epi32:
@@ -696,10 +697,11 @@ define <4 x i64> @test_mm256_cvttps_epi3
; X64: # BB#0:
; X64-NEXT: vcvttps2dq %ymm0, %ymm0
; X64-NEXT: retq
- %cvt = fptosi <8 x float> %a0 to <8 x i32>
+ %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
%res = bitcast <8 x i32> %cvt to <4 x i64>
ret <4 x i64> %res
}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_div_pd:
Modified: llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll Thu Jul 28 10:38:57 2016
@@ -359,35 +359,12 @@ define <4 x double> @test_x86_avx_cvt_ps
declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
-define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
-
-
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; add operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Modified: llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/avx-intrinsics-x86.ll Thu Jul 28 10:38:57 2016
@@ -3431,6 +3431,39 @@ define <8 x float> @test_x86_avx_cvtdq2_
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
+define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; AVX512VL-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
+; AVX512VL-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
+
+
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; AVX-LABEL: test_x86_avx_dp_ps_256:
; AVX: ## BB#0:
@@ -4552,7 +4585,7 @@ define void @movnt_dq(i8* %p, <2 x i64>
; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %ymm0, (%eax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
@@ -4560,7 +4593,7 @@ define void @movnt_dq(i8* %p, <2 x i64>
; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
Modified: llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll Thu Jul 28 10:38:57 2016
@@ -6,13 +6,12 @@
define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_ss:
; X64: # BB#0:
-; X64-NEXT: cvtsi2ssq %rdi, %xmm1
-; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: cvtsi2ssq %rdi, %xmm0
; X64-NEXT: retq
- %cvt = sitofp i64 %a1 to float
- %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
ret <4 x float> %res
}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
; X64-LABEL: test_mm_cvtss_si64:
@@ -29,7 +28,7 @@ define i64 @test_mm_cvttss_si64(<4 x flo
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: retq
- %cvt = extractelement <4 x float> %a0, i32 0
- %res = fptosi float %cvt to i64
+ %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
ret i64 %res
}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
Modified: llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/sse-intrinsics-fast-isel.ll Thu Jul 28 10:38:57 2016
@@ -707,20 +707,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x
define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm_cvtsi32_ss:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: cvtsi2ssl %eax, %xmm1
-; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_ss:
; X64: # BB#0:
-; X64-NEXT: cvtsi2ssl %edi, %xmm1
-; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: cvtsi2ssl %edi, %xmm0
; X64-NEXT: retq
- %cvt = sitofp i32 %a1 to float
- %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
ret <4 x float> %res
}
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtss_f32:
@@ -762,10 +759,10 @@ define i32 @test_mm_cvttss_si(<4 x float
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
- %cvt = extractelement <4 x float> %a0, i32 0
- %res = fptosi float %cvt to i32
+ %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
ret i32 %res
}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvttss_si32:
@@ -777,8 +774,7 @@ define i32 @test_mm_cvttss_si32(<4 x flo
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
- %cvt = extractelement <4 x float> %a0, i32 0
- %res = fptosi float %cvt to i32
+ %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
ret i32 %res
}
Modified: llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll Thu Jul 28 10:38:57 2016
@@ -25,13 +25,12 @@ define i64 @test_mm_cvtsi128_si64(<2 x i
define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_sd:
; X64: # BB#0:
-; X64-NEXT: cvtsi2sdq %rdi, %xmm1
-; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: cvtsi2sdq %rdi, %xmm0
; X64-NEXT: retq
- %cvt = sitofp i64 %a1 to double
- %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
ret <2 x double> %res
}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
; X64-LABEL: test_mm_cvtsi64_si128:
@@ -48,10 +47,10 @@ define i64 @test_mm_cvttsd_si64(<2 x dou
; X64: # BB#0:
; X64-NEXT: cvttsd2si %xmm0, %rax
; X64-NEXT: retq
- %ext = extractelement <2 x double> %a0, i32 0
- %res = fptosi double %ext to i64
+ %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
ret i64 %res
}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
; X64-LABEL: test_mm_loadu_si64:
Modified: llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll Thu Jul 28 10:38:57 2016
@@ -1208,6 +1208,39 @@ define i32 @test_mm_cvtsd_si32(<2 x doub
}
declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_cvtsd_ss:
+; X32: # BB#0:
+; X32-NEXT: cvtsd2ss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_ss:
+; X64: # BB#0:
+; X64-NEXT: cvtsd2ss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; X32-LABEL: test_mm_cvtsd_ss_load:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm1
+; X32-NEXT: cvtsd2ss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_ss_load:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm1
+; X64-NEXT: cvtsd2ss %xmm1, %xmm0
+; X64-NEXT: retq
+ %a1 = load <2 x double>, <2 x double>* %p1
+ %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+ ret <4 x float> %res
+}
+
define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_cvtsi128_si32:
; X32: # BB#0:
@@ -1303,10 +1336,11 @@ define <2 x i64> @test_mm_cvttps_epi32(<
; X64: # BB#0:
; X64-NEXT: cvttps2dq %xmm0, %xmm0
; X64-NEXT: retq
- %res = fptosi <4 x float> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
%bc = bitcast <4 x i32> %res to <2 x i64>
ret <2 x i64> %bc
}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvttsd_si32:
@@ -1318,10 +1352,10 @@ define i32 @test_mm_cvttsd_si32(<2 x dou
; X64: # BB#0:
; X64-NEXT: cvttsd2si %xmm0, %eax
; X64-NEXT: retq
- %ext = extractelement <2 x double> %a0, i32 0
- %res = fptosi double %ext to i32
+ %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
ret i32 %res
}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_div_pd:
Modified: llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll Thu Jul 28 10:38:57 2016
@@ -66,17 +66,6 @@ define <2 x double> @test_x86_sse2_cvtps
declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_sse2_storel_dq:
; CHECK: ## BB#0:
@@ -94,7 +83,7 @@ define void @test_x86_sse2_storeu_dq(i8*
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: paddb LCPI8_0, %xmm0
+; CHECK-NEXT: paddb LCPI7_0, %xmm0
; CHECK-NEXT: movdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Modified: llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86.ll (original)
+++ llvm/branches/release_39/test/CodeGen/X86/sse2-intrinsics-x86.ll Thu Jul 28 10:38:57 2016
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
@@ -274,6 +274,25 @@ define <4 x float> @test_x86_sse2_cvtsd2
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtsd2ss_load:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movaps (%eax), %xmm1
+; SSE-NEXT: cvtsd2ss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2ss_load:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vcvtsd2ss (%eax), %xmm0, %xmm0
+; KNL-NEXT: retl
+ %a1 = load <2 x double>, <2 x double>* %p1
+ %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
+
define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
; SSE-LABEL: test_x86_sse2_cvtsi2sd:
; SSE: ## BB#0:
@@ -306,6 +325,25 @@ define <2 x double> @test_x86_sse2_cvtss
declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtss2sd_load:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movaps (%eax), %xmm1
+; SSE-NEXT: cvtss2sd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtss2sd_load:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vcvtss2sd (%eax), %xmm0, %xmm0
+; KNL-NEXT: retl
+ %a1 = load <4 x float>, <4 x float>* %p1
+ %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+
+
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttpd2dq:
; SSE: ## BB#0:
@@ -322,6 +360,22 @@ define <4 x i32> @test_x86_sse2_cvttpd2d
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+; SSE-LABEL: test_x86_sse2_cvttps2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvttps2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttps2dq %xmm0, %xmm0
+; KNL-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttsd2si:
; SSE: ## BB#0:
Modified: llvm/branches/release_39/test/Transforms/ConstProp/calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/Transforms/ConstProp/calls.ll?rev=276990&r1=276989&r2=276990&view=diff
==============================================================================
--- llvm/branches/release_39/test/Transforms/ConstProp/calls.ll (original)
+++ llvm/branches/release_39/test/Transforms/ConstProp/calls.ll Thu Jul 28 10:38:57 2016
@@ -193,11 +193,13 @@ entry:
ret i1 %b
}
-; TODO: Inexact values should not fold as they are dependent on rounding mode
+; Inexact values should not fold as they are dependent on rounding mode
define i1 @test_sse_cvts_inexact() nounwind readnone {
; CHECK-LABEL: @test_sse_cvts_inexact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
entry:
%i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
%i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
More information about the llvm-branch-commits
mailing list