[llvm] 9e88416 - [FPEnv][X86][SystemZ] Use different algorithms for i64->double uint_to_fp under strictfp to avoid producing -0.0 when rounding toward negative infinity
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 21 18:23:40 PDT 2020
Author: Craig Topper
Date: 2020-10-21T18:12:54-07:00
New Revision: 9e884169a2723de5ad5c59af69b35b20953965fa
URL: https://github.com/llvm/llvm-project/commit/9e884169a2723de5ad5c59af69b35b20953965fa
DIFF: https://github.com/llvm/llvm-project/commit/9e884169a2723de5ad5c59af69b35b20953965fa.diff
LOG: [FPEnv][X86][SystemZ] Use different algorithms for i64->double uint_to_fp under strictfp to avoid producing -0.0 when rounding toward negative infinity
Some of our conversion algorithms produce -0.0 when converting unsigned i64 to double when the rounding mode is round toward negative. This switches them to other algorithms that don't have this problem. Since it is undefined behavior to change rounding mode with the non-strict nodes, this patch only changes the behavior for strict nodes.
There are still problems with unsigned i32 conversions too which I'll try to fix in another patch.
Fixes part of PR47393
Reviewed By: efriedma
Differential Revision: https://reviews.llvm.org/D87115
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll
llvm/test/CodeGen/X86/fp-intrinsics.ll
llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c22f435c740c..6328626868d0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2460,12 +2460,19 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
// TODO: Generalize this for use with other types.
- if ((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) {
- LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32\n");
+ if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) ||
+ (SrcVT == MVT::i64 && DestVT == MVT::f64)) {
+ LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32/f64\n");
// For unsigned conversions, convert them to signed conversions using the
// algorithm from the x86_64 __floatundisf in compiler_rt. That method
// should be valid for i32->f32 as well.
+ // More generally this transform should be valid if there are 3 more bits
+ // in the integer type than the significand. Rounding uses the first bit
+ // after the width of the significand and the OR of all bits after that. So
+ // we need to be able to OR the shifted out bit into one of the bits that
+ // participate in the OR.
+
// TODO: This really should be implemented using a branch rather than a
// select. We happen to get lucky and machinesink does the right
// thing most of the time. This would be a good candidate for a
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bc81d0d8298c..58a731afee36 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6528,8 +6528,13 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
SDValue &Chain,
SelectionDAG &DAG) const {
- unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
- SDValue Src = Node->getOperand(OpNo);
+ // This transform is not correct for converting 0 when rounding mode is set
+ // to round toward negative infinity which will produce -0.0. So disable under
+ // strictfp.
+ if (Node->isStrictFPOpcode())
+ return false;
+
+ SDValue Src = Node->getOperand(0);
EVT SrcVT = Src.getValueType();
EVT DstVT = Node->getValueType(0);
@@ -6548,9 +6553,10 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
// Implementation of unsigned i64 to f64 following the algorithm in
- // __floatundidf in compiler_rt. This implementation has the advantage
- // of performing rounding correctly, both in the default rounding mode
- // and in all alternate rounding modes.
+ // __floatundidf in compiler_rt. This implementation performs rounding
+ // correctly in all rounding modes with the exception of converting 0
+ // when rounding toward negative infinity. In that case the fsub will produce
+ // -0.0. This will be added to +0.0 and produce -0.0 which is incorrect.
SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
@@ -6564,18 +6570,9 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
- if (Node->isStrictFPOpcode()) {
- SDValue HiSub =
- DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other},
- {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52});
- Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other},
- {HiSub.getValue(1), LoFlt, HiSub});
- Chain = Result.getValue(1);
- } else {
- SDValue HiSub =
- DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
- Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
- }
+ SDValue HiSub =
+ DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+ Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
return true;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 620f74d69ce9..036ad48cb935 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19885,6 +19885,10 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
+ // when converting 0 when rounding toward negative infinity. Caller will
+ // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
+ assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
// This algorithm is not obvious. Here it is what we're trying to output:
/*
movq %rax, %xmm0
@@ -19898,8 +19902,6 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
#endif
*/
- bool IsStrict = Op->isStrictFPOpcode();
- unsigned OpNo = IsStrict ? 1 : 0;
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
@@ -19921,7 +19923,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
// Load the 64-bit value into an XMM register.
SDValue XR1 =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
SDValue CLod0 = DAG.getLoad(
MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
@@ -19932,35 +19934,19 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
- SDValue Sub;
- SDValue Chain;
// TODO: Are there any fast-math-flags to propagate here?
- if (IsStrict) {
- Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
- {Op.getOperand(0), XR2F, CLod1});
- Chain = Sub.getValue(1);
- } else
- Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (!IsStrict && Subtarget.hasSSE3() &&
+ if (Subtarget.hasSSE3() &&
shouldUseHorizontalOp(true, DAG, Subtarget)) {
- // FIXME: Do we need a STRICT version of FHADD?
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
- if (IsStrict) {
- Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
- {Chain, Shuffle, Sub});
- Chain = Result.getValue(1);
- } else
- Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0, dl));
- if (IsStrict)
- return DAG.getMergeValues({Result, Chain}, dl);
-
return Result;
}
@@ -20286,11 +20272,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
- if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+ // The transform for i64->f64 isn't correct for 0 when rounding to negative
+ // infinity. It produces -0.0, so disable under strictfp.
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
- if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+ if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
+ (DstVT == MVT::f32 || DstVT == MVT::f64))
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll
index a151c15e34d1..c323086b6e13 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll
@@ -22,7 +22,7 @@ define float @f1(i64 %i) #0 {
; Test i64->f64.
define double @f2(i64 %i) #0 {
; CHECK-LABEL: f2:
-; CHECK: ldgr
+; CHECK: cdgbr
; CHECK: adbr
; CHECK: br %r14
%conv = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %i,
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index b8a974f795b2..7bef1d678029 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -2492,38 +2492,54 @@ define double @uifdl(i64 %x) #0 {
;
; X86-SSE-LABEL: uifdl:
; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: subl $12, %esp
-; X86-SSE-NEXT: .cfi_def_cfa_offset 16
+; X86-SSE-NEXT: subl $28, %esp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X86-SSE-NEXT: subpd {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: movapd %xmm0, %xmm1
-; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; X86-SSE-NEXT: addpd %xmm0, %xmm1
-; X86-SSE-NEXT: movlpd %xmm1, (%esp)
+; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: shrl $31, %eax
+; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: wait
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT: movsd %xmm0, (%esp)
; X86-SSE-NEXT: fldl (%esp)
; X86-SSE-NEXT: wait
-; X86-SSE-NEXT: addl $12, %esp
+; X86-SSE-NEXT: addl $28, %esp
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; SSE-LABEL: uifdl:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: movq %rdi, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; SSE-NEXT: subpd {{.*}}(%rip), %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnsq %rdi, %rcx
+; SSE-NEXT: cvtsi2sd %rcx, %xmm0
+; SSE-NEXT: jns .LBB48_2
+; SSE-NEXT: # %bb.1:
+; SSE-NEXT: addsd %xmm0, %xmm0
+; SSE-NEXT: .LBB48_2: # %entry
; SSE-NEXT: retq
;
; AVX1-LABEL: uifdl:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovq %rdi, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: testq %rdi, %rdi
+; AVX1-NEXT: cmovnsq %rdi, %rcx
+; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX1-NEXT: jns .LBB48_2
+; AVX1-NEXT: # %bb.1:
+; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: .LBB48_2: # %entry
; AVX1-NEXT: retq
;
; AVX512-LABEL: uifdl:
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
index 21b09e2d2507..49e238df4aa2 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
@@ -1262,14 +1262,17 @@ define double @uitofp_i64tof64(i64 %x) #0 {
; SSE-X86-NEXT: movl %esp, %ebp
; SSE-X86-NEXT: .cfi_def_cfa_register %ebp
; SSE-X86-NEXT: andl $-8, %esp
-; SSE-X86-NEXT: subl $8, %esp
+; SSE-X86-NEXT: subl $24, %esp
+; SSE-X86-NEXT: movl 12(%ebp), %eax
; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; SSE-X86-NEXT: subpd {{\.LCPI.*}}, %xmm0
-; SSE-X86-NEXT: movapd %xmm0, %xmm1
-; SSE-X86-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-X86-NEXT: addpd %xmm0, %xmm1
-; SSE-X86-NEXT: movlpd %xmm1, (%esp)
+; SSE-X86-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT: shrl $31, %eax
+; SSE-X86-NEXT: fildll {{[0-9]+}}(%esp)
+; SSE-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp)
+; SSE-X86-NEXT: wait
+; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-X86-NEXT: movsd %xmm0, (%esp)
; SSE-X86-NEXT: fldl (%esp)
; SSE-X86-NEXT: wait
; SSE-X86-NEXT: movl %ebp, %esp
@@ -1279,12 +1282,18 @@ define double @uitofp_i64tof64(i64 %x) #0 {
;
; SSE-X64-LABEL: uitofp_i64tof64:
; SSE-X64: # %bb.0:
-; SSE-X64-NEXT: movq %rdi, %xmm1
-; SSE-X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; SSE-X64-NEXT: subpd {{.*}}(%rip), %xmm1
-; SSE-X64-NEXT: movapd %xmm1, %xmm0
-; SSE-X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-X64-NEXT: addpd %xmm1, %xmm0
+; SSE-X64-NEXT: movq %rdi, %rax
+; SSE-X64-NEXT: shrq %rax
+; SSE-X64-NEXT: movl %edi, %ecx
+; SSE-X64-NEXT: andl $1, %ecx
+; SSE-X64-NEXT: orq %rax, %rcx
+; SSE-X64-NEXT: testq %rdi, %rdi
+; SSE-X64-NEXT: cmovnsq %rdi, %rcx
+; SSE-X64-NEXT: cvtsi2sd %rcx, %xmm0
+; SSE-X64-NEXT: jns .LBB18_2
+; SSE-X64-NEXT: # %bb.1:
+; SSE-X64-NEXT: addsd %xmm0, %xmm0
+; SSE-X64-NEXT: .LBB18_2:
; SSE-X64-NEXT: retq
;
; AVX-X86-LABEL: uitofp_i64tof64:
@@ -1295,13 +1304,17 @@ define double @uitofp_i64tof64(i64 %x) #0 {
; AVX-X86-NEXT: movl %esp, %ebp
; AVX-X86-NEXT: .cfi_def_cfa_register %ebp
; AVX-X86-NEXT: andl $-8, %esp
-; AVX-X86-NEXT: subl $8, %esp
+; AVX-X86-NEXT: subl $24, %esp
+; AVX-X86-NEXT: movl 12(%ebp), %eax
; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-X86-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX-X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX-X86-NEXT: vmovlpd %xmm0, (%esp)
+; AVX-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX-X86-NEXT: shrl $31, %eax
+; AVX-X86-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-X86-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX-X86-NEXT: wait
+; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-X86-NEXT: vmovsd %xmm0, (%esp)
; AVX-X86-NEXT: fldl (%esp)
; AVX-X86-NEXT: wait
; AVX-X86-NEXT: movl %ebp, %esp
@@ -1311,11 +1324,18 @@ define double @uitofp_i64tof64(i64 %x) #0 {
;
; AVX1-X64-LABEL: uitofp_i64tof64:
; AVX1-X64: # %bb.0:
-; AVX1-X64-NEXT: vmovq %rdi, %xmm0
-; AVX1-X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX1-X64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-X64-NEXT: movq %rdi, %rax
+; AVX1-X64-NEXT: shrq %rax
+; AVX1-X64-NEXT: movl %edi, %ecx
+; AVX1-X64-NEXT: andl $1, %ecx
+; AVX1-X64-NEXT: orq %rax, %rcx
+; AVX1-X64-NEXT: testq %rdi, %rdi
+; AVX1-X64-NEXT: cmovnsq %rdi, %rcx
+; AVX1-X64-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX1-X64-NEXT: jns .LBB18_2
+; AVX1-X64-NEXT: # %bb.1:
+; AVX1-X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; AVX1-X64-NEXT: .LBB18_2:
; AVX1-X64-NEXT: retq
;
; AVX512-X64-LABEL: uitofp_i64tof64:
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
index 0f8881ad25c3..8ead5e2f4fa4 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
@@ -1262,112 +1262,218 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; SSE-32-LABEL: uitofp_v2i64_v2f64:
; SSE-32: # %bb.0:
-; SSE-32-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; SSE-32-NEXT: pand %xmm0, %xmm1
-; SSE-32-NEXT: por {{\.LCPI.*}}, %xmm1
-; SSE-32-NEXT: psrlq $32, %xmm0
-; SSE-32-NEXT: por {{\.LCPI.*}}, %xmm0
-; SSE-32-NEXT: subpd {{\.LCPI.*}}, %xmm0
-; SSE-32-NEXT: addpd %xmm1, %xmm0
+; SSE-32-NEXT: pushl %ebp
+; SSE-32-NEXT: .cfi_def_cfa_offset 8
+; SSE-32-NEXT: .cfi_offset %ebp, -8
+; SSE-32-NEXT: movl %esp, %ebp
+; SSE-32-NEXT: .cfi_def_cfa_register %ebp
+; SSE-32-NEXT: andl $-8, %esp
+; SSE-32-NEXT: subl $32, %esp
+; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-32-NEXT: movd %xmm1, %eax
+; SSE-32-NEXT: shrl $31, %eax
+; SSE-32-NEXT: fildll {{[0-9]+}}(%esp)
+; SSE-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; SSE-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; SSE-32-NEXT: wait
+; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-32-NEXT: movd %xmm0, %eax
+; SSE-32-NEXT: shrl $31, %eax
+; SSE-32-NEXT: fildll {{[0-9]+}}(%esp)
+; SSE-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; SSE-32-NEXT: fstpl (%esp)
+; SSE-32-NEXT: wait
+; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-32-NEXT: movl %ebp, %esp
+; SSE-32-NEXT: popl %ebp
+; SSE-32-NEXT: .cfi_def_cfa %esp, 4
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: uitofp_v2i64_v2f64:
; SSE-64: # %bb.0:
-; SSE-64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE-64-NEXT: pand %xmm0, %xmm1
-; SSE-64-NEXT: por {{.*}}(%rip), %xmm1
-; SSE-64-NEXT: psrlq $32, %xmm0
-; SSE-64-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-64-NEXT: subpd {{.*}}(%rip), %xmm0
-; SSE-64-NEXT: addpd %xmm1, %xmm0
+; SSE-64-NEXT: movdqa %xmm0, %xmm1
+; SSE-64-NEXT: movq %xmm0, %rax
+; SSE-64-NEXT: movq %rax, %rcx
+; SSE-64-NEXT: shrq %rcx
+; SSE-64-NEXT: movl %eax, %edx
+; SSE-64-NEXT: andl $1, %edx
+; SSE-64-NEXT: orq %rcx, %rdx
+; SSE-64-NEXT: testq %rax, %rax
+; SSE-64-NEXT: cmovnsq %rax, %rdx
+; SSE-64-NEXT: xorps %xmm0, %xmm0
+; SSE-64-NEXT: cvtsi2sd %rdx, %xmm0
+; SSE-64-NEXT: jns .LBB21_2
+; SSE-64-NEXT: # %bb.1:
+; SSE-64-NEXT: addsd %xmm0, %xmm0
+; SSE-64-NEXT: .LBB21_2:
+; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-64-NEXT: movq %xmm1, %rax
+; SSE-64-NEXT: movq %rax, %rcx
+; SSE-64-NEXT: shrq %rcx
+; SSE-64-NEXT: movl %eax, %edx
+; SSE-64-NEXT: andl $1, %edx
+; SSE-64-NEXT: orq %rcx, %rdx
+; SSE-64-NEXT: testq %rax, %rax
+; SSE-64-NEXT: cmovnsq %rax, %rdx
+; SSE-64-NEXT: xorps %xmm1, %xmm1
+; SSE-64-NEXT: cvtsi2sd %rdx, %xmm1
+; SSE-64-NEXT: jns .LBB21_4
+; SSE-64-NEXT: # %bb.3:
+; SSE-64-NEXT: addsd %xmm1, %xmm1
+; SSE-64-NEXT: .LBB21_4:
+; SSE-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
; SSE41-32-LABEL: uitofp_v2i64_v2f64:
; SSE41-32: # %bb.0:
-; SSE41-32-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; SSE41-32-NEXT: pand %xmm0, %xmm1
-; SSE41-32-NEXT: por {{\.LCPI.*}}, %xmm1
-; SSE41-32-NEXT: psrlq $32, %xmm0
-; SSE41-32-NEXT: por {{\.LCPI.*}}, %xmm0
-; SSE41-32-NEXT: subpd {{\.LCPI.*}}, %xmm0
-; SSE41-32-NEXT: addpd %xmm1, %xmm0
+; SSE41-32-NEXT: pushl %ebp
+; SSE41-32-NEXT: .cfi_def_cfa_offset 8
+; SSE41-32-NEXT: .cfi_offset %ebp, -8
+; SSE41-32-NEXT: movl %esp, %ebp
+; SSE41-32-NEXT: .cfi_def_cfa_register %ebp
+; SSE41-32-NEXT: andl $-8, %esp
+; SSE41-32-NEXT: subl $32, %esp
+; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
+; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
+; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-32-NEXT: movd %xmm1, %eax
+; SSE41-32-NEXT: shrl $31, %eax
+; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp)
+; SSE41-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; SSE41-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; SSE41-32-NEXT: wait
+; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE41-32-NEXT: movd %xmm0, %eax
+; SSE41-32-NEXT: shrl $31, %eax
+; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp)
+; SSE41-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; SSE41-32-NEXT: fstpl (%esp)
+; SSE41-32-NEXT: wait
+; SSE41-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE41-32-NEXT: movl %ebp, %esp
+; SSE41-32-NEXT: popl %ebp
+; SSE41-32-NEXT: .cfi_def_cfa %esp, 4
; SSE41-32-NEXT: retl
;
; SSE41-64-LABEL: uitofp_v2i64_v2f64:
; SSE41-64: # %bb.0:
-; SSE41-64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE41-64-NEXT: pand %xmm0, %xmm1
-; SSE41-64-NEXT: por {{.*}}(%rip), %xmm1
-; SSE41-64-NEXT: psrlq $32, %xmm0
-; SSE41-64-NEXT: por {{.*}}(%rip), %xmm0
-; SSE41-64-NEXT: subpd {{.*}}(%rip), %xmm0
-; SSE41-64-NEXT: addpd %xmm1, %xmm0
+; SSE41-64-NEXT: movdqa %xmm0, %xmm1
+; SSE41-64-NEXT: movq %xmm0, %rax
+; SSE41-64-NEXT: movq %rax, %rcx
+; SSE41-64-NEXT: shrq %rcx
+; SSE41-64-NEXT: movl %eax, %edx
+; SSE41-64-NEXT: andl $1, %edx
+; SSE41-64-NEXT: orq %rcx, %rdx
+; SSE41-64-NEXT: testq %rax, %rax
+; SSE41-64-NEXT: cmovnsq %rax, %rdx
+; SSE41-64-NEXT: xorps %xmm0, %xmm0
+; SSE41-64-NEXT: cvtsi2sd %rdx, %xmm0
+; SSE41-64-NEXT: jns .LBB21_2
+; SSE41-64-NEXT: # %bb.1:
+; SSE41-64-NEXT: addsd %xmm0, %xmm0
+; SSE41-64-NEXT: .LBB21_2:
+; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE41-64-NEXT: movq %xmm1, %rax
+; SSE41-64-NEXT: movq %rax, %rcx
+; SSE41-64-NEXT: shrq %rcx
+; SSE41-64-NEXT: movl %eax, %edx
+; SSE41-64-NEXT: andl $1, %edx
+; SSE41-64-NEXT: orq %rcx, %rdx
+; SSE41-64-NEXT: testq %rax, %rax
+; SSE41-64-NEXT: cmovnsq %rax, %rdx
+; SSE41-64-NEXT: xorps %xmm1, %xmm1
+; SSE41-64-NEXT: cvtsi2sd %rdx, %xmm1
+; SSE41-64-NEXT: jns .LBB21_4
+; SSE41-64-NEXT: # %bb.3:
+; SSE41-64-NEXT: addsd %xmm1, %xmm1
+; SSE41-64-NEXT: .LBB21_4:
+; SSE41-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-64-NEXT: retq
;
-; AVX1-32-LABEL: uitofp_v2i64_v2f64:
-; AVX1-32: # %bb.0:
-; AVX1-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-32-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1
-; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX1-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX1-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX1-32-NEXT: retl
+; AVX-32-LABEL: uitofp_v2i64_v2f64:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: pushl %ebp
+; AVX-32-NEXT: .cfi_def_cfa_offset 8
+; AVX-32-NEXT: .cfi_offset %ebp, -8
+; AVX-32-NEXT: movl %esp, %ebp
+; AVX-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX-32-NEXT: andl $-8, %esp
+; AVX-32-NEXT: subl $32, %esp
+; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX-32-NEXT: shrl $31, %eax
+; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX-32-NEXT: wait
+; AVX-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX-32-NEXT: shrl $31, %eax
+; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-32-NEXT: fstpl (%esp)
+; AVX-32-NEXT: wait
+; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX-32-NEXT: movl %ebp, %esp
+; AVX-32-NEXT: popl %ebp
+; AVX-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v2i64_v2f64:
; AVX1-64: # %bb.0:
-; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-64-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-64-NEXT: movq %rax, %rcx
+; AVX1-64-NEXT: shrq %rcx
+; AVX1-64-NEXT: movl %eax, %edx
+; AVX1-64-NEXT: andl $1, %edx
+; AVX1-64-NEXT: orq %rcx, %rdx
+; AVX1-64-NEXT: testq %rax, %rax
+; AVX1-64-NEXT: cmovnsq %rax, %rdx
+; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1
+; AVX1-64-NEXT: jns .LBB21_2
+; AVX1-64-NEXT: # %bb.1:
+; AVX1-64-NEXT: vaddsd %xmm1, %xmm1, %xmm1
+; AVX1-64-NEXT: .LBB21_2:
+; AVX1-64-NEXT: vmovq %xmm0, %rax
+; AVX1-64-NEXT: movq %rax, %rcx
+; AVX1-64-NEXT: shrq %rcx
+; AVX1-64-NEXT: movl %eax, %edx
+; AVX1-64-NEXT: andl $1, %edx
+; AVX1-64-NEXT: orq %rcx, %rdx
+; AVX1-64-NEXT: testq %rax, %rax
+; AVX1-64-NEXT: cmovnsq %rax, %rdx
+; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0
+; AVX1-64-NEXT: jns .LBB21_4
+; AVX1-64-NEXT: # %bb.3:
+; AVX1-64-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; AVX1-64-NEXT: .LBB21_4:
+; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-64-NEXT: retq
;
-; AVX512F-32-LABEL: uitofp_v2i64_v2f64:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1
-; AVX512F-32-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512F-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512F-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX512F-32-NEXT: retl
-;
; AVX512F-64-LABEL: uitofp_v2i64_v2f64:
; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512F-64-NEXT: vmovq %xmm0, %rax
+; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0
+; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-64-NEXT: retq
;
-; AVX512VL-32-LABEL: uitofp_v2i64_v2f64:
-; AVX512VL-32: # %bb.0:
-; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512VL-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512VL-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX512VL-32-NEXT: retl
-;
; AVX512VL-64-LABEL: uitofp_v2i64_v2f64:
; AVX512VL-64: # %bb.0:
-; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-64-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512VL-64-NEXT: vmovq %xmm0, %rax
+; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0
+; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_v2i64_v2f64:
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index 00be3baedac8..82af829506bd 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -748,106 +748,154 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
}
define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
-; AVX1-32-LABEL: uitofp_v4i64_v4f64:
-; AVX1-32: # %bb.0:
-; AVX1-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-32-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-32-NEXT: vorps {{\.LCPI.*}}, %ymm1, %ymm1
-; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-32-NEXT: vorpd {{\.LCPI.*}}, %ymm0, %ymm0
-; AVX1-32-NEXT: vsubpd {{\.LCPI.*}}, %ymm0, %ymm0
-; AVX1-32-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX1-32-NEXT: retl
+; AVX-32-LABEL: uitofp_v4i64_v4f64:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: pushl %ebp
+; AVX-32-NEXT: .cfi_def_cfa_offset 8
+; AVX-32-NEXT: .cfi_offset %ebp, -8
+; AVX-32-NEXT: movl %esp, %ebp
+; AVX-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX-32-NEXT: andl $-8, %esp
+; AVX-32-NEXT: subl $64, %esp
+; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX-32-NEXT: shrl $31, %eax
+; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-32-NEXT: fstpl (%esp)
+; AVX-32-NEXT: wait
+; AVX-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX-32-NEXT: shrl $31, %eax
+; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX-32-NEXT: wait
+; AVX-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX-32-NEXT: shrl $31, %eax
+; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX-32-NEXT: wait
+; AVX-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX-32-NEXT: shrl $31, %eax
+; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX-32-NEXT: wait
+; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-32-NEXT: movl %ebp, %esp
+; AVX-32-NEXT: popl %ebp
+; AVX-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v4i64_v4f64:
; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-64-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-64-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
+; AVX1-64-NEXT: vmovq %xmm2, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
+; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-64-NEXT: vmovq %xmm1, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
+; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-64-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-64-NEXT: vmovq %xmm2, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
+; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-64-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-64-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-64-NEXT: vmovq %xmm0, %rax
+; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
+; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-64-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-64-NEXT: retq
;
-; AVX2-32-LABEL: uitofp_v4i64_v4f64:
-; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpsrlq $32, %ymm0, %ymm1
-; AVX2-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1
-; AVX2-32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX2-32-NEXT: vsubpd %ymm2, %ymm1, %ymm1
-; AVX2-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX2-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0
-; AVX2-32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX2-32-NEXT: retl
-;
; AVX2-64-LABEL: uitofp_v4i64_v4f64:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX2-64-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX2-64-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX2-64-NEXT: vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
+; AVX2-64-NEXT: vmovq %xmm2, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
+; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX2-64-NEXT: vmovq %xmm1, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
+; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
+; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX2-64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX2-64-NEXT: vmovq %xmm2, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
+; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX2-64-NEXT: vmovq %xmm0, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
+; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX2-64-NEXT: retq
;
-; AVX512F-32-LABEL: uitofp_v4i64_v4f64:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vpsrlq $32, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1
-; AVX512F-32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-32-NEXT: vsubpd %ymm2, %ymm1, %ymm1
-; AVX512F-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0
-; AVX512F-32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: retl
-;
; AVX512F-64-LABEL: uitofp_v4i64_v4f64:
; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX512F-64-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-64-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX512F-64-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-64-NEXT: vsubpd %ymm2, %ymm0, %ymm0
-; AVX512F-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-64-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512F-64-NEXT: vmovq %xmm1, %rax
+; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
+; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2
+; AVX512F-64-NEXT: vmovq %xmm0, %rax
+; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-64-NEXT: retq
;
-; AVX512VL-32-LABEL: uitofp_v4i64_v4f64:
-; AVX512VL-32: # %bb.0:
-; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1
-; AVX512VL-32-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0
-; AVX512VL-32-NEXT: vsubpd {{\.LCPI.*}}{1to4}, %ymm0, %ymm0
-; AVX512VL-32-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX512VL-32-NEXT: retl
-;
; AVX512VL-64-LABEL: uitofp_v4i64_v4f64:
; AVX512VL-64: # %bb.0:
-; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
-; AVX512VL-64-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-64-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-64-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512VL-64-NEXT: vmovq %xmm1, %rax
+; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
+; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2
+; AVX512VL-64-NEXT: vmovq %xmm0, %rax
+; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_v4i64_v4f64:
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index 328f3c15fc48..00be17a4d162 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -362,22 +362,120 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
; NODQ-32-LABEL: uitofp_v8i64_v8f64:
; NODQ-32: # %bb.0:
-; NODQ-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200]
-; NODQ-32-NEXT: vpternlogq $248, {{\.LCPI.*}}, %zmm0, %zmm1
-; NODQ-32-NEXT: vpsrlq $32, %zmm0, %zmm0
-; NODQ-32-NEXT: vporq {{\.LCPI.*}}, %zmm0, %zmm0
-; NODQ-32-NEXT: vsubpd {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
-; NODQ-32-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; NODQ-32-NEXT: pushl %ebp
+; NODQ-32-NEXT: .cfi_def_cfa_offset 8
+; NODQ-32-NEXT: .cfi_offset %ebp, -8
+; NODQ-32-NEXT: movl %esp, %ebp
+; NODQ-32-NEXT: .cfi_def_cfa_register %ebp
+; NODQ-32-NEXT: andl $-8, %esp
+; NODQ-32-NEXT: subl $128, %esp
+; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2
+; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3
+; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextractps $1, %xmm2, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $3, %xmm2, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $1, %xmm3, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $3, %xmm3, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $1, %xmm0, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl (%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $3, %xmm0, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $1, %xmm1, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vextractps $3, %xmm1, %eax
+; NODQ-32-NEXT: shrl $31, %eax
+; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: wait
+; NODQ-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; NODQ-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; NODQ-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NODQ-32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; NODQ-32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NODQ-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; NODQ-32-NEXT: movl %ebp, %esp
+; NODQ-32-NEXT: popl %ebp
+; NODQ-32-NEXT: .cfi_def_cfa %esp, 4
; NODQ-32-NEXT: retl
;
; NODQ-64-LABEL: uitofp_v8i64_v8f64:
; NODQ-64: # %bb.0:
-; NODQ-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; NODQ-64-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; NODQ-64-NEXT: vpsrlq $32, %zmm0, %zmm0
-; NODQ-64-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; NODQ-64-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; NODQ-64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
+; NODQ-64-NEXT: vmovq %xmm1, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
+; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3
+; NODQ-64-NEXT: vmovq %xmm2, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2
+; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2
+; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3
+; NODQ-64-NEXT: vmovq %xmm2, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2
+; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3
+; NODQ-64-NEXT: vmovq %xmm0, %rax
+; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0
+; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-64-NEXT: retq
;
; DQ-LABEL: uitofp_v8i64_v8f64:
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 1e3bc0c61c91..5f8b3b33bfdc 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -6761,21 +6761,34 @@ entry:
define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rdi, %xmm1
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1
-; CHECK-NEXT: movapd %xmm1, %xmm0
-; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT: addpd %xmm1, %xmm0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: orq %rax, %rcx
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: cmovnsq %rdi, %rcx
+; CHECK-NEXT: cvtsi2sd %rcx, %xmm0
+; CHECK-NEXT: jns .LBB169_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: addsd %xmm0, %xmm0
+; CHECK-NEXT: .LBB169_2: # %entry
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovq %rdi, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: testq %rdi, %rdi
+; AVX1-NEXT: cmovnsq %rdi, %rcx
+; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX1-NEXT: jns .LBB169_2
+; AVX1-NEXT: # %bb.1:
+; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: .LBB169_2: # %entry
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64:
@@ -6906,35 +6919,77 @@ entry:
define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: por {{.*}}(%rip), %xmm1
-; CHECK-NEXT: psrlq $32, %xmm0
-; CHECK-NEXT: por {{.*}}(%rip), %xmm0
-; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0
-; CHECK-NEXT: addpd %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtsi2sd %rdx, %xmm0
+; CHECK-NEXT: jns .LBB173_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: addsd %xmm0, %xmm0
+; CHECK-NEXT: .LBB173_2: # %entry
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: cvtsi2sd %rdx, %xmm1
+; CHECK-NEXT: jns .LBB173_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: addsd %xmm1, %xmm1
+; CHECK-NEXT: .LBB173_4: # %entry
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: cmovnsq %rax, %rdx
+; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1
+; AVX1-NEXT: jns .LBB173_2
+; AVX1-NEXT: # %bb.1:
+; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .LBB173_2: # %entry
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: cmovnsq %rax, %rdx
+; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0
+; AVX1-NEXT: jns .LBB173_4
+; AVX1-NEXT: # %bb.3:
+; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: .LBB173_4: # %entry
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v2f64_v2i64:
@@ -7124,51 +7179,91 @@ entry:
define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rdi, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
-; CHECK-NEXT: subpd %xmm3, %xmm1
-; CHECK-NEXT: movapd %xmm1, %xmm0
-; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT: addpd %xmm1, %xmm0
-; CHECK-NEXT: movq %rsi, %xmm4
-; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-NEXT: subpd %xmm3, %xmm4
-; CHECK-NEXT: movapd %xmm4, %xmm1
-; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; CHECK-NEXT: addpd %xmm4, %xmm1
-; CHECK-NEXT: movq %rdx, %xmm4
-; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-NEXT: subpd %xmm3, %xmm4
-; CHECK-NEXT: movapd %xmm4, %xmm2
-; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; CHECK-NEXT: addpd %xmm4, %xmm2
-; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: orq %rax, %rcx
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: cmovnsq %rdi, %rcx
+; CHECK-NEXT: cvtsi2sd %rcx, %xmm0
+; CHECK-NEXT: jns .LBB177_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: addsd %xmm0, %xmm0
+; CHECK-NEXT: .LBB177_2: # %entry
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: orq %rax, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: cmovnsq %rsi, %rcx
+; CHECK-NEXT: cvtsi2sd %rcx, %xmm1
+; CHECK-NEXT: jns .LBB177_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: addsd %xmm1, %xmm1
+; CHECK-NEXT: .LBB177_4: # %entry
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: orq %rax, %rcx
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: cmovnsq %rdx, %rcx
+; CHECK-NEXT: cvtsi2sd %rcx, %xmm2
+; CHECK-NEXT: jns .LBB177_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: addsd %xmm2, %xmm2
+; CHECK-NEXT: .LBB177_6: # %entry
+; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
-; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0]
-; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: cmovnsq %rax, %rdx
+; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1
+; AVX1-NEXT: jns .LBB177_2
+; AVX1-NEXT: # %bb.1:
+; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .LBB177_2: # %entry
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: cmovnsq %rax, %rdx
+; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm2
+; AVX1-NEXT: jns .LBB177_4
+; AVX1-NEXT: # %bb.3:
+; AVX1-NEXT: vaddsd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB177_4: # %entry
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: cmovnsq %rax, %rdx
+; AVX1-NEXT: vcvtsi2sd %rdx, %xmm3, %xmm0
+; AVX1-NEXT: jns .LBB177_6
+; AVX1-NEXT: # %bb.5:
+; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: .LBB177_6: # %entry
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64:
@@ -7381,51 +7476,117 @@ entry:
define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; CHECK-NEXT: movdqa %xmm1, %xmm3
-; CHECK-NEXT: pand %xmm2, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; CHECK-NEXT: por %xmm4, %xmm3
-; CHECK-NEXT: psrlq $32, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; CHECK-NEXT: por %xmm5, %xmm1
-; CHECK-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; CHECK-NEXT: subpd %xmm6, %xmm1
-; CHECK-NEXT: addpd %xmm3, %xmm1
-; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: por %xmm4, %xmm2
-; CHECK-NEXT: psrlq $32, %xmm0
-; CHECK-NEXT: por %xmm5, %xmm0
-; CHECK-NEXT: subpd %xmm6, %xmm0
-; CHECK-NEXT: addpd %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtsi2sd %rdx, %xmm0
+; CHECK-NEXT: jns .LBB181_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: addsd %xmm0, %xmm0
+; CHECK-NEXT: .LBB181_2: # %entry
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; CHECK-NEXT: movq %xmm2, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: cvtsi2sd %rdx, %xmm3
+; CHECK-NEXT: jns .LBB181_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: addsd %xmm3, %xmm3
+; CHECK-NEXT: .LBB181_4: # %entry
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: xorps %xmm2, %xmm2
+; CHECK-NEXT: cvtsi2sd %rdx, %xmm2
+; CHECK-NEXT: jns .LBB181_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: addsd %xmm2, %xmm2
+; CHECK-NEXT: .LBB181_6: # %entry
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: cvtsi2sd %rdx, %xmm1
+; CHECK-NEXT: jns .LBB181_8
+; CHECK-NEXT: # %bb.7:
+; CHECK-NEXT: addsd %xmm1, %xmm1
+; CHECK-NEXT: .LBB181_8: # %entry
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; CHECK-NEXT: movapd %xmm2, %xmm1
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1
+; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v4f64_v4i64:
More information about the llvm-commits
mailing list