[llvm] f40925a - [X86] Improve lowering of fptoui
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 7 23:50:37 PST 2020
Author: Craig Topper
Date: 2020-11-07T23:50:03-08:00
New Revision: f40925aa8b3d200b4616eb0b8a772a1812800b21
URL: https://github.com/llvm/llvm-project/commit/f40925aa8b3d200b4616eb0b8a772a1812800b21
DIFF: https://github.com/llvm/llvm-project/commit/f40925aa8b3d200b4616eb0b8a772a1812800b21.diff
LOG: [X86] Improve lowering of fptoui
Invert the select condition when masking in the sign bit of a fptoui operation. Also, rather than lowering the sign mask to select/xor and expecting the select to get cleaned up later, directly lower to shift/xor.
Patch by Layton Kifer!
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D90658
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fp-cvt.ll
llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
llvm/test/CodeGen/X86/fp-intrinsics.ll
llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
llvm/test/CodeGen/X86/fp80-strict-scalar.ll
llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2c5175e0543e..269ef229b018 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20407,8 +20407,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
- // Adjust = (Value < Thresh) ? 0 : 0x80000000;
- // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+ // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
+ // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
// FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
@@ -20438,20 +20438,30 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
*DAG.getContext(), TheVT);
SDValue Cmp;
if (IsStrict) {
- Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
- Chain, /*IsSignaling*/ true);
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
+ /*IsSignaling*/ true);
Chain = Cmp.getValue(1);
} else {
- Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
}
- Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
- DAG.getConstant(0, DL, MVT::i64),
- DAG.getConstant(APInt::getSignMask(64),
- DL, MVT::i64));
- SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
- DAG.getConstantFP(0.0, DL, TheVT),
- ThreshVal);
+ // Our preferred lowering of
+ //
+ // (Value >= Thresh) ? 0x8000000000000000ULL : 0
+ //
+ // is
+ //
+ // (Value >= Thresh) << 63
+ //
+ // but since we can get here after LegalOperations, DAGCombine might do the
+ // wrong thing if we create a select. So, directly create the preferred
+ // version.
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
+ SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
+ Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
+
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
+ DAG.getConstantFP(0.0, DL, TheVT));
if (IsStrict) {
Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
diff --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll
index cedbfd2e9bff..cb438727cfba 100644
--- a/llvm/test/CodeGen/X86/fp-cvt.ll
+++ b/llvm/test/CodeGen/X86/fp-cvt.ll
@@ -451,13 +451,12 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
; X86-NEXT: sahf
; X86-NEXT: setbe %al
; X86-NEXT: fldz
-; X86-NEXT: ja .LBB10_2
+; X86-NEXT: jbe .LBB10_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: fstp %st(0)
+; X86-NEXT: fstp %st(1)
; X86-NEXT: fldz
-; X86-NEXT: fxch %st(1)
; X86-NEXT: .LBB10_2:
-; X86-NEXT: fstp %st(1)
+; X86-NEXT: fstp %st(0)
; X86-NEXT: fsubrp %st, %st(1)
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -482,8 +481,7 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
; X64-X87-NEXT: fucomi %st(1), %st
; X64-X87-NEXT: setbe %al
; X64-X87-NEXT: fldz
-; X64-X87-NEXT: fxch %st(1)
-; X64-X87-NEXT: fcmovnbe %st(1), %st
+; X64-X87-NEXT: fcmovbe %st(1), %st
; X64-X87-NEXT: fstp %st(1)
; X64-X87-NEXT: fsubrp %st, %st(1)
; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp)
@@ -504,8 +502,7 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
; X64-SSSE3-NEXT: xorl %eax, %eax
; X64-SSSE3-NEXT: fucomi %st(1), %st
; X64-SSSE3-NEXT: fldz
-; X64-SSSE3-NEXT: fxch %st(1)
-; X64-SSSE3-NEXT: fcmovnbe %st(1), %st
+; X64-SSSE3-NEXT: fcmovbe %st(1), %st
; X64-SSSE3-NEXT: fstp %st(1)
; X64-SSSE3-NEXT: fsubrp %st, %st(1)
; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp)
@@ -534,13 +531,12 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
; X86-NEXT: sahf
; X86-NEXT: setbe %al
; X86-NEXT: fldz
-; X86-NEXT: ja .LBB11_2
+; X86-NEXT: jbe .LBB11_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: fstp %st(0)
+; X86-NEXT: fstp %st(1)
; X86-NEXT: fldz
-; X86-NEXT: fxch %st(1)
; X86-NEXT: .LBB11_2:
-; X86-NEXT: fstp %st(1)
+; X86-NEXT: fstp %st(0)
; X86-NEXT: fsubrp %st, %st(1)
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -565,8 +561,7 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
; X64-X87-NEXT: fucomi %st(1), %st
; X64-X87-NEXT: setbe %al
; X64-X87-NEXT: fldz
-; X64-X87-NEXT: fxch %st(1)
-; X64-X87-NEXT: fcmovnbe %st(1), %st
+; X64-X87-NEXT: fcmovbe %st(1), %st
; X64-X87-NEXT: fstp %st(1)
; X64-X87-NEXT: fsubrp %st, %st(1)
; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp)
@@ -587,8 +582,7 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
; X64-SSSE3-NEXT: xorl %eax, %eax
; X64-SSSE3-NEXT: fucomi %st(1), %st
; X64-SSSE3-NEXT: fldz
-; X64-SSSE3-NEXT: fxch %st(1)
-; X64-SSSE3-NEXT: fcmovnbe %st(1), %st
+; X64-SSSE3-NEXT: fcmovbe %st(1), %st
; X64-SSSE3-NEXT: fstp %st(1)
; X64-SSSE3-NEXT: fsubrp %st, %st(1)
; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
index 4f2859d4bffa..52278a6369eb 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
@@ -32,7 +32,7 @@ entry:
; CHECK: COMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr
; CHECK: [[FsFLD0SD:%[0-9]+]]:fr64 = FsFLD0SD
; CHECK: JCC_1
-; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[MOVSDrm_alt1]], {{.*}}, [[FsFLD0SD]], {{.*}}
+; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[FsFLD0SD]], {{.*}}, [[MOVSDrm_alt1]], {{.*}}
; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[PHI]], implicit $mxcsr
; CHECK: MOVSDmr %stack.0, 1, $noreg, 0, $noreg, killed [[SUBSDrr]] :: (store 8 into %stack.0)
; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 8cd021ad54e3..abe88f1ca233 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -1363,8 +1363,7 @@ define i64 @f20u64(double %x) #0 {
; X87-NEXT: wait
; X87-NEXT: setbe %dl
; X87-NEXT: fldz
-; X87-NEXT: fxch %st(1)
-; X87-NEXT: fcmovnbe %st(1), %st
+; X87-NEXT: fcmovbe %st(1), %st
; X87-NEXT: fstp %st(1)
; X87-NEXT: fsubrp %st, %st(1)
; X87-NEXT: wait
@@ -1387,12 +1386,11 @@ define i64 @f20u64(double %x) #0 {
; X86-SSE-NEXT: subl $20, %esp
; X86-SSE-NEXT: .cfi_def_cfa_offset 24
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X86-SSE-NEXT: comisd %xmm0, %xmm2
-; X86-SSE-NEXT: xorpd %xmm1, %xmm1
-; X86-SSE-NEXT: ja .LBB25_2
+; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT: comisd %xmm0, %xmm1
+; X86-SSE-NEXT: jbe .LBB25_2
; X86-SSE-NEXT: # %bb.1: # %entry
-; X86-SSE-NEXT: movapd %xmm2, %xmm1
+; X86-SSE-NEXT: xorpd %xmm1, %xmm1
; X86-SSE-NEXT: .LBB25_2: # %entry
; X86-SSE-NEXT: subsd %xmm1, %xmm0
; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
index e030a9159710..156ee617e72a 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
@@ -543,12 +543,11 @@ define i64 @fptoui_f32toi64(float %x) #0 {
; SSE-X86-NEXT: andl $-8, %esp
; SSE-X86-NEXT: subl $16, %esp
; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-X86-NEXT: comiss %xmm0, %xmm2
-; SSE-X86-NEXT: xorps %xmm1, %xmm1
-; SSE-X86-NEXT: ja .LBB9_2
+; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-X86-NEXT: comiss %xmm0, %xmm1
+; SSE-X86-NEXT: jbe .LBB9_2
; SSE-X86-NEXT: # %bb.1:
-; SSE-X86-NEXT: movaps %xmm2, %xmm1
+; SSE-X86-NEXT: xorps %xmm1, %xmm1
; SSE-X86-NEXT: .LBB9_2:
; SSE-X86-NEXT: subss %xmm1, %xmm0
; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
@@ -600,12 +599,11 @@ define i64 @fptoui_f32toi64(float %x) #0 {
; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-X86-NEXT: vcomiss %xmm0, %xmm1
-; AVX1-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX1-X86-NEXT: ja .LBB9_2
+; AVX1-X86-NEXT: jbe .LBB9_2
; AVX1-X86-NEXT: # %bb.1:
-; AVX1-X86-NEXT: vmovaps %xmm1, %xmm2
+; AVX1-X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-X86-NEXT: .LBB9_2:
-; AVX1-X86-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX1-X86-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX1-X86-NEXT: vmovss %xmm0, (%esp)
; AVX1-X86-NEXT: flds (%esp)
; AVX1-X86-NEXT: fisttpll (%esp)
@@ -650,16 +648,14 @@ define i64 @fptoui_f32toi64(float %x) #0 {
; AVX512-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512-X86-NEXT: xorl %edx, %edx
; AVX512-X86-NEXT: vcomiss %xmm0, %xmm1
-; AVX512-X86-NEXT: seta %al
-; AVX512-X86-NEXT: kmovw %eax, %k1
-; AVX512-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-X86-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-X86-NEXT: setbe %dl
+; AVX512-X86-NEXT: kmovw %edx, %k1
+; AVX512-X86-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
; AVX512-X86-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512-X86-NEXT: vmovss %xmm0, (%esp)
; AVX512-X86-NEXT: flds (%esp)
; AVX512-X86-NEXT: fisttpll (%esp)
; AVX512-X86-NEXT: wait
-; AVX512-X86-NEXT: setbe %dl
; AVX512-X86-NEXT: shll $31, %edx
; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512-X86-NEXT: movl (%esp), %eax
@@ -692,13 +688,12 @@ define i64 @fptoui_f32toi64(float %x) #0 {
; X87-NEXT: sahf
; X87-NEXT: setbe %al
; X87-NEXT: fldz
-; X87-NEXT: ja .LBB9_2
+; X87-NEXT: jbe .LBB9_2
; X87-NEXT: # %bb.1:
-; X87-NEXT: fstp %st(0)
+; X87-NEXT: fstp %st(1)
; X87-NEXT: fldz
-; X87-NEXT: fxch %st(1)
; X87-NEXT: .LBB9_2:
-; X87-NEXT: fstp %st(1)
+; X87-NEXT: fstp %st(0)
; X87-NEXT: fsubrp %st, %st(1)
; X87-NEXT: wait
; X87-NEXT: fnstcw {{[0-9]+}}(%esp)
@@ -1188,12 +1183,11 @@ define i64 @fptoui_f64toi64(double %x) #0 {
; SSE-X86-NEXT: andl $-8, %esp
; SSE-X86-NEXT: subl $16, %esp
; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-X86-NEXT: comisd %xmm0, %xmm2
-; SSE-X86-NEXT: xorpd %xmm1, %xmm1
-; SSE-X86-NEXT: ja .LBB18_2
+; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-X86-NEXT: comisd %xmm0, %xmm1
+; SSE-X86-NEXT: jbe .LBB18_2
; SSE-X86-NEXT: # %bb.1:
-; SSE-X86-NEXT: movapd %xmm2, %xmm1
+; SSE-X86-NEXT: xorpd %xmm1, %xmm1
; SSE-X86-NEXT: .LBB18_2:
; SSE-X86-NEXT: subsd %xmm1, %xmm0
; SSE-X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
@@ -1245,12 +1239,11 @@ define i64 @fptoui_f64toi64(double %x) #0 {
; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX1-X86-NEXT: vcomisd %xmm0, %xmm1
-; AVX1-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX1-X86-NEXT: ja .LBB18_2
+; AVX1-X86-NEXT: jbe .LBB18_2
; AVX1-X86-NEXT: # %bb.1:
-; AVX1-X86-NEXT: vmovapd %xmm1, %xmm2
+; AVX1-X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-X86-NEXT: .LBB18_2:
-; AVX1-X86-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; AVX1-X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX1-X86-NEXT: vmovsd %xmm0, (%esp)
; AVX1-X86-NEXT: fldl (%esp)
; AVX1-X86-NEXT: fisttpll (%esp)
@@ -1295,16 +1288,14 @@ define i64 @fptoui_f64toi64(double %x) #0 {
; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512-X86-NEXT: xorl %edx, %edx
; AVX512-X86-NEXT: vcomisd %xmm0, %xmm1
-; AVX512-X86-NEXT: seta %al
-; AVX512-X86-NEXT: kmovw %eax, %k1
-; AVX512-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX512-X86-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-X86-NEXT: setbe %dl
+; AVX512-X86-NEXT: kmovw %edx, %k1
+; AVX512-X86-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
; AVX512-X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX512-X86-NEXT: vmovsd %xmm0, (%esp)
; AVX512-X86-NEXT: fldl (%esp)
; AVX512-X86-NEXT: fisttpll (%esp)
; AVX512-X86-NEXT: wait
-; AVX512-X86-NEXT: setbe %dl
; AVX512-X86-NEXT: shll $31, %edx
; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512-X86-NEXT: movl (%esp), %eax
@@ -1337,13 +1328,12 @@ define i64 @fptoui_f64toi64(double %x) #0 {
; X87-NEXT: sahf
; X87-NEXT: setbe %al
; X87-NEXT: fldz
-; X87-NEXT: ja .LBB18_2
+; X87-NEXT: jbe .LBB18_2
; X87-NEXT: # %bb.1:
-; X87-NEXT: fstp %st(0)
+; X87-NEXT: fstp %st(1)
; X87-NEXT: fldz
-; X87-NEXT: fxch %st(1)
; X87-NEXT: .LBB18_2:
-; X87-NEXT: fstp %st(1)
+; X87-NEXT: fstp %st(0)
; X87-NEXT: fsubrp %st, %st(1)
; X87-NEXT: wait
; X87-NEXT: fnstcw {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
index c127a25d7ca9..e55e3903c0dc 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
@@ -597,13 +597,12 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
; X86-NEXT: sahf
; X86-NEXT: setbe %al
; X86-NEXT: fldz
-; X86-NEXT: ja .LBB18_2
+; X86-NEXT: jbe .LBB18_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: fstp %st(0)
+; X86-NEXT: fstp %st(1)
; X86-NEXT: fldz
-; X86-NEXT: fxch %st(1)
; X86-NEXT: .LBB18_2:
-; X86-NEXT: fstp %st(1)
+; X86-NEXT: fstp %st(0)
; X86-NEXT: fsubrp %st, %st(1)
; X86-NEXT: wait
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
@@ -632,8 +631,7 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
; X64-NEXT: wait
; X64-NEXT: setbe %al
; X64-NEXT: fldz
-; X64-NEXT: fxch %st(1)
-; X64-NEXT: fcmovnbe %st(1), %st
+; X64-NEXT: fcmovbe %st(1), %st
; X64-NEXT: fstp %st(1)
; X64-NEXT: fsubrp %st, %st(1)
; X64-NEXT: wait
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index 2405292f4efc..41345a98ab4b 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -69,16 +69,15 @@ define i64 @f_to_u64(float %a) nounwind {
; X86-AVX512F-WIN-NEXT: subl $8, %esp
; X86-AVX512F-WIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX512F-WIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-WIN-NEXT: vcmpltss %xmm1, %xmm0, %k1
-; X86-AVX512F-WIN-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X86-AVX512F-WIN-NEXT: xorl %edx, %edx
; X86-AVX512F-WIN-NEXT: vucomiss %xmm0, %xmm1
-; X86-AVX512F-WIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-WIN-NEXT: setbe %dl
+; X86-AVX512F-WIN-NEXT: kmovw %edx, %k1
+; X86-AVX512F-WIN-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
; X86-AVX512F-WIN-NEXT: vsubss %xmm1, %xmm0, %xmm0
; X86-AVX512F-WIN-NEXT: vmovss %xmm0, (%esp)
; X86-AVX512F-WIN-NEXT: flds (%esp)
; X86-AVX512F-WIN-NEXT: fisttpll (%esp)
-; X86-AVX512F-WIN-NEXT: setbe %dl
; X86-AVX512F-WIN-NEXT: shll $31, %edx
; X86-AVX512F-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-AVX512F-WIN-NEXT: movl (%esp), %eax
@@ -91,16 +90,15 @@ define i64 @f_to_u64(float %a) nounwind {
; X86-AVX512F-LIN-NEXT: subl $12, %esp
; X86-AVX512F-LIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX512F-LIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-LIN-NEXT: vcmpltss %xmm1, %xmm0, %k1
-; X86-AVX512F-LIN-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X86-AVX512F-LIN-NEXT: xorl %edx, %edx
; X86-AVX512F-LIN-NEXT: vucomiss %xmm0, %xmm1
-; X86-AVX512F-LIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-LIN-NEXT: setbe %dl
+; X86-AVX512F-LIN-NEXT: kmovw %edx, %k1
+; X86-AVX512F-LIN-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
; X86-AVX512F-LIN-NEXT: vsubss %xmm1, %xmm0, %xmm0
; X86-AVX512F-LIN-NEXT: vmovss %xmm0, (%esp)
; X86-AVX512F-LIN-NEXT: flds (%esp)
; X86-AVX512F-LIN-NEXT: fisttpll (%esp)
-; X86-AVX512F-LIN-NEXT: setbe %dl
; X86-AVX512F-LIN-NEXT: shll $31, %edx
; X86-AVX512F-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-AVX512F-LIN-NEXT: movl (%esp), %eax
@@ -115,16 +113,17 @@ define i64 @f_to_u64(float %a) nounwind {
; X86-SSE3-WIN-NEXT: subl $8, %esp
; X86-SSE3-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE3-WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE3-WIN-NEXT: movaps %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT: xorl %edx, %edx
; X86-SSE3-WIN-NEXT: ucomiss %xmm0, %xmm1
-; X86-SSE3-WIN-NEXT: cmpltss %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT: andnps %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT: subss %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT: movss %xmm2, (%esp)
+; X86-SSE3-WIN-NEXT: jbe LBB0_2
+; X86-SSE3-WIN-NEXT: # %bb.1:
+; X86-SSE3-WIN-NEXT: xorps %xmm1, %xmm1
+; X86-SSE3-WIN-NEXT: LBB0_2:
+; X86-SSE3-WIN-NEXT: subss %xmm1, %xmm0
+; X86-SSE3-WIN-NEXT: movss %xmm0, (%esp)
; X86-SSE3-WIN-NEXT: flds (%esp)
; X86-SSE3-WIN-NEXT: fisttpll (%esp)
-; X86-SSE3-WIN-NEXT: setbe %dl
+; X86-SSE3-WIN-NEXT: setbe %al
+; X86-SSE3-WIN-NEXT: movzbl %al, %edx
; X86-SSE3-WIN-NEXT: shll $31, %edx
; X86-SSE3-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE3-WIN-NEXT: movl (%esp), %eax
@@ -137,16 +136,17 @@ define i64 @f_to_u64(float %a) nounwind {
; X86-SSE3-LIN-NEXT: subl $12, %esp
; X86-SSE3-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE3-LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE3-LIN-NEXT: movaps %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT: xorl %edx, %edx
; X86-SSE3-LIN-NEXT: ucomiss %xmm0, %xmm1
-; X86-SSE3-LIN-NEXT: cmpltss %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT: andnps %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT: subss %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT: movss %xmm2, (%esp)
+; X86-SSE3-LIN-NEXT: jbe .LBB0_2
+; X86-SSE3-LIN-NEXT: # %bb.1:
+; X86-SSE3-LIN-NEXT: xorps %xmm1, %xmm1
+; X86-SSE3-LIN-NEXT: .LBB0_2:
+; X86-SSE3-LIN-NEXT: subss %xmm1, %xmm0
+; X86-SSE3-LIN-NEXT: movss %xmm0, (%esp)
; X86-SSE3-LIN-NEXT: flds (%esp)
; X86-SSE3-LIN-NEXT: fisttpll (%esp)
-; X86-SSE3-LIN-NEXT: setbe %dl
+; X86-SSE3-LIN-NEXT: setbe %al
+; X86-SSE3-LIN-NEXT: movzbl %al, %edx
; X86-SSE3-LIN-NEXT: shll $31, %edx
; X86-SSE3-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE3-LIN-NEXT: movl (%esp), %eax
@@ -174,23 +174,23 @@ define i64 @f_to_u64(float %a) nounwind {
; X86-SSE2-WIN-NEXT: subl $16, %esp
; X86-SSE2-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-WIN-NEXT: movaps %xmm0, %xmm2
-; X86-SSE2-WIN-NEXT: cmpltss %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT: andnps %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT: movaps %xmm0, %xmm3
-; X86-SSE2-WIN-NEXT: subss %xmm2, %xmm3
-; X86-SSE2-WIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT: ucomiss %xmm0, %xmm1
+; X86-SSE2-WIN-NEXT: jbe LBB0_2
+; X86-SSE2-WIN-NEXT: # %bb.1:
+; X86-SSE2-WIN-NEXT: xorps %xmm1, %xmm1
+; X86-SSE2-WIN-NEXT: LBB0_2:
+; X86-SSE2-WIN-NEXT: subss %xmm1, %xmm0
+; X86-SSE2-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT: setbe %al
; X86-SSE2-WIN-NEXT: flds {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-WIN-NEXT: orl $3072, %eax # imm = 0xC00
-; X86-SSE2-WIN-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-WIN-NEXT: orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT: xorl %edx, %edx
-; X86-SSE2-WIN-NEXT: ucomiss %xmm0, %xmm1
-; X86-SSE2-WIN-NEXT: setbe %dl
+; X86-SSE2-WIN-NEXT: movzbl %al, %edx
; X86-SSE2-WIN-NEXT: shll $31, %edx
; X86-SSE2-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE2-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -203,23 +203,23 @@ define i64 @f_to_u64(float %a) nounwind {
; X86-SSE2-LIN-NEXT: subl $20, %esp
; X86-SSE2-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-LIN-NEXT: movaps %xmm0, %xmm2
-; X86-SSE2-LIN-NEXT: cmpltss %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT: andnps %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT: movaps %xmm0, %xmm3
-; X86-SSE2-LIN-NEXT: subss %xmm2, %xmm3
-; X86-SSE2-LIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT: ucomiss %xmm0, %xmm1
+; X86-SSE2-LIN-NEXT: jbe .LBB0_2
+; X86-SSE2-LIN-NEXT: # %bb.1:
+; X86-SSE2-LIN-NEXT: xorps %xmm1, %xmm1
+; X86-SSE2-LIN-NEXT: .LBB0_2:
+; X86-SSE2-LIN-NEXT: subss %xmm1, %xmm0
+; X86-SSE2-LIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT: setbe %al
; X86-SSE2-LIN-NEXT: flds {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-LIN-NEXT: orl $3072, %eax # imm = 0xC00
-; X86-SSE2-LIN-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-LIN-NEXT: orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-LIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT: xorl %edx, %edx
-; X86-SSE2-LIN-NEXT: ucomiss %xmm0, %xmm1
-; X86-SSE2-LIN-NEXT: setbe %dl
+; X86-SSE2-LIN-NEXT: movzbl %al, %edx
; X86-SSE2-LIN-NEXT: shll $31, %edx
; X86-SSE2-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE2-LIN-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -241,13 +241,12 @@ define i64 @f_to_u64(float %a) nounwind {
; X87-WIN-NEXT: sahf
; X87-WIN-NEXT: setbe %al
; X87-WIN-NEXT: fldz
-; X87-WIN-NEXT: ja LBB0_2
+; X87-WIN-NEXT: jbe LBB0_2
; X87-WIN-NEXT: # %bb.1:
-; X87-WIN-NEXT: fstp %st(0)
+; X87-WIN-NEXT: fstp %st(1)
; X87-WIN-NEXT: fldz
-; X87-WIN-NEXT: fxch %st(1)
; X87-WIN-NEXT: LBB0_2:
-; X87-WIN-NEXT: fstp %st(1)
+; X87-WIN-NEXT: fstp %st(0)
; X87-WIN-NEXT: fsubrp %st, %st(1)
; X87-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -276,13 +275,12 @@ define i64 @f_to_u64(float %a) nounwind {
; X87-LIN-NEXT: sahf
; X87-LIN-NEXT: setbe %al
; X87-LIN-NEXT: fldz
-; X87-LIN-NEXT: ja .LBB0_2
+; X87-LIN-NEXT: jbe .LBB0_2
; X87-LIN-NEXT: # %bb.1:
-; X87-LIN-NEXT: fstp %st(0)
+; X87-LIN-NEXT: fstp %st(1)
; X87-LIN-NEXT: fldz
-; X87-LIN-NEXT: fxch %st(1)
; X87-LIN-NEXT: .LBB0_2:
-; X87-LIN-NEXT: fstp %st(1)
+; X87-LIN-NEXT: fstp %st(0)
; X87-LIN-NEXT: fsubrp %st, %st(1)
; X87-LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -495,16 +493,15 @@ define i64 @d_to_u64(double %a) nounwind {
; X86-AVX512F-WIN-NEXT: subl $8, %esp
; X86-AVX512F-WIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-AVX512F-WIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-WIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1
-; X86-AVX512F-WIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X86-AVX512F-WIN-NEXT: xorl %edx, %edx
; X86-AVX512F-WIN-NEXT: vucomisd %xmm0, %xmm1
-; X86-AVX512F-WIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-WIN-NEXT: setbe %dl
+; X86-AVX512F-WIN-NEXT: kmovw %edx, %k1
+; X86-AVX512F-WIN-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
; X86-AVX512F-WIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; X86-AVX512F-WIN-NEXT: vmovsd %xmm0, (%esp)
; X86-AVX512F-WIN-NEXT: fldl (%esp)
; X86-AVX512F-WIN-NEXT: fisttpll (%esp)
-; X86-AVX512F-WIN-NEXT: setbe %dl
; X86-AVX512F-WIN-NEXT: shll $31, %edx
; X86-AVX512F-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-AVX512F-WIN-NEXT: movl (%esp), %eax
@@ -517,16 +514,15 @@ define i64 @d_to_u64(double %a) nounwind {
; X86-AVX512F-LIN-NEXT: subl $12, %esp
; X86-AVX512F-LIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-AVX512F-LIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-LIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1
-; X86-AVX512F-LIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X86-AVX512F-LIN-NEXT: xorl %edx, %edx
; X86-AVX512F-LIN-NEXT: vucomisd %xmm0, %xmm1
-; X86-AVX512F-LIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-LIN-NEXT: setbe %dl
+; X86-AVX512F-LIN-NEXT: kmovw %edx, %k1
+; X86-AVX512F-LIN-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
; X86-AVX512F-LIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; X86-AVX512F-LIN-NEXT: vmovsd %xmm0, (%esp)
; X86-AVX512F-LIN-NEXT: fldl (%esp)
; X86-AVX512F-LIN-NEXT: fisttpll (%esp)
-; X86-AVX512F-LIN-NEXT: setbe %dl
; X86-AVX512F-LIN-NEXT: shll $31, %edx
; X86-AVX512F-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-AVX512F-LIN-NEXT: movl (%esp), %eax
@@ -541,16 +537,17 @@ define i64 @d_to_u64(double %a) nounwind {
; X86-SSE3-WIN-NEXT: subl $8, %esp
; X86-SSE3-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE3-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE3-WIN-NEXT: movapd %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT: xorl %edx, %edx
; X86-SSE3-WIN-NEXT: ucomisd %xmm0, %xmm1
-; X86-SSE3-WIN-NEXT: cmpltsd %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT: andnpd %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT: subsd %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT: movsd %xmm2, (%esp)
+; X86-SSE3-WIN-NEXT: jbe LBB2_2
+; X86-SSE3-WIN-NEXT: # %bb.1:
+; X86-SSE3-WIN-NEXT: xorpd %xmm1, %xmm1
+; X86-SSE3-WIN-NEXT: LBB2_2:
+; X86-SSE3-WIN-NEXT: subsd %xmm1, %xmm0
+; X86-SSE3-WIN-NEXT: movsd %xmm0, (%esp)
; X86-SSE3-WIN-NEXT: fldl (%esp)
; X86-SSE3-WIN-NEXT: fisttpll (%esp)
-; X86-SSE3-WIN-NEXT: setbe %dl
+; X86-SSE3-WIN-NEXT: setbe %al
+; X86-SSE3-WIN-NEXT: movzbl %al, %edx
; X86-SSE3-WIN-NEXT: shll $31, %edx
; X86-SSE3-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE3-WIN-NEXT: movl (%esp), %eax
@@ -563,16 +560,17 @@ define i64 @d_to_u64(double %a) nounwind {
; X86-SSE3-LIN-NEXT: subl $12, %esp
; X86-SSE3-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE3-LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE3-LIN-NEXT: movapd %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT: xorl %edx, %edx
; X86-SSE3-LIN-NEXT: ucomisd %xmm0, %xmm1
-; X86-SSE3-LIN-NEXT: cmpltsd %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT: andnpd %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT: subsd %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT: movsd %xmm2, (%esp)
+; X86-SSE3-LIN-NEXT: jbe .LBB2_2
+; X86-SSE3-LIN-NEXT: # %bb.1:
+; X86-SSE3-LIN-NEXT: xorpd %xmm1, %xmm1
+; X86-SSE3-LIN-NEXT: .LBB2_2:
+; X86-SSE3-LIN-NEXT: subsd %xmm1, %xmm0
+; X86-SSE3-LIN-NEXT: movsd %xmm0, (%esp)
; X86-SSE3-LIN-NEXT: fldl (%esp)
; X86-SSE3-LIN-NEXT: fisttpll (%esp)
-; X86-SSE3-LIN-NEXT: setbe %dl
+; X86-SSE3-LIN-NEXT: setbe %al
+; X86-SSE3-LIN-NEXT: movzbl %al, %edx
; X86-SSE3-LIN-NEXT: shll $31, %edx
; X86-SSE3-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE3-LIN-NEXT: movl (%esp), %eax
@@ -600,23 +598,23 @@ define i64 @d_to_u64(double %a) nounwind {
; X86-SSE2-WIN-NEXT: subl $16, %esp
; X86-SSE2-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE2-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE2-WIN-NEXT: movapd %xmm0, %xmm2
-; X86-SSE2-WIN-NEXT: cmpltsd %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT: andnpd %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT: movapd %xmm0, %xmm3
-; X86-SSE2-WIN-NEXT: subsd %xmm2, %xmm3
-; X86-SSE2-WIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT: ucomisd %xmm0, %xmm1
+; X86-SSE2-WIN-NEXT: jbe LBB2_2
+; X86-SSE2-WIN-NEXT: # %bb.1:
+; X86-SSE2-WIN-NEXT: xorpd %xmm1, %xmm1
+; X86-SSE2-WIN-NEXT: LBB2_2:
+; X86-SSE2-WIN-NEXT: subsd %xmm1, %xmm0
+; X86-SSE2-WIN-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT: setbe %al
; X86-SSE2-WIN-NEXT: fldl {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-WIN-NEXT: orl $3072, %eax # imm = 0xC00
-; X86-SSE2-WIN-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-WIN-NEXT: orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT: xorl %edx, %edx
-; X86-SSE2-WIN-NEXT: ucomisd %xmm0, %xmm1
-; X86-SSE2-WIN-NEXT: setbe %dl
+; X86-SSE2-WIN-NEXT: movzbl %al, %edx
; X86-SSE2-WIN-NEXT: shll $31, %edx
; X86-SSE2-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE2-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -629,23 +627,23 @@ define i64 @d_to_u64(double %a) nounwind {
; X86-SSE2-LIN-NEXT: subl $20, %esp
; X86-SSE2-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE2-LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE2-LIN-NEXT: movapd %xmm0, %xmm2
-; X86-SSE2-LIN-NEXT: cmpltsd %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT: andnpd %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT: movapd %xmm0, %xmm3
-; X86-SSE2-LIN-NEXT: subsd %xmm2, %xmm3
-; X86-SSE2-LIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT: ucomisd %xmm0, %xmm1
+; X86-SSE2-LIN-NEXT: jbe .LBB2_2
+; X86-SSE2-LIN-NEXT: # %bb.1:
+; X86-SSE2-LIN-NEXT: xorpd %xmm1, %xmm1
+; X86-SSE2-LIN-NEXT: .LBB2_2:
+; X86-SSE2-LIN-NEXT: subsd %xmm1, %xmm0
+; X86-SSE2-LIN-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT: setbe %al
; X86-SSE2-LIN-NEXT: fldl {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-LIN-NEXT: orl $3072, %eax # imm = 0xC00
-; X86-SSE2-LIN-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-LIN-NEXT: orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-LIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT: xorl %edx, %edx
-; X86-SSE2-LIN-NEXT: ucomisd %xmm0, %xmm1
-; X86-SSE2-LIN-NEXT: setbe %dl
+; X86-SSE2-LIN-NEXT: movzbl %al, %edx
; X86-SSE2-LIN-NEXT: shll $31, %edx
; X86-SSE2-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-SSE2-LIN-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -667,13 +665,12 @@ define i64 @d_to_u64(double %a) nounwind {
; X87-WIN-NEXT: sahf
; X87-WIN-NEXT: setbe %al
; X87-WIN-NEXT: fldz
-; X87-WIN-NEXT: ja LBB2_2
+; X87-WIN-NEXT: jbe LBB2_2
; X87-WIN-NEXT: # %bb.1:
-; X87-WIN-NEXT: fstp %st(0)
+; X87-WIN-NEXT: fstp %st(1)
; X87-WIN-NEXT: fldz
-; X87-WIN-NEXT: fxch %st(1)
; X87-WIN-NEXT: LBB2_2:
-; X87-WIN-NEXT: fstp %st(1)
+; X87-WIN-NEXT: fstp %st(0)
; X87-WIN-NEXT: fsubrp %st, %st(1)
; X87-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -702,13 +699,12 @@ define i64 @d_to_u64(double %a) nounwind {
; X87-LIN-NEXT: sahf
; X87-LIN-NEXT: setbe %al
; X87-LIN-NEXT: fldz
-; X87-LIN-NEXT: ja .LBB2_2
+; X87-LIN-NEXT: jbe .LBB2_2
; X87-LIN-NEXT: # %bb.1:
-; X87-LIN-NEXT: fstp %st(0)
+; X87-LIN-NEXT: fstp %st(1)
; X87-LIN-NEXT: fldz
-; X87-LIN-NEXT: fxch %st(1)
; X87-LIN-NEXT: .LBB2_2:
-; X87-LIN-NEXT: fstp %st(1)
+; X87-LIN-NEXT: fstp %st(0)
; X87-LIN-NEXT: fsubrp %st, %st(1)
; X87-LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -902,8 +898,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X86-AVX512-WIN-NEXT: xorl %edx, %edx
; X86-AVX512-WIN-NEXT: fucomi %st(1), %st
; X86-AVX512-WIN-NEXT: fldz
-; X86-AVX512-WIN-NEXT: fxch %st(1)
-; X86-AVX512-WIN-NEXT: fcmovnbe %st(1), %st
+; X86-AVX512-WIN-NEXT: fcmovbe %st(1), %st
; X86-AVX512-WIN-NEXT: fstp %st(1)
; X86-AVX512-WIN-NEXT: fsubrp %st, %st(1)
; X86-AVX512-WIN-NEXT: fisttpll (%esp)
@@ -923,8 +918,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X86-AVX512-LIN-NEXT: xorl %edx, %edx
; X86-AVX512-LIN-NEXT: fucomi %st(1), %st
; X86-AVX512-LIN-NEXT: fldz
-; X86-AVX512-LIN-NEXT: fxch %st(1)
-; X86-AVX512-LIN-NEXT: fcmovnbe %st(1), %st
+; X86-AVX512-LIN-NEXT: fcmovbe %st(1), %st
; X86-AVX512-LIN-NEXT: fstp %st(1)
; X86-AVX512-LIN-NEXT: fsubrp %st, %st(1)
; X86-AVX512-LIN-NEXT: fisttpll (%esp)
@@ -943,8 +937,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X64-AVX512-WIN-NEXT: xorl %eax, %eax
; X64-AVX512-WIN-NEXT: fucomi %st(1), %st
; X64-AVX512-WIN-NEXT: fldz
-; X64-AVX512-WIN-NEXT: fxch %st(1)
-; X64-AVX512-WIN-NEXT: fcmovnbe %st(1), %st
+; X64-AVX512-WIN-NEXT: fcmovbe %st(1), %st
; X64-AVX512-WIN-NEXT: fstp %st(1)
; X64-AVX512-WIN-NEXT: fsubrp %st, %st(1)
; X64-AVX512-WIN-NEXT: fisttpll (%rsp)
@@ -961,8 +954,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X64-AVX512-LIN-NEXT: xorl %eax, %eax
; X64-AVX512-LIN-NEXT: fucomi %st(1), %st
; X64-AVX512-LIN-NEXT: fldz
-; X64-AVX512-LIN-NEXT: fxch %st(1)
-; X64-AVX512-LIN-NEXT: fcmovnbe %st(1), %st
+; X64-AVX512-LIN-NEXT: fcmovbe %st(1), %st
; X64-AVX512-LIN-NEXT: fstp %st(1)
; X64-AVX512-LIN-NEXT: fsubrp %st, %st(1)
; X64-AVX512-LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp)
@@ -982,8 +974,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X86-SSE3-WIN-NEXT: xorl %edx, %edx
; X86-SSE3-WIN-NEXT: fucomi %st(1), %st
; X86-SSE3-WIN-NEXT: fldz
-; X86-SSE3-WIN-NEXT: fxch %st(1)
-; X86-SSE3-WIN-NEXT: fcmovnbe %st(1), %st
+; X86-SSE3-WIN-NEXT: fcmovbe %st(1), %st
; X86-SSE3-WIN-NEXT: fstp %st(1)
; X86-SSE3-WIN-NEXT: fsubrp %st, %st(1)
; X86-SSE3-WIN-NEXT: fisttpll (%esp)
@@ -1003,8 +994,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X86-SSE3-LIN-NEXT: xorl %edx, %edx
; X86-SSE3-LIN-NEXT: fucomi %st(1), %st
; X86-SSE3-LIN-NEXT: fldz
-; X86-SSE3-LIN-NEXT: fxch %st(1)
-; X86-SSE3-LIN-NEXT: fcmovnbe %st(1), %st
+; X86-SSE3-LIN-NEXT: fcmovbe %st(1), %st
; X86-SSE3-LIN-NEXT: fstp %st(1)
; X86-SSE3-LIN-NEXT: fsubrp %st, %st(1)
; X86-SSE3-LIN-NEXT: fisttpll (%esp)
@@ -1023,8 +1013,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X64-SSE3-WIN-NEXT: xorl %eax, %eax
; X64-SSE3-WIN-NEXT: fucomi %st(1), %st
; X64-SSE3-WIN-NEXT: fldz
-; X64-SSE3-WIN-NEXT: fxch %st(1)
-; X64-SSE3-WIN-NEXT: fcmovnbe %st(1), %st
+; X64-SSE3-WIN-NEXT: fcmovbe %st(1), %st
; X64-SSE3-WIN-NEXT: fstp %st(1)
; X64-SSE3-WIN-NEXT: fsubrp %st, %st(1)
; X64-SSE3-WIN-NEXT: fisttpll (%rsp)
@@ -1041,8 +1030,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X64-SSE3-LIN-NEXT: xorl %eax, %eax
; X64-SSE3-LIN-NEXT: fucomi %st(1), %st
; X64-SSE3-LIN-NEXT: fldz
-; X64-SSE3-LIN-NEXT: fxch %st(1)
-; X64-SSE3-LIN-NEXT: fcmovnbe %st(1), %st
+; X64-SSE3-LIN-NEXT: fcmovbe %st(1), %st
; X64-SSE3-LIN-NEXT: fstp %st(1)
; X64-SSE3-LIN-NEXT: fsubrp %st, %st(1)
; X64-SSE3-LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp)
@@ -1063,8 +1051,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X86-SSE2-WIN-NEXT: fucomi %st(1), %st
; X86-SSE2-WIN-NEXT: setbe %dl
; X86-SSE2-WIN-NEXT: fldz
-; X86-SSE2-WIN-NEXT: fxch %st(1)
-; X86-SSE2-WIN-NEXT: fcmovnbe %st(1), %st
+; X86-SSE2-WIN-NEXT: fcmovbe %st(1), %st
; X86-SSE2-WIN-NEXT: fstp %st(1)
; X86-SSE2-WIN-NEXT: fsubrp %st, %st(1)
; X86-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
@@ -1090,8 +1077,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X86-SSE2-LIN-NEXT: fucomi %st(1), %st
; X86-SSE2-LIN-NEXT: setbe %dl
; X86-SSE2-LIN-NEXT: fldz
-; X86-SSE2-LIN-NEXT: fxch %st(1)
-; X86-SSE2-LIN-NEXT: fcmovnbe %st(1), %st
+; X86-SSE2-LIN-NEXT: fcmovbe %st(1), %st
; X86-SSE2-LIN-NEXT: fstp %st(1)
; X86-SSE2-LIN-NEXT: fsubrp %st, %st(1)
; X86-SSE2-LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
@@ -1116,8 +1102,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X64-SSE2-WIN-NEXT: fucomi %st(1), %st
; X64-SSE2-WIN-NEXT: setbe %al
; X64-SSE2-WIN-NEXT: fldz
-; X64-SSE2-WIN-NEXT: fxch %st(1)
-; X64-SSE2-WIN-NEXT: fcmovnbe %st(1), %st
+; X64-SSE2-WIN-NEXT: fcmovbe %st(1), %st
; X64-SSE2-WIN-NEXT: fstp %st(1)
; X64-SSE2-WIN-NEXT: fsubrp %st, %st(1)
; X64-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%rsp)
@@ -1140,8 +1125,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X64-SSE2-LIN-NEXT: fucomi %st(1), %st
; X64-SSE2-LIN-NEXT: setbe %al
; X64-SSE2-LIN-NEXT: fldz
-; X64-SSE2-LIN-NEXT: fxch %st(1)
-; X64-SSE2-LIN-NEXT: fcmovnbe %st(1), %st
+; X64-SSE2-LIN-NEXT: fcmovbe %st(1), %st
; X64-SSE2-LIN-NEXT: fstp %st(1)
; X64-SSE2-LIN-NEXT: fsubrp %st, %st(1)
; X64-SSE2-LIN-NEXT: fnstcw -{{[0-9]+}}(%rsp)
@@ -1170,13 +1154,12 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X87-WIN-NEXT: sahf
; X87-WIN-NEXT: setbe %al
; X87-WIN-NEXT: fldz
-; X87-WIN-NEXT: ja LBB4_2
+; X87-WIN-NEXT: jbe LBB4_2
; X87-WIN-NEXT: # %bb.1:
-; X87-WIN-NEXT: fstp %st(0)
+; X87-WIN-NEXT: fstp %st(1)
; X87-WIN-NEXT: fldz
-; X87-WIN-NEXT: fxch %st(1)
; X87-WIN-NEXT: LBB4_2:
-; X87-WIN-NEXT: fstp %st(1)
+; X87-WIN-NEXT: fstp %st(0)
; X87-WIN-NEXT: fsubrp %st, %st(1)
; X87-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
@@ -1205,13 +1188,12 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
; X87-LIN-NEXT: sahf
; X87-LIN-NEXT: setbe %al
; X87-LIN-NEXT: fldz
-; X87-LIN-NEXT: ja .LBB4_2
+; X87-LIN-NEXT: jbe .LBB4_2
; X87-LIN-NEXT: # %bb.1:
-; X87-LIN-NEXT: fstp %st(0)
+; X87-LIN-NEXT: fstp %st(1)
; X87-LIN-NEXT: fldz
-; X87-LIN-NEXT: fxch %st(1)
; X87-LIN-NEXT: .LBB4_2:
-; X87-LIN-NEXT: fstp %st(1)
+; X87-LIN-NEXT: fstp %st(0)
; X87-LIN-NEXT: fsubrp %st, %st(1)
; X87-LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index 93695371091b..4569e69b7e50 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -222,17 +222,16 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
-; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-32-NEXT: comisd %xmm2, %xmm0
-; SSE-32-NEXT: xorpd %xmm1, %xmm1
-; SSE-32-NEXT: xorpd %xmm3, %xmm3
-; SSE-32-NEXT: jb .LBB1_2
+; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-32-NEXT: comisd %xmm1, %xmm0
+; SSE-32-NEXT: movapd %xmm1, %xmm2
+; SSE-32-NEXT: jae .LBB1_2
; SSE-32-NEXT: # %bb.1:
-; SSE-32-NEXT: movapd %xmm2, %xmm3
+; SSE-32-NEXT: xorpd %xmm2, %xmm2
; SSE-32-NEXT: .LBB1_2:
-; SSE-32-NEXT: movapd %xmm0, %xmm4
-; SSE-32-NEXT: subsd %xmm3, %xmm4
-; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: movapd %xmm0, %xmm3
+; SSE-32-NEXT: subsd %xmm2, %xmm3
+; SSE-32-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: fldl {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
@@ -244,10 +243,10 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-32-NEXT: comisd %xmm2, %xmm0
-; SSE-32-NEXT: jb .LBB1_4
+; SSE-32-NEXT: comisd %xmm1, %xmm0
+; SSE-32-NEXT: jae .LBB1_4
; SSE-32-NEXT: # %bb.3:
-; SSE-32-NEXT: movapd %xmm2, %xmm1
+; SSE-32-NEXT: xorpd %xmm1, %xmm1
; SSE-32-NEXT: .LBB1_4:
; SSE-32-NEXT: subsd %xmm1, %xmm0
; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
@@ -323,17 +322,16 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $16, %esp
-; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB1_2
+; AVX-32-NEXT: vcomisd %xmm1, %xmm2
+; AVX-32-NEXT: vmovapd %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB1_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm4
+; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB1_2:
-; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovsd %xmm3, (%esp)
+; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovsd %xmm2, (%esp)
; AVX-32-NEXT: fldl (%esp)
; AVX-32-NEXT: fisttpll (%esp)
; AVX-32-NEXT: wait
@@ -342,11 +340,11 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomisd %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB1_4
+; AVX-32-NEXT: jae .LBB1_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm2
+; AVX-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB1_4:
-; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
@@ -410,30 +408,25 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512F-32-NEXT: xorl %eax, %eax
; AVX512F-32-NEXT: vcomisd %xmm2, %xmm1
-; AVX512F-32-NEXT: setb %cl
-; AVX512F-32-NEXT: kmovw %ecx, %k1
-; AVX512F-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovapd %xmm2, %xmm4
-; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1
-; AVX512F-32-NEXT: vmovsd %xmm1, (%esp)
-; AVX512F-32-NEXT: fldl (%esp)
-; AVX512F-32-NEXT: fisttpll (%esp)
-; AVX512F-32-NEXT: wait
; AVX512F-32-NEXT: setae %al
-; AVX512F-32-NEXT: shll $31, %eax
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vmovsd %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubsd %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovsd %xmm1, (%esp)
; AVX512F-32-NEXT: xorl %ecx, %ecx
; AVX512F-32-NEXT: vcomisd %xmm2, %xmm0
-; AVX512F-32-NEXT: setb %dl
-; AVX512F-32-NEXT: kmovw %edx, %k1
-; AVX512F-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512F-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vmovsd %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fldl (%esp)
+; AVX512F-32-NEXT: fisttpll (%esp)
; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: shll $31, %eax
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: shll $31, %ecx
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -468,30 +461,25 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm1
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovapd %xmm2, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1
-; AVX512VL-32-NEXT: vmovsd %xmm1, (%esp)
-; AVX512VL-32-NEXT: fldl (%esp)
-; AVX512VL-32-NEXT: fisttpll (%esp)
-; AVX512VL-32-NEXT: wait
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovsd %xmm1, (%esp)
; AVX512VL-32-NEXT: xorl %ecx, %ecx
; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm0
-; AVX512VL-32-NEXT: setb %dl
-; AVX512VL-32-NEXT: kmovw %edx, %k1
-; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl (%esp)
+; AVX512VL-32-NEXT: fisttpll (%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: shll $31, %ecx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -905,17 +893,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
-; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-32-NEXT: comiss %xmm2, %xmm0
-; SSE-32-NEXT: xorps %xmm1, %xmm1
-; SSE-32-NEXT: xorps %xmm3, %xmm3
-; SSE-32-NEXT: jb .LBB4_2
+; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT: comiss %xmm1, %xmm0
+; SSE-32-NEXT: movaps %xmm1, %xmm2
+; SSE-32-NEXT: jae .LBB4_2
; SSE-32-NEXT: # %bb.1:
-; SSE-32-NEXT: movaps %xmm2, %xmm3
+; SSE-32-NEXT: xorps %xmm2, %xmm2
; SSE-32-NEXT: .LBB4_2:
-; SSE-32-NEXT: movaps %xmm0, %xmm4
-; SSE-32-NEXT: subss %xmm3, %xmm4
-; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: movaps %xmm0, %xmm3
+; SSE-32-NEXT: subss %xmm2, %xmm3
+; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
@@ -927,10 +914,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-32-NEXT: comiss %xmm2, %xmm0
-; SSE-32-NEXT: jb .LBB4_4
+; SSE-32-NEXT: comiss %xmm1, %xmm0
+; SSE-32-NEXT: jae .LBB4_4
; SSE-32-NEXT: # %bb.3:
-; SSE-32-NEXT: movaps %xmm2, %xmm1
+; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: .LBB4_4:
; SSE-32-NEXT: subss %xmm1, %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
@@ -1006,17 +993,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $16, %esp
-; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB4_2
+; AVX-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX-32-NEXT: vmovaps %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB4_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm4
+; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB4_2:
-; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -1025,11 +1011,11 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB4_4
+; AVX-32-NEXT: jae .LBB4_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm2
+; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB4_4:
-; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: fisttpll (%esp)
@@ -1093,30 +1079,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: xorl %eax, %eax
; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1
-; AVX512F-32-NEXT: setb %cl
-; AVX512F-32-NEXT: kmovw %ecx, %k1
-; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1
-; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: wait
; AVX512F-32-NEXT: setae %al
-; AVX512F-32-NEXT: shll $31, %eax
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: xorl %ecx, %ecx
; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0
-; AVX512F-32-NEXT: setb %dl
-; AVX512F-32-NEXT: kmovw %edx, %k1
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512F-32-NEXT: vmovss %xmm0, (%esp)
+; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: flds (%esp)
; AVX512F-32-NEXT: fisttpll (%esp)
; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: shll $31, %eax
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: shll $31, %ecx
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1151,30 +1132,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1
-; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: xorl %ecx, %ecx
; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0
-; AVX512VL-32-NEXT: setb %dl
-; AVX512VL-32-NEXT: kmovw %edx, %k1
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovss %xmm0, (%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds (%esp)
; AVX512VL-32-NEXT: fisttpll (%esp)
; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: shll $31, %ecx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1225,17 +1201,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
; SSE-32-NEXT: subl $24, %esp
; SSE-32-NEXT: movl 8(%ebp), %eax
; SSE-32-NEXT: movaps (%eax), %xmm0
-; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-32-NEXT: comiss %xmm2, %xmm0
-; SSE-32-NEXT: xorps %xmm1, %xmm1
-; SSE-32-NEXT: xorps %xmm3, %xmm3
-; SSE-32-NEXT: jb .LBB5_2
+; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT: comiss %xmm1, %xmm0
+; SSE-32-NEXT: movaps %xmm1, %xmm2
+; SSE-32-NEXT: jae .LBB5_2
; SSE-32-NEXT: # %bb.1:
-; SSE-32-NEXT: movaps %xmm2, %xmm3
+; SSE-32-NEXT: xorps %xmm2, %xmm2
; SSE-32-NEXT: .LBB5_2:
-; SSE-32-NEXT: movaps %xmm0, %xmm4
-; SSE-32-NEXT: subss %xmm3, %xmm4
-; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: movaps %xmm0, %xmm3
+; SSE-32-NEXT: subss %xmm2, %xmm3
+; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
@@ -1247,10 +1222,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-32-NEXT: comiss %xmm2, %xmm0
-; SSE-32-NEXT: jb .LBB5_4
+; SSE-32-NEXT: comiss %xmm1, %xmm0
+; SSE-32-NEXT: jae .LBB5_4
; SSE-32-NEXT: # %bb.3:
-; SSE-32-NEXT: movaps %xmm2, %xmm1
+; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: .LBB5_4:
; SSE-32-NEXT: subss %xmm1, %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
@@ -1328,17 +1303,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
; AVX-32-NEXT: subl $16, %esp
; AVX-32-NEXT: movl 8(%ebp), %eax
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB5_2
+; AVX-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX-32-NEXT: vmovaps %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB5_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm4
+; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB5_2:
-; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -1347,11 +1321,11 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB5_4
+; AVX-32-NEXT: jae .LBB5_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm2
+; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB5_4:
-; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: fisttpll (%esp)
@@ -1418,30 +1392,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: xorl %eax, %eax
; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1
-; AVX512F-32-NEXT: setb %cl
-; AVX512F-32-NEXT: kmovw %ecx, %k1
-; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1
-; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: wait
; AVX512F-32-NEXT: setae %al
-; AVX512F-32-NEXT: shll $31, %eax
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: xorl %ecx, %ecx
; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0
-; AVX512F-32-NEXT: setb %dl
-; AVX512F-32-NEXT: kmovw %edx, %k1
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512F-32-NEXT: vmovss %xmm0, (%esp)
+; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: flds (%esp)
; AVX512F-32-NEXT: fisttpll (%esp)
; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: shll $31, %eax
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: shll $31, %ecx
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1477,30 +1446,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1
-; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: xorl %ecx, %ecx
; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0
-; AVX512VL-32-NEXT: setb %dl
-; AVX512VL-32-NEXT: kmovw %edx, %k1
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovss %xmm0, (%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds (%esp)
; AVX512VL-32-NEXT: fisttpll (%esp)
; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: shll $31, %ecx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -2416,17 +2380,16 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
-; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-32-NEXT: comisd %xmm2, %xmm0
-; SSE-32-NEXT: xorpd %xmm1, %xmm1
-; SSE-32-NEXT: xorpd %xmm3, %xmm3
-; SSE-32-NEXT: jb .LBB19_2
+; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-32-NEXT: comisd %xmm1, %xmm0
+; SSE-32-NEXT: movapd %xmm1, %xmm2
+; SSE-32-NEXT: jae .LBB19_2
; SSE-32-NEXT: # %bb.1:
-; SSE-32-NEXT: movapd %xmm2, %xmm3
+; SSE-32-NEXT: xorpd %xmm2, %xmm2
; SSE-32-NEXT: .LBB19_2:
-; SSE-32-NEXT: movapd %xmm0, %xmm4
-; SSE-32-NEXT: subsd %xmm3, %xmm4
-; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: movapd %xmm0, %xmm3
+; SSE-32-NEXT: subsd %xmm2, %xmm3
+; SSE-32-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: fldl {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
@@ -2438,10 +2401,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-32-NEXT: comisd %xmm2, %xmm0
-; SSE-32-NEXT: jb .LBB19_4
+; SSE-32-NEXT: comisd %xmm1, %xmm0
+; SSE-32-NEXT: jae .LBB19_4
; SSE-32-NEXT: # %bb.3:
-; SSE-32-NEXT: movapd %xmm2, %xmm1
+; SSE-32-NEXT: xorpd %xmm1, %xmm1
; SSE-32-NEXT: .LBB19_4:
; SSE-32-NEXT: subsd %xmm1, %xmm0
; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
@@ -2517,17 +2480,16 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $16, %esp
-; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB19_2
+; AVX-32-NEXT: vcomisd %xmm1, %xmm2
+; AVX-32-NEXT: vmovapd %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB19_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm4
+; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB19_2:
-; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovsd %xmm3, (%esp)
+; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovsd %xmm2, (%esp)
; AVX-32-NEXT: fldl (%esp)
; AVX-32-NEXT: fisttpll (%esp)
; AVX-32-NEXT: wait
@@ -2536,11 +2498,11 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomisd %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB19_4
+; AVX-32-NEXT: jae .LBB19_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm2
+; AVX-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB19_4:
-; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
@@ -2792,17 +2754,16 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
-; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-32-NEXT: comiss %xmm2, %xmm0
-; SSE-32-NEXT: xorps %xmm1, %xmm1
-; SSE-32-NEXT: xorps %xmm3, %xmm3
-; SSE-32-NEXT: jb .LBB21_2
+; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT: comiss %xmm1, %xmm0
+; SSE-32-NEXT: movaps %xmm1, %xmm2
+; SSE-32-NEXT: jae .LBB21_2
; SSE-32-NEXT: # %bb.1:
-; SSE-32-NEXT: movaps %xmm2, %xmm3
+; SSE-32-NEXT: xorps %xmm2, %xmm2
; SSE-32-NEXT: .LBB21_2:
-; SSE-32-NEXT: movaps %xmm0, %xmm4
-; SSE-32-NEXT: subss %xmm3, %xmm4
-; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT: movaps %xmm0, %xmm3
+; SSE-32-NEXT: subss %xmm2, %xmm3
+; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
@@ -2814,10 +2775,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-32-NEXT: comiss %xmm2, %xmm0
-; SSE-32-NEXT: jb .LBB21_4
+; SSE-32-NEXT: comiss %xmm1, %xmm0
+; SSE-32-NEXT: jae .LBB21_4
; SSE-32-NEXT: # %bb.3:
-; SSE-32-NEXT: movaps %xmm2, %xmm1
+; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: .LBB21_4:
; SSE-32-NEXT: subss %xmm1, %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
@@ -2893,17 +2854,16 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $16, %esp
-; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB21_2
+; AVX-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX-32-NEXT: vmovaps %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB21_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm4
+; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB21_2:
-; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -2912,11 +2872,11 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB21_4
+; AVX-32-NEXT: jae .LBB21_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm2
+; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB21_4:
-; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: fisttpll (%esp)
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index d4d285c36485..bcc14ec38e27 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -226,17 +226,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $32, %esp
-; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB1_2
+; AVX-32-NEXT: vcomisd %xmm1, %xmm2
+; AVX-32-NEXT: vmovapd %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB1_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm4
+; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB1_2:
-; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -244,16 +243,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX-32-NEXT: movzbl %al, %eax
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX-32-NEXT: vcomisd %xmm1, %xmm4
-; AVX-32-NEXT: vxorpd %xmm5, %xmm5, %xmm5
-; AVX-32-NEXT: jb .LBB1_4
+; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX-32-NEXT: vcomisd %xmm1, %xmm3
+; AVX-32-NEXT: vmovapd %xmm1, %xmm4
+; AVX-32-NEXT: jae .LBB1_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm5
+; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; AVX-32-NEXT: .LBB1_4:
-; AVX-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4
-; AVX-32-NEXT: vmovsd %xmm4, (%esp)
+; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
+; AVX-32-NEXT: vmovsd %xmm3, (%esp)
; AVX-32-NEXT: fldl (%esp)
; AVX-32-NEXT: fisttpll (%esp)
; AVX-32-NEXT: wait
@@ -261,14 +260,14 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX-32-NEXT: movzbl %cl, %ecx
; AVX-32-NEXT: shll $31, %ecx
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB1_6
+; AVX-32-NEXT: vcomisd %xmm1, %xmm2
+; AVX-32-NEXT: vmovapd %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB1_6
; AVX-32-NEXT: # %bb.5:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm4
+; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB1_6:
-; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -277,11 +276,11 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX-32-NEXT: shll $31, %edx
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX-32-NEXT: vcomisd %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB1_8
+; AVX-32-NEXT: jae .LBB1_8
; AVX-32-NEXT: # %bb.7:
-; AVX-32-NEXT: vmovapd %xmm1, %xmm2
+; AVX-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB1_8:
-; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
@@ -375,83 +374,68 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512F-32-NEXT: movl %esp, %ebp
; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512F-32-NEXT: pushl %ebx
-; AVX512F-32-NEXT: pushl %esi
; AVX512F-32-NEXT: andl $-8, %esp
-; AVX512F-32-NEXT: subl $32, %esp
-; AVX512F-32-NEXT: .cfi_offset %esi, -16
+; AVX512F-32-NEXT: subl $40, %esp
; AVX512F-32-NEXT: .cfi_offset %ebx, -12
-; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2
-; AVX512F-32-NEXT: setb %cl
-; AVX512F-32-NEXT: kmovw %ecx, %k1
-; AVX512F-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: movl $0, %eax
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-32-NEXT: xorl %eax, %eax
+; AVX512F-32-NEXT: vcomisd %xmm3, %xmm2
; AVX512F-32-NEXT: setae %al
-; AVX512F-32-NEXT: shll $31, %eax
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: movl %eax, %esi
-; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX512F-32-NEXT: xorl %ecx, %ecx
-; AVX512F-32-NEXT: vcomisd %xmm1, %xmm4
-; AVX512F-32-NEXT: setb %dl
-; AVX512F-32-NEXT: kmovw %edx, %k1
-; AVX512F-32-NEXT: vmovapd %xmm1, %xmm5
-; AVX512F-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1}
-; AVX512F-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4
-; AVX512F-32-NEXT: vmovsd %xmm4, (%esp)
-; AVX512F-32-NEXT: fldl (%esp)
-; AVX512F-32-NEXT: fisttpll (%esp)
-; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: setae %cl
-; AVX512F-32-NEXT: shll $31, %ecx
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: xorl %edx, %edx
-; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2
-; AVX512F-32-NEXT: setb %bl
-; AVX512F-32-NEXT: kmovw %ebx, %k1
-; AVX512F-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z}
; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vmovsd %xmm2, (%esp)
+; AVX512F-32-NEXT: xorl %edx, %edx
+; AVX512F-32-NEXT: vcomisd %xmm3, %xmm1
; AVX512F-32-NEXT: setae %dl
-; AVX512F-32-NEXT: shll $31, %edx
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovw %edx, %k1
+; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512F-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F-32-NEXT: xorl %ecx, %ecx
+; AVX512F-32-NEXT: vcomisd %xmm3, %xmm1
+; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512F-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: xorl %ebx, %ebx
-; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0
-; AVX512F-32-NEXT: setb %al
-; AVX512F-32-NEXT: kmovw %eax, %k1
-; AVX512F-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512F-32-NEXT: vcomisd %xmm3, %xmm0
+; AVX512F-32-NEXT: setae %bl
+; AVX512F-32-NEXT: kmovw %ebx, %k1
+; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z}
; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fldl (%esp)
+; AVX512F-32-NEXT: fisttpll (%esp)
+; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: shll $31, %eax
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: shll $31, %edx
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512F-32-NEXT: setae %bl
+; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512F-32-NEXT: shll $31, %ecx
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: shll $31, %ebx
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1
; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-32-NEXT: leal -8(%ebp), %esp
-; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: leal -4(%ebp), %esp
; AVX512F-32-NEXT: popl %ebx
; AVX512F-32-NEXT: popl %ebp
; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
@@ -483,83 +467,68 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512VL-32-NEXT: movl %esp, %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512VL-32-NEXT: pushl %ebx
-; AVX512VL-32-NEXT: pushl %esi
; AVX512VL-32-NEXT: andl $-8, %esp
-; AVX512VL-32-NEXT: subl $32, %esp
-; AVX512VL-32-NEXT: .cfi_offset %esi, -16
+; AVX512VL-32-NEXT: subl $40, %esp
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm2
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, %esi
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX512VL-32-NEXT: xorl %ecx, %ecx
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT: setb %dl
-; AVX512VL-32-NEXT: kmovw %edx, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp)
-; AVX512VL-32-NEXT: fldl (%esp)
-; AVX512VL-32-NEXT: fisttpll (%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %cl
-; AVX512VL-32-NEXT: shll $31, %ecx
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; AVX512VL-32-NEXT: xorl %edx, %edx
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
-; AVX512VL-32-NEXT: setb %bl
-; AVX512VL-32-NEXT: kmovw %ebx, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vmovsd %xmm2, (%esp)
+; AVX512VL-32-NEXT: xorl %edx, %edx
+; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm1
; AVX512VL-32-NEXT: setae %dl
-; AVX512VL-32-NEXT: shll $31, %edx
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; AVX512VL-32-NEXT: kmovw %edx, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-32-NEXT: xorl %ecx, %ecx
+; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm1
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: xorl %ebx, %ebx
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0
-; AVX512VL-32-NEXT: setb %al
-; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm0
+; AVX512VL-32-NEXT: setae %bl
+; AVX512VL-32-NEXT: kmovw %ebx, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl (%esp)
+; AVX512VL-32-NEXT: fisttpll (%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT: shll $31, %edx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512VL-32-NEXT: setae %bl
+; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT: shll $31, %ecx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: shll $31, %ebx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-32-NEXT: leal -8(%ebp), %esp
-; AVX512VL-32-NEXT: popl %esi
+; AVX512VL-32-NEXT: leal -4(%ebp), %esp
; AVX512VL-32-NEXT: popl %ebx
; AVX512VL-32-NEXT: popl %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
@@ -788,17 +757,16 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $32, %esp
-; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB3_2
+; AVX-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX-32-NEXT: vmovaps %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB3_2
; AVX-32-NEXT: # %bb.1:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm4
+; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB3_2:
-; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -806,15 +774,15 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
; AVX-32-NEXT: movzbl %al, %eax
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB3_4
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX-32-NEXT: vmovaps %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB3_4
; AVX-32-NEXT: # %bb.3:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm4
+; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB3_4:
-; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovss %xmm3, (%esp)
+; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovss %xmm2, (%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: fisttpll (%esp)
; AVX-32-NEXT: wait
@@ -822,15 +790,15 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
; AVX-32-NEXT: movzbl %cl, %ecx
; AVX-32-NEXT: shll $31, %ecx
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT: jb .LBB3_6
+; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX-32-NEXT: vmovaps %xmm1, %xmm3
+; AVX-32-NEXT: jae .LBB3_6
; AVX-32-NEXT: # %bb.5:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm4
+; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-32-NEXT: .LBB3_6:
-; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
@@ -839,11 +807,11 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
; AVX-32-NEXT: shll $31, %edx
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX-32-NEXT: jb .LBB3_8
+; AVX-32-NEXT: jae .LBB3_8
; AVX-32-NEXT: # %bb.7:
-; AVX-32-NEXT: vmovaps %xmm1, %xmm2
+; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: .LBB3_8:
-; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
@@ -937,83 +905,68 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
; AVX512F-32-NEXT: movl %esp, %ebp
; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512F-32-NEXT: pushl %ebx
-; AVX512F-32-NEXT: pushl %esi
; AVX512F-32-NEXT: andl $-8, %esp
-; AVX512F-32-NEXT: subl $32, %esp
-; AVX512F-32-NEXT: .cfi_offset %esi, -16
+; AVX512F-32-NEXT: subl $40, %esp
; AVX512F-32-NEXT: .cfi_offset %ebx, -12
-; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2
-; AVX512F-32-NEXT: setb %cl
-; AVX512F-32-NEXT: kmovw %ecx, %k1
-; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: movl $0, %eax
+; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT: xorl %eax, %eax
+; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1
; AVX512F-32-NEXT: setae %al
-; AVX512F-32-NEXT: shll $31, %eax
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: movl %eax, %esi
-; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512F-32-NEXT: xorl %ecx, %ecx
-; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2
-; AVX512F-32-NEXT: setb %dl
-; AVX512F-32-NEXT: kmovw %edx, %k1
-; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT: vmovss %xmm2, (%esp)
-; AVX512F-32-NEXT: flds (%esp)
-; AVX512F-32-NEXT: fisttpll (%esp)
-; AVX512F-32-NEXT: wait
-; AVX512F-32-NEXT: setae %cl
-; AVX512F-32-NEXT: shll $31, %ecx
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovss %xmm1, (%esp)
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512F-32-NEXT: xorl %edx, %edx
-; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2
-; AVX512F-32-NEXT: setb %bl
-; AVX512F-32-NEXT: kmovw %ebx, %k1
-; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1
; AVX512F-32-NEXT: setae %dl
-; AVX512F-32-NEXT: shll $31, %edx
-; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovw %edx, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-32-NEXT: xorl %ecx, %ecx
+; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1
+; AVX512F-32-NEXT: setae %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: xorl %ebx, %ebx
-; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX512F-32-NEXT: setb %al
-; AVX512F-32-NEXT: kmovw %eax, %k1
-; AVX512F-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0
+; AVX512F-32-NEXT: setae %bl
+; AVX512F-32-NEXT: kmovw %ebx, %k1
+; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512F-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: flds (%esp)
+; AVX512F-32-NEXT: fisttpll (%esp)
+; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: shll $31, %eax
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: shll $31, %edx
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512F-32-NEXT: setae %bl
+; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512F-32-NEXT: shll $31, %ecx
+; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: shll $31, %ebx
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1
; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-32-NEXT: leal -8(%ebp), %esp
-; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: leal -4(%ebp), %esp
; AVX512F-32-NEXT: popl %ebx
; AVX512F-32-NEXT: popl %ebp
; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
@@ -1045,83 +998,68 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
; AVX512VL-32-NEXT: movl %esp, %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512VL-32-NEXT: pushl %ebx
-; AVX512VL-32-NEXT: pushl %esi
; AVX512VL-32-NEXT: andl $-8, %esp
-; AVX512VL-32-NEXT: subl $32, %esp
-; AVX512VL-32-NEXT: .cfi_offset %esi, -16
+; AVX512VL-32-NEXT: subl $40, %esp
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
+; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, %esi
-; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512VL-32-NEXT: xorl %ecx, %ecx
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
-; AVX512VL-32-NEXT: setb %dl
-; AVX512VL-32-NEXT: kmovw %edx, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovss %xmm2, (%esp)
-; AVX512VL-32-NEXT: flds (%esp)
-; AVX512VL-32-NEXT: fisttpll (%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %cl
-; AVX512VL-32-NEXT: shll $31, %ecx
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovss %xmm1, (%esp)
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512VL-32-NEXT: xorl %edx, %edx
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
-; AVX512VL-32-NEXT: setb %bl
-; AVX512VL-32-NEXT: kmovw %ebx, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1
; AVX512VL-32-NEXT: setae %dl
-; AVX512VL-32-NEXT: shll $31, %edx
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; AVX512VL-32-NEXT: kmovw %edx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT: xorl %ecx, %ecx
+; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: xorl %ebx, %ebx
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX512VL-32-NEXT: setb %al
-; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0
+; AVX512VL-32-NEXT: setae %bl
+; AVX512VL-32-NEXT: kmovw %ebx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds (%esp)
+; AVX512VL-32-NEXT: fisttpll (%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT: shll $31, %edx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512VL-32-NEXT: setae %bl
+; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT: shll $31, %ecx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: shll $31, %ebx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-32-NEXT: leal -8(%ebp), %esp
-; AVX512VL-32-NEXT: popl %esi
+; AVX512VL-32-NEXT: leal -4(%ebp), %esp
; AVX512VL-32-NEXT: popl %ebx
; AVX512VL-32-NEXT: popl %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index 6f4ab5faaa3b..af52e5fa98b6 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -149,147 +149,125 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
; AVX512VL-32-NEXT: .cfi_offset %esi, -20
; AVX512VL-32-NEXT: .cfi_offset %edi, -16
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
+; AVX512VL-32-NEXT: setae %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: movl %eax, %edi
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
+; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp)
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT: setb %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; AVX512VL-32-NEXT: movl %eax, %esi
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
-; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT: setb %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, %edi
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
-; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, %esi
-; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
; AVX512VL-32-NEXT: xorl %edx, %edx
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT: setb %al
-; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp)
-; AVX512VL-32-NEXT: fldl (%esp)
-; AVX512VL-32-NEXT: fisttpll (%esp)
-; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %dl
-; AVX512VL-32-NEXT: shll $31, %edx
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; AVX512VL-32-NEXT: kmovw %edx, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: setae %al
+; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: xorl %ecx, %ecx
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0
-; AVX512VL-32-NEXT: setb %bl
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-32-NEXT: xorl %ebx, %ebx
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
+; AVX512VL-32-NEXT: setae %bl
; AVX512VL-32-NEXT: kmovw %ebx, %k1
-; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-32-NEXT: setae %al
+; AVX512VL-32-NEXT: kmovw %eax, %k1
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl (%esp)
+; AVX512VL-32-NEXT: fisttpll (%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: shll $31, %esi
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX512VL-32-NEXT: shll $31, %edi
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi
; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; AVX512VL-32-NEXT: shll $31, %edx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; AVX512VL-32-NEXT: shll $31, %edx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT: shll $31, %ecx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; AVX512VL-32-NEXT: shll $31, %ecx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
+; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX512VL-32-NEXT: shll $31, %ebx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3
-; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload
+; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3
; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -463,147 +441,125 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
; AVX512VL-32-NEXT: .cfi_offset %esi, -20
; AVX512VL-32-NEXT: .cfi_offset %edi, -16
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,3,3,3]
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
+; AVX512VL-32-NEXT: setae %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: movl %eax, %edi
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT: vmovss %xmm3, (%esp)
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
+; AVX512VL-32-NEXT: setae %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: movl %eax, %esi
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
+; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z}
; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
+; AVX512VL-32-NEXT: xorl %edx, %edx
+; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX512VL-32-NEXT: setae %dl
+; AVX512VL-32-NEXT: kmovw %edx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4
-; AVX512VL-32-NEXT: setb %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, %edi
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %al
-; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: movl $0, %eax
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-32-NEXT: xorl %ecx, %ecx
+; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: kmovw %ecx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT: xorl %ebx, %ebx
+; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
+; AVX512VL-32-NEXT: setae %bl
+; AVX512VL-32-NEXT: kmovw %ebx, %k1
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: xorl %eax, %eax
+; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0
; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: movl %eax, %esi
-; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,3,3,3]
-; AVX512VL-32-NEXT: xorl %edx, %edx
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4
-; AVX512VL-32-NEXT: setb %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm4, (%esp)
+; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds (%esp)
; AVX512VL-32-NEXT: fisttpll (%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %dl
-; AVX512VL-32-NEXT: shll $31, %edx
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
-; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512VL-32-NEXT: xorl %eax, %eax
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT: setb %cl
-; AVX512VL-32-NEXT: kmovw %ecx, %k1
-; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: wait
-; AVX512VL-32-NEXT: setae %al
-; AVX512VL-32-NEXT: shll $31, %eax
-; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT: xorl %ecx, %ecx
-; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0
-; AVX512VL-32-NEXT: setb %bl
-; AVX512VL-32-NEXT: kmovw %ebx, %k1
-; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: shll $31, %esi
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX512VL-32-NEXT: shll $31, %edi
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi
; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; AVX512VL-32-NEXT: shll $31, %edx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; AVX512VL-32-NEXT: shll $31, %edx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1
+; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT: shll $31, %ecx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT: setae %cl
+; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; AVX512VL-32-NEXT: shll $31, %ecx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT: shll $31, %eax
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
+; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX512VL-32-NEXT: shll $31, %ebx
+; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3
-; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload
+; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3
; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
More information about the llvm-commits
mailing list