[llvm] f40925a - [X86] Improve lowering of fptoui

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Nov 7 23:50:37 PST 2020


Author: Craig Topper
Date: 2020-11-07T23:50:03-08:00
New Revision: f40925aa8b3d200b4616eb0b8a772a1812800b21

URL: https://github.com/llvm/llvm-project/commit/f40925aa8b3d200b4616eb0b8a772a1812800b21
DIFF: https://github.com/llvm/llvm-project/commit/f40925aa8b3d200b4616eb0b8a772a1812800b21.diff

LOG: [X86] Improve lowering of fptoui

Invert the select condition when masking in the sign bit of a fptoui operation. Also, rather than lowering the sign mask to select/xor and expecting the select to get cleaned up later, directly lower to shift/xor.

Patch by Layton Kifer!

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D90658

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/fp-cvt.ll
    llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
    llvm/test/CodeGen/X86/fp-intrinsics.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
    llvm/test/CodeGen/X86/fp80-strict-scalar.ll
    llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2c5175e0543e..269ef229b018 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20407,8 +20407,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     // of a signed i64.  Let Thresh be the FP equivalent of
     // 0x8000000000000000ULL.
     //
-    //  Adjust = (Value < Thresh) ? 0 : 0x80000000;
-    //  FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+    //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
+    //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
     //  FistSrc = (Value - FltOfs);
     //  Fist-to-mem64 FistSrc
     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
@@ -20438,20 +20438,30 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                    *DAG.getContext(), TheVT);
     SDValue Cmp;
     if (IsStrict) {
-      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
-                         Chain, /*IsSignaling*/ true);
+      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
+                         /*IsSignaling*/ true);
       Chain = Cmp.getValue(1);
     } else {
-      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
+      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
     }
 
-    Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
-                           DAG.getConstant(0, DL, MVT::i64),
-                           DAG.getConstant(APInt::getSignMask(64),
-                                           DL, MVT::i64));
-    SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
-                                   DAG.getConstantFP(0.0, DL, TheVT),
-                                   ThreshVal);
+    // Our preferred lowering of
+    //
+    // (Value >= Thresh) ? 0x8000000000000000ULL : 0
+    //
+    // is
+    //
+    // (Value >= Thresh) << 63
+    //
+    // but since we can get here after LegalOperations, DAGCombine might do the
+    // wrong thing if we create a select. So, directly create the preferred
+    // version.
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
+    SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
+    Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
+
+    SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
+                                   DAG.getConstantFP(0.0, DL, TheVT));
 
     if (IsStrict) {
       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

diff  --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll
index cedbfd2e9bff..cb438727cfba 100644
--- a/llvm/test/CodeGen/X86/fp-cvt.ll
+++ b/llvm/test/CodeGen/X86/fp-cvt.ll
@@ -451,13 +451,12 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setbe %al
 ; X86-NEXT:    fldz
-; X86-NEXT:    ja .LBB10_2
+; X86-NEXT:    jbe .LBB10_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fstp %st(1)
 ; X86-NEXT:    fldz
-; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:  .LBB10_2:
-; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    fstp %st(0)
 ; X86-NEXT:    fsubrp %st, %st(1)
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -482,8 +481,7 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
 ; X64-X87-NEXT:    fucomi %st(1), %st
 ; X64-X87-NEXT:    setbe %al
 ; X64-X87-NEXT:    fldz
-; X64-X87-NEXT:    fxch %st(1)
-; X64-X87-NEXT:    fcmovnbe %st(1), %st
+; X64-X87-NEXT:    fcmovbe %st(1), %st
 ; X64-X87-NEXT:    fstp %st(1)
 ; X64-X87-NEXT:    fsubrp %st, %st(1)
 ; X64-X87-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
@@ -504,8 +502,7 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
 ; X64-SSSE3-NEXT:    xorl %eax, %eax
 ; X64-SSSE3-NEXT:    fucomi %st(1), %st
 ; X64-SSSE3-NEXT:    fldz
-; X64-SSSE3-NEXT:    fxch %st(1)
-; X64-SSSE3-NEXT:    fcmovnbe %st(1), %st
+; X64-SSSE3-NEXT:    fcmovbe %st(1), %st
 ; X64-SSSE3-NEXT:    fstp %st(1)
 ; X64-SSSE3-NEXT:    fsubrp %st, %st(1)
 ; X64-SSSE3-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
@@ -534,13 +531,12 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setbe %al
 ; X86-NEXT:    fldz
-; X86-NEXT:    ja .LBB11_2
+; X86-NEXT:    jbe .LBB11_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fstp %st(1)
 ; X86-NEXT:    fldz
-; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:  .LBB11_2:
-; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    fstp %st(0)
 ; X86-NEXT:    fsubrp %st, %st(1)
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -565,8 +561,7 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
 ; X64-X87-NEXT:    fucomi %st(1), %st
 ; X64-X87-NEXT:    setbe %al
 ; X64-X87-NEXT:    fldz
-; X64-X87-NEXT:    fxch %st(1)
-; X64-X87-NEXT:    fcmovnbe %st(1), %st
+; X64-X87-NEXT:    fcmovbe %st(1), %st
 ; X64-X87-NEXT:    fstp %st(1)
 ; X64-X87-NEXT:    fsubrp %st, %st(1)
 ; X64-X87-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
@@ -587,8 +582,7 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
 ; X64-SSSE3-NEXT:    xorl %eax, %eax
 ; X64-SSSE3-NEXT:    fucomi %st(1), %st
 ; X64-SSSE3-NEXT:    fldz
-; X64-SSSE3-NEXT:    fxch %st(1)
-; X64-SSSE3-NEXT:    fcmovnbe %st(1), %st
+; X64-SSSE3-NEXT:    fcmovbe %st(1), %st
 ; X64-SSSE3-NEXT:    fstp %st(1)
 ; X64-SSSE3-NEXT:    fsubrp %st, %st(1)
 ; X64-SSSE3-NEXT:    fisttpll -{{[0-9]+}}(%rsp)

diff  --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
index 4f2859d4bffa..52278a6369eb 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
@@ -32,7 +32,7 @@ entry:
 ; CHECK: COMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr
 ; CHECK: [[FsFLD0SD:%[0-9]+]]:fr64 = FsFLD0SD
 ; CHECK: JCC_1
-; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[MOVSDrm_alt1]], {{.*}}, [[FsFLD0SD]], {{.*}}
+; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[FsFLD0SD]], {{.*}}, [[MOVSDrm_alt1]], {{.*}}
 ; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[PHI]], implicit $mxcsr
 ; CHECK: MOVSDmr %stack.0, 1, $noreg, 0, $noreg, killed [[SUBSDrr]] :: (store 8 into %stack.0)
 ; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags

diff  --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 8cd021ad54e3..abe88f1ca233 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -1363,8 +1363,7 @@ define i64 @f20u64(double %x) #0 {
 ; X87-NEXT:    wait
 ; X87-NEXT:    setbe %dl
 ; X87-NEXT:    fldz
-; X87-NEXT:    fxch %st(1)
-; X87-NEXT:    fcmovnbe %st(1), %st
+; X87-NEXT:    fcmovbe %st(1), %st
 ; X87-NEXT:    fstp %st(1)
 ; X87-NEXT:    fsubrp %st, %st(1)
 ; X87-NEXT:    wait
@@ -1387,12 +1386,11 @@ define i64 @f20u64(double %x) #0 {
 ; X86-SSE-NEXT:    subl $20, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 24
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; X86-SSE-NEXT:    comisd %xmm0, %xmm2
-; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
-; X86-SSE-NEXT:    ja .LBB25_2
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    comisd %xmm0, %xmm1
+; X86-SSE-NEXT:    jbe .LBB25_2
 ; X86-SSE-NEXT:  # %bb.1: # %entry
-; X86-SSE-NEXT:    movapd %xmm2, %xmm1
+; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:  .LBB25_2: # %entry
 ; X86-SSE-NEXT:    subsd %xmm1, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
index e030a9159710..156ee617e72a 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
@@ -543,12 +543,11 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; SSE-X86-NEXT:    andl $-8, %esp
 ; SSE-X86-NEXT:    subl $16, %esp
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    comiss %xmm0, %xmm2
-; SSE-X86-NEXT:    xorps %xmm1, %xmm1
-; SSE-X86-NEXT:    ja .LBB9_2
+; SSE-X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-X86-NEXT:    comiss %xmm0, %xmm1
+; SSE-X86-NEXT:    jbe .LBB9_2
 ; SSE-X86-NEXT:  # %bb.1:
-; SSE-X86-NEXT:    movaps %xmm2, %xmm1
+; SSE-X86-NEXT:    xorps %xmm1, %xmm1
 ; SSE-X86-NEXT:  .LBB9_2:
 ; SSE-X86-NEXT:    subss %xmm1, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -600,12 +599,11 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; AVX1-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX1-X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX1-X86-NEXT:    vcomiss %xmm0, %xmm1
-; AVX1-X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-X86-NEXT:    ja .LBB9_2
+; AVX1-X86-NEXT:    jbe .LBB9_2
 ; AVX1-X86-NEXT:  # %bb.1:
-; AVX1-X86-NEXT:    vmovaps %xmm1, %xmm2
+; AVX1-X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX1-X86-NEXT:  .LBB9_2:
-; AVX1-X86-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX1-X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX1-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX1-X86-NEXT:    flds (%esp)
 ; AVX1-X86-NEXT:    fisttpll (%esp)
@@ -650,16 +648,14 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; AVX512-X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512-X86-NEXT:    xorl %edx, %edx
 ; AVX512-X86-NEXT:    vcomiss %xmm0, %xmm1
-; AVX512-X86-NEXT:    seta %al
-; AVX512-X86-NEXT:    kmovw %eax, %k1
-; AVX512-X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-X86-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-X86-NEXT:    setbe %dl
+; AVX512-X86-NEXT:    kmovw %edx, %k1
+; AVX512-X86-NEXT:    vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512-X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512-X86-NEXT:    flds (%esp)
 ; AVX512-X86-NEXT:    fisttpll (%esp)
 ; AVX512-X86-NEXT:    wait
-; AVX512-X86-NEXT:    setbe %dl
 ; AVX512-X86-NEXT:    shll $31, %edx
 ; AVX512-X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512-X86-NEXT:    movl (%esp), %eax
@@ -692,13 +688,12 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; X87-NEXT:    sahf
 ; X87-NEXT:    setbe %al
 ; X87-NEXT:    fldz
-; X87-NEXT:    ja .LBB9_2
+; X87-NEXT:    jbe .LBB9_2
 ; X87-NEXT:  # %bb.1:
-; X87-NEXT:    fstp %st(0)
+; X87-NEXT:    fstp %st(1)
 ; X87-NEXT:    fldz
-; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:  .LBB9_2:
-; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    fstp %st(0)
 ; X87-NEXT:    fsubrp %st, %st(1)
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
@@ -1188,12 +1183,11 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; SSE-X86-NEXT:    andl $-8, %esp
 ; SSE-X86-NEXT:    subl $16, %esp
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-X86-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-X86-NEXT:    comisd %xmm0, %xmm2
-; SSE-X86-NEXT:    xorpd %xmm1, %xmm1
-; SSE-X86-NEXT:    ja .LBB18_2
+; SSE-X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-X86-NEXT:    comisd %xmm0, %xmm1
+; SSE-X86-NEXT:    jbe .LBB18_2
 ; SSE-X86-NEXT:  # %bb.1:
-; SSE-X86-NEXT:    movapd %xmm2, %xmm1
+; SSE-X86-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-X86-NEXT:  .LBB18_2:
 ; SSE-X86-NEXT:    subsd %xmm1, %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
@@ -1245,12 +1239,11 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; AVX1-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-X86-NEXT:    vcomisd %xmm0, %xmm1
-; AVX1-X86-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX1-X86-NEXT:    ja .LBB18_2
+; AVX1-X86-NEXT:    jbe .LBB18_2
 ; AVX1-X86-NEXT:  # %bb.1:
-; AVX1-X86-NEXT:    vmovapd %xmm1, %xmm2
+; AVX1-X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX1-X86-NEXT:  .LBB18_2:
-; AVX1-X86-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
+; AVX1-X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX1-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX1-X86-NEXT:    fldl (%esp)
 ; AVX1-X86-NEXT:    fisttpll (%esp)
@@ -1295,16 +1288,14 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; AVX512-X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-X86-NEXT:    xorl %edx, %edx
 ; AVX512-X86-NEXT:    vcomisd %xmm0, %xmm1
-; AVX512-X86-NEXT:    seta %al
-; AVX512-X86-NEXT:    kmovw %eax, %k1
-; AVX512-X86-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX512-X86-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-X86-NEXT:    setbe %dl
+; AVX512-X86-NEXT:    kmovw %edx, %k1
+; AVX512-X86-NEXT:    vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512-X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX512-X86-NEXT:    fldl (%esp)
 ; AVX512-X86-NEXT:    fisttpll (%esp)
 ; AVX512-X86-NEXT:    wait
-; AVX512-X86-NEXT:    setbe %dl
 ; AVX512-X86-NEXT:    shll $31, %edx
 ; AVX512-X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512-X86-NEXT:    movl (%esp), %eax
@@ -1337,13 +1328,12 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; X87-NEXT:    sahf
 ; X87-NEXT:    setbe %al
 ; X87-NEXT:    fldz
-; X87-NEXT:    ja .LBB18_2
+; X87-NEXT:    jbe .LBB18_2
 ; X87-NEXT:  # %bb.1:
-; X87-NEXT:    fstp %st(0)
+; X87-NEXT:    fstp %st(1)
 ; X87-NEXT:    fldz
-; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:  .LBB18_2:
-; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    fstp %st(0)
 ; X87-NEXT:    fsubrp %st, %st(1)
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)

diff  --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
index c127a25d7ca9..e55e3903c0dc 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
@@ -597,13 +597,12 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setbe %al
 ; X86-NEXT:    fldz
-; X86-NEXT:    ja .LBB18_2
+; X86-NEXT:    jbe .LBB18_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fstp %st(1)
 ; X86-NEXT:    fldz
-; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:  .LBB18_2:
-; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    fstp %st(0)
 ; X86-NEXT:    fsubrp %st, %st(1)
 ; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
@@ -632,8 +631,7 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
 ; X64-NEXT:    wait
 ; X64-NEXT:    setbe %al
 ; X64-NEXT:    fldz
-; X64-NEXT:    fxch %st(1)
-; X64-NEXT:    fcmovnbe %st(1), %st
+; X64-NEXT:    fcmovbe %st(1), %st
 ; X64-NEXT:    fstp %st(1)
 ; X64-NEXT:    fsubrp %st, %st(1)
 ; X64-NEXT:    wait

diff  --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index 2405292f4efc..41345a98ab4b 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -69,16 +69,15 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-AVX512F-WIN-NEXT:    subl $8, %esp
 ; X86-AVX512F-WIN-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512F-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-WIN-NEXT:    vcmpltss %xmm1, %xmm0, %k1
-; X86-AVX512F-WIN-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX512F-WIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-WIN-NEXT:    vucomiss %xmm0, %xmm1
-; X86-AVX512F-WIN-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-WIN-NEXT:    setbe %dl
+; X86-AVX512F-WIN-NEXT:    kmovw %edx, %k1
+; X86-AVX512F-WIN-NEXT:    vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; X86-AVX512F-WIN-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; X86-AVX512F-WIN-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX512F-WIN-NEXT:    flds (%esp)
 ; X86-AVX512F-WIN-NEXT:    fisttpll (%esp)
-; X86-AVX512F-WIN-NEXT:    setbe %dl
 ; X86-AVX512F-WIN-NEXT:    shll $31, %edx
 ; X86-AVX512F-WIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512F-WIN-NEXT:    movl (%esp), %eax
@@ -91,16 +90,15 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-AVX512F-LIN-NEXT:    subl $12, %esp
 ; X86-AVX512F-LIN-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512F-LIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-LIN-NEXT:    vcmpltss %xmm1, %xmm0, %k1
-; X86-AVX512F-LIN-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX512F-LIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-LIN-NEXT:    vucomiss %xmm0, %xmm1
-; X86-AVX512F-LIN-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-LIN-NEXT:    setbe %dl
+; X86-AVX512F-LIN-NEXT:    kmovw %edx, %k1
+; X86-AVX512F-LIN-NEXT:    vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; X86-AVX512F-LIN-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; X86-AVX512F-LIN-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX512F-LIN-NEXT:    flds (%esp)
 ; X86-AVX512F-LIN-NEXT:    fisttpll (%esp)
-; X86-AVX512F-LIN-NEXT:    setbe %dl
 ; X86-AVX512F-LIN-NEXT:    shll $31, %edx
 ; X86-AVX512F-LIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512F-LIN-NEXT:    movl (%esp), %eax
@@ -115,16 +113,17 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE3-WIN-NEXT:    subl $8, %esp
 ; X86-SSE3-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE3-WIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE3-WIN-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-WIN-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE3-WIN-NEXT:    cmpltss %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT:    andnps %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT:    subss %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT:    movss %xmm2, (%esp)
+; X86-SSE3-WIN-NEXT:    jbe LBB0_2
+; X86-SSE3-WIN-NEXT:  # %bb.1:
+; X86-SSE3-WIN-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE3-WIN-NEXT:  LBB0_2:
+; X86-SSE3-WIN-NEXT:    subss %xmm1, %xmm0
+; X86-SSE3-WIN-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE3-WIN-NEXT:    flds (%esp)
 ; X86-SSE3-WIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-WIN-NEXT:    setbe %dl
+; X86-SSE3-WIN-NEXT:    setbe %al
+; X86-SSE3-WIN-NEXT:    movzbl %al, %edx
 ; X86-SSE3-WIN-NEXT:    shll $31, %edx
 ; X86-SSE3-WIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE3-WIN-NEXT:    movl (%esp), %eax
@@ -137,16 +136,17 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE3-LIN-NEXT:    subl $12, %esp
 ; X86-SSE3-LIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE3-LIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE3-LIN-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-LIN-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE3-LIN-NEXT:    cmpltss %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT:    andnps %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT:    subss %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT:    movss %xmm2, (%esp)
+; X86-SSE3-LIN-NEXT:    jbe .LBB0_2
+; X86-SSE3-LIN-NEXT:  # %bb.1:
+; X86-SSE3-LIN-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE3-LIN-NEXT:  .LBB0_2:
+; X86-SSE3-LIN-NEXT:    subss %xmm1, %xmm0
+; X86-SSE3-LIN-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE3-LIN-NEXT:    flds (%esp)
 ; X86-SSE3-LIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-LIN-NEXT:    setbe %dl
+; X86-SSE3-LIN-NEXT:    setbe %al
+; X86-SSE3-LIN-NEXT:    movzbl %al, %edx
 ; X86-SSE3-LIN-NEXT:    shll $31, %edx
 ; X86-SSE3-LIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE3-LIN-NEXT:    movl (%esp), %eax
@@ -174,23 +174,23 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE2-WIN-NEXT:    subl $16, %esp
 ; X86-SSE2-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-WIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-WIN-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-WIN-NEXT:    cmpltss %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT:    andnps %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT:    movaps %xmm0, %xmm3
-; X86-SSE2-WIN-NEXT:    subss %xmm2, %xmm3
-; X86-SSE2-WIN-NEXT:    movss %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT:    ucomiss %xmm0, %xmm1
+; X86-SSE2-WIN-NEXT:    jbe LBB0_2
+; X86-SSE2-WIN-NEXT:  # %bb.1:
+; X86-SSE2-WIN-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE2-WIN-NEXT:  LBB0_2:
+; X86-SSE2-WIN-NEXT:    subss %xmm1, %xmm0
+; X86-SSE2-WIN-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT:    setbe %al
 ; X86-SSE2-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-WIN-NEXT:    orl $3072, %eax # imm = 0xC00
-; X86-SSE2-WIN-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-WIN-NEXT:    orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-WIN-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT:    xorl %edx, %edx
-; X86-SSE2-WIN-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE2-WIN-NEXT:    setbe %dl
+; X86-SSE2-WIN-NEXT:    movzbl %al, %edx
 ; X86-SSE2-WIN-NEXT:    shll $31, %edx
 ; X86-SSE2-WIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -203,23 +203,23 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE2-LIN-NEXT:    subl $20, %esp
 ; X86-SSE2-LIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-LIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-LIN-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-LIN-NEXT:    cmpltss %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT:    andnps %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT:    movaps %xmm0, %xmm3
-; X86-SSE2-LIN-NEXT:    subss %xmm2, %xmm3
-; X86-SSE2-LIN-NEXT:    movss %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT:    ucomiss %xmm0, %xmm1
+; X86-SSE2-LIN-NEXT:    jbe .LBB0_2
+; X86-SSE2-LIN-NEXT:  # %bb.1:
+; X86-SSE2-LIN-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE2-LIN-NEXT:  .LBB0_2:
+; X86-SSE2-LIN-NEXT:    subss %xmm1, %xmm0
+; X86-SSE2-LIN-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT:    setbe %al
 ; X86-SSE2-LIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-LIN-NEXT:    orl $3072, %eax # imm = 0xC00
-; X86-SSE2-LIN-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-LIN-NEXT:    orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-LIN-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT:    xorl %edx, %edx
-; X86-SSE2-LIN-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE2-LIN-NEXT:    setbe %dl
+; X86-SSE2-LIN-NEXT:    movzbl %al, %edx
 ; X86-SSE2-LIN-NEXT:    shll $31, %edx
 ; X86-SSE2-LIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -241,13 +241,12 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X87-WIN-NEXT:    sahf
 ; X87-WIN-NEXT:    setbe %al
 ; X87-WIN-NEXT:    fldz
-; X87-WIN-NEXT:    ja LBB0_2
+; X87-WIN-NEXT:    jbe LBB0_2
 ; X87-WIN-NEXT:  # %bb.1:
-; X87-WIN-NEXT:    fstp %st(0)
+; X87-WIN-NEXT:    fstp %st(1)
 ; X87-WIN-NEXT:    fldz
-; X87-WIN-NEXT:    fxch %st(1)
 ; X87-WIN-NEXT:  LBB0_2:
-; X87-WIN-NEXT:    fstp %st(1)
+; X87-WIN-NEXT:    fstp %st(0)
 ; X87-WIN-NEXT:    fsubrp %st, %st(1)
 ; X87-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -276,13 +275,12 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X87-LIN-NEXT:    sahf
 ; X87-LIN-NEXT:    setbe %al
 ; X87-LIN-NEXT:    fldz
-; X87-LIN-NEXT:    ja .LBB0_2
+; X87-LIN-NEXT:    jbe .LBB0_2
 ; X87-LIN-NEXT:  # %bb.1:
-; X87-LIN-NEXT:    fstp %st(0)
+; X87-LIN-NEXT:    fstp %st(1)
 ; X87-LIN-NEXT:    fldz
-; X87-LIN-NEXT:    fxch %st(1)
 ; X87-LIN-NEXT:  .LBB0_2:
-; X87-LIN-NEXT:    fstp %st(1)
+; X87-LIN-NEXT:    fstp %st(0)
 ; X87-LIN-NEXT:    fsubrp %st, %st(1)
 ; X87-LIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -495,16 +493,15 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-AVX512F-WIN-NEXT:    subl $8, %esp
 ; X86-AVX512F-WIN-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX512F-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-WIN-NEXT:    vcmpltsd %xmm1, %xmm0, %k1
-; X86-AVX512F-WIN-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; X86-AVX512F-WIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-WIN-NEXT:    vucomisd %xmm0, %xmm1
-; X86-AVX512F-WIN-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-WIN-NEXT:    setbe %dl
+; X86-AVX512F-WIN-NEXT:    kmovw %edx, %k1
+; X86-AVX512F-WIN-NEXT:    vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; X86-AVX512F-WIN-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; X86-AVX512F-WIN-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-AVX512F-WIN-NEXT:    fldl (%esp)
 ; X86-AVX512F-WIN-NEXT:    fisttpll (%esp)
-; X86-AVX512F-WIN-NEXT:    setbe %dl
 ; X86-AVX512F-WIN-NEXT:    shll $31, %edx
 ; X86-AVX512F-WIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512F-WIN-NEXT:    movl (%esp), %eax
@@ -517,16 +514,15 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-AVX512F-LIN-NEXT:    subl $12, %esp
 ; X86-AVX512F-LIN-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX512F-LIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-LIN-NEXT:    vcmpltsd %xmm1, %xmm0, %k1
-; X86-AVX512F-LIN-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; X86-AVX512F-LIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-LIN-NEXT:    vucomisd %xmm0, %xmm1
-; X86-AVX512F-LIN-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; X86-AVX512F-LIN-NEXT:    setbe %dl
+; X86-AVX512F-LIN-NEXT:    kmovw %edx, %k1
+; X86-AVX512F-LIN-NEXT:    vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; X86-AVX512F-LIN-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; X86-AVX512F-LIN-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-AVX512F-LIN-NEXT:    fldl (%esp)
 ; X86-AVX512F-LIN-NEXT:    fisttpll (%esp)
-; X86-AVX512F-LIN-NEXT:    setbe %dl
 ; X86-AVX512F-LIN-NEXT:    shll $31, %edx
 ; X86-AVX512F-LIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512F-LIN-NEXT:    movl (%esp), %eax
@@ -541,16 +537,17 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE3-WIN-NEXT:    subl $8, %esp
 ; X86-SSE3-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE3-WIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE3-WIN-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-WIN-NEXT:    ucomisd %xmm0, %xmm1
-; X86-SSE3-WIN-NEXT:    cmpltsd %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT:    andnpd %xmm1, %xmm0
-; X86-SSE3-WIN-NEXT:    subsd %xmm0, %xmm2
-; X86-SSE3-WIN-NEXT:    movsd %xmm2, (%esp)
+; X86-SSE3-WIN-NEXT:    jbe LBB2_2
+; X86-SSE3-WIN-NEXT:  # %bb.1:
+; X86-SSE3-WIN-NEXT:    xorpd %xmm1, %xmm1
+; X86-SSE3-WIN-NEXT:  LBB2_2:
+; X86-SSE3-WIN-NEXT:    subsd %xmm1, %xmm0
+; X86-SSE3-WIN-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE3-WIN-NEXT:    fldl (%esp)
 ; X86-SSE3-WIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-WIN-NEXT:    setbe %dl
+; X86-SSE3-WIN-NEXT:    setbe %al
+; X86-SSE3-WIN-NEXT:    movzbl %al, %edx
 ; X86-SSE3-WIN-NEXT:    shll $31, %edx
 ; X86-SSE3-WIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE3-WIN-NEXT:    movl (%esp), %eax
@@ -563,16 +560,17 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE3-LIN-NEXT:    subl $12, %esp
 ; X86-SSE3-LIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE3-LIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE3-LIN-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-LIN-NEXT:    ucomisd %xmm0, %xmm1
-; X86-SSE3-LIN-NEXT:    cmpltsd %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT:    andnpd %xmm1, %xmm0
-; X86-SSE3-LIN-NEXT:    subsd %xmm0, %xmm2
-; X86-SSE3-LIN-NEXT:    movsd %xmm2, (%esp)
+; X86-SSE3-LIN-NEXT:    jbe .LBB2_2
+; X86-SSE3-LIN-NEXT:  # %bb.1:
+; X86-SSE3-LIN-NEXT:    xorpd %xmm1, %xmm1
+; X86-SSE3-LIN-NEXT:  .LBB2_2:
+; X86-SSE3-LIN-NEXT:    subsd %xmm1, %xmm0
+; X86-SSE3-LIN-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE3-LIN-NEXT:    fldl (%esp)
 ; X86-SSE3-LIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-LIN-NEXT:    setbe %dl
+; X86-SSE3-LIN-NEXT:    setbe %al
+; X86-SSE3-LIN-NEXT:    movzbl %al, %edx
 ; X86-SSE3-LIN-NEXT:    shll $31, %edx
 ; X86-SSE3-LIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE3-LIN-NEXT:    movl (%esp), %eax
@@ -600,23 +598,23 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE2-WIN-NEXT:    subl $16, %esp
 ; X86-SSE2-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-WIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE2-WIN-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE2-WIN-NEXT:    cmpltsd %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT:    andnpd %xmm1, %xmm2
-; X86-SSE2-WIN-NEXT:    movapd %xmm0, %xmm3
-; X86-SSE2-WIN-NEXT:    subsd %xmm2, %xmm3
-; X86-SSE2-WIN-NEXT:    movsd %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT:    ucomisd %xmm0, %xmm1
+; X86-SSE2-WIN-NEXT:    jbe LBB2_2
+; X86-SSE2-WIN-NEXT:  # %bb.1:
+; X86-SSE2-WIN-NEXT:    xorpd %xmm1, %xmm1
+; X86-SSE2-WIN-NEXT:  LBB2_2:
+; X86-SSE2-WIN-NEXT:    subsd %xmm1, %xmm0
+; X86-SSE2-WIN-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT:    setbe %al
 ; X86-SSE2-WIN-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-WIN-NEXT:    orl $3072, %eax # imm = 0xC00
-; X86-SSE2-WIN-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-WIN-NEXT:    orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-WIN-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-SSE2-WIN-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-WIN-NEXT:    xorl %edx, %edx
-; X86-SSE2-WIN-NEXT:    ucomisd %xmm0, %xmm1
-; X86-SSE2-WIN-NEXT:    setbe %dl
+; X86-SSE2-WIN-NEXT:    movzbl %al, %edx
 ; X86-SSE2-WIN-NEXT:    shll $31, %edx
 ; X86-SSE2-WIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -629,23 +627,23 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE2-LIN-NEXT:    subl $20, %esp
 ; X86-SSE2-LIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-LIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE2-LIN-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE2-LIN-NEXT:    cmpltsd %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT:    andnpd %xmm1, %xmm2
-; X86-SSE2-LIN-NEXT:    movapd %xmm0, %xmm3
-; X86-SSE2-LIN-NEXT:    subsd %xmm2, %xmm3
-; X86-SSE2-LIN-NEXT:    movsd %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT:    ucomisd %xmm0, %xmm1
+; X86-SSE2-LIN-NEXT:    jbe .LBB2_2
+; X86-SSE2-LIN-NEXT:  # %bb.1:
+; X86-SSE2-LIN-NEXT:    xorpd %xmm1, %xmm1
+; X86-SSE2-LIN-NEXT:  .LBB2_2:
+; X86-SSE2-LIN-NEXT:    subsd %xmm1, %xmm0
+; X86-SSE2-LIN-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT:    setbe %al
 ; X86-SSE2-LIN-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-LIN-NEXT:    orl $3072, %eax # imm = 0xC00
-; X86-SSE2-LIN-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-SSE2-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-LIN-NEXT:    orl $3072, %ecx # imm = 0xC00
+; X86-SSE2-LIN-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-SSE2-LIN-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE2-LIN-NEXT:    xorl %edx, %edx
-; X86-SSE2-LIN-NEXT:    ucomisd %xmm0, %xmm1
-; X86-SSE2-LIN-NEXT:    setbe %dl
+; X86-SSE2-LIN-NEXT:    movzbl %al, %edx
 ; X86-SSE2-LIN-NEXT:    shll $31, %edx
 ; X86-SSE2-LIN-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -667,13 +665,12 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X87-WIN-NEXT:    sahf
 ; X87-WIN-NEXT:    setbe %al
 ; X87-WIN-NEXT:    fldz
-; X87-WIN-NEXT:    ja LBB2_2
+; X87-WIN-NEXT:    jbe LBB2_2
 ; X87-WIN-NEXT:  # %bb.1:
-; X87-WIN-NEXT:    fstp %st(0)
+; X87-WIN-NEXT:    fstp %st(1)
 ; X87-WIN-NEXT:    fldz
-; X87-WIN-NEXT:    fxch %st(1)
 ; X87-WIN-NEXT:  LBB2_2:
-; X87-WIN-NEXT:    fstp %st(1)
+; X87-WIN-NEXT:    fstp %st(0)
 ; X87-WIN-NEXT:    fsubrp %st, %st(1)
 ; X87-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -702,13 +699,12 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X87-LIN-NEXT:    sahf
 ; X87-LIN-NEXT:    setbe %al
 ; X87-LIN-NEXT:    fldz
-; X87-LIN-NEXT:    ja .LBB2_2
+; X87-LIN-NEXT:    jbe .LBB2_2
 ; X87-LIN-NEXT:  # %bb.1:
-; X87-LIN-NEXT:    fstp %st(0)
+; X87-LIN-NEXT:    fstp %st(1)
 ; X87-LIN-NEXT:    fldz
-; X87-LIN-NEXT:    fxch %st(1)
 ; X87-LIN-NEXT:  .LBB2_2:
-; X87-LIN-NEXT:    fstp %st(1)
+; X87-LIN-NEXT:    fstp %st(0)
 ; X87-LIN-NEXT:    fsubrp %st, %st(1)
 ; X87-LIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -902,8 +898,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-AVX512-WIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512-WIN-NEXT:    fucomi %st(1), %st
 ; X86-AVX512-WIN-NEXT:    fldz
-; X86-AVX512-WIN-NEXT:    fxch %st(1)
-; X86-AVX512-WIN-NEXT:    fcmovnbe %st(1), %st
+; X86-AVX512-WIN-NEXT:    fcmovbe %st(1), %st
 ; X86-AVX512-WIN-NEXT:    fstp %st(1)
 ; X86-AVX512-WIN-NEXT:    fsubrp %st, %st(1)
 ; X86-AVX512-WIN-NEXT:    fisttpll (%esp)
@@ -923,8 +918,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-AVX512-LIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512-LIN-NEXT:    fucomi %st(1), %st
 ; X86-AVX512-LIN-NEXT:    fldz
-; X86-AVX512-LIN-NEXT:    fxch %st(1)
-; X86-AVX512-LIN-NEXT:    fcmovnbe %st(1), %st
+; X86-AVX512-LIN-NEXT:    fcmovbe %st(1), %st
 ; X86-AVX512-LIN-NEXT:    fstp %st(1)
 ; X86-AVX512-LIN-NEXT:    fsubrp %st, %st(1)
 ; X86-AVX512-LIN-NEXT:    fisttpll (%esp)
@@ -943,8 +937,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X64-AVX512-WIN-NEXT:    xorl %eax, %eax
 ; X64-AVX512-WIN-NEXT:    fucomi %st(1), %st
 ; X64-AVX512-WIN-NEXT:    fldz
-; X64-AVX512-WIN-NEXT:    fxch %st(1)
-; X64-AVX512-WIN-NEXT:    fcmovnbe %st(1), %st
+; X64-AVX512-WIN-NEXT:    fcmovbe %st(1), %st
 ; X64-AVX512-WIN-NEXT:    fstp %st(1)
 ; X64-AVX512-WIN-NEXT:    fsubrp %st, %st(1)
 ; X64-AVX512-WIN-NEXT:    fisttpll (%rsp)
@@ -961,8 +954,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X64-AVX512-LIN-NEXT:    xorl %eax, %eax
 ; X64-AVX512-LIN-NEXT:    fucomi %st(1), %st
 ; X64-AVX512-LIN-NEXT:    fldz
-; X64-AVX512-LIN-NEXT:    fxch %st(1)
-; X64-AVX512-LIN-NEXT:    fcmovnbe %st(1), %st
+; X64-AVX512-LIN-NEXT:    fcmovbe %st(1), %st
 ; X64-AVX512-LIN-NEXT:    fstp %st(1)
 ; X64-AVX512-LIN-NEXT:    fsubrp %st, %st(1)
 ; X64-AVX512-LIN-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
@@ -982,8 +974,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-SSE3-WIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-WIN-NEXT:    fucomi %st(1), %st
 ; X86-SSE3-WIN-NEXT:    fldz
-; X86-SSE3-WIN-NEXT:    fxch %st(1)
-; X86-SSE3-WIN-NEXT:    fcmovnbe %st(1), %st
+; X86-SSE3-WIN-NEXT:    fcmovbe %st(1), %st
 ; X86-SSE3-WIN-NEXT:    fstp %st(1)
 ; X86-SSE3-WIN-NEXT:    fsubrp %st, %st(1)
 ; X86-SSE3-WIN-NEXT:    fisttpll (%esp)
@@ -1003,8 +994,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-SSE3-LIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-LIN-NEXT:    fucomi %st(1), %st
 ; X86-SSE3-LIN-NEXT:    fldz
-; X86-SSE3-LIN-NEXT:    fxch %st(1)
-; X86-SSE3-LIN-NEXT:    fcmovnbe %st(1), %st
+; X86-SSE3-LIN-NEXT:    fcmovbe %st(1), %st
 ; X86-SSE3-LIN-NEXT:    fstp %st(1)
 ; X86-SSE3-LIN-NEXT:    fsubrp %st, %st(1)
 ; X86-SSE3-LIN-NEXT:    fisttpll (%esp)
@@ -1023,8 +1013,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X64-SSE3-WIN-NEXT:    xorl %eax, %eax
 ; X64-SSE3-WIN-NEXT:    fucomi %st(1), %st
 ; X64-SSE3-WIN-NEXT:    fldz
-; X64-SSE3-WIN-NEXT:    fxch %st(1)
-; X64-SSE3-WIN-NEXT:    fcmovnbe %st(1), %st
+; X64-SSE3-WIN-NEXT:    fcmovbe %st(1), %st
 ; X64-SSE3-WIN-NEXT:    fstp %st(1)
 ; X64-SSE3-WIN-NEXT:    fsubrp %st, %st(1)
 ; X64-SSE3-WIN-NEXT:    fisttpll (%rsp)
@@ -1041,8 +1030,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X64-SSE3-LIN-NEXT:    xorl %eax, %eax
 ; X64-SSE3-LIN-NEXT:    fucomi %st(1), %st
 ; X64-SSE3-LIN-NEXT:    fldz
-; X64-SSE3-LIN-NEXT:    fxch %st(1)
-; X64-SSE3-LIN-NEXT:    fcmovnbe %st(1), %st
+; X64-SSE3-LIN-NEXT:    fcmovbe %st(1), %st
 ; X64-SSE3-LIN-NEXT:    fstp %st(1)
 ; X64-SSE3-LIN-NEXT:    fsubrp %st, %st(1)
 ; X64-SSE3-LIN-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
@@ -1063,8 +1051,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-SSE2-WIN-NEXT:    fucomi %st(1), %st
 ; X86-SSE2-WIN-NEXT:    setbe %dl
 ; X86-SSE2-WIN-NEXT:    fldz
-; X86-SSE2-WIN-NEXT:    fxch %st(1)
-; X86-SSE2-WIN-NEXT:    fcmovnbe %st(1), %st
+; X86-SSE2-WIN-NEXT:    fcmovbe %st(1), %st
 ; X86-SSE2-WIN-NEXT:    fstp %st(1)
 ; X86-SSE2-WIN-NEXT:    fsubrp %st, %st(1)
 ; X86-SSE2-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
@@ -1090,8 +1077,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-SSE2-LIN-NEXT:    fucomi %st(1), %st
 ; X86-SSE2-LIN-NEXT:    setbe %dl
 ; X86-SSE2-LIN-NEXT:    fldz
-; X86-SSE2-LIN-NEXT:    fxch %st(1)
-; X86-SSE2-LIN-NEXT:    fcmovnbe %st(1), %st
+; X86-SSE2-LIN-NEXT:    fcmovbe %st(1), %st
 ; X86-SSE2-LIN-NEXT:    fstp %st(1)
 ; X86-SSE2-LIN-NEXT:    fsubrp %st, %st(1)
 ; X86-SSE2-LIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
@@ -1116,8 +1102,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X64-SSE2-WIN-NEXT:    fucomi %st(1), %st
 ; X64-SSE2-WIN-NEXT:    setbe %al
 ; X64-SSE2-WIN-NEXT:    fldz
-; X64-SSE2-WIN-NEXT:    fxch %st(1)
-; X64-SSE2-WIN-NEXT:    fcmovnbe %st(1), %st
+; X64-SSE2-WIN-NEXT:    fcmovbe %st(1), %st
 ; X64-SSE2-WIN-NEXT:    fstp %st(1)
 ; X64-SSE2-WIN-NEXT:    fsubrp %st, %st(1)
 ; X64-SSE2-WIN-NEXT:    fnstcw {{[0-9]+}}(%rsp)
@@ -1140,8 +1125,7 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X64-SSE2-LIN-NEXT:    fucomi %st(1), %st
 ; X64-SSE2-LIN-NEXT:    setbe %al
 ; X64-SSE2-LIN-NEXT:    fldz
-; X64-SSE2-LIN-NEXT:    fxch %st(1)
-; X64-SSE2-LIN-NEXT:    fcmovnbe %st(1), %st
+; X64-SSE2-LIN-NEXT:    fcmovbe %st(1), %st
 ; X64-SSE2-LIN-NEXT:    fstp %st(1)
 ; X64-SSE2-LIN-NEXT:    fsubrp %st, %st(1)
 ; X64-SSE2-LIN-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
@@ -1170,13 +1154,12 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X87-WIN-NEXT:    sahf
 ; X87-WIN-NEXT:    setbe %al
 ; X87-WIN-NEXT:    fldz
-; X87-WIN-NEXT:    ja LBB4_2
+; X87-WIN-NEXT:    jbe LBB4_2
 ; X87-WIN-NEXT:  # %bb.1:
-; X87-WIN-NEXT:    fstp %st(0)
+; X87-WIN-NEXT:    fstp %st(1)
 ; X87-WIN-NEXT:    fldz
-; X87-WIN-NEXT:    fxch %st(1)
 ; X87-WIN-NEXT:  LBB4_2:
-; X87-WIN-NEXT:    fstp %st(1)
+; X87-WIN-NEXT:    fstp %st(0)
 ; X87-WIN-NEXT:    fsubrp %st, %st(1)
 ; X87-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -1205,13 +1188,12 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X87-LIN-NEXT:    sahf
 ; X87-LIN-NEXT:    setbe %al
 ; X87-LIN-NEXT:    fldz
-; X87-LIN-NEXT:    ja .LBB4_2
+; X87-LIN-NEXT:    jbe .LBB4_2
 ; X87-LIN-NEXT:  # %bb.1:
-; X87-LIN-NEXT:    fstp %st(0)
+; X87-LIN-NEXT:    fstp %st(1)
 ; X87-LIN-NEXT:    fldz
-; X87-LIN-NEXT:    fxch %st(1)
 ; X87-LIN-NEXT:  .LBB4_2:
-; X87-LIN-NEXT:    fstp %st(1)
+; X87-LIN-NEXT:    fstp %st(0)
 ; X87-LIN-NEXT:    fsubrp %st, %st(1)
 ; X87-LIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-LIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index 93695371091b..4569e69b7e50 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -222,17 +222,16 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-32-NEXT:    comisd %xmm2, %xmm0
-; SSE-32-NEXT:    xorpd %xmm1, %xmm1
-; SSE-32-NEXT:    xorpd %xmm3, %xmm3
-; SSE-32-NEXT:    jb .LBB1_2
+; SSE-32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-32-NEXT:    comisd %xmm1, %xmm0
+; SSE-32-NEXT:    movapd %xmm1, %xmm2
+; SSE-32-NEXT:    jae .LBB1_2
 ; SSE-32-NEXT:  # %bb.1:
-; SSE-32-NEXT:    movapd %xmm2, %xmm3
+; SSE-32-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-32-NEXT:  .LBB1_2:
-; SSE-32-NEXT:    movapd %xmm0, %xmm4
-; SSE-32-NEXT:    subsd %xmm3, %xmm4
-; SSE-32-NEXT:    movsd %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    movapd %xmm0, %xmm3
+; SSE-32-NEXT:    subsd %xmm2, %xmm3
+; SSE-32-NEXT:    movsd %xmm3, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    wait
@@ -244,10 +243,10 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-32-NEXT:    comisd %xmm2, %xmm0
-; SSE-32-NEXT:    jb .LBB1_4
+; SSE-32-NEXT:    comisd %xmm1, %xmm0
+; SSE-32-NEXT:    jae .LBB1_4
 ; SSE-32-NEXT:  # %bb.3:
-; SSE-32-NEXT:    movapd %xmm2, %xmm1
+; SSE-32-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-32-NEXT:  .LBB1_4:
 ; SSE-32-NEXT:    subsd %xmm1, %xmm0
 ; SSE-32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
@@ -323,17 +322,16 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX-32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB1_2
+; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
+; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB1_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm4
+; AVX-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB1_2:
-; AVX-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovsd %xmm3, (%esp)
+; AVX-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovsd %xmm2, (%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    wait
@@ -342,11 +340,11 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vcomisd %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB1_4
+; AVX-32-NEXT:    jae .LBB1_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm2
+; AVX-32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB1_4:
-; AVX-32-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
@@ -410,30 +408,25 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomisd %xmm2, %xmm1
-; AVX512F-32-NEXT:    setb %cl
-; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovapd %xmm2, %xmm4
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubsd %xmm4, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovsd %xmm1, (%esp)
-; AVX512F-32-NEXT:    fldl (%esp)
-; AVX512F-32-NEXT:    fisttpll (%esp)
-; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %al
-; AVX512F-32-NEXT:    shll $31, %eax
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    vmovsd %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovsd %xmm1, (%esp)
 ; AVX512F-32-NEXT:    xorl %ecx, %ecx
 ; AVX512F-32-NEXT:    vcomisd %xmm2, %xmm0
-; AVX512F-32-NEXT:    setb %dl
-; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512F-32-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
+; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    kmovw %ecx, %k1
+; AVX512F-32-NEXT:    vmovsd %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fldl (%esp)
+; AVX512F-32-NEXT:    fisttpll (%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    shll $31, %eax
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -468,30 +461,25 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm2, %xmm1
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovapd %xmm2, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovsd %xmm1, (%esp)
-; AVX512VL-32-NEXT:    fldl (%esp)
-; AVX512VL-32-NEXT:    fisttpll (%esp)
-; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, (%esp)
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
 ; AVX512VL-32-NEXT:    vcomisd %xmm2, %xmm0
-; AVX512VL-32-NEXT:    setb %dl
-; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl (%esp)
+; AVX512VL-32-NEXT:    fisttpll (%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -905,17 +893,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss %xmm2, %xmm0
-; SSE-32-NEXT:    xorps %xmm1, %xmm1
-; SSE-32-NEXT:    xorps %xmm3, %xmm3
-; SSE-32-NEXT:    jb .LBB4_2
+; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss %xmm1, %xmm0
+; SSE-32-NEXT:    movaps %xmm1, %xmm2
+; SSE-32-NEXT:    jae .LBB4_2
 ; SSE-32-NEXT:  # %bb.1:
-; SSE-32-NEXT:    movaps %xmm2, %xmm3
+; SSE-32-NEXT:    xorps %xmm2, %xmm2
 ; SSE-32-NEXT:  .LBB4_2:
-; SSE-32-NEXT:    movaps %xmm0, %xmm4
-; SSE-32-NEXT:    subss %xmm3, %xmm4
-; SSE-32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    movaps %xmm0, %xmm3
+; SSE-32-NEXT:    subss %xmm2, %xmm3
+; SSE-32-NEXT:    movss %xmm3, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    wait
@@ -927,10 +914,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-32-NEXT:    comiss %xmm2, %xmm0
-; SSE-32-NEXT:    jb .LBB4_4
+; SSE-32-NEXT:    comiss %xmm1, %xmm0
+; SSE-32-NEXT:    jae .LBB4_4
 ; SSE-32-NEXT:  # %bb.3:
-; SSE-32-NEXT:    movaps %xmm2, %xmm1
+; SSE-32-NEXT:    xorps %xmm1, %xmm1
 ; SSE-32-NEXT:  .LBB4_4:
 ; SSE-32-NEXT:    subss %xmm1, %xmm0
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -1006,17 +993,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX-32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB4_2
+; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB4_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm4
+; AVX-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB4_2:
-; AVX-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -1025,11 +1011,11 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB4_4
+; AVX-32-NEXT:    jae .LBB4_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm2
+; AVX-32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB4_4:
-; AVX-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
@@ -1093,30 +1079,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
-; AVX512F-32-NEXT:    setb %cl
-; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovaps %xmm2, %xmm4
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm4, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %al
-; AVX512F-32-NEXT:    shll $31, %eax
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    xorl %ecx, %ecx
 ; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm0
-; AVX512F-32-NEXT:    setb %dl
-; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    kmovw %ecx, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
 ; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    shll $31, %eax
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1151,30 +1132,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovaps %xmm2, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
 ; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm0
-; AVX512VL-32-NEXT:    setb %dl
-; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
 ; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1225,17 +1201,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
 ; SSE-32-NEXT:    subl $24, %esp
 ; SSE-32-NEXT:    movl 8(%ebp), %eax
 ; SSE-32-NEXT:    movaps (%eax), %xmm0
-; SSE-32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss %xmm2, %xmm0
-; SSE-32-NEXT:    xorps %xmm1, %xmm1
-; SSE-32-NEXT:    xorps %xmm3, %xmm3
-; SSE-32-NEXT:    jb .LBB5_2
+; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss %xmm1, %xmm0
+; SSE-32-NEXT:    movaps %xmm1, %xmm2
+; SSE-32-NEXT:    jae .LBB5_2
 ; SSE-32-NEXT:  # %bb.1:
-; SSE-32-NEXT:    movaps %xmm2, %xmm3
+; SSE-32-NEXT:    xorps %xmm2, %xmm2
 ; SSE-32-NEXT:  .LBB5_2:
-; SSE-32-NEXT:    movaps %xmm0, %xmm4
-; SSE-32-NEXT:    subss %xmm3, %xmm4
-; SSE-32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    movaps %xmm0, %xmm3
+; SSE-32-NEXT:    subss %xmm2, %xmm3
+; SSE-32-NEXT:    movss %xmm3, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    wait
@@ -1247,10 +1222,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-32-NEXT:    comiss %xmm2, %xmm0
-; SSE-32-NEXT:    jb .LBB5_4
+; SSE-32-NEXT:    comiss %xmm1, %xmm0
+; SSE-32-NEXT:    jae .LBB5_4
 ; SSE-32-NEXT:  # %bb.3:
-; SSE-32-NEXT:    movaps %xmm2, %xmm1
+; SSE-32-NEXT:    xorps %xmm1, %xmm1
 ; SSE-32-NEXT:  .LBB5_4:
 ; SSE-32-NEXT:    subss %xmm1, %xmm0
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -1328,17 +1303,16 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    movl 8(%ebp), %eax
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX-32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB5_2
+; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB5_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm4
+; AVX-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB5_2:
-; AVX-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -1347,11 +1321,11 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB5_4
+; AVX-32-NEXT:    jae .LBB5_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm2
+; AVX-32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB5_4:
-; AVX-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
@@ -1418,30 +1392,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
 ; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
-; AVX512F-32-NEXT:    setb %cl
-; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovaps %xmm2, %xmm4
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm4, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %al
-; AVX512F-32-NEXT:    shll $31, %eax
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    xorl %ecx, %ecx
 ; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm0
-; AVX512F-32-NEXT:    setb %dl
-; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    kmovw %ecx, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
 ; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    shll $31, %eax
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1477,30 +1446,25 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) s
 ; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovaps %xmm2, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
 ; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm0
-; AVX512VL-32-NEXT:    setb %dl
-; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
 ; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -2416,17 +2380,16 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-32-NEXT:    comisd %xmm2, %xmm0
-; SSE-32-NEXT:    xorpd %xmm1, %xmm1
-; SSE-32-NEXT:    xorpd %xmm3, %xmm3
-; SSE-32-NEXT:    jb .LBB19_2
+; SSE-32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-32-NEXT:    comisd %xmm1, %xmm0
+; SSE-32-NEXT:    movapd %xmm1, %xmm2
+; SSE-32-NEXT:    jae .LBB19_2
 ; SSE-32-NEXT:  # %bb.1:
-; SSE-32-NEXT:    movapd %xmm2, %xmm3
+; SSE-32-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-32-NEXT:  .LBB19_2:
-; SSE-32-NEXT:    movapd %xmm0, %xmm4
-; SSE-32-NEXT:    subsd %xmm3, %xmm4
-; SSE-32-NEXT:    movsd %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    movapd %xmm0, %xmm3
+; SSE-32-NEXT:    subsd %xmm2, %xmm3
+; SSE-32-NEXT:    movsd %xmm3, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    wait
@@ -2438,10 +2401,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-32-NEXT:    comisd %xmm2, %xmm0
-; SSE-32-NEXT:    jb .LBB19_4
+; SSE-32-NEXT:    comisd %xmm1, %xmm0
+; SSE-32-NEXT:    jae .LBB19_4
 ; SSE-32-NEXT:  # %bb.3:
-; SSE-32-NEXT:    movapd %xmm2, %xmm1
+; SSE-32-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-32-NEXT:  .LBB19_4:
 ; SSE-32-NEXT:    subsd %xmm1, %xmm0
 ; SSE-32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
@@ -2517,17 +2480,16 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX-32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB19_2
+; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
+; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB19_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm4
+; AVX-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB19_2:
-; AVX-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovsd %xmm3, (%esp)
+; AVX-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovsd %xmm2, (%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    wait
@@ -2536,11 +2498,11 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vcomisd %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB19_4
+; AVX-32-NEXT:    jae .LBB19_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm2
+; AVX-32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB19_4:
-; AVX-32-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
@@ -2792,17 +2754,16 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss %xmm2, %xmm0
-; SSE-32-NEXT:    xorps %xmm1, %xmm1
-; SSE-32-NEXT:    xorps %xmm3, %xmm3
-; SSE-32-NEXT:    jb .LBB21_2
+; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss %xmm1, %xmm0
+; SSE-32-NEXT:    movaps %xmm1, %xmm2
+; SSE-32-NEXT:    jae .LBB21_2
 ; SSE-32-NEXT:  # %bb.1:
-; SSE-32-NEXT:    movaps %xmm2, %xmm3
+; SSE-32-NEXT:    xorps %xmm2, %xmm2
 ; SSE-32-NEXT:  .LBB21_2:
-; SSE-32-NEXT:    movaps %xmm0, %xmm4
-; SSE-32-NEXT:    subss %xmm3, %xmm4
-; SSE-32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    movaps %xmm0, %xmm3
+; SSE-32-NEXT:    subss %xmm2, %xmm3
+; SSE-32-NEXT:    movss %xmm3, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    wait
@@ -2814,10 +2775,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-32-NEXT:    comiss %xmm2, %xmm0
-; SSE-32-NEXT:    jb .LBB21_4
+; SSE-32-NEXT:    comiss %xmm1, %xmm0
+; SSE-32-NEXT:    jae .LBB21_4
 ; SSE-32-NEXT:  # %bb.3:
-; SSE-32-NEXT:    movaps %xmm2, %xmm1
+; SSE-32-NEXT:    xorps %xmm1, %xmm1
 ; SSE-32-NEXT:  .LBB21_4:
 ; SSE-32-NEXT:    subss %xmm1, %xmm0
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -2893,17 +2854,16 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX-32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB21_2
+; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB21_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm4
+; AVX-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB21_2:
-; AVX-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -2912,11 +2872,11 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB21_4
+; AVX-32-NEXT:    jae .LBB21_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm2
+; AVX-32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB21_4:
-; AVX-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index d4d285c36485..bcc14ec38e27 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -226,17 +226,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $32, %esp
-; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX-32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB1_2
+; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
+; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB1_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm4
+; AVX-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB1_2:
-; AVX-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -244,16 +243,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX-32-NEXT:    vcomisd %xmm1, %xmm4
-; AVX-32-NEXT:    vxorpd %xmm5, %xmm5, %xmm5
-; AVX-32-NEXT:    jb .LBB1_4
+; AVX-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX-32-NEXT:    vcomisd %xmm1, %xmm3
+; AVX-32-NEXT:    vmovapd %xmm1, %xmm4
+; AVX-32-NEXT:    jae .LBB1_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm5
+; AVX-32-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; AVX-32-NEXT:  .LBB1_4:
-; AVX-32-NEXT:    vsubsd %xmm5, %xmm4, %xmm4
-; AVX-32-NEXT:    vmovsd %xmm4, (%esp)
+; AVX-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
+; AVX-32-NEXT:    vmovsd %xmm3, (%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    wait
@@ -261,14 +260,14 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX-32-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB1_6
+; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
+; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB1_6
 ; AVX-32-NEXT:  # %bb.5:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm4
+; AVX-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB1_6:
-; AVX-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -277,11 +276,11 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    shll $31, %edx
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX-32-NEXT:    vcomisd %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB1_8
+; AVX-32-NEXT:    jae .LBB1_8
 ; AVX-32-NEXT:  # %bb.7:
-; AVX-32-NEXT:    vmovapd %xmm1, %xmm2
+; AVX-32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB1_8:
-; AVX-32-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
@@ -375,83 +374,68 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    movl %esp, %ebp
 ; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512F-32-NEXT:    pushl %ebx
-; AVX512F-32-NEXT:    pushl %esi
 ; AVX512F-32-NEXT:    andl $-8, %esp
-; AVX512F-32-NEXT:    subl $32, %esp
-; AVX512F-32-NEXT:    .cfi_offset %esi, -16
+; AVX512F-32-NEXT:    subl $40, %esp
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm2
-; AVX512F-32-NEXT:    setb %cl
-; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubsd %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    movl $0, %eax
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-32-NEXT:    xorl %eax, %eax
+; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm2
 ; AVX512F-32-NEXT:    setae %al
-; AVX512F-32-NEXT:    shll $31, %eax
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl %eax, %esi
-; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX512F-32-NEXT:    xorl %ecx, %ecx
-; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm4
-; AVX512F-32-NEXT:    setb %dl
-; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovapd %xmm1, %xmm5
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm5, %xmm5 {%k1}
-; AVX512F-32-NEXT:    vsubsd %xmm5, %xmm4, %xmm4
-; AVX512F-32-NEXT:    vmovsd %xmm4, (%esp)
-; AVX512F-32-NEXT:    fldl (%esp)
-; AVX512F-32-NEXT:    fisttpll (%esp)
-; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    setae %cl
-; AVX512F-32-NEXT:    shll $31, %ecx
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    xorl %edx, %edx
-; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm2
-; AVX512F-32-NEXT:    setb %bl
-; AVX512F-32-NEXT:    kmovw %ebx, %k1
-; AVX512F-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
+; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z}
 ; AVX512F-32-NEXT:    vsubsd %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vmovsd %xmm2, (%esp)
+; AVX512F-32-NEXT:    xorl %edx, %edx
+; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm1
 ; AVX512F-32-NEXT:    setae %dl
-; AVX512F-32-NEXT:    shll $31, %edx
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    kmovw %edx, %k1
+; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F-32-NEXT:    xorl %ecx, %ecx
+; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm1
+; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    kmovw %ecx, %k1
+; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    xorl %ebx, %ebx
-; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm0
-; AVX512F-32-NEXT:    setb %al
-; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm0
+; AVX512F-32-NEXT:    setae %bl
+; AVX512F-32-NEXT:    kmovw %ebx, %k1
+; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z}
 ; AVX512F-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fldl (%esp)
+; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    shll $31, %eax
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    shll $31, %edx
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512F-32-NEXT:    setae %bl
+; AVX512F-32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512F-32-NEXT:    shll $31, %ecx
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    shll $31, %ebx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, %ebx, %xmm1, %xmm1
 ; AVX512F-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-32-NEXT:    leal -8(%ebp), %esp
-; AVX512F-32-NEXT:    popl %esi
+; AVX512F-32-NEXT:    leal -4(%ebp), %esp
 ; AVX512F-32-NEXT:    popl %ebx
 ; AVX512F-32-NEXT:    popl %ebp
 ; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
@@ -483,83 +467,68 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    movl %esp, %ebp
 ; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512VL-32-NEXT:    pushl %ebx
-; AVX512VL-32-NEXT:    pushl %esi
 ; AVX512VL-32-NEXT:    andl $-8, %esp
-; AVX512VL-32-NEXT:    subl $32, %esp
-; AVX512VL-32-NEXT:    .cfi_offset %esi, -16
+; AVX512VL-32-NEXT:    subl $40, %esp
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, %esi
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX512VL-32-NEXT:    xorl %ecx, %ecx
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    setb %dl
-; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm4, (%esp)
-; AVX512VL-32-NEXT:    fldl (%esp)
-; AVX512VL-32-NEXT:    fisttpll (%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %cl
-; AVX512VL-32-NEXT:    shll $31, %ecx
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; AVX512VL-32-NEXT:    xorl %edx, %edx
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
-; AVX512VL-32-NEXT:    setb %bl
-; AVX512VL-32-NEXT:    kmovw %ebx, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vmovsd %xmm2, (%esp)
+; AVX512VL-32-NEXT:    xorl %edx, %edx
+; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm1
 ; AVX512VL-32-NEXT:    setae %dl
-; AVX512VL-32-NEXT:    shll $31, %edx
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; AVX512VL-32-NEXT:    kmovw %edx, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-32-NEXT:    xorl %ecx, %ecx
+; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm1
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    xorl %ebx, %ebx
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm0
-; AVX512VL-32-NEXT:    setb %al
-; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm0
+; AVX512VL-32-NEXT:    setae %bl
+; AVX512VL-32-NEXT:    kmovw %ebx, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl (%esp)
+; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT:    shll $31, %edx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512VL-32-NEXT:    setae %bl
+; AVX512VL-32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    shll $31, %ecx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    shll $31, %ebx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %ebx, %xmm1, %xmm1
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; AVX512VL-32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-32-NEXT:    leal -8(%ebp), %esp
-; AVX512VL-32-NEXT:    popl %esi
+; AVX512VL-32-NEXT:    leal -4(%ebp), %esp
 ; AVX512VL-32-NEXT:    popl %ebx
 ; AVX512VL-32-NEXT:    popl %ebp
 ; AVX512VL-32-NEXT:    .cfi_def_cfa %esp, 4
@@ -788,17 +757,16 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $32, %esp
-; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX-32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-32-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB3_2
+; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB3_2
 ; AVX-32-NEXT:  # %bb.1:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm4
+; AVX-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB3_2:
-; AVX-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -806,15 +774,15 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX-32-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB3_4
+; AVX-32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB3_4
 ; AVX-32-NEXT:  # %bb.3:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm4
+; AVX-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB3_4:
-; AVX-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovss %xmm3, (%esp)
+; AVX-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovss %xmm2, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    wait
@@ -822,15 +790,15 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX-32-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX-32-NEXT:    jb .LBB3_6
+; AVX-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
+; AVX-32-NEXT:    jae .LBB3_6
 ; AVX-32-NEXT:  # %bb.5:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm4
+; AVX-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX-32-NEXT:  .LBB3_6:
-; AVX-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    wait
@@ -839,11 +807,11 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    shll $31, %edx
 ; AVX-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX-32-NEXT:    jb .LBB3_8
+; AVX-32-NEXT:    jae .LBB3_8
 ; AVX-32-NEXT:  # %bb.7:
-; AVX-32-NEXT:    vmovaps %xmm1, %xmm2
+; AVX-32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:  .LBB3_8:
-; AVX-32-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; AVX-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
@@ -937,83 +905,68 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    movl %esp, %ebp
 ; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512F-32-NEXT:    pushl %ebx
-; AVX512F-32-NEXT:    pushl %esi
 ; AVX512F-32-NEXT:    andl $-8, %esp
-; AVX512F-32-NEXT:    subl $32, %esp
-; AVX512F-32-NEXT:    .cfi_offset %esi, -16
+; AVX512F-32-NEXT:    subl $40, %esp
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
-; AVX512F-32-NEXT:    setb %cl
-; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    movl $0, %eax
+; AVX512F-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    xorl %eax, %eax
+; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512F-32-NEXT:    setae %al
-; AVX512F-32-NEXT:    shll $31, %eax
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl %eax, %esi
-; AVX512F-32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512F-32-NEXT:    xorl %ecx, %ecx
-; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
-; AVX512F-32-NEXT:    setb %dl
-; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vmovss %xmm2, (%esp)
-; AVX512F-32-NEXT:    flds (%esp)
-; AVX512F-32-NEXT:    fisttpll (%esp)
-; AVX512F-32-NEXT:    wait
-; AVX512F-32-NEXT:    setae %cl
-; AVX512F-32-NEXT:    shll $31, %ecx
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovss %xmm1, (%esp)
+; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512F-32-NEXT:    xorl %edx, %edx
-; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
-; AVX512F-32-NEXT:    setb %bl
-; AVX512F-32-NEXT:    kmovw %ebx, %k1
-; AVX512F-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512F-32-NEXT:    vsubss %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512F-32-NEXT:    setae %dl
-; AVX512F-32-NEXT:    shll $31, %edx
-; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    kmovw %edx, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-32-NEXT:    xorl %ecx, %ecx
+; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512F-32-NEXT:    setae %cl
+; AVX512F-32-NEXT:    kmovw %ecx, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    xorl %ebx, %ebx
-; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX512F-32-NEXT:    setb %al
-; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    vmovss %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm0
+; AVX512F-32-NEXT:    setae %bl
+; AVX512F-32-NEXT:    kmovw %ebx, %k1
+; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
 ; AVX512F-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    flds (%esp)
+; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    shll $31, %eax
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    shll $31, %edx
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512F-32-NEXT:    setae %bl
+; AVX512F-32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512F-32-NEXT:    shll $31, %ecx
+; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    shll $31, %ebx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, %ebx, %xmm1, %xmm1
 ; AVX512F-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-32-NEXT:    leal -8(%ebp), %esp
-; AVX512F-32-NEXT:    popl %esi
+; AVX512F-32-NEXT:    leal -4(%ebp), %esp
 ; AVX512F-32-NEXT:    popl %ebx
 ; AVX512F-32-NEXT:    popl %ebp
 ; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
@@ -1045,83 +998,68 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    movl %esp, %ebp
 ; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512VL-32-NEXT:    pushl %ebx
-; AVX512VL-32-NEXT:    pushl %esi
 ; AVX512VL-32-NEXT:    andl $-8, %esp
-; AVX512VL-32-NEXT:    subl $32, %esp
-; AVX512VL-32-NEXT:    .cfi_offset %esi, -16
+; AVX512VL-32-NEXT:    subl $40, %esp
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
+; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, %esi
-; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512VL-32-NEXT:    xorl %ecx, %ecx
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
-; AVX512VL-32-NEXT:    setb %dl
-; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovss %xmm2, (%esp)
-; AVX512VL-32-NEXT:    flds (%esp)
-; AVX512VL-32-NEXT:    fisttpll (%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %cl
-; AVX512VL-32-NEXT:    shll $31, %ecx
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovss %xmm1, (%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512VL-32-NEXT:    xorl %edx, %edx
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
-; AVX512VL-32-NEXT:    setb %bl
-; AVX512VL-32-NEXT:    kmovw %ebx, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512VL-32-NEXT:    setae %dl
-; AVX512VL-32-NEXT:    shll $31, %edx
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; AVX512VL-32-NEXT:    kmovw %edx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT:    xorl %ecx, %ecx
+; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    xorl %ebx, %ebx
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX512VL-32-NEXT:    setb %al
-; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm3, %xmm1, %xmm1 {%k1}
+; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm0
+; AVX512VL-32-NEXT:    setae %bl
+; AVX512VL-32-NEXT:    kmovw %ebx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds (%esp)
+; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX512VL-32-NEXT:    shll $31, %edx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX512VL-32-NEXT:    setae %bl
+; AVX512VL-32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    shll $31, %ecx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    shll $31, %ebx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %ebx, %xmm1, %xmm1
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; AVX512VL-32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-32-NEXT:    leal -8(%ebp), %esp
-; AVX512VL-32-NEXT:    popl %esi
+; AVX512VL-32-NEXT:    leal -4(%ebp), %esp
 ; AVX512VL-32-NEXT:    popl %ebx
 ; AVX512VL-32-NEXT:    popl %ebp
 ; AVX512VL-32-NEXT:    .cfi_def_cfa %esp, 4

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index 6f4ab5faaa3b..af52e5fa98b6 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -149,147 +149,125 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    .cfi_offset %esi, -20
 ; AVX512VL-32-NEXT:    .cfi_offset %edi, -16
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512VL-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
 ; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
+; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    movl %eax, %edi
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
+; AVX512VL-32-NEXT:    vmovsd %xmm3, (%esp)
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    setb %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; AVX512VL-32-NEXT:    movl %eax, %esi
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
-; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    setb %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, %edi
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
-; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
 ; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, %esi
-; AVX512VL-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
 ; AVX512VL-32-NEXT:    xorl %edx, %edx
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    setb %al
-; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm5
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT:    vsubsd %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm4, (%esp)
-; AVX512VL-32-NEXT:    fldl (%esp)
-; AVX512VL-32-NEXT:    fisttpll (%esp)
-; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %dl
-; AVX512VL-32-NEXT:    shll $31, %edx
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; AVX512VL-32-NEXT:    kmovw %edx, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vmovapd %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    setae %al
+; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
 ; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm0
-; AVX512VL-32-NEXT:    setb %bl
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-32-NEXT:    xorl %ebx, %ebx
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
+; AVX512VL-32-NEXT:    setae %bl
 ; AVX512VL-32-NEXT:    kmovw %ebx, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm0
+; AVX512VL-32-NEXT:    setae %al
+; AVX512VL-32-NEXT:    kmovw %eax, %k1
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl (%esp)
+; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    shll $31, %esi
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    shll $31, %edi
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edi
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    shll $31, %edx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vpinsrd $1, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; AVX512VL-32-NEXT:    shll $31, %edx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vpinsrd $3, %edi, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    shll $31, %ecx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
+; AVX512VL-32-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX512VL-32-NEXT:    shll $31, %ebx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload
+; AVX512VL-32-NEXT:    vpinsrd $3, %ebx, %xmm3, %xmm3
 ; AVX512VL-32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-32-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm1
 ; AVX512VL-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -463,147 +441,125 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    .cfi_offset %esi, -20
 ; AVX512VL-32-NEXT:    .cfi_offset %edi, -16
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,3,3,3]
 ; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
+; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    movl %eax, %edi
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovss %xmm3, (%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
+; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    movl %eax, %esi
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
 ; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
+; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm4, %xmm4 {%k1}
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm4 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
 ; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
+; AVX512VL-32-NEXT:    xorl %edx, %edx
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX512VL-32-NEXT:    setae %dl
+; AVX512VL-32-NEXT:    kmovw %edx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm4
-; AVX512VL-32-NEXT:    setb %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm5
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, %edi
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %al
-; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm3, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm4, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    movl $0, %eax
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-32-NEXT:    xorl %ecx, %ecx
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    kmovw %ecx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT:    xorl %ebx, %ebx
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
+; AVX512VL-32-NEXT:    setae %bl
+; AVX512VL-32-NEXT:    kmovw %ebx, %k1
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    xorl %eax, %eax
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm0
 ; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    movl %eax, %esi
-; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,3,3,3]
-; AVX512VL-32-NEXT:    xorl %edx, %edx
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm4
-; AVX512VL-32-NEXT:    setb %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm5
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm5, %xmm5 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm5, %xmm4, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm4, (%esp)
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %dl
-; AVX512VL-32-NEXT:    shll $31, %edx
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512VL-32-NEXT:    xorl %eax, %eax
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
-; AVX512VL-32-NEXT:    setb %cl
-; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vmovaps %xmm1, %xmm4
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm4, %xmm4 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    wait
-; AVX512VL-32-NEXT:    setae %al
-; AVX512VL-32-NEXT:    shll $31, %eax
-; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; AVX512VL-32-NEXT:    xorl %ecx, %ecx
-; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm0
-; AVX512VL-32-NEXT:    setb %bl
-; AVX512VL-32-NEXT:    kmovw %ebx, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512VL-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX512VL-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    shll $31, %esi
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    shll $31, %edi
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edi
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; AVX512VL-32-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; AVX512VL-32-NEXT:    shll $31, %edx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vpinsrd $1, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; AVX512VL-32-NEXT:    shll $31, %edx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vpinsrd $3, %edi, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm1
+; AVX512VL-32-NEXT:    shll $31, %ecx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload
-; AVX512VL-32-NEXT:    setae %cl
+; AVX512VL-32-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    shll $31, %eax
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
+; AVX512VL-32-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX512VL-32-NEXT:    shll $31, %ebx
+; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload
+; AVX512VL-32-NEXT:    vpinsrd $3, %ebx, %xmm3, %xmm3
 ; AVX512VL-32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-32-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm1
 ; AVX512VL-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0


        


More information about the llvm-commits mailing list