[llvm] ee71c1b - [X86] Implement smarter instruction lowering for FP_TO_UINT from f32/f64 to i32/i64 and vXf32/vXf64 to vXi32 for SSE2 and AVX2 by using the exact semantic of the CVTTPS2SI instruction.

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 14 04:04:00 PDT 2021


Author: Simon Pilgrim
Date: 2021-07-14T12:03:49+01:00
New Revision: ee71c1bbccb19ed7a30b9aaf112a2c6ac2987193

URL: https://github.com/llvm/llvm-project/commit/ee71c1bbccb19ed7a30b9aaf112a2c6ac2987193
DIFF: https://github.com/llvm/llvm-project/commit/ee71c1bbccb19ed7a30b9aaf112a2c6ac2987193.diff

LOG: [X86] Implement smarter instruction lowering for FP_TO_UINT from f32/f64 to i32/i64 and vXf32/vXf64 to vXi32 for SSE2 and AVX2 by using the exact semantic of the CVTTPS2SI instruction.

We know that "CVTTPS2SI" returns 0x80000000 for out of range inputs (and for FP_TO_UINT, negative float values are undefined). We can use this to make unsigned conversions from vXf32 to vXi32 more efficient, particularly on targets without blend using the following logic:

small := CVTTPS2SI(x);
fp_to_ui(x) := small | (CVTTPS2SI(x - 2^31) & ARITHMETIC_RIGHT_SHIFT(small, 31))

Even on targets where "PBLENDVPS"/"PBLENDVB" exists, it is often a latency 2, low throughput instruction so this logic is applied there too (in particular for AVX2 also). It furthermore gets rid of one high latency floating point comparison in the previous lowering.

@TomHender checked the correctness of this for all possible floats between -1 and 2^32 (both ends excluded).

Original Patch by @TomHender (Tom Hender)

Differential Revision: https://reviews.llvm.org/D89697

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/fptoui.ll
    llvm/test/CodeGen/X86/concat-cast.ll
    llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
    llvm/test/CodeGen/X86/ftrunc.ll
    llvm/test/CodeGen/X86/half.ll
    llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
    llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
    llvm/test/CodeGen/X86/vec_cast3.ll
    llvm/test/CodeGen/X86/vec_fp_to_int.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8697588faa75e..a0f79ae73aa76 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1013,7 +1013,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
 
@@ -1248,6 +1250,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
@@ -1775,7 +1778,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        Subtarget.hasVLX() ? Legal : Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
                        Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
                        Subtarget.hasVLX() ? Legal : Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
@@ -21215,6 +21217,44 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("All 256->128 cases should have been handled above!");
 }
 
+// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
+// behaves on out of range inputs to generate optimized conversions.
+static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
+                                    SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget) {
+  MVT SrcVT = Src.getSimpleValueType();
+  unsigned DstBits = VT.getScalarSizeInBits();
+  assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
+
+  // Calculate the converted result for values in the range 0 to
+  // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
+  SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
+  SDValue Big =
+      DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
+                  DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
+                              DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
+
+  // The "CVTTP2SI" instruction conveniently sets the sign bit if
+  // and only if the value was out of range. So we can use that
+  // as our indicator that we rather use "Big" instead of "Small".
+  //
+  // Use "Small" if "IsOverflown" has all bits cleared
+  // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
+
+  // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
+  // use the slightly slower blendv select instead.
+  if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
+    SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
+    return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
+  }
+
+  SDValue IsOverflown =
+      DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
+                  DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
+  return DAG.getNode(ISD::OR, dl, VT, Small,
+                     DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
+}
+
 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
@@ -21274,10 +21314,10 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
-        (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
+        (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
+        Subtarget.useAVX512Regs()) {
       assert(!IsSigned && "Expected unsigned conversion!");
-      assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
-             "Unexpected features!");
+      assert(!Subtarget.hasVLX() && "Unexpected features!");
       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
       // Need to concat with zero vector for strict fp to avoid spurious
@@ -21307,9 +21347,9 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
-        (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
-      assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
-             !Subtarget.hasVLX() && "Unexpected features!");
+        (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
+        Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
+      assert(!Subtarget.hasVLX() && "Unexpected features!");
       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
       // Need to concat with zero vector for strict fp to avoid spurious
       // exceptions.
@@ -21366,6 +21406,15 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       return DAG.getNode(Opc, dl, VT, Tmp);
     }
 
+    // Generate optimized instructions for pre AVX512 unsigned conversions from
+    // vXf32 to vXi32.
+    if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
+        (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
+        (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
+      assert(!IsSigned && "Expected unsigned conversion!");
+      return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
+    }
+
     return SDValue();
   }
 
@@ -21378,6 +21427,39 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     if (Subtarget.hasAVX512())
       return Op;
 
+    // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
+    // behaves on out of range inputs to generate optimized conversions.
+    if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
+                      (VT == MVT::i64 && Subtarget.is64Bit()))) {
+      unsigned DstBits = VT.getScalarSizeInBits();
+      APInt UIntLimit = APInt::getSignMask(DstBits);
+      SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
+                                        DAG.getConstant(UIntLimit, dl, VT));
+      MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
+
+      // Calculate the converted result for values in the range:
+      // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
+      // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
+      SDValue Small =
+          DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
+                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
+      SDValue Big = DAG.getNode(
+          X86ISD::CVTTS2SI, dl, VT,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
+                      DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
+
+      // The "CVTTS2SI" instruction conveniently sets the sign bit if
+      // and only if the value was out of range. So we can use that
+      // as our indicator that we rather use "Big" instead of "Small".
+      //
+      // Use "Small" if "IsOverflown" has all bits cleared
+      // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
+      SDValue IsOverflown = DAG.getNode(
+          ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
+      return DAG.getNode(ISD::OR, dl, VT, Small,
+                         DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
+    }
+
     // Use default expansion for i64.
     if (VT == MVT::i64)
       return SDValue();
@@ -30781,12 +30863,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
 
     if (VT == MVT::v2i32) {
-      assert((IsSigned || Subtarget.hasAVX512()) &&
-             "Can only handle signed conversion without AVX512");
+      assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
+             "Strict unsigned conversion requires AVX512");
       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
              "Unexpected type action!");
       if (Src.getValueType() == MVT::v2f64) {
+        if (!IsSigned && !Subtarget.hasAVX512()) {
+          SDValue Res =
+              expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
+          Results.push_back(Res);
+          return;
+        }
+
         unsigned Opc;
         if (IsStrict)
           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 42406d15226a2..1cadbd8bdcd26 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1914,12 +1914,14 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  1 },
     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  3 },
 
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    3 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    3 },
     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  7 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  7 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  4 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  7 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  4 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  3 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  4 },
 
     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
@@ -2026,10 +2028,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  2 },
     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  2 },
     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  9 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  9 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  9 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  9 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  6 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  7 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  7 },
 
     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
@@ -2097,15 +2100,15 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  1 },
 
     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    1 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    5 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    1 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    5 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  2 },
     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  2 },
     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  6 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  3 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
   };
 
   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {

diff  --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index eac56601c171e..24e79531e8eda 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -19,18 +19,25 @@ define i32 @fptoui_double_i64(i32 %arg) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui double undef to i64
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'fptoui_double_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui double undef to i64
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'fptoui_double_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'fptoui_double_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = fptoui double undef to i64
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_double_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
@@ -47,10 +54,10 @@ define i32 @fptoui_double_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'fptoui_double_i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui double undef to i64
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = fptoui double undef to i64
@@ -70,23 +77,23 @@ define i32 @fptoui_double_i32(i32 %arg) {
 ;
 ; SSE42-LABEL: 'fptoui_double_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'fptoui_double_i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'fptoui_double_i32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptoui_double_i32'
@@ -98,9 +105,9 @@ define i32 @fptoui_double_i32(i32 %arg) {
 ;
 ; SLM-LABEL: 'fptoui_double_i32'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I32 = fptoui double undef to i32
@@ -206,20 +213,28 @@ define i32 @fptoui_float_i64(i32 %arg) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui float undef to i64
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'fptoui_float_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui float undef to i64
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'fptoui_float_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'fptoui_float_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = fptoui float undef to i64
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_float_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
@@ -238,11 +253,11 @@ define i32 @fptoui_float_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'fptoui_float_i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui float undef to i64
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = fptoui float undef to i64
@@ -264,26 +279,26 @@ define i32 @fptoui_float_i32(i32 %arg) {
 ;
 ; SSE42-LABEL: 'fptoui_float_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'fptoui_float_i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'fptoui_float_i32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptoui_float_i32'
@@ -296,10 +311,10 @@ define i32 @fptoui_float_i32(i32 %arg) {
 ;
 ; SLM-LABEL: 'fptoui_float_i32'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I32 = fptoui float undef to i32

diff  --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll
index ccd61af7bef59..7f6311c57eb26 100644
--- a/llvm/test/CodeGen/X86/concat-cast.ll
+++ b/llvm/test/CodeGen/X86/concat-cast.ll
@@ -109,91 +109,39 @@ define <4 x i32> @fptosi_v4f32_v4i32(<2 x float> %x, <2 x float> %y) {
 }
 
 define <4 x i32> @fptoui_v4f32_v4i32(<2 x float> %x, <2 x float> %y) {
-; SSE2-LABEL: fptoui_v4f32_v4i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpltps %xmm3, %xmm2
-; SSE2-NEXT:    cvttps2dq %xmm0, %xmm4
-; SSE2-NEXT:    subps %xmm3, %xmm0
-; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    xorps %xmm5, %xmm0
-; SSE2-NEXT:    andps %xmm2, %xmm4
-; SSE2-NEXT:    andnps %xmm0, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    cmpltps %xmm3, %xmm0
-; SSE2-NEXT:    cvttps2dq %xmm1, %xmm4
-; SSE2-NEXT:    subps %xmm3, %xmm1
-; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT:    xorps %xmm5, %xmm1
-; SSE2-NEXT:    andps %xmm0, %xmm4
-; SSE2-NEXT:    andnps %xmm1, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: fptoui_v4f32_v4i32:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE4-NEXT:    movaps %xmm0, %xmm2
-; SSE4-NEXT:    cmpltps %xmm4, %xmm2
-; SSE4-NEXT:    cvttps2dq %xmm0, %xmm5
-; SSE4-NEXT:    subps %xmm4, %xmm0
-; SSE4-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE4-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE4-NEXT:    xorps %xmm6, %xmm3
-; SSE4-NEXT:    movaps %xmm2, %xmm0
-; SSE4-NEXT:    blendvps %xmm0, %xmm5, %xmm3
-; SSE4-NEXT:    movaps %xmm1, %xmm0
-; SSE4-NEXT:    cmpltps %xmm4, %xmm0
-; SSE4-NEXT:    cvttps2dq %xmm1, %xmm2
-; SSE4-NEXT:    subps %xmm4, %xmm1
-; SSE4-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE4-NEXT:    xorps %xmm6, %xmm1
-; SSE4-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE4-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE4-NEXT:    movaps %xmm3, %xmm0
-; SSE4-NEXT:    retq
+; SSE-LABEL: fptoui_v4f32_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_v4f32_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vsubps %xmm2, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vcmpltps %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vsubps %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vblendvps %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_v4f32_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vsubps %xmm2, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttps2dq %xmm4, %xmm4
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vcmpltps %xmm2, %xmm1, %xmm3
-; AVX2-NEXT:    vsubps %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX2-NEXT:    vxorps %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT:    vblendvps %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vsubps %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_v4f32_v4i32:
@@ -316,82 +264,50 @@ define <4 x i32> @fptosi_v4f64_v4i32(<2 x double> %x, <2 x double> %y) {
 }
 
 define <4 x i32> @fptoui_v4f64_v4i32(<2 x double> %x, <2 x double> %y) {
-; SSE2-LABEL: fptoui_v4f64_v4i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT:    cvttsd2si %xmm1, %rsi
-; SSE2-NEXT:    movd %edx, %xmm1
-; SSE2-NEXT:    movd %esi, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: fptoui_v4f64_v4i32:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    cvttsd2si %xmm0, %rax
-; SSE4-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE4-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE4-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE4-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE4-NEXT:    cvttsd2si %xmm1, %rsi
-; SSE4-NEXT:    movd %eax, %xmm0
-; SSE4-NEXT:    pinsrd $1, %ecx, %xmm0
-; SSE4-NEXT:    pinsrd $2, %edx, %xmm0
-; SSE4-NEXT:    pinsrd $3, %esi, %xmm0
-; SSE4-NEXT:    retq
+; SSE-LABEL: fptoui_v4f64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd {{.*#+}} xmm2 = [2.147483648E+9,2.147483648E+9]
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm3
+; SSE-NEXT:    subpd %xmm2, %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm4
+; SSE-NEXT:    movapd %xmm3, %xmm0
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm3
+; SSE-NEXT:    subpd %xmm2, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    movapd %xmm3, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_v4f64_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm2, %ymm0, %ymm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vsubpd %ymm2, %ymm0, %ymm4
-; AVX1-NEXT:    vcvttpd2dq %ymm4, %xmm4
-; AVX1-NEXT:    vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vxorpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vsubpd %ymm2, %ymm1, %ymm2
-; AVX1-NEXT:    vcvttpd2dq %ymm2, %xmm2
-; AVX1-NEXT:    vxorpd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT:    vblendvps %xmm3, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vorpd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_v4f64_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm4
-; AVX2-NEXT:    vcvttpd2dq %ymm4, %xmm4
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vsubpd %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vsubpd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vcvttpd2dq %ymm2, %xmm2
-; AVX2-NEXT:    vxorpd %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vblendvps %xmm3, %xmm1, %xmm2, %xmm1
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vorpd %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
index 4bf6fe99c0717..afc7c71963b5b 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -369,21 +369,21 @@ define i32 @test_unsigned_i32_f32(float %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i32_f32:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE-NEXT:    subss %xmm1, %xmm2
-; X86-SSE-NEXT:    cvttss2si %xmm2, %eax
-; X86-SSE-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
-; X86-SSE-NEXT:    cvttss2si %xmm0, %ecx
-; X86-SSE-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE-NEXT:    cmovbel %eax, %ecx
-; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    movl %eax, %ecx
+; X86-SSE-NEXT:    sarl $31, %ecx
+; X86-SSE-NEXT:    movaps %xmm0, %xmm1
+; X86-SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %edx
+; X86-SSE-NEXT:    andl %ecx, %edx
+; X86-SSE-NEXT:    orl %eax, %edx
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %ecx, %edx
+; X86-SSE-NEXT:    cmovael %edx, %ecx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %eax
-; X86-SSE-NEXT:    cmovbel %edx, %eax
+; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_unsigned_i32_f32:
@@ -636,19 +636,18 @@ define i64 @test_unsigned_i64_f32(float %f) nounwind {
 ;
 ; X64-LABEL: test_unsigned_i64_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:    movaps %xmm0, %xmm2
-; X64-NEXT:    subss %xmm1, %xmm2
-; X64-NEXT:    cvttss2si %xmm2, %rax
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    xorq %rax, %rcx
 ; X64-NEXT:    cvttss2si %xmm0, %rax
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movaps %xmm0, %xmm1
+; X64-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    cvttss2si %xmm1, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rax, %rcx
+; X64-NEXT:    cmovaeq %rdx, %rcx
 ; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movq $-1, %rax
 ; X64-NEXT:    cmovbeq %rcx, %rax
@@ -1309,13 +1308,12 @@ define i32 @test_unsigned_i32_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    maxsd %xmm1, %xmm0
 ; X86-SSE-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cvttsd2si %xmm0, %ecx
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE-NEXT:    subsd %xmm1, %xmm2
-; X86-SSE-NEXT:    cvttsd2si %xmm2, %eax
-; X86-SSE-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
-; X86-SSE-NEXT:    ucomisd %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovbl %ecx, %eax
+; X86-SSE-NEXT:    movl %ecx, %edx
+; X86-SSE-NEXT:    sarl $31, %edx
+; X86-SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE-NEXT:    andl %edx, %eax
+; X86-SSE-NEXT:    orl %ecx, %eax
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_unsigned_i32_f64:
@@ -1562,19 +1560,18 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ;
 ; X64-LABEL: test_unsigned_i64_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    movapd %xmm0, %xmm2
-; X64-NEXT:    subsd %xmm1, %xmm2
-; X64-NEXT:    cvttsd2si %xmm2, %rax
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    xorq %rax, %rcx
 ; X64-NEXT:    cvttsd2si %xmm0, %rax
-; X64-NEXT:    ucomisd %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movapd %xmm0, %xmm1
+; X64-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    cvttsd2si %xmm1, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    xorpd %xmm1, %xmm1
 ; X64-NEXT:    ucomisd %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rax, %rcx
+; X64-NEXT:    cmovaeq %rdx, %rcx
 ; X64-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movq $-1, %rax
 ; X64-NEXT:    cmovbeq %rcx, %rax
@@ -2298,21 +2295,21 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE-NEXT:    subss %xmm1, %xmm2
-; X86-SSE-NEXT:    cvttss2si %xmm2, %eax
-; X86-SSE-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
-; X86-SSE-NEXT:    cvttss2si %xmm0, %ecx
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    movl %eax, %ecx
+; X86-SSE-NEXT:    sarl $31, %ecx
+; X86-SSE-NEXT:    movaps %xmm0, %xmm1
+; X86-SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %edx
+; X86-SSE-NEXT:    andl %ecx, %edx
+; X86-SSE-NEXT:    orl %eax, %edx
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %ecx, %edx
+; X86-SSE-NEXT:    cmovael %edx, %ecx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %eax
-; X86-SSE-NEXT:    cmovbel %edx, %eax
+; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
 ;
@@ -2589,19 +2586,18 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movzwl %di, %edi
 ; X64-NEXT:    callq __gnu_h2f_ieee at PLT
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:    movaps %xmm0, %xmm2
-; X64-NEXT:    subss %xmm1, %xmm2
-; X64-NEXT:    cvttss2si %xmm2, %rax
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    xorq %rax, %rcx
 ; X64-NEXT:    cvttss2si %xmm0, %rax
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movaps %xmm0, %xmm1
+; X64-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    cvttss2si %xmm1, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rax, %rcx
+; X64-NEXT:    cmovaeq %rdx, %rcx
 ; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movq $-1, %rax
 ; X64-NEXT:    cmovbeq %rcx, %rax

diff  --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index af3dab5a2dea2..14191f140470c 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -29,16 +29,14 @@ define float @trunc_unsigned_f32(float %x) #0 {
 define double @trunc_unsigned_f64(double %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    subsd %xmm1, %xmm2
-; SSE2-NEXT:    cvttsd2si %xmm2, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rax, %rcx
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    ucomisd %xmm1, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE2-NEXT:    andq %rcx, %rdx
+; SSE2-NEXT:    orq %rax, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
 ; SSE2-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
@@ -63,24 +61,20 @@ define double @trunc_unsigned_f64(double %x) #0 {
 define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    cmpltps %xmm2, %xmm1
-; SSE2-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE2-NEXT:    subps %xmm2, %xmm0
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    andps %xmm1, %xmm3
-; SSE2-NEXT:    andnps %xmm0, %xmm1
-; SSE2-NEXT:    orps %xmm3, %xmm1
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
-; SSE2-NEXT:    andps %xmm1, %xmm0
-; SSE2-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_unsigned_v4f32:
@@ -104,21 +98,21 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
 ; SSE2-NEXT:    movapd %xmm0, %xmm1
 ; SSE2-NEXT:    subsd %xmm2, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rdx
+; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    andq %rax, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
 ; SSE2-NEXT:    movq %rdx, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    xorq %rcx, %rax
+; SSE2-NEXT:    cvttsd2si %xmm0, %rax
+; SSE2-NEXT:    subsd %xmm2, %xmm0
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    andq %rcx, %rdx
+; SSE2-NEXT:    orq %rax, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
 ; SSE2-NEXT:    pand %xmm1, %xmm0
@@ -150,40 +144,41 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
 ; SSE2-NEXT:    movapd %xmm1, %xmm2
 ; SSE2-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
 ; SSE2-NEXT:    subsd %xmm3, %xmm1
-; SSE2-NEXT:    cvttsd2si %xmm1, %rcx
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE2-NEXT:    ucomisd %xmm3, %xmm2
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    cvttsd2si %xmm1, %rax
+; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    andq %rax, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
 ; SSE2-NEXT:    movq %rdx, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE2-NEXT:    movapd %xmm2, %xmm4
-; SSE2-NEXT:    subsd %xmm3, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE2-NEXT:    ucomisd %xmm3, %xmm2
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    cvttsd2si %xmm2, %rax
+; SSE2-NEXT:    subsd %xmm3, %xmm2
+; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    andq %rcx, %rdx
+; SSE2-NEXT:    orq %rax, %rdx
 ; SSE2-NEXT:    movq %rdx, %xmm2
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    movapd %xmm0, %xmm2
 ; SSE2-NEXT:    subsd %xmm3, %xmm2
-; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm3, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    cvttsd2si %xmm2, %rax
+; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    andq %rax, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
 ; SSE2-NEXT:    movq %rdx, %xmm2
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT:    movapd %xmm0, %xmm4
-; SSE2-NEXT:    subsd %xmm3, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    ucomisd %xmm3, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    subsd %xmm3, %xmm0
+; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    andq %rcx, %rdx
+; SSE2-NEXT:    orq %rax, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3

diff  --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index bee7876809d45..34978bdfeb049 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -269,15 +269,13 @@ define i64 @test_fptoui_i64(half* %p) #0 {
 ; CHECK-LIBCALL-NEXT:    pushq %rax
 ; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
-; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, %xmm2
-; CHECK-LIBCALL-NEXT:    subss %xmm1, %xmm2
-; CHECK-LIBCALL-NEXT:    cvttss2si %xmm2, %rax
-; CHECK-LIBCALL-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; CHECK-LIBCALL-NEXT:    xorq %rax, %rcx
+; CHECK-LIBCALL-NEXT:    cvttss2si %xmm0, %rcx
+; CHECK-LIBCALL-NEXT:    movq %rcx, %rdx
+; CHECK-LIBCALL-NEXT:    sarq $63, %rdx
+; CHECK-LIBCALL-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-LIBCALL-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-LIBCALL-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT:    cmovaeq %rcx, %rax
+; CHECK-LIBCALL-NEXT:    andq %rdx, %rax
+; CHECK-LIBCALL-NEXT:    orq %rcx, %rax
 ; CHECK-LIBCALL-NEXT:    popq %rcx
 ; CHECK-LIBCALL-NEXT:    retq
 ;
@@ -286,14 +284,13 @@ define i64 @test_fptoui_i64(half* %p) #0 {
 ; BWON-F16C-NEXT:    movzwl (%rdi), %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; BWON-F16C-NEXT:    vsubss %xmm1, %xmm0, %xmm2
-; BWON-F16C-NEXT:    vcvttss2si %xmm2, %rax
-; BWON-F16C-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; BWON-F16C-NEXT:    xorq %rax, %rcx
+; BWON-F16C-NEXT:    vcvttss2si %xmm0, %rcx
+; BWON-F16C-NEXT:    movq %rcx, %rdx
+; BWON-F16C-NEXT:    sarq $63, %rdx
+; BWON-F16C-NEXT:    vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvttss2si %xmm0, %rax
-; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm0
-; BWON-F16C-NEXT:    cmovaeq %rcx, %rax
+; BWON-F16C-NEXT:    andq %rdx, %rax
+; BWON-F16C-NEXT:    orq %rcx, %rax
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_fptoui_i64:

diff  --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index 99c158420dbef..469c05d44813f 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -38,31 +38,29 @@ define i32 @f_to_u32(float %a) nounwind {
 ; X64-AVX512-NEXT:    vcvttss2usi %xmm0, %eax
 ; X64-AVX512-NEXT:    retq
 ;
-; X86-SSE3-WIN-LABEL: f_to_u32:
-; X86-SSE3-WIN:       # %bb.0:
-; X86-SSE3-WIN-NEXT:    pushl %ebp
-; X86-SSE3-WIN-NEXT:    movl %esp, %ebp
-; X86-SSE3-WIN-NEXT:    andl $-8, %esp
-; X86-SSE3-WIN-NEXT:    subl $8, %esp
-; X86-SSE3-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE3-WIN-NEXT:    movss %xmm0, (%esp)
-; X86-SSE3-WIN-NEXT:    flds (%esp)
-; X86-SSE3-WIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-WIN-NEXT:    movl (%esp), %eax
-; X86-SSE3-WIN-NEXT:    movl %ebp, %esp
-; X86-SSE3-WIN-NEXT:    popl %ebp
-; X86-SSE3-WIN-NEXT:    retl
+; X86-SSE-WIN-LABEL: f_to_u32:
+; X86-SSE-WIN:       # %bb.0:
+; X86-SSE-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-WIN-NEXT:    cvttss2si %xmm0, %ecx
+; X86-SSE-WIN-NEXT:    movl %ecx, %edx
+; X86-SSE-WIN-NEXT:    sarl $31, %edx
+; X86-SSE-WIN-NEXT:    subss __real at 4f000000, %xmm0
+; X86-SSE-WIN-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-WIN-NEXT:    andl %edx, %eax
+; X86-SSE-WIN-NEXT:    orl %ecx, %eax
+; X86-SSE-WIN-NEXT:    retl
 ;
-; X86-SSE3-LIN-LABEL: f_to_u32:
-; X86-SSE3-LIN:       # %bb.0:
-; X86-SSE3-LIN-NEXT:    subl $12, %esp
-; X86-SSE3-LIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE3-LIN-NEXT:    movss %xmm0, (%esp)
-; X86-SSE3-LIN-NEXT:    flds (%esp)
-; X86-SSE3-LIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-LIN-NEXT:    movl (%esp), %eax
-; X86-SSE3-LIN-NEXT:    addl $12, %esp
-; X86-SSE3-LIN-NEXT:    retl
+; X86-SSE-LIN-LABEL: f_to_u32:
+; X86-SSE-LIN:       # %bb.0:
+; X86-SSE-LIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-LIN-NEXT:    cvttss2si %xmm0, %ecx
+; X86-SSE-LIN-NEXT:    movl %ecx, %edx
+; X86-SSE-LIN-NEXT:    sarl $31, %edx
+; X86-SSE-LIN-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-LIN-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-LIN-NEXT:    andl %edx, %eax
+; X86-SSE-LIN-NEXT:    orl %ecx, %eax
+; X86-SSE-LIN-NEXT:    retl
 ;
 ; X64-SSE-LABEL: f_to_u32:
 ; X64-SSE:       # %bb.0:
@@ -70,32 +68,6 @@ define i32 @f_to_u32(float %a) nounwind {
 ; X64-SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-SSE-NEXT:    retq
 ;
-; X86-SSE2-LABEL: f_to_u32:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    subss %xmm1, %xmm2
-; X86-SSE2-NEXT:    cvttss2si %xmm2, %ecx
-; X86-SSE2-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X86-SSE2-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE2-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE2-NEXT:    cmovbel %ecx, %eax
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE1-LABEL: f_to_u32:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE1-NEXT:    subss %xmm1, %xmm2
-; X86-SSE1-NEXT:    cvttss2si %xmm2, %ecx
-; X86-SSE1-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X86-SSE1-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE1-NEXT:    ucomiss %xmm0, %xmm1
-; X86-SSE1-NEXT:    cmovbel %ecx, %eax
-; X86-SSE1-NEXT:    retl
-;
 ; X87-WIN-LABEL: f_to_u32:
 ; X87-WIN:       # %bb.0:
 ; X87-WIN-NEXT:    pushl %ebp
@@ -185,28 +157,26 @@ define i32 @d_to_u32(double %a) nounwind {
 ;
 ; X86-SSE3-WIN-LABEL: d_to_u32:
 ; X86-SSE3-WIN:       # %bb.0:
-; X86-SSE3-WIN-NEXT:    pushl %ebp
-; X86-SSE3-WIN-NEXT:    movl %esp, %ebp
-; X86-SSE3-WIN-NEXT:    andl $-8, %esp
-; X86-SSE3-WIN-NEXT:    subl $8, %esp
 ; X86-SSE3-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE3-WIN-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE3-WIN-NEXT:    fldl (%esp)
-; X86-SSE3-WIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-WIN-NEXT:    movl (%esp), %eax
-; X86-SSE3-WIN-NEXT:    movl %ebp, %esp
-; X86-SSE3-WIN-NEXT:    popl %ebp
+; X86-SSE3-WIN-NEXT:    cvttsd2si %xmm0, %ecx
+; X86-SSE3-WIN-NEXT:    movl %ecx, %edx
+; X86-SSE3-WIN-NEXT:    sarl $31, %edx
+; X86-SSE3-WIN-NEXT:    subsd __real at 41e0000000000000, %xmm0
+; X86-SSE3-WIN-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE3-WIN-NEXT:    andl %edx, %eax
+; X86-SSE3-WIN-NEXT:    orl %ecx, %eax
 ; X86-SSE3-WIN-NEXT:    retl
 ;
 ; X86-SSE3-LIN-LABEL: d_to_u32:
 ; X86-SSE3-LIN:       # %bb.0:
-; X86-SSE3-LIN-NEXT:    subl $12, %esp
 ; X86-SSE3-LIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE3-LIN-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE3-LIN-NEXT:    fldl (%esp)
-; X86-SSE3-LIN-NEXT:    fisttpll (%esp)
-; X86-SSE3-LIN-NEXT:    movl (%esp), %eax
-; X86-SSE3-LIN-NEXT:    addl $12, %esp
+; X86-SSE3-LIN-NEXT:    cvttsd2si %xmm0, %ecx
+; X86-SSE3-LIN-NEXT:    movl %ecx, %edx
+; X86-SSE3-LIN-NEXT:    sarl $31, %edx
+; X86-SSE3-LIN-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE3-LIN-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE3-LIN-NEXT:    andl %edx, %eax
+; X86-SSE3-LIN-NEXT:    orl %ecx, %eax
 ; X86-SSE3-LIN-NEXT:    retl
 ;
 ; X64-SSE-LABEL: d_to_u32:
@@ -215,18 +185,29 @@ define i32 @d_to_u32(double %a) nounwind {
 ; X64-SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-SSE-NEXT:    retq
 ;
-; X86-SSE2-LABEL: d_to_u32:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE2-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE2-NEXT:    subsd %xmm1, %xmm2
-; X86-SSE2-NEXT:    cvttsd2si %xmm2, %ecx
-; X86-SSE2-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X86-SSE2-NEXT:    cvttsd2si %xmm0, %eax
-; X86-SSE2-NEXT:    ucomisd %xmm0, %xmm1
-; X86-SSE2-NEXT:    cmovbel %ecx, %eax
-; X86-SSE2-NEXT:    retl
+; X86-SSE2-WIN-LABEL: d_to_u32:
+; X86-SSE2-WIN:       # %bb.0:
+; X86-SSE2-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-WIN-NEXT:    cvttsd2si %xmm0, %ecx
+; X86-SSE2-WIN-NEXT:    movl %ecx, %edx
+; X86-SSE2-WIN-NEXT:    sarl $31, %edx
+; X86-SSE2-WIN-NEXT:    subsd __real at 41e0000000000000, %xmm0
+; X86-SSE2-WIN-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE2-WIN-NEXT:    andl %edx, %eax
+; X86-SSE2-WIN-NEXT:    orl %ecx, %eax
+; X86-SSE2-WIN-NEXT:    retl
+;
+; X86-SSE2-LIN-LABEL: d_to_u32:
+; X86-SSE2-LIN:       # %bb.0:
+; X86-SSE2-LIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-LIN-NEXT:    cvttsd2si %xmm0, %ecx
+; X86-SSE2-LIN-NEXT:    movl %ecx, %edx
+; X86-SSE2-LIN-NEXT:    sarl $31, %edx
+; X86-SSE2-LIN-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-LIN-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE2-LIN-NEXT:    andl %edx, %eax
+; X86-SSE2-LIN-NEXT:    orl %ecx, %eax
+; X86-SSE2-LIN-NEXT:    retl
 ;
 ; X86-SSE1-WIN-LABEL: d_to_u32:
 ; X86-SSE1-WIN:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index 5bed83322fcb8..0ce9c87057467 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -153,18 +153,27 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE3-LIN-NEXT:    addl $12, %esp
 ; X86-SSE3-LIN-NEXT:    retl
 ;
-; X64-SSE-LABEL: f_to_u64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movaps %xmm0, %xmm2
-; X64-SSE-NEXT:    subss %xmm1, %xmm2
-; X64-SSE-NEXT:    cvttss2si %xmm2, %rax
-; X64-SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-SSE-NEXT:    xorq %rax, %rcx
-; X64-SSE-NEXT:    cvttss2si %xmm0, %rax
-; X64-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X64-SSE-NEXT:    cmovaeq %rcx, %rax
-; X64-SSE-NEXT:    retq
+; X64-SSE-WIN-LABEL: f_to_u64:
+; X64-SSE-WIN:       # %bb.0:
+; X64-SSE-WIN-NEXT:    cvttss2si %xmm0, %rcx
+; X64-SSE-WIN-NEXT:    movq %rcx, %rdx
+; X64-SSE-WIN-NEXT:    sarq $63, %rdx
+; X64-SSE-WIN-NEXT:    subss __real at 5f000000(%rip), %xmm0
+; X64-SSE-WIN-NEXT:    cvttss2si %xmm0, %rax
+; X64-SSE-WIN-NEXT:    andq %rdx, %rax
+; X64-SSE-WIN-NEXT:    orq %rcx, %rax
+; X64-SSE-WIN-NEXT:    retq
+;
+; X64-SSE-LIN-LABEL: f_to_u64:
+; X64-SSE-LIN:       # %bb.0:
+; X64-SSE-LIN-NEXT:    cvttss2si %xmm0, %rcx
+; X64-SSE-LIN-NEXT:    movq %rcx, %rdx
+; X64-SSE-LIN-NEXT:    sarq $63, %rdx
+; X64-SSE-LIN-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-LIN-NEXT:    cvttss2si %xmm0, %rax
+; X64-SSE-LIN-NEXT:    andq %rdx, %rax
+; X64-SSE-LIN-NEXT:    orq %rcx, %rax
+; X64-SSE-LIN-NEXT:    retq
 ;
 ; X86-SSE2-WIN-LABEL: f_to_u64:
 ; X86-SSE2-WIN:       # %bb.0:
@@ -577,18 +586,27 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE3-LIN-NEXT:    addl $12, %esp
 ; X86-SSE3-LIN-NEXT:    retl
 ;
-; X64-SSE-LABEL: d_to_u64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE-NEXT:    subsd %xmm1, %xmm2
-; X64-SSE-NEXT:    cvttsd2si %xmm2, %rax
-; X64-SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-SSE-NEXT:    xorq %rax, %rcx
-; X64-SSE-NEXT:    cvttsd2si %xmm0, %rax
-; X64-SSE-NEXT:    ucomisd %xmm1, %xmm0
-; X64-SSE-NEXT:    cmovaeq %rcx, %rax
-; X64-SSE-NEXT:    retq
+; X64-SSE-WIN-LABEL: d_to_u64:
+; X64-SSE-WIN:       # %bb.0:
+; X64-SSE-WIN-NEXT:    cvttsd2si %xmm0, %rcx
+; X64-SSE-WIN-NEXT:    movq %rcx, %rdx
+; X64-SSE-WIN-NEXT:    sarq $63, %rdx
+; X64-SSE-WIN-NEXT:    subsd __real at 43e0000000000000(%rip), %xmm0
+; X64-SSE-WIN-NEXT:    cvttsd2si %xmm0, %rax
+; X64-SSE-WIN-NEXT:    andq %rdx, %rax
+; X64-SSE-WIN-NEXT:    orq %rcx, %rax
+; X64-SSE-WIN-NEXT:    retq
+;
+; X64-SSE-LIN-LABEL: d_to_u64:
+; X64-SSE-LIN:       # %bb.0:
+; X64-SSE-LIN-NEXT:    cvttsd2si %xmm0, %rcx
+; X64-SSE-LIN-NEXT:    movq %rcx, %rdx
+; X64-SSE-LIN-NEXT:    sarq $63, %rdx
+; X64-SSE-LIN-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-LIN-NEXT:    cvttsd2si %xmm0, %rax
+; X64-SSE-LIN-NEXT:    andq %rdx, %rax
+; X64-SSE-LIN-NEXT:    orq %rcx, %rax
+; X64-SSE-LIN-NEXT:    retq
 ;
 ; X86-SSE2-WIN-LABEL: d_to_u64:
 ; X86-SSE2-WIN:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll
index 57911e1eb9674..1596316807439 100644
--- a/llvm/test/CodeGen/X86/vec_cast3.ll
+++ b/llvm/test/CodeGen/X86/vec_cast3.ll
@@ -117,13 +117,12 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) {
 define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2u32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; CHECK-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vcvttps2dq %xmm1, %xmm1
-; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm1
+; CHECK-NEXT:    vpsrad $31, %xmm1, %xmm2
+; CHECK-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i32>
   ret <2 x i32> %res

diff  --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index ff8e59c04c621..25b702db977aa 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -254,21 +254,21 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 ; SSE-NEXT:    movapd %xmm0, %xmm1
 ; SSE-NEXT:    subsd %xmm2, %xmm1
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE-NEXT:    ucomisd %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    movapd %xmm0, %xmm3
-; SSE-NEXT:    subsd %xmm2, %xmm3
-; SSE-NEXT:    cvttsd2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    subsd %xmm2, %xmm0
 ; SSE-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE-NEXT:    ucomisd %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -278,20 +278,21 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 ; VEX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
 ; VEX-NEXT:    vcvttsd2si %xmm2, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttsd2si %xmm0, %rdx
-; VEX-NEXT:    vucomisd %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
 ; VEX-NEXT:    vmovq %rdx, %xmm2
 ; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
-; VEX-NEXT:    vcvttsd2si %xmm3, %rax
-; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; VEX-NEXT:    vcvttsd2si %xmm1, %rax
 ; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
-; VEX-NEXT:    vucomisd %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
+; VEX-NEXT:    vmovq %rdx, %xmm0
 ; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; VEX-NEXT:    retq
 ;
@@ -334,45 +335,24 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
 ; SSE-LABEL: fptoui_2f64_to_4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movd %ecx, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: fptoui_2f64_to_4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_2f64_to_4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; VEX-LABEL: fptoui_2f64_to_4i32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; VEX-NEXT:    vpsrad $31, %xmm1, %xmm2
+; VEX-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; VEX-NEXT:    vorpd %xmm0, %xmm1, %xmm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_2f64_to_4i32:
 ; AVX512F:       # %bb.0:
@@ -407,45 +387,24 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
 define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
 ; SSE-LABEL: fptoui_2f64_to_2i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: fptoui_2f64_to_2i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_2f64_to_2i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; VEX-LABEL: fptoui_2f64_to_2i32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; VEX-NEXT:    vpsrad $31, %xmm1, %xmm2
+; VEX-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; VEX-NEXT:    vorpd %xmm0, %xmm1, %xmm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_2f64_to_2i32:
 ; AVX512F:       # %bb.0:
@@ -480,27 +439,24 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
 define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; SSE-LABEL: fptoui_4f64_to_2i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_4f64_to_2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovapd %xmm0, %xmm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vorpd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -508,15 +464,12 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovapd %xmm0, %xmm0
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vandpd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -560,40 +513,41 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; SSE-NEXT:    movapd %xmm0, %xmm2
 ; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
 ; SSE-NEXT:    subsd %xmm3, %xmm0
-; SSE-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE-NEXT:    ucomisd %xmm3, %xmm2
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm0
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-NEXT:    movapd %xmm2, %xmm4
-; SSE-NEXT:    subsd %xmm3, %xmm4
-; SSE-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE-NEXT:    ucomisd %xmm3, %xmm2
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttsd2si %xmm2, %rax
+; SSE-NEXT:    subsd %xmm3, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm2
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE-NEXT:    movapd %xmm1, %xmm2
 ; SSE-NEXT:    subsd %xmm3, %xmm2
-; SSE-NEXT:    cvttsd2si %xmm2, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE-NEXT:    ucomisd %xmm3, %xmm1
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttsd2si %xmm2, %rax
+; SSE-NEXT:    cvttsd2si %xmm1, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm2
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    movapd %xmm1, %xmm4
-; SSE-NEXT:    subsd %xmm3, %xmm4
-; SSE-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    ucomisd %xmm3, %xmm1
-; SSE-NEXT:    cmovaeq %rcx, %rax
-; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    subsd %xmm3, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    retq
@@ -604,36 +558,39 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttsd2si %xmm2, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm3
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
 ; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
 ; AVX1-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttsd2si %xmm2, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm2
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm0, %rdx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm3
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX1-NEXT:    vcvttsd2si %xmm0, %rcx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -644,36 +601,39 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
 ; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm2
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
 ; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm0, %rdx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -732,48 +692,44 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; SSE-LABEL: fptoui_4f64_to_4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    movapd {{.*#+}} xmm2 = [2.147483648E+9,2.147483648E+9]
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm3
+; SSE-NEXT:    subpd %xmm2, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    movapd %xmm3, %xmm4
+; SSE-NEXT:    psrad $31, %xmm4
+; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    subpd %xmm2, %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm2
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_4f64_to_4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vorpd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_4f64_to_4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vandpd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1199,40 +1155,34 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
 define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 ; SSE-LABEL: fptoui_2f32_to_2i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    cmpltps %xmm2, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    andps %xmm1, %xmm3
-; SSE-NEXT:    andnps %xmm0, %xmm1
-; SSE-NEXT:    orps %xmm3, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_2f32_to_2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_2f32_to_2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_2f32_to_2i32:
@@ -1267,40 +1217,34 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 ; SSE-LABEL: fptoui_4f32_to_4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    cmpltps %xmm2, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    andps %xmm1, %xmm3
-; SSE-NEXT:    andnps %xmm0, %xmm1
-; SSE-NEXT:    orps %xmm3, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_4f32_to_4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_4f32_to_4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f32_to_4i32:
@@ -1339,21 +1283,21 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    subss %xmm2, %xmm3
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss %xmm2, %xmm0
 ; SSE-NEXT:    cvttss2si %xmm0, %rcx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -1363,20 +1307,21 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
 ; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
 ; VEX-NEXT:    vcvttss2si %xmm2, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rdx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vcvttss2si %xmm0, %rcx
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
 ; VEX-NEXT:    vmovq %rdx, %xmm2
 ; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT:    vcvttss2si %xmm3, %rax
-; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; VEX-NEXT:    vcvttss2si %xmm1, %rax
 ; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
+; VEX-NEXT:    vmovq %rdx, %xmm0
 ; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; VEX-NEXT:    retq
 ;
@@ -1424,21 +1369,21 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    subss %xmm2, %xmm3
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss %xmm2, %xmm0
 ; SSE-NEXT:    cvttss2si %xmm0, %rcx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -1449,18 +1394,19 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 ; VEX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; VEX-NEXT:    vsubss %xmm2, %xmm1, %xmm3
 ; VEX-NEXT:    vcvttss2si %xmm3, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm1, %rdx
-; VEX-NEXT:    vucomiss %xmm2, %xmm1
-; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vcvttss2si %xmm1, %rcx
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
 ; VEX-NEXT:    vsubss %xmm2, %xmm0, %xmm1
 ; VEX-NEXT:    vcvttss2si %xmm1, %rax
-; VEX-NEXT:    xorq %rcx, %rax
 ; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vucomiss %xmm2, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    movq %rcx, %rsi
+; VEX-NEXT:    sarq $63, %rsi
+; VEX-NEXT:    andq %rax, %rsi
+; VEX-NEXT:    orq %rcx, %rsi
+; VEX-NEXT:    vmovq %rsi, %xmm0
 ; VEX-NEXT:    vmovq %rdx, %xmm1
 ; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; VEX-NEXT:    retq
@@ -1507,51 +1453,41 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
 ; SSE-LABEL: fptoui_8f32_to_8i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    cmpltps %xmm4, %xmm2
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm4, %xmm0
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT:    xorps %xmm5, %xmm0
-; SSE-NEXT:    andps %xmm2, %xmm3
-; SSE-NEXT:    andnps %xmm0, %xmm2
-; SSE-NEXT:    orps %xmm3, %xmm2
-; SSE-NEXT:    movaps %xmm1, %xmm3
-; SSE-NEXT:    cmpltps %xmm4, %xmm3
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
-; SSE-NEXT:    subps %xmm4, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE-NEXT:    xorps %xmm5, %xmm1
-; SSE-NEXT:    andps %xmm3, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm3
-; SSE-NEXT:    orps %xmm0, %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm3
+; SSE-NEXT:    subps %xmm2, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_8f32_to_8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vsubps %ymm1, %ymm0, %ymm1
-; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm1
+; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendvps %ymm1, %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_8f32_to_8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_8f32_to_8i32:
@@ -1587,43 +1523,43 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    subss %xmm1, %xmm2
-; SSE-NEXT:    cvttss2si %xmm2, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm1, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm3
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm1, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm3
 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    movaps %xmm0, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
 ; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rax
-; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -1634,36 +1570,39 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm2
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
 ; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm4
 ; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm3
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm3
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm3
 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1674,36 +1613,39 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm2
 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
 ; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm3
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -1765,43 +1707,43 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    subss %xmm1, %xmm2
-; SSE-NEXT:    cvttss2si %xmm2, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm1, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm3
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm1, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm3
 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    movaps %xmm0, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
 ; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rax
-; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -1812,36 +1754,39 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm2
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
 ; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm4
 ; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm3
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm3
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm3
 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    andq %rax, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1852,36 +1797,39 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm2
 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
 ; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm3
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -2807,21 +2755,21 @@ define <2 x i64> @fptoui_2f32_to_2i64_load(<2 x float>* %x) {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    subss %xmm2, %xmm0
 ; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm1, %rdx
-; SSE-NEXT:    ucomiss %xmm2, %xmm1
-; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    cvttss2si %xmm1, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm1, %xmm3
-; SSE-NEXT:    subss %xmm2, %xmm3
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rcx
-; SSE-NEXT:    ucomiss %xmm2, %xmm1
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm1
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
@@ -2831,20 +2779,21 @@ define <2 x i64> @fptoui_2f32_to_2i64_load(<2 x float>* %x) {
 ; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
 ; VEX-NEXT:    vcvttss2si %xmm2, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rdx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vcvttss2si %xmm0, %rcx
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
 ; VEX-NEXT:    vmovq %rdx, %xmm2
 ; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT:    vcvttss2si %xmm3, %rax
-; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; VEX-NEXT:    vcvttss2si %xmm1, %rax
 ; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    movq %rcx, %rdx
+; VEX-NEXT:    sarq $63, %rdx
+; VEX-NEXT:    andq %rax, %rdx
+; VEX-NEXT:    orq %rcx, %rdx
+; VEX-NEXT:    vmovq %rdx, %xmm0
 ; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; VEX-NEXT:    retq
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index 7612b1f746469..6842b8a0ef1b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,XOP
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX256DQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -126,50 +126,11 @@ define void @fptoui_8f64_8i32() #0 {
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX1-LABEL: @fptoui_8f64_8i32(
-; AVX1-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX1-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX1-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX1-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; AVX1-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; AVX1-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; AVX1-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; AVX1-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; AVX1-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; AVX1-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; AVX1-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; AVX1-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX1-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX1-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX1-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX1-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX1-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX1-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX1-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
-; AVX1-NEXT:    ret void
-;
-; XOP-LABEL: @fptoui_8f64_8i32(
-; XOP-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
-; XOP-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
-; XOP-NEXT:    ret void
-;
-; AVX2-LABEL: @fptoui_8f64_8i32(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
-; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
-; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @fptoui_8f64_8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
-; AVX512-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
-; AVX512-NEXT:    ret void
+; AVX-LABEL: @fptoui_8f64_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8


        


More information about the llvm-commits mailing list