[Mlir-commits] [mlir] 37455b1 - Revert "Reland "[X86][RFC] Enable `_Float16` type support on X86 following the psABI""

Benjamin Kramer llvmlistbot at llvm.org
Wed Jun 15 04:24:55 PDT 2022


Author: Thomas Joerg
Date: 2022-06-15T13:24:24+02:00
New Revision: 37455b1f71d32674406caa0609495d8d0db69e4e

URL: https://github.com/llvm/llvm-project/commit/37455b1f71d32674406caa0609495d8d0db69e4e
DIFF: https://github.com/llvm/llvm-project/commit/37455b1f71d32674406caa0609495d8d0db69e4e.diff

LOG: Revert "Reland "[X86][RFC] Enable `_Float16` type support on X86 following the psABI""

This reverts commit 6e02e27536b9de25a651cfc9c2966ce471169355.

This introduces a crash in the backend. Reproducer in MLIR's LLVM
dialect follows. Let me know if you have trouble reproducing this.

module {
  llvm.func @malloc(i64) -> !llvm.ptr<i8>
  llvm.func @_mlir_ciface_tf_report_error(!llvm.ptr<i8>, i32, !llvm.ptr<i8>)
  llvm.mlir.global internal constant @error_message_2208944672953921889("failed to allocate memory at loc(\22-\22:3:8)\00")
  llvm.func @_mlir_ciface_tf_alloc(!llvm.ptr<i8>, i64, i64, i32, i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
  llvm.func @Rsqrt_CPU_DT_HALF_DT_HALF(%arg0: !llvm.ptr<i8>, %arg1: i64, %arg2: !llvm.ptr<i8>) -> !llvm.struct<(i64, ptr<i8>)> attributes {llvm.emit_c_interface, tf_entry} {
    %0 = llvm.mlir.constant(8 : i32) : i32
    %1 = llvm.mlir.constant(8 : index) : i64
    %2 = llvm.mlir.constant(2 : index) : i64
    %3 = llvm.mlir.constant(dense<0.000000e+00> : vector<4xf16>) : vector<4xf16>
    %4 = llvm.mlir.constant(dense<[0, 1, 2, 3]> : vector<4xi32>) : vector<4xi32>
    %5 = llvm.mlir.constant(dense<1.000000e+00> : vector<4xf16>) : vector<4xf16>
    %6 = llvm.mlir.constant(false) : i1
    %7 = llvm.mlir.constant(1 : i32) : i32
    %8 = llvm.mlir.constant(0 : i32) : i32
    %9 = llvm.mlir.constant(4 : index) : i64
    %10 = llvm.mlir.constant(0 : index) : i64
    %11 = llvm.mlir.constant(1 : index) : i64
    %12 = llvm.mlir.constant(-1 : index) : i64
    %13 = llvm.mlir.null : !llvm.ptr<f16>
    %14 = llvm.getelementptr %13[%9] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %15 = llvm.ptrtoint %14 : !llvm.ptr<f16> to i64
    %16 = llvm.alloca %15 x f16 {alignment = 32 : i64} : (i64) -> !llvm.ptr<f16>
    %17 = llvm.alloca %15 x f16 {alignment = 32 : i64} : (i64) -> !llvm.ptr<f16>
    %18 = llvm.mlir.null : !llvm.ptr<i64>
    %19 = llvm.getelementptr %18[%arg1] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %20 = llvm.ptrtoint %19 : !llvm.ptr<i64> to i64
    %21 = llvm.alloca %20 x i64 : (i64) -> !llvm.ptr<i64>
    llvm.br ^bb1(%10 : i64)
  ^bb1(%22: i64):  // 2 preds: ^bb0, ^bb2
    %23 = llvm.icmp "slt" %22, %arg1 : i64
    llvm.cond_br %23, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %24 = llvm.bitcast %arg2 : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>
    %25 = llvm.getelementptr %24[%10, 2] : (!llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>, i64) -> !llvm.ptr<i64>
    %26 = llvm.add %22, %11  : i64
    %27 = llvm.getelementptr %25[%26] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %28 = llvm.load %27 : !llvm.ptr<i64>
    %29 = llvm.getelementptr %21[%22] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    llvm.store %28, %29 : !llvm.ptr<i64>
    llvm.br ^bb1(%26 : i64)
  ^bb3:  // pred: ^bb1
    llvm.br ^bb4(%10, %11 : i64, i64)
  ^bb4(%30: i64, %31: i64):  // 2 preds: ^bb3, ^bb5
    %32 = llvm.icmp "slt" %30, %arg1 : i64
    llvm.cond_br %32, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %33 = llvm.bitcast %arg2 : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>
    %34 = llvm.getelementptr %33[%10, 2] : (!llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>, i64) -> !llvm.ptr<i64>
    %35 = llvm.add %30, %11  : i64
    %36 = llvm.getelementptr %34[%35] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %37 = llvm.load %36 : !llvm.ptr<i64>
    %38 = llvm.mul %37, %31  : i64
    llvm.br ^bb4(%35, %38 : i64, i64)
  ^bb6:  // pred: ^bb4
    %39 = llvm.bitcast %arg2 : !llvm.ptr<i8> to !llvm.ptr<ptr<f16>>
    %40 = llvm.getelementptr %39[%11] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    %41 = llvm.load %40 : !llvm.ptr<ptr<f16>>
    %42 = llvm.getelementptr %13[%11] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %43 = llvm.ptrtoint %42 : !llvm.ptr<f16> to i64
    %44 = llvm.alloca %7 x i32 : (i32) -> !llvm.ptr<i32>
    llvm.store %8, %44 : !llvm.ptr<i32>
    %45 = llvm.call @_mlir_ciface_tf_alloc(%arg0, %31, %43, %8, %7, %44) : (!llvm.ptr<i8>, i64, i64, i32, i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
    %46 = llvm.bitcast %45 : !llvm.ptr<i8> to !llvm.ptr<f16>
    %47 = llvm.icmp "eq" %31, %10 : i64
    %48 = llvm.or %6, %47  : i1
    %49 = llvm.mlir.null : !llvm.ptr<i8>
    %50 = llvm.icmp "ne" %45, %49 : !llvm.ptr<i8>
    %51 = llvm.or %50, %48  : i1
    llvm.cond_br %51, ^bb7, ^bb13
  ^bb7:  // pred: ^bb6
    %52 = llvm.urem %31, %9  : i64
    %53 = llvm.sub %31, %52  : i64
    llvm.br ^bb8(%10 : i64)
  ^bb8(%54: i64):  // 2 preds: ^bb7, ^bb9
    %55 = llvm.icmp "slt" %54, %53 : i64
    llvm.cond_br %55, ^bb9, ^bb10
  ^bb9:  // pred: ^bb8
    %56 = llvm.mul %54, %11  : i64
    %57 = llvm.add %56, %10  : i64
    %58 = llvm.add %57, %10  : i64
    %59 = llvm.getelementptr %41[%58] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %60 = llvm.bitcast %59 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    %61 = llvm.load %60 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %62 = "llvm.intr.sqrt"(%61) : (vector<4xf16>) -> vector<4xf16>
    %63 = llvm.fdiv %5, %62  : vector<4xf16>
    %64 = llvm.getelementptr %46[%58] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %65 = llvm.bitcast %64 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.store %63, %65 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %66 = llvm.add %54, %9  : i64
    llvm.br ^bb8(%66 : i64)
  ^bb10:  // pred: ^bb8
    %67 = llvm.icmp "ult" %53, %31 : i64
    llvm.cond_br %67, ^bb11, ^bb12
  ^bb11:  // pred: ^bb10
    %68 = llvm.mul %53, %12  : i64
    %69 = llvm.add %31, %68  : i64
    %70 = llvm.mul %53, %11  : i64
    %71 = llvm.add %70, %10  : i64
    %72 = llvm.trunc %69 : i64 to i32
    %73 = llvm.mlir.undef : vector<4xi32>
    %74 = llvm.insertelement %72, %73[%8 : i32] : vector<4xi32>
    %75 = llvm.shufflevector %74, %73 [0 : i32, 0 : i32, 0 : i32, 0 : i32] : vector<4xi32>, vector<4xi32>
    %76 = llvm.icmp "slt" %4, %75 : vector<4xi32>
    %77 = llvm.add %71, %10  : i64
    %78 = llvm.getelementptr %41[%77] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %79 = llvm.bitcast %78 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    %80 = llvm.intr.masked.load %79, %76, %3 {alignment = 2 : i32} : (!llvm.ptr<vector<4xf16>>, vector<4xi1>, vector<4xf16>) -> vector<4xf16>
    %81 = llvm.bitcast %16 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.store %80, %81 : !llvm.ptr<vector<4xf16>>
    %82 = llvm.load %81 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %83 = "llvm.intr.sqrt"(%82) : (vector<4xf16>) -> vector<4xf16>
    %84 = llvm.fdiv %5, %83  : vector<4xf16>
    %85 = llvm.bitcast %17 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.store %84, %85 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %86 = llvm.load %85 : !llvm.ptr<vector<4xf16>>
    %87 = llvm.getelementptr %46[%77] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %88 = llvm.bitcast %87 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.intr.masked.store %86, %88, %76 {alignment = 2 : i32} : vector<4xf16>, vector<4xi1> into !llvm.ptr<vector<4xf16>>
    llvm.br ^bb12
  ^bb12:  // 2 preds: ^bb10, ^bb11
    %89 = llvm.mul %2, %1  : i64
    %90 = llvm.mul %arg1, %2  : i64
    %91 = llvm.add %90, %11  : i64
    %92 = llvm.mul %91, %1  : i64
    %93 = llvm.add %89, %92  : i64
    %94 = llvm.alloca %93 x i8 : (i64) -> !llvm.ptr<i8>
    %95 = llvm.bitcast %94 : !llvm.ptr<i8> to !llvm.ptr<ptr<f16>>
    llvm.store %46, %95 : !llvm.ptr<ptr<f16>>
    %96 = llvm.getelementptr %95[%11] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    llvm.store %46, %96 : !llvm.ptr<ptr<f16>>
    %97 = llvm.getelementptr %95[%2] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    %98 = llvm.bitcast %97 : !llvm.ptr<ptr<f16>> to !llvm.ptr<i64>
    llvm.store %10, %98 : !llvm.ptr<i64>
    %99 = llvm.bitcast %94 : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64, i64)>>
    %100 = llvm.getelementptr %99[%10, 3] : (!llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64, i64)>>, i64) -> !llvm.ptr<i64>
    %101 = llvm.getelementptr %100[%arg1] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %102 = llvm.sub %arg1, %11  : i64
    llvm.br ^bb14(%102, %11 : i64, i64)
  ^bb13:  // pred: ^bb6
    %103 = llvm.mlir.addressof @error_message_2208944672953921889 : !llvm.ptr<array<42 x i8>>
    %104 = llvm.getelementptr %103[%10, %10] : (!llvm.ptr<array<42 x i8>>, i64, i64) -> !llvm.ptr<i8>
    llvm.call @_mlir_ciface_tf_report_error(%arg0, %0, %104) : (!llvm.ptr<i8>, i32, !llvm.ptr<i8>) -> ()
    %105 = llvm.mul %2, %1  : i64
    %106 = llvm.mul %2, %10  : i64
    %107 = llvm.add %106, %11  : i64
    %108 = llvm.mul %107, %1  : i64
    %109 = llvm.add %105, %108  : i64
    %110 = llvm.alloca %109 x i8 : (i64) -> !llvm.ptr<i8>
    %111 = llvm.bitcast %110 : !llvm.ptr<i8> to !llvm.ptr<ptr<f16>>
    llvm.store %13, %111 : !llvm.ptr<ptr<f16>>
    %112 = llvm.getelementptr %111[%11] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    llvm.store %13, %112 : !llvm.ptr<ptr<f16>>
    %113 = llvm.getelementptr %111[%2] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    %114 = llvm.bitcast %113 : !llvm.ptr<ptr<f16>> to !llvm.ptr<i64>
    llvm.store %10, %114 : !llvm.ptr<i64>
    %115 = llvm.call @malloc(%109) : (i64) -> !llvm.ptr<i8>
    "llvm.intr.memcpy"(%115, %110, %109, %6) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
    %116 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
    %117 = llvm.insertvalue %10, %116[0] : !llvm.struct<(i64, ptr<i8>)>
    %118 = llvm.insertvalue %115, %117[1] : !llvm.struct<(i64, ptr<i8>)>
    llvm.return %118 : !llvm.struct<(i64, ptr<i8>)>
  ^bb14(%119: i64, %120: i64):  // 2 preds: ^bb12, ^bb15
    %121 = llvm.icmp "sge" %119, %10 : i64
    llvm.cond_br %121, ^bb15, ^bb16
  ^bb15:  // pred: ^bb14
    %122 = llvm.getelementptr %21[%119] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %123 = llvm.load %122 : !llvm.ptr<i64>
    %124 = llvm.getelementptr %100[%119] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    llvm.store %123, %124 : !llvm.ptr<i64>
    %125 = llvm.getelementptr %101[%119] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    llvm.store %120, %125 : !llvm.ptr<i64>
    %126 = llvm.mul %120, %123  : i64
    %127 = llvm.sub %119, %11  : i64
    llvm.br ^bb14(%127, %126 : i64, i64)
  ^bb16:  // pred: ^bb14
    %128 = llvm.call @malloc(%93) : (i64) -> !llvm.ptr<i8>
    "llvm.intr.memcpy"(%128, %94, %93, %6) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
    %129 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
    %130 = llvm.insertvalue %arg1, %129[0] : !llvm.struct<(i64, ptr<i8>)>
    %131 = llvm.insertvalue %128, %130[1] : !llvm.struct<(i64, ptr<i8>)>
    llvm.return %131 : !llvm.struct<(i64, ptr<i8>)>
  }
  llvm.func @_mlir_ciface_Rsqrt_CPU_DT_HALF_DT_HALF(%arg0: !llvm.ptr<struct<(i64, ptr<i8>)>>, %arg1: !llvm.ptr<i8>, %arg2: !llvm.ptr<struct<(i64, ptr<i8>)>>) attributes {llvm.emit_c_interface, tf_entry} {
    %0 = llvm.load %arg2 : !llvm.ptr<struct<(i64, ptr<i8>)>>
    %1 = llvm.extractvalue %0[0] : !llvm.struct<(i64, ptr<i8>)>
    %2 = llvm.extractvalue %0[1] : !llvm.struct<(i64, ptr<i8>)>
    %3 = llvm.call @Rsqrt_CPU_DT_HALF_DT_HALF(%arg1, %1, %2) : (!llvm.ptr<i8>, i64, !llvm.ptr<i8>) -> !llvm.struct<(i64, ptr<i8>)>
    llvm.store %3, %arg0 : !llvm.ptr<struct<(i64, ptr<i8>)>>
    llvm.return
  }
}

Added: 
    

Modified: 
    llvm/docs/ReleaseNotes.rst
    llvm/lib/Target/X86/X86FastISel.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrAVX512.td
    llvm/lib/Target/X86/X86InstrCompiler.td
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/lib/Target/X86/X86InstrVecCompiler.td
    llvm/lib/Target/X86/X86InstructionSelector.cpp
    llvm/lib/Target/X86/X86RegisterInfo.td
    llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
    llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
    llvm/test/CodeGen/X86/atomic-non-integer.ll
    llvm/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
    llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
    llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
    llvm/test/CodeGen/X86/cvt16-2.ll
    llvm/test/CodeGen/X86/cvt16.ll
    llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
    llvm/test/CodeGen/X86/fmf-flags.ll
    llvm/test/CodeGen/X86/fp-round.ll
    llvm/test/CodeGen/X86/fp-roundeven.ll
    llvm/test/CodeGen/X86/fp128-cast-strict.ll
    llvm/test/CodeGen/X86/fpclamptosat.ll
    llvm/test/CodeGen/X86/fpclamptosat_vec.ll
    llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
    llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
    llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
    llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
    llvm/test/CodeGen/X86/freeze.ll
    llvm/test/CodeGen/X86/frem.ll
    llvm/test/CodeGen/X86/half-constrained.ll
    llvm/test/CodeGen/X86/half.ll
    llvm/test/CodeGen/X86/pr31088.ll
    llvm/test/CodeGen/X86/pr38533.ll
    llvm/test/CodeGen/X86/pr47000.ll
    llvm/test/CodeGen/X86/scheduler-asm-moves.mir
    llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
    llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
    llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
    llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
    llvm/test/CodeGen/X86/vec_fp_to_int.ll
    llvm/test/CodeGen/X86/vector-half-conversions.ll
    llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
    llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
    llvm/test/MC/X86/x86_64-asm-match.s
    mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir

Removed: 
    


################################################################################
diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 392395a17b966..d2813bb869736 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -138,7 +138,7 @@ Changes to the WebAssembly Backend
 Changes to the X86 Backend
 --------------------------
 
-* Support ``half`` type on SSE2 and above targets.
+* ...
 
 Changes to the OCaml bindings
 -----------------------------

diff  --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index f2c362eeaa485..8698cd9c4eb03 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -148,7 +148,8 @@ class X86FastISel final : public FastISel {
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
     return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
-           (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
+           (VT == MVT::f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::f16 && Subtarget->hasFP16());
   }
 
   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@@ -2280,13 +2281,12 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   default: return false;
   case MVT::i8:  Opc = X86::CMOV_GR8;   break;
   case MVT::i16: Opc = X86::CMOV_GR16;  break;
+  case MVT::f16: Opc = X86::CMOV_FR16X; break;
   case MVT::i32: Opc = X86::CMOV_GR32;  break;
-  case MVT::f16:
-    Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
-  case MVT::f32:
-    Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
-  case MVT::f64:
-    Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
+  case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
+                                              : X86::CMOV_FR32; break;
+  case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
+                                              : X86::CMOV_FR64; break;
   }
 
   const Value *Cond = I->getOperand(0);
@@ -3903,9 +3903,6 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
   unsigned Opc = 0;
   switch (VT.SimpleTy) {
   default: return 0;
-  case MVT::f16:
-    Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
-    break;
   case MVT::f32:
     Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
           : HasSSE1 ? X86::FsFLD0SS

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ca2fb6d92975d..5233d4bb874bf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -553,13 +553,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 
-  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
-
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
-    // f16, f32 and f64 use SSE.
+    // f32 and f64 use SSE.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
-                                                     : &X86::FR16RegClass);
     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
                                                      : &X86::FR32RegClass);
     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
@@ -591,37 +587,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
 
-    // Half type will be promoted by default.
-    setOperationAction(ISD::FABS, MVT::f16, Promote);
-    setOperationAction(ISD::FNEG, MVT::f16, Promote);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
-    setOperationAction(ISD::FADD, MVT::f16, Promote);
-    setOperationAction(ISD::FSUB, MVT::f16, Promote);
-    setOperationAction(ISD::FMUL, MVT::f16, Promote);
-    setOperationAction(ISD::FDIV, MVT::f16, Promote);
-    setOperationAction(ISD::FREM, MVT::f16, Promote);
-    setOperationAction(ISD::FMA, MVT::f16, Promote);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
-    setOperationAction(ISD::FSIN, MVT::f16, Promote);
-    setOperationAction(ISD::FCOS, MVT::f16, Promote);
-    setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
-    setOperationAction(ISD::BR_CC, MVT::f16, Promote);
-    setOperationAction(ISD::SETCC, MVT::f16, Promote);
-    setOperationAction(ISD::SELECT, MVT::f16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
-    setOperationAction(ISD::FROUND, MVT::f16, Promote);
-    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
-    setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
-    setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
-    setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
-
-    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
-    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
-
     // Lower this to MOVMSK plus an AND.
     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
@@ -696,10 +661,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     } else // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0)); // xorpd
   }
-  // Support fp16 0 immediate.
-  if (isTypeLegal(MVT::f16))
-    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
-
   // Handle constrained floating-point operations of scalar.
   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
@@ -709,6 +670,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
@@ -760,12 +722,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
-    if (isTypeLegal(MVT::f16)) {
-      setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
-    } else {
-      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
-    }
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
     // as Custom.
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
@@ -1488,13 +1445,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
-  if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
-    setOperationAction(ISD::FP_ROUND,             MVT::f16,    Custom);
-    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16,    Custom);
-    setOperationAction(ISD::FP_EXTEND,            MVT::f32,    Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32,    Custom);
-  }
-
   // This block controls legalization of the mask vector sizes that are
   // available with AVX512. 512-bit vectors are in a separate block controlled
   // by useAVX512Regs.
@@ -2023,6 +1973,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
+    if (isTypeLegal(MVT::f80)) {
+      setOperationAction(ISD::FP_EXTEND,          MVT::f80, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::f80, Custom);
+    }
 
     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
@@ -2108,6 +2062,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
     }
+
+    // Support fp16 0 immediate
+    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
@@ -3957,7 +3914,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
         else if (RegVT == MVT::f16)
-          RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
+          RC = &X86::FR16XRegClass;
         else if (RegVT == MVT::f32)
           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
         else if (RegVT == MVT::f64)
@@ -5712,7 +5669,8 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
 }
 
 bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
-  return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+  return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
+         (VT == MVT::f16 && Subtarget.hasFP16());
 }
 
 bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
@@ -5724,7 +5682,8 @@ bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 
 bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
   return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
-         (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
+         (VT == MVT::f32 && Subtarget.hasSSE1()) ||
+         (VT == MVT::f16 && Subtarget.hasFP16());
 }
 
 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
@@ -20781,16 +20740,6 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
   return Cvt;
 }
 
-template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
-  return VT == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
-  return ::isSoftFP16(VT, Subtarget);
-}
-
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
@@ -20832,10 +20781,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
-  // Bail out when we don't have native conversion instructions.
-  if (isSoftFP16(VT))
-    return SDValue();
-
   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
 
   // These are really Legal; return the operand so the caller accepts it as
@@ -21301,8 +21246,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   MVT DstVT = Op->getSimpleValueType(0);
   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
 
-  // Bail out when we don't have native conversion instructions.
-  if (DstVT == MVT::f128 || isSoftFP16(DstVT))
+  if (DstVT == MVT::f128)
     return SDValue();
 
   if (DstVT.isVector())
@@ -22125,16 +22069,6 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   SDValue Res;
-  if (isSoftFP16(SrcVT)) {
-    if (IsStrict)
-      return DAG.getNode(
-          Op.getOpcode(), dl, {VT, MVT::Other},
-          {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
-                              {Chain, Src})});
-    return DAG.getNode(Op.getOpcode(), dl, VT,
-                       DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Src));
-  }
-
   if (VT.isVector()) {
     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
@@ -22472,9 +22406,6 @@ SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
-  if (SrcVT == MVT::f16)
-    return SDValue();
-
   // If the source is in an SSE register, the node is Legal.
   if (isScalarFPTypeInSSEReg(SrcVT))
     return Op;
@@ -22546,7 +22477,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // This code is only for floats and doubles. Fall back to generic code for
   // anything else.
-  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
+  if (!isScalarFPTypeInSSEReg(SrcVT))
     return SDValue();
 
   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -22681,52 +22612,27 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
-  if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80))
+  if (VT == MVT::f128)
     return SDValue();
 
-  if (SVT == MVT::f16) {
-    if (Subtarget.hasFP16())
-      return Op;
-    if (!Subtarget.hasF16C())
-      return SDValue();
-
-    if (VT != MVT::f32) {
+  if (VT == MVT::f80) {
+    if (SVT == MVT::f16) {
+      assert(Subtarget.hasFP16() && "Unexpected features!");
+      RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
+      MakeLibCallOptions CallOptions;
+      std::pair<SDValue, SDValue> Tmp =
+          makeLibCall(DAG, LC, VT, In, CallOptions, DL,
+                      IsStrict ? Op.getOperand(0) : SDValue());
       if (IsStrict)
-        return DAG.getNode(
-            ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
-            {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
-                                {MVT::f32, MVT::Other}, {Chain, In})});
-
-      return DAG.getNode(ISD::FP_EXTEND, DL, VT,
-                         DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
-    }
-
-    In = DAG.getBitcast(MVT::i16, In);
-    In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
-                     getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
-                     DAG.getIntPtrConstant(0, DL));
-    SDValue Res;
-    if (IsStrict) {
-      Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
-                        {Chain, In});
-      Chain = Res.getValue(1);
-    } else {
-      Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
-                        DAG.getTargetConstant(4, DL, MVT::i32));
+        return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
+      else
+        return Tmp.first;
     }
-    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
-                      DAG.getIntPtrConstant(0, DL));
-    if (IsStrict)
-      return DAG.getMergeValues({Res, Chain}, DL);
-    return Res;
-  }
-
-  if (!SVT.isVector())
     return Op;
+  }
 
   if (SVT.getVectorElementType() == MVT::f16) {
     assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
@@ -22753,64 +22659,15 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
-
-  SDLoc DL(Op);
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
-  SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
   MVT VT = Op.getSimpleValueType();
   MVT SVT = In.getSimpleValueType();
 
-  if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
-    return SDValue();
-
-  if (VT == MVT::f16) {
-    if (Subtarget.hasFP16())
-      return Op;
-    if (!Subtarget.hasF16C())
-      return SDValue();
-
-    if (SVT != MVT::f32) {
-      if (IsStrict)
-        return DAG.getNode(
-            ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
-            {Chain,
-             DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
-                         {Chain, In, Op2}),
-             Op2});
-
-      return DAG.getNode(ISD::FP_ROUND, DL, VT,
-                         DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
-                         Op2);
-    }
-
-    SDValue Res;
-    SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
-                                        MVT::i32);
-    if (IsStrict) {
-      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
-                        DAG.getConstantFP(0, DL, MVT::v4f32), In,
-                        DAG.getIntPtrConstant(0, DL));
-      Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
-                        {Chain, Res, Rnd});
-      Chain = Res.getValue(1);
-    } else {
-      // FIXME: Should we use zeros for upper elements for non-strict?
-      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
-      Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
-    }
-
-    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
-                      DAG.getIntPtrConstant(0, DL));
-    Res = DAG.getBitcast(MVT::f16, Res);
-
-    if (IsStrict)
-      return DAG.getMergeValues({Res, Chain}, DL);
-
-    return Res;
-  }
+  // It's legal except when f128 is involved or we're converting f80->f16.
+  if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
+    return Op;
 
-  return Op;
+  return SDValue();
 }
 
 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -24833,11 +24690,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
-  if (isSoftFP16(VT))
-    return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
-                                                DAG.getBitcast(MVT::i16, Op1),
-                                                DAG.getBitcast(MVT::i16, Op2)));
-
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
@@ -25577,10 +25429,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest  = Op.getOperand(2);
   SDLoc dl(Op);
 
-  // Bail out when we don't have native compare instructions.
   if (Cond.getOpcode() == ISD::SETCC &&
-      Cond.getOperand(0).getValueType() != MVT::f128 &&
-      !isSoftFP16(Cond.getOperand(0).getValueType())) {
+      Cond.getOperand(0).getValueType() != MVT::f128) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -34302,7 +34152,6 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 // conditional jump around it.
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  case X86::CMOV_FR16:
   case X86::CMOV_FR16X:
   case X86::CMOV_FR32:
   case X86::CMOV_FR32X:
@@ -35978,8 +35827,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
-  case X86::CMOV_FR16:
-  case X86::CMOV_FR16X:
   case X86::CMOV_FR32:
   case X86::CMOV_FR32X:
   case X86::CMOV_FR64:
@@ -44254,7 +44101,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
+      VT != MVT::f80 && VT != MVT::f128 &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 7f2f2a4c76aef..cd0e4a348f48b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1640,8 +1640,6 @@ namespace llvm {
 
     bool needsCmpXchgNb(Type *MemType) const;
 
-    template<typename T> bool isSoftFP16(T VT) const;
-
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 

diff  --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 48da7b3ac8827..918d11008d20c 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -476,7 +476,6 @@ let Predicates = [HasAVX512] in {
 def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
-def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
 }
@@ -509,23 +508,25 @@ let Predicates = [HasAVX512] in {
 def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
-def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
 def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
-def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
 }
 
+let Predicates = [HasFP16] in {
+def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
+}
+
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
-  def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
-                          [(set FR16X:$dst, fp16imm0)]>;
   def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
                           [(set FR32X:$dst, fp32imm0)]>;
   def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@@ -534,6 +535,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
                             [(set VR128X:$dst, fp128imm0)]>;
 }
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in {
+  def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
+                          [(set FR16X:$dst, fp16imm0)]>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
@@ -671,21 +678,21 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>;
 // Codegen pattern with the alternative types insert VEC128 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
                vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>;
 // Codegen pattern with the alternative types insert VEC256 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
-              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>;
 
 
 multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
@@ -972,7 +979,7 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
-          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
@@ -980,14 +987,14 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
-                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>;
 // Codegen pattern with the alternative types extract VEC256 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
-                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>;
 
 
 // A 128-bit extract from bits [255:128] of a 512-bit vector should use a
@@ -1013,10 +1020,6 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
           (v8i16 (VEXTRACTI128rr
                   (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
-def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
-          (v8f16 (VEXTRACTF128rr
-                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
-                  (iPTR 1)))>;
 def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
           (v16i8 (VEXTRACTI128rr
                   (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
@@ -1046,16 +1049,18 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
           (v8i16 (VEXTRACTI32x4Z256rr
                   (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
-def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
-          (v8f16 (VEXTRACTF32x4Z256rr
-                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
-                  (iPTR 1)))>;
 def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
           (v16i8 (VEXTRACTI32x4Z256rr
                   (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 }
 
+let Predicates = [HasFP16, HasVLX] in
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+          (v8f16 (VEXTRACTF32x4Z256rr
+                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+                  (iPTR 1)))>;
+
 
 // Additional patterns for handling a bitcast between the vselect and the
 // extract_subvector.
@@ -1473,7 +1478,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
-let Predicates = [HasBWI] in {
+let Predicates = [HasFP16] in {
   def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)),
             (VPBROADCASTWZrm addr:$src)>;
 
@@ -1482,7 +1487,7 @@ let Predicates = [HasBWI] in {
   def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))),
             (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
 }
-let Predicates = [HasVLX, HasBWI] in {
+let Predicates = [HasVLX, HasFP16] in {
   def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
             (VPBROADCASTWZ128rm addr:$src)>;
   def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
@@ -3758,9 +3763,6 @@ let Predicates = [HasBWI, NoVLX] in {
 
   defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
   defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
-
-  defm : mask_move_lowering<"VMOVDQU16Z", v8f16x_info, v32f16_info>;
-  defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -3850,7 +3852,7 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
 }
-let Predicates = [HasBWI] in {
+let Predicates = [HasFP16] in {
   def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))),
             (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
   def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
@@ -3885,7 +3887,7 @@ let Predicates = [HasBWI] in {
   def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
             (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
 }
-let Predicates = [HasBWI, HasVLX] in {
+let Predicates = [HasFP16, HasVLX] in {
   def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))),
             (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
   def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
@@ -4097,14 +4099,14 @@ def : Pat<(f64 (bitconvert VK64:$src)),
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
-                              X86VectorVTInfo _, Predicate prd = HasAVX512> {
-  let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
+                              X86VectorVTInfo _,
+                              list<Predicate> prd = [HasAVX512, OptForSize]> {
+  let Predicates = prd in
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
              _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
-  let Predicates = [prd] in {
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4157,7 +4159,6 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
               !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
               [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
               NotMemoryFoldable;
-  }
 }
 
 defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
@@ -4167,7 +4168,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
-                                  HasFP16>,
+                                  [HasFP16]>,
                                   VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
@@ -4337,9 +4338,14 @@ def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
                       addr:$srcAddr)>;
 }
 
+defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
+defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4347,12 +4353,6 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
-let Predicates = [HasFP16] in {
-defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
-defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
-defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
                    (v32i1 (insert_subvector
                            (v32i1 immAllZerosV),
@@ -4360,30 +4360,6 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
                            (iPTR 0))),
                    (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
                    GR8, sub_8bit>;
-
-defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
-defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
-defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
-                   (v32i1 (insert_subvector
-                           (v32i1 immAllZerosV),
-                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
-                           (iPTR 0))),
-                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
-                   GR8, sub_8bit>;
-
-def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
-          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
-           (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
-           VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
-           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
-
-def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
-          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
-           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
-}
-
 defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4409,6 +4385,10 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
+defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4416,6 +4396,13 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (insert_subvector
+                           (v32i1 immAllZerosV),
+                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                           (iPTR 0))),
+                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                   GR8, sub_8bit>;
 defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4441,6 +4428,16 @@ defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
+           VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
            (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
@@ -11654,14 +11651,6 @@ defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
 defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
 defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
 
-// Always select FP16 instructions if available.
-let Predicates = [HasBWI], AddedComplexity = -10 in {
-  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>;
-  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
-  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>;
-  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>;
-}
-
 //===----------------------------------------------------------------------===//
 // VSHUFPS - VSHUFPD Operations
 //===----------------------------------------------------------------------===//
@@ -12999,6 +12988,7 @@ def : Pat<(i16 (bitconvert FR16X:$src)),
                 sub_16bit))>;
 def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
           (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>;
+}
 
 // Allow "vmovw" to use GR64
 let hasSideEffects = 0 in {
@@ -13007,7 +12997,6 @@ let hasSideEffects = 0 in {
   def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
 }
-}
 
 // Convert 16-bit float to i16/u16
 multiclass avx512_cvtph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,

diff  --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index a55b95960aa6e..39f27312c8ce0 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -562,14 +562,12 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
   let Predicates = [HasMMX] in
     defm _VR64   : CMOVrr_PSEUDO<VR64, x86mmx>;
 
+  defm _FR16X    : CMOVrr_PSEUDO<FR16X, f16>;
   let Predicates = [HasSSE1,NoAVX512] in
     defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
-  let Predicates = [HasSSE2,NoAVX512] in {
-    defm _FR16   : CMOVrr_PSEUDO<FR16, f16>;
+  let Predicates = [HasSSE2,NoAVX512] in
     defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
-  }
   let Predicates = [HasAVX512] in {
-    defm _FR16X  : CMOVrr_PSEUDO<FR16X, f16>;
     defm _FR32X  : CMOVrr_PSEUDO<FR32X, f32>;
     defm _FR64X  : CMOVrr_PSEUDO<FR64X, f64>;
   }

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 34c30074e338b..e20c738ba10c9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -765,7 +765,6 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS:
-  case X86::FsFLD0SH:
   case X86::FsFLD0F128:
   case X86::KSET0D:
   case X86::KSET0Q:
@@ -3581,6 +3580,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
   case 2:
     if (X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (X86::FR16XRegClass.hasSubClassEq(RC)) {
+      assert(STI.hasFP16());
+      return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
+    }
     assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
     return load ? X86::MOV16rm : X86::MOV16mr;
   case 4:
@@ -3608,10 +3611,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
         X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
         X86::VK16PAIRRegClass.hasSubClassEq(RC))
       return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
-    if ((X86::FR16RegClass.hasSubClassEq(RC) ||
-         X86::FR16XRegClass.hasSubClassEq(RC)) &&
-        STI.hasFP16())
-      return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
     llvm_unreachable("Unknown 4-byte regclass");
   case 8:
     if (X86::GR64RegClass.hasSubClassEq(RC))
@@ -3851,12 +3850,12 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
   assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
          "Stack slot too small for store");
   if (RC->getID() == X86::TILERegClassID) {
     unsigned Opc = X86::TILESTORED;
     // tilestored %tmm, (%sp, %idx)
+    MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
     BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
     MachineInstr *NewMI =
@@ -3865,14 +3864,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     MachineOperand &MO = NewMI->getOperand(2);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
-  } else if ((RC->getID() == X86::FR16RegClassID ||
-              RC->getID() == X86::FR16XRegClassID) &&
-             !Subtarget.hasFP16()) {
-    unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr
-                   : Subtarget.hasAVX()  ? X86::VMOVSSmr
-                                         : X86::MOVSSmr;
-    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
-        .addReg(SrcReg, getKillRegState(isKill));
   } else {
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
     bool isAligned =
@@ -3901,14 +3892,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     MachineOperand &MO = NewMI->getOperand(3);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
-  } else if ((RC->getID() == X86::FR16RegClassID ||
-              RC->getID() == X86::FR16XRegClassID) &&
-             !Subtarget.hasFP16()) {
-    unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm
-                   : Subtarget.hasAVX()  ? X86::VMOVSSrm
-                                         : X86::MOVSSrm;
-    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
-                      FrameIdx);
   } else {
     const MachineFunction &MF = *MBB.getParent();
     const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -4884,7 +4867,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
-  case X86::FsFLD0SH:
   case X86::FsFLD0F128:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
   case X86::AVX_SET0: {
@@ -6620,7 +6602,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX512_FsFLD0SS:
       Alignment = Align(4);
       break;
-    case X86::FsFLD0SH:
     case X86::AVX512_FsFLD0SH:
       Alignment = Align(2);
       break;
@@ -6659,7 +6640,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
   case X86::AVX512_512_SETALLONES:
-  case X86::FsFLD0SH:
   case X86::AVX512_FsFLD0SH:
   case X86::FsFLD0SD:
   case X86::AVX512_FsFLD0SD:
@@ -6699,7 +6679,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Ty = Type::getDoubleTy(MF.getFunction().getContext());
     else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
       Ty = Type::getFP128Ty(MF.getFunction().getContext());
-    else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH)
+    else if (Opc == X86::AVX512_FsFLD0SH)
       Ty = Type::getHalfTy(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 06cb280e860a7..69181c44bc488 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -112,8 +112,6 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
-  def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
-                   [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
                    [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
@@ -3967,20 +3965,6 @@ defm PINSRW : sse2_pinsrw, PD;
 
 } // ExeDomain = SSEPackedInt
 
-// Always select FP16 instructions if available.
-let Predicates = [UseSSE2], AddedComplexity = -10 in {
-  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
-  def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
-  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
-  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
-}
-
-let Predicates = [HasAVX, NoBWI] in {
-  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
-  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
-  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
-}
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Mask Creation
 //===---------------------------------------------------------------------===//
@@ -5209,12 +5193,6 @@ let Predicates = [HasAVX, NoBWI] in
 
 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
 
-let Predicates = [UseSSE41] in
-  def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
-
-let Predicates = [HasAVX, NoBWI] in
-  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
-
 
 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
@@ -7597,21 +7575,6 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
         (VPBROADCASTWYrr (VMOVDI2PDIrr
                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                                               GR16:$src, sub_16bit))))>;
-
-  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
-            (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
-            (VPBROADCASTWYrm addr:$src)>;
-
-  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
-            (VPBROADCASTWrr VR128:$src)>;
-  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
-            (VPBROADCASTWYrr VR128:$src)>;
-
-  def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
-            (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
-  def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
-            (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),

diff  --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index e6ecbb6521003..2429aa113fb19 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -17,8 +17,6 @@
 
 let Predicates = [NoAVX512] in {
   // A vector extract of the first f32/f64 position is a subregister copy
-  def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))),
-            (COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>;
   def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
             (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
   def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
@@ -36,8 +34,8 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [NoVLX] in {
-  def : Pat<(v8f16 (scalar_to_vector FR16:$src)),
-            (COPY_TO_REGCLASS FR16:$src, VR128)>;
+  def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
+            (COPY_TO_REGCLASS FR16X:$src, VR128)>;
   // Implicitly promote a 32-bit scalar to a vector.
   def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
             (COPY_TO_REGCLASS FR32:$src, VR128)>;

diff  --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index ff701159b95ea..fb13d93dd1d44 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -179,8 +179,6 @@ X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
       return &X86::GR64RegClass;
   }
   if (RB.getID() == X86::VECRRegBankID) {
-    if (Ty.getSizeInBits() == 16)
-      return STI.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
     if (Ty.getSizeInBits() == 32)
       return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
     if (Ty.getSizeInBits() == 64)

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 6dc51e37d3c2f..6362386c8a494 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -537,8 +537,6 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 
 def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
 
-def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)> {let Size = 32;}
-
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
@@ -601,7 +599,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
 
 def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
 
-def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
+def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>;
 
 // Extended VR128 and VR256 for AVX-512 instructions
 def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],

diff  --git a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
index 7f81abaabc566..c83d1fb430d2a 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
@@ -847,228 +847,228 @@ define void @casts() {
 
 define void @fp16() {
 ; SSE2-LABEL: 'fp16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 229 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 232 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'fp16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'fp16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 119 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'fp16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'fp16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
@@ -1112,56 +1112,56 @@ define void @fp16() {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'fp16'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)

diff  --git a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
index 0202103c8ff4d..44705cbcfac48 100644
--- a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
+++ b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
@@ -28,8 +28,8 @@ body: |
     liveins: $rdi, $rsi
 
   ; CHECK-LABEL: name: test
-  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi,
-    INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags
+  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi,
+    INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags
     $rax = MOV64rr killed $rsi
     RET64 killed $rax
 ...
@@ -45,8 +45,8 @@ body: |
 
   ; Verify that the register ties are preserved.
   ; CHECK-LABEL: name: test2
-  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags
-    INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags
+  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags
+    INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags
     $rax = MOV64rr killed $rsi
     RET64 killed $rax
 ...

diff  --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index fa0f65c4a8ae0..a320a551ffa27 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -4,9 +4,9 @@
 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX
 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX
 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
 
 ; Note: This test is testing that the lowering for atomics matches what we
 ; currently emit for non-atomics + the atomic restriction.  The presence of
@@ -16,45 +16,17 @@
 ;  and their calling convention which remain unresolved.)
 
 define void @store_half(half* %fptr, half %v) {
-; X86-SSE1-LABEL: store_half:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movw %ax, (%ecx)
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: store_half:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movw %cx, (%eax)
-; X86-SSE2-NEXT:    retl
-;
-; X86-AVX-LABEL: store_half:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movw %cx, (%eax)
-; X86-AVX-NEXT:    retl
-;
-; X86-NOSSE-LABEL: store_half:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movw %ax, (%ecx)
-; X86-NOSSE-NEXT:    retl
-;
-; X64-SSE-LABEL: store_half:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X64-SSE-NEXT:    movw %ax, (%rdi)
-; X64-SSE-NEXT:    retq
+; X86-LABEL: store_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    retl
 ;
-; X64-AVX-LABEL: store_half:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; X64-AVX-NEXT:    movw %ax, (%rdi)
-; X64-AVX-NEXT:    retq
+; X64-LABEL: store_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movw %si, (%rdi)
+; X64-NEXT:    retq
   store atomic half %v, half* %fptr unordered, align 2
   ret void
 }
@@ -221,43 +193,16 @@ define void @store_fp128(fp128* %fptr, fp128 %v) {
 }
 
 define half @load_half(half* %fptr) {
-; X86-SSE1-LABEL: load_half:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movzwl (%eax), %eax
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: load_half:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzwl (%eax), %eax
-; X86-SSE2-NEXT:    pinsrw $0, %eax, %xmm0
-; X86-SSE2-NEXT:    retl
-;
-; X86-AVX-LABEL: load_half:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movzwl (%eax), %eax
-; X86-AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X86-AVX-NEXT:    retl
-;
-; X86-NOSSE-LABEL: load_half:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movzwl (%eax), %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X64-SSE-LABEL: load_half:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movzwl (%rdi), %eax
-; X64-SSE-NEXT:    pinsrw $0, %eax, %xmm0
-; X64-SSE-NEXT:    retq
+; X86-LABEL: load_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    retl
 ;
-; X64-AVX-LABEL: load_half:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    movzwl (%rdi), %eax
-; X64-AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X64-AVX-NEXT:    retq
+; X64-LABEL: load_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    retq
   %v = load atomic half, half* %fptr unordered, align 2
   ret half %v
 }

diff  --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 0323af4aca34f..d307be240a29c 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2254,19 +2254,22 @@ define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %
 ; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k1
-; KNL-NEXT:    kmovw %k1, %edi
-; KNL-NEXT:    movzwl 2(%rsi), %eax
-; KNL-NEXT:    xorl %ecx, %ecx
-; KNL-NEXT:    testb $1, %dil
-; KNL-NEXT:    cmovel %ecx, %eax
-; KNL-NEXT:    kmovw %k0, %edi
-; KNL-NEXT:    testb $1, %dil
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    testb $1, %cl
+; KNL-NEXT:    movl $0, %ecx
 ; KNL-NEXT:    je LBB85_2
 ; KNL-NEXT:  ## %bb.1:
-; KNL-NEXT:    movl (%rsi), %ecx
+; KNL-NEXT:    movzwl 2(%rsi), %ecx
 ; KNL-NEXT:  LBB85_2:
-; KNL-NEXT:    movw %cx, (%rdx)
-; KNL-NEXT:    movw %ax, 2(%rdx)
+; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    testb $1, %dil
+; KNL-NEXT:    je LBB85_4
+; KNL-NEXT:  ## %bb.3:
+; KNL-NEXT:    movzwl (%rsi), %eax
+; KNL-NEXT:  LBB85_4:
+; KNL-NEXT:    movw %ax, (%rdx)
+; KNL-NEXT:    movw %cx, 2(%rdx)
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_concat_v2i1:
@@ -2301,19 +2304,22 @@ define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %
 ; SKX-NEXT:    korw %k1, %k2, %k1
 ; SKX-NEXT:    kandw %k1, %k0, %k0
 ; SKX-NEXT:    kshiftrb $1, %k0, %k1
-; SKX-NEXT:    kmovd %k1, %edi
-; SKX-NEXT:    movzwl 2(%rsi), %eax
-; SKX-NEXT:    xorl %ecx, %ecx
-; SKX-NEXT:    testb $1, %dil
-; SKX-NEXT:    cmovel %ecx, %eax
-; SKX-NEXT:    kmovd %k0, %edi
-; SKX-NEXT:    testb $1, %dil
+; SKX-NEXT:    kmovd %k1, %ecx
+; SKX-NEXT:    xorl %eax, %eax
+; SKX-NEXT:    testb $1, %cl
+; SKX-NEXT:    movl $0, %ecx
 ; SKX-NEXT:    je LBB85_2
 ; SKX-NEXT:  ## %bb.1:
-; SKX-NEXT:    movl (%rsi), %ecx
+; SKX-NEXT:    movzwl 2(%rsi), %ecx
 ; SKX-NEXT:  LBB85_2:
-; SKX-NEXT:    movw %cx, (%rdx)
-; SKX-NEXT:    movw %ax, 2(%rdx)
+; SKX-NEXT:    kmovd %k0, %edi
+; SKX-NEXT:    testb $1, %dil
+; SKX-NEXT:    je LBB85_4
+; SKX-NEXT:  ## %bb.3:
+; SKX-NEXT:    movzwl (%rsi), %eax
+; SKX-NEXT:  LBB85_4:
+; SKX-NEXT:    movw %ax, (%rdx)
+; SKX-NEXT:    movw %cx, 2(%rdx)
 ; SKX-NEXT:    retq
   %tmp = load <2 x half>, <2 x half>* %arg, align 8
   %tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600>

diff  --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
index fbaff4aa34f33..0a7c4e0aa36aa 100644
--- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
@@ -153,156 +153,206 @@ define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i1
 declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
 
 ; Make sure we scalarize masked loads of f16.
-define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr) {
+define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
 ; CHECK-LABEL: test_mask_load_16xf16:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
-; CHECK-NEXT:    vpmovmskb %xmm0, %ecx
-; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    vpmovmskb %xmm0, %r11d
+; CHECK-NEXT:    testb $1, %r11b
 ; CHECK-NEXT:    je LBB12_1
 ; CHECK-NEXT:  ## %bb.2: ## %cond.load
-; CHECK-NEXT:    vpinsrw $0, (%rsi), %xmm0, %xmm8
+; CHECK-NEXT:    movzwl (%rsi), %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    jmp LBB12_3
 ; CHECK-NEXT:  LBB12_1:
-; CHECK-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:  LBB12_3: ## %else
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm9, %xmm9, %xmm9
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm10
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm4
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm5
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm6
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm7
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm1
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm3
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm11
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm12
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm13
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm14
-; CHECK-NEXT:    testb $2, %cl
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    testb $2, %r11b
 ; CHECK-NEXT:    je LBB12_4
 ; CHECK-NEXT:  ## %bb.5: ## %cond.load1
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm15
-; CHECK-NEXT:    vpinsrw $0, 2(%rsi), %xmm0, %xmm2
-; CHECK-NEXT:    testb $4, %cl
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movl %edi, %r12d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl %edi, %r13d
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movl %edi, %r8d
+; CHECK-NEXT:    movl %edi, %r9d
+; CHECK-NEXT:    movl %edi, %r10d
+; CHECK-NEXT:    movl %edi, %r15d
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movzwl 2(%rsi), %edi
+; CHECK-NEXT:    ## kill: def $di killed $di def $edi
+; CHECK-NEXT:    testb $4, %r11b
 ; CHECK-NEXT:    jne LBB12_7
 ; CHECK-NEXT:    jmp LBB12_8
 ; CHECK-NEXT:  LBB12_4:
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm15
-; CHECK-NEXT:    testb $4, %cl
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movl %edi, %r12d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl %edi, %r13d
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movl %edi, %r8d
+; CHECK-NEXT:    movl %edi, %r9d
+; CHECK-NEXT:    movl %edi, %r10d
+; CHECK-NEXT:    movl %edi, %r15d
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    testb $4, %r11b
 ; CHECK-NEXT:    je LBB12_8
 ; CHECK-NEXT:  LBB12_7: ## %cond.load4
-; CHECK-NEXT:    vpinsrw $0, 4(%rsi), %xmm0, %xmm10
+; CHECK-NEXT:    movzwl 4(%rsi), %ecx
+; CHECK-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; CHECK-NEXT:  LBB12_8: ## %else5
-; CHECK-NEXT:    testb $8, %cl
+; CHECK-NEXT:    testb $8, %r11b
 ; CHECK-NEXT:    jne LBB12_9
 ; CHECK-NEXT:  ## %bb.10: ## %else8
-; CHECK-NEXT:    testb $16, %cl
+; CHECK-NEXT:    testb $16, %r11b
 ; CHECK-NEXT:    jne LBB12_11
 ; CHECK-NEXT:  LBB12_12: ## %else11
-; CHECK-NEXT:    testb $32, %cl
+; CHECK-NEXT:    testb $32, %r11b
 ; CHECK-NEXT:    jne LBB12_13
 ; CHECK-NEXT:  LBB12_14: ## %else14
-; CHECK-NEXT:    testb $64, %cl
+; CHECK-NEXT:    testb $64, %r11b
 ; CHECK-NEXT:    jne LBB12_15
 ; CHECK-NEXT:  LBB12_16: ## %else17
-; CHECK-NEXT:    testb $-128, %cl
+; CHECK-NEXT:    testb $-128, %r11b
 ; CHECK-NEXT:    jne LBB12_17
 ; CHECK-NEXT:  LBB12_18: ## %else20
-; CHECK-NEXT:    testl $256, %ecx ## imm = 0x100
+; CHECK-NEXT:    testl $256, %r11d ## imm = 0x100
 ; CHECK-NEXT:    jne LBB12_19
 ; CHECK-NEXT:  LBB12_20: ## %else23
-; CHECK-NEXT:    testl $512, %ecx ## imm = 0x200
+; CHECK-NEXT:    testl $512, %r11d ## imm = 0x200
 ; CHECK-NEXT:    jne LBB12_21
 ; CHECK-NEXT:  LBB12_22: ## %else26
-; CHECK-NEXT:    testl $1024, %ecx ## imm = 0x400
+; CHECK-NEXT:    testl $1024, %r11d ## imm = 0x400
 ; CHECK-NEXT:    jne LBB12_23
 ; CHECK-NEXT:  LBB12_24: ## %else29
-; CHECK-NEXT:    testl $2048, %ecx ## imm = 0x800
+; CHECK-NEXT:    testl $2048, %r11d ## imm = 0x800
 ; CHECK-NEXT:    jne LBB12_25
 ; CHECK-NEXT:  LBB12_26: ## %else32
-; CHECK-NEXT:    testl $4096, %ecx ## imm = 0x1000
-; CHECK-NEXT:    jne LBB12_27
+; CHECK-NEXT:    testl $4096, %r11d ## imm = 0x1000
+; CHECK-NEXT:    je LBB12_28
+; CHECK-NEXT:  LBB12_27: ## %cond.load34
+; CHECK-NEXT:    movzwl 24(%rsi), %edx
 ; CHECK-NEXT:  LBB12_28: ## %else35
-; CHECK-NEXT:    testl $8192, %ecx ## imm = 0x2000
+; CHECK-NEXT:    movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    testl $8192, %r11d ## imm = 0x2000
 ; CHECK-NEXT:    jne LBB12_29
-; CHECK-NEXT:  LBB12_30: ## %else38
-; CHECK-NEXT:    testl $16384, %ecx ## imm = 0x4000
+; CHECK-NEXT:  ## %bb.30: ## %else38
+; CHECK-NEXT:    testl $16384, %r11d ## imm = 0x4000
 ; CHECK-NEXT:    jne LBB12_31
 ; CHECK-NEXT:  LBB12_32: ## %else41
-; CHECK-NEXT:    testl $32768, %ecx ## imm = 0x8000
-; CHECK-NEXT:    je LBB12_34
-; CHECK-NEXT:  LBB12_33: ## %cond.load43
-; CHECK-NEXT:    vpinsrw $0, 30(%rsi), %xmm0, %xmm9
-; CHECK-NEXT:  LBB12_34: ## %else44
-; CHECK-NEXT:    vpextrw $0, %xmm8, (%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm2, 2(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm10, 4(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm4, 6(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm5, 8(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm6, 10(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm7, 12(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm1, 14(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm0, 16(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm3, 18(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm11, 20(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm12, 22(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm13, 24(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm14, 26(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm15, 28(%rax)
-; CHECK-NEXT:    vpextrw $0, %xmm9, 30(%rax)
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    testl $32768, %r11d ## imm = 0x8000
+; CHECK-NEXT:    je LBB12_33
+; CHECK-NEXT:  LBB12_34: ## %cond.load43
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-NEXT:    jmp LBB12_35
 ; CHECK-NEXT:  LBB12_9: ## %cond.load7
-; CHECK-NEXT:    vpinsrw $0, 6(%rsi), %xmm0, %xmm4
-; CHECK-NEXT:    testb $16, %cl
+; CHECK-NEXT:    movzwl 6(%rsi), %r12d
+; CHECK-NEXT:    testb $16, %r11b
 ; CHECK-NEXT:    je LBB12_12
 ; CHECK-NEXT:  LBB12_11: ## %cond.load10
-; CHECK-NEXT:    vpinsrw $0, 8(%rsi), %xmm0, %xmm5
-; CHECK-NEXT:    testb $32, %cl
+; CHECK-NEXT:    movzwl 8(%rsi), %ebx
+; CHECK-NEXT:    testb $32, %r11b
 ; CHECK-NEXT:    je LBB12_14
 ; CHECK-NEXT:  LBB12_13: ## %cond.load13
-; CHECK-NEXT:    vpinsrw $0, 10(%rsi), %xmm0, %xmm6
-; CHECK-NEXT:    testb $64, %cl
+; CHECK-NEXT:    movzwl 10(%rsi), %ebp
+; CHECK-NEXT:    testb $64, %r11b
 ; CHECK-NEXT:    je LBB12_16
 ; CHECK-NEXT:  LBB12_15: ## %cond.load16
-; CHECK-NEXT:    vpinsrw $0, 12(%rsi), %xmm0, %xmm7
-; CHECK-NEXT:    testb $-128, %cl
+; CHECK-NEXT:    movzwl 12(%rsi), %r13d
+; CHECK-NEXT:    testb $-128, %r11b
 ; CHECK-NEXT:    je LBB12_18
 ; CHECK-NEXT:  LBB12_17: ## %cond.load19
-; CHECK-NEXT:    vpinsrw $0, 14(%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    testl $256, %ecx ## imm = 0x100
+; CHECK-NEXT:    movzwl 14(%rsi), %r14d
+; CHECK-NEXT:    testl $256, %r11d ## imm = 0x100
 ; CHECK-NEXT:    je LBB12_20
 ; CHECK-NEXT:  LBB12_19: ## %cond.load22
-; CHECK-NEXT:    vpinsrw $0, 16(%rsi), %xmm0, %xmm0
-; CHECK-NEXT:    testl $512, %ecx ## imm = 0x200
+; CHECK-NEXT:    movzwl 16(%rsi), %r8d
+; CHECK-NEXT:    testl $512, %r11d ## imm = 0x200
 ; CHECK-NEXT:    je LBB12_22
 ; CHECK-NEXT:  LBB12_21: ## %cond.load25
-; CHECK-NEXT:    vpinsrw $0, 18(%rsi), %xmm0, %xmm3
-; CHECK-NEXT:    testl $1024, %ecx ## imm = 0x400
+; CHECK-NEXT:    movzwl 18(%rsi), %r9d
+; CHECK-NEXT:    testl $1024, %r11d ## imm = 0x400
 ; CHECK-NEXT:    je LBB12_24
 ; CHECK-NEXT:  LBB12_23: ## %cond.load28
-; CHECK-NEXT:    vpinsrw $0, 20(%rsi), %xmm0, %xmm11
-; CHECK-NEXT:    testl $2048, %ecx ## imm = 0x800
+; CHECK-NEXT:    movzwl 20(%rsi), %r10d
+; CHECK-NEXT:    testl $2048, %r11d ## imm = 0x800
 ; CHECK-NEXT:    je LBB12_26
 ; CHECK-NEXT:  LBB12_25: ## %cond.load31
-; CHECK-NEXT:    vpinsrw $0, 22(%rsi), %xmm0, %xmm12
-; CHECK-NEXT:    testl $4096, %ecx ## imm = 0x1000
-; CHECK-NEXT:    je LBB12_28
-; CHECK-NEXT:  LBB12_27: ## %cond.load34
-; CHECK-NEXT:    vpinsrw $0, 24(%rsi), %xmm0, %xmm13
-; CHECK-NEXT:    testl $8192, %ecx ## imm = 0x2000
-; CHECK-NEXT:    je LBB12_30
+; CHECK-NEXT:    movzwl 22(%rsi), %r15d
+; CHECK-NEXT:    testl $4096, %r11d ## imm = 0x1000
+; CHECK-NEXT:    jne LBB12_27
+; CHECK-NEXT:    jmp LBB12_28
 ; CHECK-NEXT:  LBB12_29: ## %cond.load37
-; CHECK-NEXT:    vpinsrw $0, 26(%rsi), %xmm0, %xmm14
-; CHECK-NEXT:    testl $16384, %ecx ## imm = 0x4000
+; CHECK-NEXT:    movzwl 26(%rsi), %ecx
+; CHECK-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    testl $16384, %r11d ## imm = 0x4000
 ; CHECK-NEXT:    je LBB12_32
 ; CHECK-NEXT:  LBB12_31: ## %cond.load40
-; CHECK-NEXT:    vpinsrw $0, 28(%rsi), %xmm0, %xmm15
-; CHECK-NEXT:    testl $32768, %ecx ## imm = 0x8000
-; CHECK-NEXT:    jne LBB12_33
-; CHECK-NEXT:    jmp LBB12_34
+; CHECK-NEXT:    movzwl 28(%rsi), %ecx
+; CHECK-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    testl $32768, %r11d ## imm = 0x8000
+; CHECK-NEXT:    jne LBB12_34
+; CHECK-NEXT:  LBB12_33:
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT:  LBB12_35: ## %else44
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    movw %dx, (%rax)
+; CHECK-NEXT:    movw %di, 2(%rax)
+; CHECK-NEXT:    movw %cx, 4(%rax)
+; CHECK-NEXT:    movw %r12w, 6(%rax)
+; CHECK-NEXT:    movw %bx, 8(%rax)
+; CHECK-NEXT:    movw %bp, 10(%rax)
+; CHECK-NEXT:    movw %r13w, 12(%rax)
+; CHECK-NEXT:    movw %r14w, 14(%rax)
+; CHECK-NEXT:    movw %r8w, 16(%rax)
+; CHECK-NEXT:    movw %r9w, 18(%rax)
+; CHECK-NEXT:    movw %r10w, 20(%rax)
+; CHECK-NEXT:    movw %r15w, 22(%rax)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT:    movw %cx, 24(%rax)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT:    movw %cx, 26(%rax)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT:    movw %cx, 28(%rax)
+; CHECK-NEXT:    movw %si, 30(%rax)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
   %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer)
   ret <16 x half> %res
 }
@@ -364,76 +414,78 @@ define void @test_mask_store_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x h
 ; CHECK-NEXT:  LBB13_32: ## %else30
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB13_1: ## %cond.store
-; CHECK-NEXT:    vpextrw $0, %xmm1, (%rdi)
+; CHECK-NEXT:    movw %si, (%rdi)
 ; CHECK-NEXT:    testb $2, %al
 ; CHECK-NEXT:    je LBB13_4
 ; CHECK-NEXT:  LBB13_3: ## %cond.store1
-; CHECK-NEXT:    vpextrw $0, %xmm2, 2(%rdi)
+; CHECK-NEXT:    movw %dx, 2(%rdi)
 ; CHECK-NEXT:    testb $4, %al
 ; CHECK-NEXT:    je LBB13_6
 ; CHECK-NEXT:  LBB13_5: ## %cond.store3
-; CHECK-NEXT:    vpextrw $0, %xmm3, 4(%rdi)
+; CHECK-NEXT:    movw %cx, 4(%rdi)
 ; CHECK-NEXT:    testb $8, %al
 ; CHECK-NEXT:    je LBB13_8
 ; CHECK-NEXT:  LBB13_7: ## %cond.store5
-; CHECK-NEXT:    vpextrw $0, %xmm4, 6(%rdi)
+; CHECK-NEXT:    movw %r8w, 6(%rdi)
 ; CHECK-NEXT:    testb $16, %al
 ; CHECK-NEXT:    je LBB13_10
 ; CHECK-NEXT:  LBB13_9: ## %cond.store7
-; CHECK-NEXT:    vpextrw $0, %xmm5, 8(%rdi)
+; CHECK-NEXT:    movw %r9w, 8(%rdi)
 ; CHECK-NEXT:    testb $32, %al
 ; CHECK-NEXT:    je LBB13_12
 ; CHECK-NEXT:  LBB13_11: ## %cond.store9
-; CHECK-NEXT:    vpextrw $0, %xmm6, 10(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 10(%rdi)
 ; CHECK-NEXT:    testb $64, %al
 ; CHECK-NEXT:    je LBB13_14
 ; CHECK-NEXT:  LBB13_13: ## %cond.store11
-; CHECK-NEXT:    vpextrw $0, %xmm7, 12(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 12(%rdi)
 ; CHECK-NEXT:    testb $-128, %al
 ; CHECK-NEXT:    je LBB13_16
 ; CHECK-NEXT:  LBB13_15: ## %cond.store13
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 14(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 14(%rdi)
 ; CHECK-NEXT:    testl $256, %eax ## imm = 0x100
 ; CHECK-NEXT:    je LBB13_18
 ; CHECK-NEXT:  LBB13_17: ## %cond.store15
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 16(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 16(%rdi)
 ; CHECK-NEXT:    testl $512, %eax ## imm = 0x200
 ; CHECK-NEXT:    je LBB13_20
 ; CHECK-NEXT:  LBB13_19: ## %cond.store17
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 18(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 18(%rdi)
 ; CHECK-NEXT:    testl $1024, %eax ## imm = 0x400
 ; CHECK-NEXT:    je LBB13_22
 ; CHECK-NEXT:  LBB13_21: ## %cond.store19
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 20(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 20(%rdi)
 ; CHECK-NEXT:    testl $2048, %eax ## imm = 0x800
 ; CHECK-NEXT:    je LBB13_24
 ; CHECK-NEXT:  LBB13_23: ## %cond.store21
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 22(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 22(%rdi)
 ; CHECK-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; CHECK-NEXT:    je LBB13_26
 ; CHECK-NEXT:  LBB13_25: ## %cond.store23
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 24(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 24(%rdi)
 ; CHECK-NEXT:    testl $8192, %eax ## imm = 0x2000
 ; CHECK-NEXT:    je LBB13_28
 ; CHECK-NEXT:  LBB13_27: ## %cond.store25
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 26(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 26(%rdi)
 ; CHECK-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; CHECK-NEXT:    je LBB13_30
 ; CHECK-NEXT:  LBB13_29: ## %cond.store27
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 28(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movw %cx, 28(%rdi)
 ; CHECK-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; CHECK-NEXT:    je LBB13_32
 ; CHECK-NEXT:  LBB13_31: ## %cond.store29
-; CHECK-NEXT:    vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT:    vpextrw $0, %xmm0, 30(%rdi)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movw %ax, 30(%rdi)
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
   ret void

diff  --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
index 3b8d6edf04d33..e2ea8974f6551 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
@@ -211,8 +211,8 @@ define half @movmsk(half %x) {
 define half @bitcast_fabs(half %x) {
 ; CHECK-LABEL: bitcast_fabs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
   %and = and i16 %bc1, 32767
@@ -223,8 +223,8 @@ define half @bitcast_fabs(half %x) {
 define half @bitcast_fneg(half %x) {
 ; CHECK-LABEL: bitcast_fneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
   %xor = xor i16 %bc1, 32768
@@ -285,8 +285,8 @@ define half @fsub_bitcast_fneg(half %x, half %y) {
 define half @nabs(half %a) {
 ; CHECK-LABEL: nabs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %conv = bitcast half %a to i16
   %and = or i16 %conv, -32768

diff  --git a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
index dd58c7bde3c55..4f703d42fdf5f 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %2
 ; CHECK-NEXT: t8: i32 = add t2, Constant:i32<4>
 ; CHECK-NEXT: t22: ch,glue = CopyToReg t17, Register:i32 %5, t8
-; CHECK-NEXT: t30: ch,glue = inlineasm_br t22, TargetExternalSymbol:i64'xorl $0, $0; jmp ${1:l}', MDNode:ch<null>, TargetConstant:i64<8>, TargetConstant:i32<2359305>, Register:i32 %5, TargetConstant:i64<13>, TargetBlockAddress:i64<@test, %fail> 0, TargetConstant:i32<12>, Register:i32 $df, TargetConstant:i32<12>, Register:i16 $fpsw, TargetConstant:i32<12>, Register:i32 $eflags, t22:1
+; CHECK-NEXT: t30: ch,glue = inlineasm_br t22, TargetExternalSymbol:i64'xorl $0, $0; jmp ${1:l}', MDNode:ch<null>, TargetConstant:i64<8>, TargetConstant:i32<2293769>, Register:i32 %5, TargetConstant:i64<13>, TargetBlockAddress:i64<@test, %fail> 0, TargetConstant:i32<12>, Register:i32 $df, TargetConstant:i32<12>, Register:i16 $fpsw, TargetConstant:i32<12>, Register:i32 $eflags, t22:1
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
 entry:

diff  --git a/llvm/test/CodeGen/X86/cvt16-2.ll b/llvm/test/CodeGen/X86/cvt16-2.ll
index c0fbb5ab6871f..67111e838cab8 100644
--- a/llvm/test/CodeGen/X86/cvt16-2.ll
+++ b/llvm/test/CodeGen/X86/cvt16-2.ll
@@ -9,8 +9,7 @@ define void @test1(float %src, i16* %dest) {
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
 ; LIBCALL-NEXT:    .cfi_offset %rbx, -16
 ; LIBCALL-NEXT:    movq %rdi, %rbx
-; LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
+; LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
 ; LIBCALL-NEXT:    movw %ax, (%rbx)
 ; LIBCALL-NEXT:    popq %rbx
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
@@ -29,8 +28,8 @@ define void @test1(float %src, i16* %dest) {
 define float @test2(i16* nocapture %src) {
 ; LIBCALL-LABEL: test2:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; LIBCALL-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; FP16-LABEL: test2:
 ; FP16:       # %bb.0:
@@ -47,10 +46,11 @@ define float @test3(float %src) nounwind uwtable readnone {
 ; LIBCALL:       # %bb.0:
 ; LIBCALL-NEXT:    pushq %rax
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
-; LIBCALL-NEXT:    callq __truncsfhf2 at PLT
+; LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
+; LIBCALL-NEXT:    movzwl %ax, %edi
 ; LIBCALL-NEXT:    popq %rax
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
-; LIBCALL-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; LIBCALL-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; FP16-LABEL: test3:
 ; FP16:       # %bb.0:
@@ -66,8 +66,14 @@ define float @test3(float %src) nounwind uwtable readnone {
 define double @test4(i16* nocapture %src) {
 ; LIBCALL-LABEL: test4:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; LIBCALL-NEXT:    jmp __extendhfdf2 at PLT # TAILCALL
+; LIBCALL-NEXT:    pushq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; LIBCALL-NEXT:    popq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
+; LIBCALL-NEXT:    retq
 ;
 ; FP16-LABEL: test4:
 ; FP16:       # %bb.0:
@@ -82,14 +88,7 @@ define double @test4(i16* nocapture %src) {
 define i16 @test5(double %src) {
 ; LIBCALL-LABEL: test5:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    pushq %rax
-; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
-; LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; LIBCALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; LIBCALL-NEXT:    popq %rcx
-; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
-; LIBCALL-NEXT:    retq
+; LIBCALL-NEXT:    jmp __truncdfhf2 at PLT # TAILCALL
 ;
 ; FP16-LABEL: test5:
 ; FP16:       # %bb.0:
@@ -107,8 +106,10 @@ define x86_fp80 @test6(i16* nocapture %src) {
 ; LIBCALL:       # %bb.0:
 ; LIBCALL-NEXT:    pushq %rax
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
-; LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; LIBCALL-NEXT:    callq __extendhfxf2 at PLT
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp)
+; LIBCALL-NEXT:    flds {{[0-9]+}}(%rsp)
 ; LIBCALL-NEXT:    popq %rax
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
 ; LIBCALL-NEXT:    retq
@@ -130,16 +131,7 @@ define x86_fp80 @test6(i16* nocapture %src) {
 define i16 @test7(x86_fp80 %src) {
 ; LIBCALL-LABEL: test7:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    subq $24, %rsp
-; LIBCALL-NEXT:    .cfi_def_cfa_offset 32
-; LIBCALL-NEXT:    fldt {{[0-9]+}}(%rsp)
-; LIBCALL-NEXT:    fstpt (%rsp)
-; LIBCALL-NEXT:    callq __truncxfhf2 at PLT
-; LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; LIBCALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; LIBCALL-NEXT:    addq $24, %rsp
-; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
-; LIBCALL-NEXT:    retq
+; LIBCALL-NEXT:    jmp __truncxfhf2 at PLT # TAILCALL
 ;
 ; FP16-LABEL: test7:
 ; FP16:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index 43b1988d1b16f..035e3323478f2 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -28,8 +28,8 @@ define void @test1(float %src, i16* %dest) {
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
 ; LIBCALL-NEXT:    .cfi_offset %rbx, -16
 ; LIBCALL-NEXT:    movq %rdi, %rbx
-; LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; LIBCALL-NEXT:    pextrw $0, %xmm0, (%rbx)
+; LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
+; LIBCALL-NEXT:    movw %ax, (%rbx)
 ; LIBCALL-NEXT:    popq %rbx
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
 ; LIBCALL-NEXT:    retq
@@ -37,8 +37,7 @@ define void @test1(float %src, i16* %dest) {
 ; F16C-LABEL: test1:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; SOFTFLOAT-LABEL: test1:
@@ -60,8 +59,8 @@ define void @test1(float %src, i16* %dest) {
 define float @test2(i16* nocapture %src) {
 ; LIBCALL-LABEL: test2:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; LIBCALL-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; F16C-LABEL: test2:
 ; F16C:       # %bb.0:
@@ -89,17 +88,15 @@ define float @test3(float %src) nounwind uwtable readnone {
 ; LIBCALL:       # %bb.0:
 ; LIBCALL-NEXT:    pushq %rax
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
-; LIBCALL-NEXT:    callq __truncsfhf2 at PLT
+; LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
+; LIBCALL-NEXT:    movzwl %ax, %edi
 ; LIBCALL-NEXT:    popq %rax
 ; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
-; LIBCALL-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; LIBCALL-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; F16C-LABEL: test3:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movzwl %ax, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
@@ -121,8 +118,14 @@ define float @test3(float %src) nounwind uwtable readnone {
 define double @test4(i16* nocapture %src) {
 ; LIBCALL-LABEL: test4:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; LIBCALL-NEXT:    jmp __extendhfdf2 at PLT # TAILCALL
+; LIBCALL-NEXT:    pushq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; LIBCALL-NEXT:    popq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
+; LIBCALL-NEXT:    retq
 ;
 ; F16C-LABEL: test4:
 ; F16C:       # %bb.0:
@@ -151,22 +154,11 @@ define double @test4(i16* nocapture %src) {
 define i16 @test5(double %src) {
 ; LIBCALL-LABEL: test5:
 ; LIBCALL:       # %bb.0:
-; LIBCALL-NEXT:    pushq %rax
-; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
-; LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; LIBCALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; LIBCALL-NEXT:    popq %rcx
-; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
-; LIBCALL-NEXT:    retq
+; LIBCALL-NEXT:    jmp __truncdfhf2 at PLT # TAILCALL
 ;
 ; F16C-LABEL: test5:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-NEXT:    retq
+; F16C-NEXT:    jmp __truncdfhf2 at PLT # TAILCALL
 ;
 ; SOFTFLOAT-LABEL: test5:
 ; SOFTFLOAT:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 2b3891a6fee8f..362fe73073f02 100644
--- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -16,8 +16,6 @@ define zeroext i16 @test1_fast(double %d) #0 {
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
 ; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    popq %rcx
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-NEXT:    retq
@@ -27,42 +25,40 @@ entry:
 }
 
 define zeroext i16 @test2_fast(x86_fp80 %d) #0 {
-; ALL-LABEL: test2_fast:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    subq $24, %rsp
-; ALL-NEXT:    .cfi_def_cfa_offset 32
-; ALL-NEXT:    fldt {{[0-9]+}}(%rsp)
-; ALL-NEXT:    fstpt (%rsp)
-; ALL-NEXT:    callq __truncxfhf2 at PLT
-; ALL-NEXT:    vpextrw $0, %xmm0, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL-NEXT:    addq $24, %rsp
-; ALL-NEXT:    .cfi_def_cfa_offset 8
-; ALL-NEXT:    retq
-entry:
-  %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d)
-  ret i16 %0
-}
-
-define zeroext i16 @test1(double %d) #1 {
-; F16C-LABEL: test1:
+; F16C-LABEL: test2_fast:
 ; F16C:       # %bb.0: # %entry
-; F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; F16C-NEXT:    fldt {{[0-9]+}}(%rsp)
+; F16C-NEXT:    fstps -{{[0-9]+}}(%rsp)
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vmovd %xmm0, %eax
 ; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
 ; F16C-NEXT:    retq
 ;
-; AVX-LABEL: test1:
+; AVX-LABEL: test2_fast:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %rax
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX-NEXT:    popq %rcx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fstpt (%rsp)
+; AVX-NEXT:    callq __truncxfhf2 at PLT
+; AVX-NEXT:    addq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-NEXT:    retq
+entry:
+  %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d)
+  ret i16 %0
+}
+
+define zeroext i16 @test1(double %d) #1 {
+; ALL-LABEL: test1:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    popq %rcx
+; ALL-NEXT:    .cfi_def_cfa_offset 8
+; ALL-NEXT:    retq
 entry:
   %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d)
   ret i16 %0
@@ -76,8 +72,6 @@ define zeroext i16 @test2(x86_fp80 %d) #1 {
 ; ALL-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; ALL-NEXT:    fstpt (%rsp)
 ; ALL-NEXT:    callq __truncxfhf2 at PLT
-; ALL-NEXT:    vpextrw $0, %xmm0, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; ALL-NEXT:    addq $24, %rsp
 ; ALL-NEXT:    .cfi_def_cfa_offset 8
 ; ALL-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll
index e4195d6bd4fb9..102a162b44824 100644
--- a/llvm/test/CodeGen/X86/fmf-flags.ll
+++ b/llvm/test/CodeGen/X86/fmf-flags.ll
@@ -111,12 +111,14 @@ define dso_local float @div_arcp_by_const(half %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    callq __truncsfhf2 at PLT
+; X64-NEXT:    callq __gnu_f2h_ieee at PLT
+; X64-NEXT:    movzwl %ax, %edi
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    .cfi_def_cfa_offset 8
-; X64-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; X64-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; X86-LABEL: div_arcp_by_const:
 ; X86:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index 9218996071ba2..7bcbdb2f40ea5 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -10,16 +10,18 @@ define half @round_f16(half %h) {
 ; SSE2:       ## %bb.0: ## %entry
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    movzwl %di, %edi
 ; SSE2-NEXT:    callq ___extendhfsf2
 ; SSE2-NEXT:    callq _roundf
 ; SSE2-NEXT:    callq ___truncsfhf2
-; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    popq %rcx
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: round_f16:
 ; SSE41:       ## %bb.0: ## %entry
 ; SSE41-NEXT:    pushq %rax
 ; SSE41-NEXT:    .cfi_def_cfa_offset 16
+; SSE41-NEXT:    movzwl %di, %edi
 ; SSE41-NEXT:    callq ___extendhfsf2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andps %xmm0, %xmm1
@@ -28,13 +30,14 @@ define half @round_f16(half %h) {
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
 ; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
 ; SSE41-NEXT:    callq ___truncsfhf2
-; SSE41-NEXT:    popq %rax
+; SSE41-NEXT:    popq %rcx
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: round_f16:
 ; AVX1:       ## %bb.0: ## %entry
 ; AVX1-NEXT:    pushq %rax
 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    movzwl %di, %edi
 ; AVX1-NEXT:    callq ___extendhfsf2
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
@@ -42,13 +45,12 @@ define half @round_f16(half %h) {
 ; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    callq ___truncsfhf2
-; AVX1-NEXT:    popq %rax
+; AVX1-NEXT:    popq %rcx
 ; AVX1-NEXT:    retq
 ;
 ; AVX512F-LABEL: round_f16:
 ; AVX512F:       ## %bb.0: ## %entry
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
+; AVX512F-NEXT:    movzwl %di, %eax
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
@@ -57,7 +59,7 @@ define half @round_f16(half %h) {
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
-; AVX512F-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512FP16-LABEL: round_f16:

diff  --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
index 6e087a383b1d1..6757f3825ba58 100644
--- a/llvm/test/CodeGen/X86/fp-roundeven.ll
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -10,42 +10,44 @@ define half @roundeven_f16(half %h) {
 ; SSE2:       ## %bb.0: ## %entry
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    movzwl %di, %edi
 ; SSE2-NEXT:    callq ___extendhfsf2
 ; SSE2-NEXT:    callq _roundevenf
 ; SSE2-NEXT:    callq ___truncsfhf2
-; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    popq %rcx
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: roundeven_f16:
 ; SSE41:       ## %bb.0: ## %entry
 ; SSE41-NEXT:    pushq %rax
 ; SSE41-NEXT:    .cfi_def_cfa_offset 16
+; SSE41-NEXT:    movzwl %di, %edi
 ; SSE41-NEXT:    callq ___extendhfsf2
 ; SSE41-NEXT:    roundss $8, %xmm0, %xmm0
 ; SSE41-NEXT:    callq ___truncsfhf2
-; SSE41-NEXT:    popq %rax
+; SSE41-NEXT:    popq %rcx
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: roundeven_f16:
 ; AVX1:       ## %bb.0: ## %entry
 ; AVX1-NEXT:    pushq %rax
 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    movzwl %di, %edi
 ; AVX1-NEXT:    callq ___extendhfsf2
 ; AVX1-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    callq ___truncsfhf2
-; AVX1-NEXT:    popq %rax
+; AVX1-NEXT:    popq %rcx
 ; AVX1-NEXT:    retq
 ;
 ; AVX512F-LABEL: roundeven_f16:
 ; AVX512F:       ## %bb.0: ## %entry
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
+; AVX512F-NEXT:    movzwl %di, %eax
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
-; AVX512F-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512FP16-LABEL: roundeven_f16:

diff  --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 37ce1cc567576..dd05e643f8df0 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -19,9 +19,10 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp {
 ; X64-SSE-LABEL: TestFPExtF16_F128:
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    pinsrw $0, vf16(%rip), %xmm0
-; X64-SSE-NEXT:    callq __extendhftf2 at PLT
-; X64-SSE-NEXT:    movdqa %xmm0, vf128(%rip)
+; X64-SSE-NEXT:    movzwl vf16(%rip), %edi
+; X64-SSE-NEXT:    callq __gnu_h2f_ieee at PLT
+; X64-SSE-NEXT:    callq __extendsftf2 at PLT
+; X64-SSE-NEXT:    movaps %xmm0, vf128(%rip)
 ; X64-SSE-NEXT:    popq %rax
 ; X64-SSE-NEXT:    retq
 ;
@@ -217,9 +218,8 @@ define dso_local void @TestFPTruncF128_F16() nounwind strictfp {
 ; X64-SSE-LABEL: TestFPTruncF128_F16:
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movdqa vf128(%rip), %xmm0
+; X64-SSE-NEXT:    movaps vf128(%rip), %xmm0
 ; X64-SSE-NEXT:    callq __trunctfhf2 at PLT
-; X64-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-SSE-NEXT:    movw %ax, vf16(%rip)
 ; X64-SSE-NEXT:    popq %rax
 ; X64-SSE-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll
index 6237790763124..111611b34b06a 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat.ll
@@ -139,17 +139,15 @@ define i32 @stest_f16i32(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; CHECK-NEXT:    cmovael %eax, %ecx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; CHECK-NEXT:    cmovbel %ecx, %edx
+; CHECK-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; CHECK-NEXT:    cmovbel %eax, %ecx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovnpl %edx, %eax
+; CHECK-NEXT:    cmovnpl %ecx, %eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -168,7 +166,8 @@ define i32 @utesth_f16i32(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    sarq $63, %rcx
@@ -196,7 +195,8 @@ define i32 @ustest_f16i32(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
 ; CHECK-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
 ; CHECK-NEXT:    cmpq %rax, %rcx
@@ -343,17 +343,13 @@ define i16 @stest_f16i16(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $32768, %ecx # imm = 0x8000
-; CHECK-NEXT:    cmovael %eax, %ecx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $32767, %edx # imm = 0x7FFF
-; CHECK-NEXT:    cmovbel %ecx, %edx
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovnpl %edx, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -373,7 +369,8 @@ define i16 @utesth_f16i16(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
 ; CHECK-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
 ; CHECK-NEXT:    movl $65535, %eax # imm = 0xFFFF
@@ -395,7 +392,8 @@ define i16 @ustest_f16i16(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; CHECK-NEXT:    movl $65535, %ecx # imm = 0xFFFF
@@ -564,17 +562,15 @@ define i64 @stest_f16i64(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovaeq %rax, %rcx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmovbeq %rcx, %rdx
+; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovbeq %rax, %rcx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovnpq %rdx, %rax
+; CHECK-NEXT:    cmovnpq %rcx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -593,7 +589,9 @@ define i64 @utesth_f16i64(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __fixunshfti at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    cmovneq %rcx, %rax
@@ -613,7 +611,9 @@ define i64 @ustest_f16i64(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    movl $1, %esi
@@ -768,17 +768,15 @@ define i32 @stest_f16i32_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; CHECK-NEXT:    cmovael %eax, %ecx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; CHECK-NEXT:    cmovbel %ecx, %edx
+; CHECK-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; CHECK-NEXT:    cmovbel %eax, %ecx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovnpl %edx, %eax
+; CHECK-NEXT:    cmovnpl %ecx, %eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -795,7 +793,8 @@ define i32 @utesth_f16i32_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
 ; CHECK-NEXT:    movq %rcx, %rdx
 ; CHECK-NEXT:    sarq $63, %rdx
@@ -822,7 +821,8 @@ define i32 @ustest_f16i32_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
 ; CHECK-NEXT:    cmpq %rcx, %rax
@@ -957,17 +957,13 @@ define i16 @stest_f16i16_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $32768, %ecx # imm = 0x8000
-; CHECK-NEXT:    cmovael %eax, %ecx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $32767, %edx # imm = 0x7FFF
-; CHECK-NEXT:    cmovbel %ecx, %edx
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovnpl %edx, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -985,7 +981,8 @@ define i16 @utesth_f16i16_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
 ; CHECK-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
 ; CHECK-NEXT:    movl $65535, %eax # imm = 0xFFFF
@@ -1006,7 +1003,8 @@ define i16 @ustest_f16i16_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; CHECK-NEXT:    movl $65535, %ecx # imm = 0xFFFF
@@ -1165,17 +1163,15 @@ define i64 @stest_f16i64_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovaeq %rax, %rcx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmovbeq %rcx, %rdx
+; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovbeq %rax, %rcx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovnpq %rdx, %rax
+; CHECK-NEXT:    cmovnpq %rcx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -1192,7 +1188,9 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __fixunshfti at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    cmovneq %rcx, %rax
@@ -1213,7 +1211,9 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    movl $1, %esi

diff  --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 54e4fc8af6749..312fad5302ef1 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -395,33 +395,39 @@ entry:
 define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-LABEL: stest_f16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm3
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
@@ -442,7 +448,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    pand %xmm1, %xmm3
 ; CHECK-NEXT:    pandn %xmm2, %xmm1
 ; CHECK-NEXT:    por %xmm3, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; CHECK-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm7, %xmm3
 ; CHECK-NEXT:    pxor %xmm0, %xmm3
 ; CHECK-NEXT:    movdqa %xmm4, %xmm5
@@ -485,7 +491,13 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -501,13 +513,22 @@ entry:
 define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-LABEL: utesth_f16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %ecx, %ebp
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -516,10 +537,9 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    andq %rcx, %rdx
 ; CHECK-NEXT:    orq %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -528,12 +548,11 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    andq %rcx, %rdx
 ; CHECK-NEXT:    orq %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -543,9 +562,8 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    orq %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -572,7 +590,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    pand %xmm5, %xmm0
 ; CHECK-NEXT:    pandn %xmm1, %xmm5
 ; CHECK-NEXT:    por %xmm0, %xmm5
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; CHECK-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
 ; CHECK-NEXT:    pxor %xmm6, %xmm2
 ; CHECK-NEXT:    movdqa %xmm4, %xmm0
 ; CHECK-NEXT:    pcmpgtd %xmm2, %xmm0
@@ -586,7 +604,13 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    pandn %xmm1, %xmm0
 ; CHECK-NEXT:    por %xmm6, %xmm0
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -600,33 +624,39 @@ entry:
 define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-LABEL: ustest_f16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm3
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
@@ -647,7 +677,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    pand %xmm1, %xmm3
 ; CHECK-NEXT:    pandn %xmm2, %xmm1
 ; CHECK-NEXT:    por %xmm3, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; CHECK-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm7, %xmm3
 ; CHECK-NEXT:    pxor %xmm0, %xmm3
 ; CHECK-NEXT:    movdqa %xmm4, %xmm5
@@ -683,7 +713,13 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    por %xmm2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -855,37 +891,52 @@ entry:
 define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: stest_f16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -893,29 +944,25 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -923,7 +970,19 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -939,37 +998,52 @@ entry:
 define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: utesth_f16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -977,29 +1051,25 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -1027,7 +1097,19 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    pslld $16, %xmm0
 ; CHECK-NEXT:    psrad $16, %xmm0
 ; CHECK-NEXT:    packssdw %xmm4, %xmm0
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -1041,37 +1123,52 @@ entry:
 define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: ustest_f16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %r15d
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %esi, %r14d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -1079,29 +1176,25 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -1132,7 +1225,19 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    pslld $16, %xmm0
 ; CHECK-NEXT:    psrad $16, %xmm0
 ; CHECK-NEXT:    packssdw %xmm3, %xmm0
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -1462,21 +1567,24 @@ entry:
 define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-LABEL: stest_f16i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    callq __fixhfti at PLT
-; CHECK-NEXT:    movq %rax, %r14
-; CHECK-NEXT:    movq %rdx, %rbx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %r14d
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    movq %rdx, %rbp
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
 ; CHECK-NEXT:    cmpq %rsi, %rax
@@ -1484,28 +1592,28 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    sbbq $0, %rdi
 ; CHECK-NEXT:    cmovgeq %rcx, %rdx
 ; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %r14
-; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    cmpq %rsi, %rbx
+; CHECK-NEXT:    movq %rbp, %rdi
 ; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %rbx, %rcx
-; CHECK-NEXT:    cmovlq %r14, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %r8
+; CHECK-NEXT:    cmovlq %rbp, %rcx
+; CHECK-NEXT:    cmovlq %rbx, %rsi
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    movq $-1, %rbp
 ; CHECK-NEXT:    movq $-1, %rbx
-; CHECK-NEXT:    movq $-1, %rdi
-; CHECK-NEXT:    sbbq %rcx, %rdi
-; CHECK-NEXT:    cmovgeq %r8, %rsi
-; CHECK-NEXT:    cmpq %rax, %r8
-; CHECK-NEXT:    sbbq %rdx, %rbx
-; CHECK-NEXT:    cmovgeq %r8, %rax
+; CHECK-NEXT:    sbbq %rcx, %rbx
+; CHECK-NEXT:    cmovgeq %rdi, %rsi
+; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    sbbq %rdx, %rbp
+; CHECK-NEXT:    cmovgeq %rdi, %rax
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movq %rsi, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -1521,22 +1629,24 @@ entry:
 define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-LABEL: utesth_f16i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __fixunshfti at PLT
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movq %rax, %rbx
 ; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __fixunshfti at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    cmovneq %rcx, %rax
@@ -1545,11 +1655,11 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    movq %rbx, %xmm1
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -1563,28 +1673,31 @@ entry:
 define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-LABEL: ustest_f16i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %r14d
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    movq %rdx, %rbp
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    movl $1, %esi
 ; CHECK-NEXT:    cmovgq %rsi, %rdx
 ; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovleq %r14, %rsi
+; CHECK-NEXT:    testq %rbp, %rbp
+; CHECK-NEXT:    cmovleq %rbp, %rsi
 ; CHECK-NEXT:    cmovgq %rcx, %rbx
 ; CHECK-NEXT:    movq %rbx, %rdi
 ; CHECK-NEXT:    negq %rdi
@@ -1599,11 +1712,11 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movq %rbx, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -2002,33 +2115,39 @@ entry:
 define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-LABEL: stest_f16i32_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm2
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
@@ -2049,7 +2168,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    pand %xmm1, %xmm2
 ; CHECK-NEXT:    pandn %xmm4, %xmm1
 ; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; CHECK-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm7, %xmm2
 ; CHECK-NEXT:    pxor %xmm0, %xmm2
 ; CHECK-NEXT:    movdqa %xmm3, %xmm5
@@ -2092,7 +2211,13 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -2106,13 +2231,22 @@ entry:
 define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-LABEL: utesth_f16i32_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %ecx, %ebp
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -2121,10 +2255,9 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    andq %rcx, %rdx
 ; CHECK-NEXT:    orq %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -2133,12 +2266,11 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    andq %rcx, %rdx
 ; CHECK-NEXT:    orq %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -2148,9 +2280,8 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    orq %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -2177,7 +2308,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    pand %xmm4, %xmm0
 ; CHECK-NEXT:    pandn %xmm2, %xmm4
 ; CHECK-NEXT:    por %xmm0, %xmm4
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; CHECK-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
 ; CHECK-NEXT:    pxor %xmm6, %xmm1
 ; CHECK-NEXT:    movdqa %xmm3, %xmm0
 ; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
@@ -2191,7 +2322,13 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-NEXT:    por %xmm6, %xmm0
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -2204,33 +2341,39 @@ entry:
 define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-LABEL: ustest_f16i32_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movq %rax, %xmm2
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
@@ -2251,7 +2394,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    pand %xmm1, %xmm2
 ; CHECK-NEXT:    pandn %xmm4, %xmm1
 ; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; CHECK-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm7, %xmm2
 ; CHECK-NEXT:    pxor %xmm0, %xmm2
 ; CHECK-NEXT:    movdqa %xmm3, %xmm5
@@ -2287,7 +2430,13 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    por %xmm2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -2447,37 +2596,52 @@ entry:
 define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: stest_f16i16_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -2485,29 +2649,25 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -2515,7 +2675,19 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -2529,37 +2701,52 @@ entry:
 define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: utesth_f16i16_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -2567,29 +2754,25 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2617,7 +2800,19 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    pslld $16, %xmm0
 ; CHECK-NEXT:    psrad $16, %xmm0
 ; CHECK-NEXT:    packssdw %xmm4, %xmm0
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -2630,37 +2825,52 @@ entry:
 define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: ustest_f16i16_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %r15d
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %esi, %r14d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -2668,29 +2878,25 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -2721,7 +2927,19 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    pslld $16, %xmm0
 ; CHECK-NEXT:    psrad $16, %xmm0
 ; CHECK-NEXT:    packssdw %xmm3, %xmm0
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -3065,21 +3283,24 @@ entry:
 define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-LABEL: stest_f16i64_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movq %rax, %rbx
 ; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
 ; CHECK-NEXT:    cmpq %rcx, %rax
 ; CHECK-NEXT:    movq %rcx, %rsi
@@ -3097,28 +3318,28 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    cmoveq %rsi, %rcx
 ; CHECK-NEXT:    cmovsq %r14, %rdi
 ; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000
-; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rbp # imm = 0x8000000000000000
+; CHECK-NEXT:    movq %rbp, %rsi
 ; CHECK-NEXT:    cmovnsq %rcx, %rsi
-; CHECK-NEXT:    cmpq %rbx, %rcx
-; CHECK-NEXT:    cmovbeq %rbx, %rcx
+; CHECK-NEXT:    cmpq %rbp, %rcx
+; CHECK-NEXT:    cmovbeq %rbp, %rcx
 ; CHECK-NEXT:    cmpq $-1, %rdi
 ; CHECK-NEXT:    cmovneq %rsi, %rcx
 ; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %rbp, %rsi
 ; CHECK-NEXT:    cmovnsq %rax, %rsi
-; CHECK-NEXT:    cmpq %rbx, %rax
-; CHECK-NEXT:    cmovbeq %rbx, %rax
+; CHECK-NEXT:    cmpq %rbp, %rax
+; CHECK-NEXT:    cmovbeq %rbp, %rax
 ; CHECK-NEXT:    cmpq $-1, %rdx
 ; CHECK-NEXT:    cmovneq %rsi, %rax
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movq %rcx, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -3132,39 +3353,41 @@ entry:
 define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-LABEL: utesth_f16i64_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __fixunshfti at PLT
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __fixunshfti at PLT
+; CHECK-NEXT:    movq %rdx, %rbp
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    cmovneq %rcx, %rax
 ; CHECK-NEXT:    cmpq $1, %rdx
 ; CHECK-NEXT:    cmoveq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
+; CHECK-NEXT:    testq %rbp, %rbp
 ; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    cmpq $1, %r14
+; CHECK-NEXT:    cmpq $1, %rbp
 ; CHECK-NEXT:    cmoveq %rcx, %rbx
 ; CHECK-NEXT:    movq %rbx, %xmm1
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
@@ -3177,21 +3400,24 @@ entry:
 define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-LABEL: ustest_f16i64_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %r14d
+; CHECK-NEXT:    movzwl %di, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __fixhfti at PLT
+; CHECK-NEXT:    movq %rdx, %rbp
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testq %rdx, %rdx
 ; CHECK-NEXT:    movl $1, %esi
@@ -3200,10 +3426,10 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    cmovgq %rcx, %rax
 ; CHECK-NEXT:    cmpq $1, %rdx
 ; CHECK-NEXT:    cmoveq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovleq %r14, %rsi
+; CHECK-NEXT:    testq %rbp, %rbp
+; CHECK-NEXT:    cmovleq %rbp, %rsi
 ; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    cmpq $1, %r14
+; CHECK-NEXT:    cmpq $1, %rbp
 ; CHECK-NEXT:    cmoveq %rcx, %rbx
 ; CHECK-NEXT:    testq %rsi, %rsi
 ; CHECK-NEXT:    cmovsq %rcx, %rbx
@@ -3212,11 +3438,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movq %rbx, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
index 8ace836987319..bcfac18d6f4f4 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -2052,7 +2052,6 @@ declare  i64 @llvm.fptosi.sat.i64.f16 (half)
 declare i100 @llvm.fptosi.sat.i100.f16(half)
 declare i128 @llvm.fptosi.sat.i128.f16(half)
 
-; FIXME: Can be optimizated with maxss + minss
 define i1 @test_signed_i1_f16(half %f) nounwind {
 ; X86-X87-LABEL: test_signed_i1_f16:
 ; X86-X87:       # %bb.0:
@@ -2110,22 +2109,15 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_signed_i1_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $255, %eax
-; X86-SSE-NEXT:    cmovael %ecx, %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmoval %ecx, %eax
-; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ecx, %eax
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2133,17 +2125,13 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i1_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmoval %ecx, %eax
-; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovpl %ecx, %eax
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    minss %xmm1, %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2151,7 +2139,6 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
     ret i1 %x
 }
 
-; FIXME: Can be optimizated with maxss + minss
 define i8 @test_signed_i8_f16(half %f) nounwind {
 ; X86-X87-LABEL: test_signed_i8_f16:
 ; X86-X87:       # %bb.0:
@@ -2205,22 +2192,15 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_signed_i8_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $128, %ecx
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $127, %edx
-; X86-SSE-NEXT:    cmovbel %ecx, %edx
-; X86-SSE-NEXT:    xorl %eax, %eax
-; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovnpl %edx, %eax
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2228,17 +2208,13 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i8_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $128, %ecx
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    cmovbel %ecx, %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovnpl %edx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2246,7 +2222,6 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
     ret i8 %x
 }
 
-; FIXME: Can be optimizated with maxss + minss
 define i13 @test_signed_i13_f16(half %f) nounwind {
 ; X86-X87-LABEL: test_signed_i13_f16:
 ; X86-X87:       # %bb.0:
@@ -2301,22 +2276,15 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_signed_i13_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $61440, %ecx # imm = 0xF000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $4095, %edx # imm = 0xFFF
-; X86-SSE-NEXT:    cmovbel %ecx, %edx
-; X86-SSE-NEXT:    xorl %eax, %eax
-; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovnpl %edx, %eax
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2324,17 +2292,13 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i13_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $61440, %ecx # imm = 0xF000
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $4095, %edx # imm = 0xFFF
-; X64-NEXT:    cmovbel %ecx, %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovnpl %edx, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2342,7 +2306,6 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
     ret i13 %x
 }
 
-; FIXME: Can be optimizated with maxss + minss
 define i16 @test_signed_i16_f16(half %f) nounwind {
 ; X86-X87-LABEL: test_signed_i16_f16:
 ; X86-X87:       # %bb.0:
@@ -2397,22 +2360,15 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_signed_i16_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $32768, %ecx # imm = 0x8000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $32767, %edx # imm = 0x7FFF
-; X86-SSE-NEXT:    cmovbel %ecx, %edx
-; X86-SSE-NEXT:    xorl %eax, %eax
-; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovnpl %edx, %eax
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2420,17 +2376,13 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i16_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $32768, %ecx # imm = 0x8000
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $32767, %edx # imm = 0x7FFF
-; X64-NEXT:    cmovbel %ecx, %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovnpl %edx, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2438,7 +2390,6 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
     ret i16 %x
 }
 
-; FIXME: Can be optimizated with maxss + minss
 define i19 @test_signed_i19_f16(half %f) nounwind {
 ; X86-X87-LABEL: test_signed_i19_f16:
 ; X86-X87:       # %bb.0:
@@ -2493,39 +2444,31 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_signed_i19_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $262143, %edx # imm = 0x3FFFF
-; X86-SSE-NEXT:    cmovbel %ecx, %edx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovnpl %edx, %eax
+; X86-SSE-NEXT:    maxss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cvttss2si %xmm0, %ecx
+; X86-SSE-NEXT:    cmovnpl %ecx, %eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_signed_i19_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $262143, %edx # imm = 0x3FFFF
-; X64-NEXT:    cmovbel %ecx, %edx
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovnpl %edx, %eax
+; X64-NEXT:    maxss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %ecx
+; X64-NEXT:    cmovnpl %ecx, %eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
     %x = call i19 @llvm.fptosi.sat.i19.f16(half %f)
@@ -2586,39 +2529,33 @@ define i32 @test_signed_i32_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_signed_i32_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ecx, %edx
+; X86-SSE-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    cmovbel %eax, %ecx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovnpl %edx, %eax
+; X86-SSE-NEXT:    cmovnpl %ecx, %eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_signed_i32_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X64-NEXT:    cmovbel %ecx, %edx
+; X64-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovbel %eax, %ecx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovnpl %edx, %eax
+; X64-NEXT:    cmovnpl %ecx, %eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
     %x = call i32 @llvm.fptosi.sat.i32.f16(half %f)
@@ -2697,10 +2634,9 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $24, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -2733,7 +2669,8 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i50_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    cvttss2si %xmm0, %rax
 ; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movabsq $-562949953421312, %rcx # imm = 0xFFFE000000000000
@@ -2822,10 +2759,9 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $24, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -2858,17 +2794,15 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i64_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    cvttss2si %xmm0, %rax
 ; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    cmovaeq %rax, %rcx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    cmovbeq %rcx, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovbeq %rax, %rcx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ucomiss %xmm0, %xmm0
-; X64-NEXT:    cmovnpq %rdx, %rax
+; X64-NEXT:    cmovnpq %rcx, %rax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
     %x = call i64 @llvm.fptosi.sat.i64.f16(half %f)
@@ -2974,11 +2908,10 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -3030,7 +2963,8 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i100_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    callq __fixsfti at PLT
 ; X64-NEXT:    xorl %ecx, %ecx
@@ -3159,11 +3093,10 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -3211,7 +3144,8 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; X64-LABEL: test_signed_i128_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    callq __fixsfti at PLT
 ; X64-NEXT:    xorl %ecx, %ecx

diff  --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index 2dfd226284832..a568e3673846f 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -542,119 +542,97 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_signed_v8i1_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $65535, %ebp # imm = 0xFFFF
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebp, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebx, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -662,8 +640,12 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> %f)
@@ -676,121 +658,99 @@ define <8 x i8> @test_signed_v8i8_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $48, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $128, %r14d
-; CHECK-NEXT:    cmovbl %r14d, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $127, %r12d
-; CHECK-NEXT:    cmoval %r12d, %ebp
-; CHECK-NEXT:    xorl %r15d, %r15d
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %ebp
-; CHECK-NEXT:    shll $8, %ebp
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movl %r9d, %r13d
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %r15d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
+; CHECK-NEXT:    cvttss2si %xmm0, %r12d
+; CHECK-NEXT:    shll $8, %r12d
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %ebx
-; CHECK-NEXT:    orl %ebp, %ebx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    orl %r12d, %ebx
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %ebp
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %ebp
 ; CHECK-NEXT:    shll $8, %ebp
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    orl %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    pinsrw $1, %ebx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %ebx
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %ebx
 ; CHECK-NEXT:    shll $8, %ebx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    orl %ebx, %eax
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    pinsrw $2, %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %ebx
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %ebx
 ; CHECK-NEXT:    shll $8, %ebx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r12d, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    orl %ebx, %eax
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    pinsrw $3, %eax, %xmm0
-; CHECK-NEXT:    addq $48, %rsp
+; CHECK-NEXT:    addq $40, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
@@ -803,120 +763,97 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_signed_v8i16_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $128, %rsp
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $32768, %r14d # imm = 0x8000
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $32767, %ebp # imm = 0x7FFF
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -924,9 +861,12 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    addq $128, %rsp
+; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> %f)
@@ -937,129 +877,112 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_signed_v8i32_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $128, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $88, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $-2147483648, %r14d # imm = 0x80000000
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
-; CHECK-NEXT:    cmoval %ebp, %eax
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    movl $2147483647, %r15d # imm = 0x7FFFFFFF
+; CHECK-NEXT:    cmoval %r15d, %eax
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpl %ebx, %eax
+; CHECK-NEXT:    cmovpl %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    addq $128, %rsp
+; CHECK-NEXT:    addq $88, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> %f)
@@ -1069,129 +992,111 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
 define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_signed_v8i64_v8f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $128, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $104, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $-9223372036854775808, %r14 # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $9223372036854775807, %rbx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmovaq %rbx, %rax
-; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    movabsq $9223372036854775807, %r15 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovaq %r15, %rax
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
+; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rax
+; CHECK-NEXT:    cmovaq %r15, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r15, %rax
-; CHECK-NEXT:    movq %rax, %xmm3
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm3 = xmm3[0],mem[0]
+; CHECK-NEXT:    cmovpq %r12, %rax
+; CHECK-NEXT:    movq %rax, %xmm2
+; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm2 = xmm2[0],mem[0]
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    addq $128, %rsp
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; CHECK-NEXT:    addq $104, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> %f)
   ret <8 x i64> %x
@@ -1207,116 +1112,113 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    subq $88, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, %r14d
+; CHECK-NEXT:    movl %edx, %r12d
 ; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
-; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    cmovbq %rbp, %rax
 ; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
 ; CHECK-NEXT:    cmovbq %rcx, %rdx
-; CHECK-NEXT:    movq %rcx, %r14
+; CHECK-NEXT:    movq %rcx, %r13
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmovaq %rcx, %rdx
-; CHECK-NEXT:    movq %rcx, %rbp
+; CHECK-NEXT:    movabsq $9223372036854775807, %r15 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovaq %r15, %rdx
 ; CHECK-NEXT:    movq $-1, %rcx
 ; CHECK-NEXT:    cmovaq %rcx, %rax
-; CHECK-NEXT:    movq $-1, %r15
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r12, %rax
+; CHECK-NEXT:    cmovpq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovpq %r12, %rdx
+; CHECK-NEXT:    cmovpq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %rbp, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbp, %rdx
-; CHECK-NEXT:    cmovaq %r15, %rax
-; CHECK-NEXT:    movq $-1, %r15
+; CHECK-NEXT:    cmovaq %r15, %rdx
+; CHECK-NEXT:    movq $-1, %rcx
+; CHECK-NEXT:    cmovaq %rcx, %rax
+; CHECK-NEXT:    movq $-1, %r12
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r12, %rax
+; CHECK-NEXT:    cmovpq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovpq %r12, %rdx
+; CHECK-NEXT:    cmovpq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %rbp, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbp, %rdx
-; CHECK-NEXT:    cmovaq %r15, %rax
-; CHECK-NEXT:    movq $-1, %r15
+; CHECK-NEXT:    cmovaq %r15, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rax
+; CHECK-NEXT:    movq $-1, %r14
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r12, %rax
+; CHECK-NEXT:    cmovpq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovpq %r12, %rdx
+; CHECK-NEXT:    cmovpq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %rbp, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbp, %rdx
-; CHECK-NEXT:    movq %rbp, %r13
-; CHECK-NEXT:    cmovaq %r15, %rax
-; CHECK-NEXT:    movq $-1, %r15
+; CHECK-NEXT:    cmovaq %r15, %rdx
+; CHECK-NEXT:    cmovaq %r14, %rax
+; CHECK-NEXT:    movq $-1, %r14
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r12, %rax
+; CHECK-NEXT:    cmovpq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovpq %r12, %rdx
+; CHECK-NEXT:    cmovpq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
-; CHECK-NEXT:    movq %rdx, %rbp
+; CHECK-NEXT:    movq %rdx, %r12
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %r14, %rbp
+; CHECK-NEXT:    cmovbq %rbp, %rax
+; CHECK-NEXT:    cmovbq %r13, %r12
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %rbp
-; CHECK-NEXT:    cmovaq %r15, %rax
+; CHECK-NEXT:    cmovaq %r15, %r12
+; CHECK-NEXT:    cmovaq %r14, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r12, %rax
+; CHECK-NEXT:    cmovpq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovpq %r12, %rbp
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    cmovpq %rbp, %r12
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movq %rax, %r14
@@ -1324,41 +1226,39 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %r14
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovbq %rax, %r15
+; CHECK-NEXT:    cmovbq %rbp, %r14
+; CHECK-NEXT:    cmovbq %r13, %r15
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %r15
+; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovaq %rax, %r15
 ; CHECK-NEXT:    movq $-1, %rax
 ; CHECK-NEXT:    cmovaq %rax, %r14
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %r12, %r14
-; CHECK-NEXT:    cmovpq %r12, %r15
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    cmovpq %rbp, %r14
+; CHECK-NEXT:    cmovpq %rbp, %r15
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
-; CHECK-NEXT:    movq %rax, %r12
-; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    movq %rax, %r13
+; CHECK-NEXT:    movq %rdx, %rbp
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movl $0, %eax
-; CHECK-NEXT:    cmovbq %rax, %r12
+; CHECK-NEXT:    cmovbq %rax, %r13
 ; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovbq %rcx, %r13
+; CHECK-NEXT:    cmovbq %rcx, %rbp
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmovaq %rcx, %r13
+; CHECK-NEXT:    cmovaq %rcx, %rbp
 ; CHECK-NEXT:    movq $-1, %rcx
-; CHECK-NEXT:    cmovaq %rcx, %r12
+; CHECK-NEXT:    cmovaq %rcx, %r13
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    cmovpq %rax, %r12
 ; CHECK-NEXT:    cmovpq %rax, %r13
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    cmovpq %rax, %rbp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixsfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
@@ -1379,11 +1279,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    cmovpq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, 120(%rbx)
 ; CHECK-NEXT:    movq %rax, 112(%rbx)
-; CHECK-NEXT:    movq %r13, 104(%rbx)
-; CHECK-NEXT:    movq %r12, 96(%rbx)
+; CHECK-NEXT:    movq %rbp, 104(%rbx)
+; CHECK-NEXT:    movq %r13, 96(%rbx)
 ; CHECK-NEXT:    movq %r15, 88(%rbx)
 ; CHECK-NEXT:    movq %r14, 80(%rbx)
-; CHECK-NEXT:    movq %rbp, 72(%rbx)
+; CHECK-NEXT:    movq %r12, 72(%rbx)
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-NEXT:    movq %rax, 64(%rbx)
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload

diff  --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
index 5fbf26c9d166a..afc7c71963b5b 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -1922,20 +1922,15 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i1_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $1, %eax
-; X86-SSE-NEXT:    cmovbel %ecx, %eax
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -1943,15 +1938,13 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i1_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:    cmovbel %ecx, %eax
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2004,20 +1997,15 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i8_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $255, %eax
-; X86-SSE-NEXT:    cmovbel %ecx, %eax
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2025,15 +2013,13 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i8_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovbel %ecx, %eax
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2085,20 +2071,15 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i13_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $8191, %eax # imm = 0x1FFF
-; X86-SSE-NEXT:    cmovbel %ecx, %eax
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2106,15 +2087,13 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i13_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $8191, %eax # imm = 0x1FFF
-; X64-NEXT:    cmovbel %ecx, %eax
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2166,20 +2145,15 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i16_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %eax, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-SSE-NEXT:    cmovbel %ecx, %eax
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    minss %xmm0, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2187,15 +2161,13 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i16_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT:    cmovbel %ecx, %eax
+; X64-NEXT:    maxss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    minss %xmm1, %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
@@ -2247,42 +2219,27 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i19_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    movl %eax, %ecx
-; X86-SSE-NEXT:    sarl $31, %ecx
-; X86-SSE-NEXT:    movaps %xmm0, %xmm1
-; X86-SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT:    cvttss2si %xmm1, %edx
-; X86-SSE-NEXT:    andl %ecx, %edx
-; X86-SSE-NEXT:    orl %eax, %edx
-; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
-; X86-SSE-NEXT:    cmovael %edx, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $524287, %eax # imm = 0x7FFFF
-; X86-SSE-NEXT:    cmovbel %ecx, %eax
+; X86-SSE-NEXT:    maxss %xmm1, %xmm0
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_unsigned_i19_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    cvttss2si %xmm0, %rax
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovael %eax, %ecx
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    movl $524287, %eax # imm = 0x7FFFF
-; X64-NEXT:    cmovbel %ecx, %eax
+; X64-NEXT:    maxss %xmm1, %xmm0
+; X64-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
     %x = call i19 @llvm.fptoui.sat.i19.f16(half %f)
@@ -2333,10 +2290,9 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i32_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
@@ -2360,7 +2316,8 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i32_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    cvttss2si %xmm0, %rax
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    xorps %xmm1, %xmm1
@@ -2449,10 +2406,9 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $24, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -2496,19 +2452,13 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i50_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    cvttss2si %xmm0, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movaps %xmm0, %xmm1
-; X64-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    cvttss2si %xmm1, %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    cmovaeq %rdx, %rcx
+; X64-NEXT:    cmovaeq %rax, %rcx
 ; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movabsq $1125899906842623, %rax # imm = 0x3FFFFFFFFFFFF
 ; X64-NEXT:    cmovbeq %rcx, %rax
@@ -2590,10 +2540,9 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i64_f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $28, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -2635,7 +2584,8 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i64_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    cvttss2si %xmm0, %rax
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    sarq $63, %rcx
@@ -2739,11 +2689,10 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -2790,7 +2739,8 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i100_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    callq __fixunssfti at PLT
 ; X64-NEXT:    xorl %ecx, %ecx
@@ -2890,11 +2840,10 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
-; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __gnu_h2f_ieee
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -2939,7 +2888,8 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; X64-LABEL: test_unsigned_i128_f16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %di, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    callq __fixunssfti at PLT
 ; X64-NEXT:    xorl %ecx, %ecx

diff  --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
index 7d5f075b64f30..44b6ca0c55d64 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
@@ -541,103 +541,97 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v8i1_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $1, %ebp
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
+; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -645,8 +639,12 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> %f)
@@ -659,103 +657,99 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $56, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %ebp
-; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movl %r9d, %r13d
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %r15d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $255, %r15d
-; CHECK-NEXT:    cmoval %r15d, %ebp
-; CHECK-NEXT:    shll $8, %ebp
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
+; CHECK-NEXT:    cvttss2si %xmm0, %r12d
+; CHECK-NEXT:    shll $8, %r12d
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %ebx
-; CHECK-NEXT:    orl %ebp, %ebx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    orl %r12d, %ebx
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebp
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %ebp
 ; CHECK-NEXT:    shll $8, %ebp
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    orl %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    pinsrw $1, %ebx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %ebx
 ; CHECK-NEXT:    shll $8, %ebx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    orl %ebx, %eax
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    pinsrw $2, %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %ebx
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %ebx
 ; CHECK-NEXT:    shll $8, %ebx
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %r14d, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %r15d, %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    orl %ebx, %eax
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    pinsrw $3, %eax, %xmm0
-; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    addq $40, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
@@ -768,103 +762,97 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v8i16_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    movl %r9d, %ebp
+; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $65535, %ebp # imm = 0xFFFF
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
+; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
-; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -872,8 +860,12 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %f)
@@ -884,111 +876,113 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v8i32_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm3, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $88, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movl $-1, %ebp
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    movl $-1, %r12d
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbl %ebx, %eax
+; CHECK-NEXT:    cmovbl %r15d, %eax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmoval %ebp, %eax
+; CHECK-NEXT:    cmoval %r12d, %eax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    addq $88, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> %f)
@@ -998,18 +992,22 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind {
 define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v8i64_v8f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $136, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $104, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
@@ -1018,19 +1016,18 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    sarq $63, %rdx
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movq $-1, %rbx
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    movq $-1, %r12
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1039,17 +1036,16 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1058,15 +1054,14 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1075,17 +1070,16 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1094,15 +1088,14 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1111,17 +1104,16 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1130,15 +1122,14 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
+; CHECK-NEXT:    cmovaq %r12, %rdx
 ; CHECK-NEXT:    movq %rdx, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %rax
 ; CHECK-NEXT:    cvttss2si %xmm0, %rcx
@@ -1147,18 +1138,22 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    andq %rax, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    cmovbq %r15, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbx, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm3
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm3 = xmm3[0],mem[0]
+; CHECK-NEXT:    cmovaq %r12, %rdx
+; CHECK-NEXT:    movq %rdx, %xmm2
+; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm2 = xmm2[0],mem[0]
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; CHECK-NEXT:    addq $104, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> %f)
   ret <8 x i64> %x
@@ -1174,93 +1169,92 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    subq $88, %rsp
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, %r15d
+; CHECK-NEXT:    movl %ecx, %r14d
+; CHECK-NEXT:    movl %edx, %r12d
 ; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
-; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    xorl %r13d, %r13d
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm1
-; CHECK-NEXT:    cmovbq %r12, %rdx
-; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
+; CHECK-NEXT:    cmovbq %r13, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    movq $-1, %r13
-; CHECK-NEXT:    cmovaq %r13, %rax
+; CHECK-NEXT:    movq $-1, %rbp
+; CHECK-NEXT:    cmovaq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovaq %r13, %rdx
+; CHECK-NEXT:    cmovaq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r12w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rdx
-; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
+; CHECK-NEXT:    cmovbq %r13, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %rax
+; CHECK-NEXT:    cmovaq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovaq %r13, %rdx
+; CHECK-NEXT:    cmovaq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rdx
-; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
+; CHECK-NEXT:    cmovbq %r13, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %rax
+; CHECK-NEXT:    cmovaq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovaq %r13, %rdx
+; CHECK-NEXT:    cmovaq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rdx
-; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    cmovbq %r13, %rdx
+; CHECK-NEXT:    cmovbq %r13, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %rax
+; CHECK-NEXT:    cmovaq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovaq %r13, %rdx
+; CHECK-NEXT:    cmovaq %rbp, %rdx
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
-; CHECK-NEXT:    movq %rdx, %rbp
+; CHECK-NEXT:    movq %rdx, %r12
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %rbp
-; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    cmovbq %r13, %r12
+; CHECK-NEXT:    cmovbq %r13, %rax
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %rax
+; CHECK-NEXT:    cmovaq %rbp, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    cmovaq %r13, %rbp
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    cmovaq %rbp, %r12
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movq %rax, %r14
@@ -1268,31 +1262,29 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovbq %r12, %r15
-; CHECK-NEXT:    cmovbq %r12, %r14
+; CHECK-NEXT:    cmovbq %r13, %r15
+; CHECK-NEXT:    cmovbq %r13, %r14
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r13, %r14
-; CHECK-NEXT:    cmovaq %r13, %r15
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    cmovaq %rbp, %r14
+; CHECK-NEXT:    cmovaq %rbp, %r15
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
-; CHECK-NEXT:    movq %rax, %r12
-; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    movq %rax, %r13
+; CHECK-NEXT:    movq %rdx, %rbp
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movl $0, %eax
+; CHECK-NEXT:    cmovbq %rax, %rbp
 ; CHECK-NEXT:    cmovbq %rax, %r13
-; CHECK-NEXT:    cmovbq %rax, %r12
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    cmovaq %rax, %r12
 ; CHECK-NEXT:    cmovaq %rax, %r13
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    cmovaq %rax, %rbp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    callq __fixunssfti at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
@@ -1307,11 +1299,11 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    cmovaq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, 120(%rbx)
 ; CHECK-NEXT:    movq %rax, 112(%rbx)
-; CHECK-NEXT:    movq %r13, 104(%rbx)
-; CHECK-NEXT:    movq %r12, 96(%rbx)
+; CHECK-NEXT:    movq %rbp, 104(%rbx)
+; CHECK-NEXT:    movq %r13, 96(%rbx)
 ; CHECK-NEXT:    movq %r15, 88(%rbx)
 ; CHECK-NEXT:    movq %r14, 80(%rbx)
-; CHECK-NEXT:    movq %rbp, 72(%rbx)
+; CHECK-NEXT:    movq %r12, 72(%rbx)
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-NEXT:    movq %rax, 64(%rbx)
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload

diff  --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 28b65aee746df..36156a77dfd35 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -38,10 +38,14 @@ define half @freeze_half() {
 ; X86ASM:       # %bb.0:
 ; X86ASM-NEXT:    pushq %rax
 ; X86ASM-NEXT:    .cfi_def_cfa_offset 16
-; X86ASM-NEXT:    callq __extendhfsf2 at PLT
+; X86ASM-NEXT:    xorl %edi, %edi
+; X86ASM-NEXT:    callq __gnu_h2f_ieee at PLT
+; X86ASM-NEXT:    callq __gnu_f2h_ieee at PLT
+; X86ASM-NEXT:    movzwl %ax, %edi
+; X86ASM-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X86ASM-NEXT:    addss %xmm0, %xmm0
-; X86ASM-NEXT:    callq __truncsfhf2 at PLT
-; X86ASM-NEXT:    popq %rax
+; X86ASM-NEXT:    callq __gnu_f2h_ieee at PLT
+; X86ASM-NEXT:    popq %rcx
 ; X86ASM-NEXT:    .cfi_def_cfa_offset 8
 ; X86ASM-NEXT:    retq
   %y1 = freeze half undef

diff  --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll
index b1fcbb05b2fd7..50e7d62fb4a2e 100644
--- a/llvm/test/CodeGen/X86/frem.ll
+++ b/llvm/test/CodeGen/X86/frem.ll
@@ -6,24 +6,24 @@
 define void @frem_f16(half %a0, half %a1, half *%p3) nounwind {
 ; CHECK-LABEL: frem_f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $16, %rsp
-; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movzwl %si, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movzwl %bp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
 ; CHECK-NEXT:    movw %ax, (%rbx)
-; CHECK-NEXT:    addq $16, %rsp
+; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %frem = frem half %a0, %a1
   store half %frem, half *%p3
@@ -501,564 +501,503 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> *%p3) nou
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $1032, %rsp # imm = 0x408
-; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $248, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, %r14d
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %ebp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r12d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r13d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %edi
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %ecx
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %edx
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r11d
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %ebp
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r14d
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r15d
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r12d
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r13d
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r8d
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r9d
-; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
-; CHECK-NEXT:    movw %r10w, 62(%rbx)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    movw %ax, 60(%rbx)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    movw %ax, 58(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r10d
-; CHECK-NEXT:    movw %si, 56(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-NEXT:    movw %di, 54(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %edi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    movw %ax, 52(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %cx, 50(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %ecx
-; CHECK-NEXT:    movw %dx, 48(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %edx
-; CHECK-NEXT:    movw %r11w, 46(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r11d
-; CHECK-NEXT:    movw %bp, 44(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %ebp
-; CHECK-NEXT:    movw %r14w, 42(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r14d
-; CHECK-NEXT:    movw %r15w, 40(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r15d
-; CHECK-NEXT:    movw %r12w, 38(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r12d
-; CHECK-NEXT:    movw %r13w, 36(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r13d
-; CHECK-NEXT:    movw %r8w, 34(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r8d
-; CHECK-NEXT:    movw %r9w, 32(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r9d
-; CHECK-NEXT:    movw %r10w, 30(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %r10d
-; CHECK-NEXT:    movw %si, 28(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-NEXT:    movw %di, 26(%rbx)
-; CHECK-NEXT:    movw %ax, 24(%rbx)
-; CHECK-NEXT:    movw %cx, 22(%rbx)
-; CHECK-NEXT:    movw %dx, 20(%rbx)
-; CHECK-NEXT:    movw %r11w, 18(%rbx)
-; CHECK-NEXT:    movw %bp, 16(%rbx)
-; CHECK-NEXT:    movw %r14w, 14(%rbx)
-; CHECK-NEXT:    movw %r15w, 12(%rbx)
-; CHECK-NEXT:    movw %r12w, 10(%rbx)
-; CHECK-NEXT:    movw %r13w, 8(%rbx)
-; CHECK-NEXT:    movw %r8w, 6(%rbx)
-; CHECK-NEXT:    movw %r9w, 4(%rbx)
-; CHECK-NEXT:    movw %r10w, 2(%rbx)
-; CHECK-NEXT:    movw %si, (%rbx)
-; CHECK-NEXT:    addq $1032, %rsp # imm = 0x408
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, 62(%r15)
+; CHECK-NEXT:    movw %bx, 60(%r15)
+; CHECK-NEXT:    movw %r13w, 58(%r15)
+; CHECK-NEXT:    movw %r12w, 56(%r15)
+; CHECK-NEXT:    movw %r14w, 54(%r15)
+; CHECK-NEXT:    movw %bp, 52(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 50(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 48(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 46(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 44(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 42(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 40(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 38(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 36(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 34(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 32(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 30(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 28(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 26(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 24(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 22(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 20(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 18(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 16(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 14(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 12(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 10(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 8(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 6(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 4(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 2(%r15)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, (%r15)
+; CHECK-NEXT:    addq $248, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13
@@ -1074,286 +1013,259 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> *%p3) nou
 define void @frem_v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> *%p3) nounwind {
 ; CHECK-LABEL: frem_v16f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $512, %rsp # imm = 0x200
-; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $120, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %r14w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %ebx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r13d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r12d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r15d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 30(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 28(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 26(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 24(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 22(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 20(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 18(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 16(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 14(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 12(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 10(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 8(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 6(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 4(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 2(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, (%rbx)
-; CHECK-NEXT:    addq $512, %rsp # imm = 0x200
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, 30(%rbp)
+; CHECK-NEXT:    movw %r15w, 28(%rbp)
+; CHECK-NEXT:    movw %r14w, 26(%rbp)
+; CHECK-NEXT:    movw %bx, 24(%rbp)
+; CHECK-NEXT:    movw %r12w, 22(%rbp)
+; CHECK-NEXT:    movw %r13w, 20(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 18(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 16(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 14(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 12(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 10(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 8(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 6(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 4(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, 2(%rbp)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, (%rbp)
+; CHECK-NEXT:    addq $120, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %frem = frem <16 x half> %a0, %a1
   store <16 x half> %frem, <16 x half> *%p3
@@ -1363,150 +1275,130 @@ define void @frem_v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> *%p3) nou
 define void @frem_v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> *%p3) nounwind {
 ; CHECK-LABEL: frem_v8f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $240, %rsp
-; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, %ebx
+; CHECK-NEXT:    movl %esi, %r13d
+; CHECK-NEXT:    movl %edi, %r15d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %r15w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %r13w, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r12d
+; CHECK-NEXT:    movl %ebp, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r13d
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %r15d
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 14(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 12(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 10(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 8(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 6(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 4(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, 2(%rbx)
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, (%rbx)
-; CHECK-NEXT:    addq $240, %rsp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf at PLT
+; CHECK-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movw %ax, 14(%rcx)
+; CHECK-NEXT:    movw %bx, 12(%rcx)
+; CHECK-NEXT:    movw %bp, 10(%rcx)
+; CHECK-NEXT:    movw %r15w, 8(%rcx)
+; CHECK-NEXT:    movw %r14w, 6(%rcx)
+; CHECK-NEXT:    movw %r13w, 4(%rcx)
+; CHECK-NEXT:    movw %r12w, 2(%rcx)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, (%rcx)
+; CHECK-NEXT:    addq $56, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %frem = frem <8 x half> %a0, %a1
   store <8 x half> %frem, <8 x half> *%p3

diff  --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index 167f327492fe8..600673a8ca1f8 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -36,7 +36,7 @@ define float @half_to_float() strictfp {
 ; X64-NOF16C:       ## %bb.0:
 ; X64-NOF16C-NEXT:    pushq %rax
 ; X64-NOF16C-NEXT:    .cfi_def_cfa_offset 16
-; X64-NOF16C-NEXT:    pinsrw $0, _a(%rip), %xmm0
+; X64-NOF16C-NEXT:    movzwl _a(%rip), %edi
 ; X64-NOF16C-NEXT:    callq ___extendhfsf2
 ; X64-NOF16C-NEXT:    popq %rax
 ; X64-NOF16C-NEXT:    retq
@@ -81,8 +81,9 @@ define double @half_to_double() strictfp {
 ; X64-NOF16C:       ## %bb.0:
 ; X64-NOF16C-NEXT:    pushq %rax
 ; X64-NOF16C-NEXT:    .cfi_def_cfa_offset 16
-; X64-NOF16C-NEXT:    pinsrw $0, _a(%rip), %xmm0
-; X64-NOF16C-NEXT:    callq ___extendhfdf2
+; X64-NOF16C-NEXT:    movzwl _a(%rip), %edi
+; X64-NOF16C-NEXT:    callq ___extendhfsf2
+; X64-NOF16C-NEXT:    cvtss2sd %xmm0, %xmm0
 ; X64-NOF16C-NEXT:    popq %rax
 ; X64-NOF16C-NEXT:    retq
 ;
@@ -111,30 +112,37 @@ define x86_fp80 @half_to_fp80() strictfp {
 ;
 ; X32-F16C-LABEL: half_to_fp80:
 ; X32-F16C:       ## %bb.0:
-; X32-F16C-NEXT:    subl $12, %esp
-; X32-F16C-NEXT:    .cfi_def_cfa_offset 16
-; X32-F16C-NEXT:    vpinsrw $0, _a, %xmm0, %xmm0
-; X32-F16C-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X32-F16C-NEXT:    calll ___extendhfxf2
-; X32-F16C-NEXT:    addl $12, %esp
+; X32-F16C-NEXT:    pushl %eax
+; X32-F16C-NEXT:    .cfi_def_cfa_offset 8
+; X32-F16C-NEXT:    movzwl _a, %eax
+; X32-F16C-NEXT:    vmovd %eax, %xmm0
+; X32-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; X32-F16C-NEXT:    vmovss %xmm0, (%esp)
+; X32-F16C-NEXT:    flds (%esp)
+; X32-F16C-NEXT:    wait
+; X32-F16C-NEXT:    popl %eax
 ; X32-F16C-NEXT:    retl
 ;
 ; X64-NOF16C-LABEL: half_to_fp80:
 ; X64-NOF16C:       ## %bb.0:
 ; X64-NOF16C-NEXT:    pushq %rax
 ; X64-NOF16C-NEXT:    .cfi_def_cfa_offset 16
-; X64-NOF16C-NEXT:    pinsrw $0, _a(%rip), %xmm0
-; X64-NOF16C-NEXT:    callq ___extendhfxf2
+; X64-NOF16C-NEXT:    movzwl _a(%rip), %edi
+; X64-NOF16C-NEXT:    callq ___extendhfsf2
+; X64-NOF16C-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp)
+; X64-NOF16C-NEXT:    flds {{[0-9]+}}(%rsp)
+; X64-NOF16C-NEXT:    wait
 ; X64-NOF16C-NEXT:    popq %rax
 ; X64-NOF16C-NEXT:    retq
 ;
 ; X64-F16C-LABEL: half_to_fp80:
 ; X64-F16C:       ## %bb.0:
-; X64-F16C-NEXT:    pushq %rax
-; X64-F16C-NEXT:    .cfi_def_cfa_offset 16
-; X64-F16C-NEXT:    vpinsrw $0, _a(%rip), %xmm0, %xmm0
-; X64-F16C-NEXT:    callq ___extendhfxf2
-; X64-F16C-NEXT:    popq %rax
+; X64-F16C-NEXT:    movzwl _a(%rip), %eax
+; X64-F16C-NEXT:    vmovd %eax, %xmm0
+; X64-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; X64-F16C-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-F16C-NEXT:    flds -{{[0-9]+}}(%rsp)
+; X64-F16C-NEXT:    wait
 ; X64-F16C-NEXT:    retq
   %1 = load half, half* @a, align 2
   %2 = tail call x86_fp80 @llvm.experimental.constrained.fpext.f80.f16(half %1, metadata !"fpexcept.strict") #0
@@ -158,8 +166,7 @@ define void @float_to_half(float %0) strictfp {
 ; X32-F16C:       ## %bb.0:
 ; X32-F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X32-F16C-NEXT:    vmovd %xmm0, %eax
-; X32-F16C-NEXT:    movw %ax, _a
+; X32-F16C-NEXT:    vpextrw $0, %xmm0, _a
 ; X32-F16C-NEXT:    retl
 ;
 ; X64-NOF16C-LABEL: float_to_half:
@@ -167,7 +174,6 @@ define void @float_to_half(float %0) strictfp {
 ; X64-NOF16C-NEXT:    pushq %rax
 ; X64-NOF16C-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NOF16C-NEXT:    callq ___truncsfhf2
-; X64-NOF16C-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NOF16C-NEXT:    movw %ax, _a(%rip)
 ; X64-NOF16C-NEXT:    popq %rax
 ; X64-NOF16C-NEXT:    retq
@@ -177,8 +183,7 @@ define void @float_to_half(float %0) strictfp {
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT:    vmovd %xmm0, %eax
-; X64-F16C-NEXT:    movw %ax, _a(%rip)
+; X64-F16C-NEXT:    vpextrw $0, %xmm0, _a(%rip)
 ; X64-F16C-NEXT:    retq
   %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   store half %2, half* @a, align 2
@@ -200,13 +205,13 @@ define void @double_to_half(double %0) strictfp {
 ;
 ; X32-F16C-LABEL: double_to_half:
 ; X32-F16C:       ## %bb.0:
+; X32-F16C-NEXT:    subl $12, %esp
+; X32-F16C-NEXT:    .cfi_def_cfa_offset 16
 ; X32-F16C-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; X32-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X32-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X32-F16C-NEXT:    vmovd %xmm0, %eax
+; X32-F16C-NEXT:    vmovsd %xmm0, (%esp)
+; X32-F16C-NEXT:    calll ___truncdfhf2
 ; X32-F16C-NEXT:    movw %ax, _a
+; X32-F16C-NEXT:    addl $12, %esp
 ; X32-F16C-NEXT:    retl
 ;
 ; X64-NOF16C-LABEL: double_to_half:
@@ -214,19 +219,17 @@ define void @double_to_half(double %0) strictfp {
 ; X64-NOF16C-NEXT:    pushq %rax
 ; X64-NOF16C-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NOF16C-NEXT:    callq ___truncdfhf2
-; X64-NOF16C-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NOF16C-NEXT:    movw %ax, _a(%rip)
 ; X64-NOF16C-NEXT:    popq %rax
 ; X64-NOF16C-NEXT:    retq
 ;
 ; X64-F16C-LABEL: double_to_half:
 ; X64-F16C:       ## %bb.0:
-; X64-F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT:    vmovd %xmm0, %eax
+; X64-F16C-NEXT:    pushq %rax
+; X64-F16C-NEXT:    .cfi_def_cfa_offset 16
+; X64-F16C-NEXT:    callq ___truncdfhf2
 ; X64-F16C-NEXT:    movw %ax, _a(%rip)
+; X64-F16C-NEXT:    popq %rax
 ; X64-F16C-NEXT:    retq
   %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   store half %2, half* @a, align 2
@@ -254,7 +257,7 @@ define void @fp80_to_half(x86_fp80 %0) strictfp {
 ; X32-F16C-NEXT:    fstpt (%esp)
 ; X32-F16C-NEXT:    wait
 ; X32-F16C-NEXT:    calll ___truncxfhf2
-; X32-F16C-NEXT:    vpextrw $0, %xmm0, _a
+; X32-F16C-NEXT:    movw %ax, _a
 ; X32-F16C-NEXT:    addl $28, %esp
 ; X32-F16C-NEXT:    retl
 ;
@@ -266,7 +269,6 @@ define void @fp80_to_half(x86_fp80 %0) strictfp {
 ; X64-NOF16C-NEXT:    fstpt (%rsp)
 ; X64-NOF16C-NEXT:    wait
 ; X64-NOF16C-NEXT:    callq ___truncxfhf2
-; X64-NOF16C-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NOF16C-NEXT:    movw %ax, _a(%rip)
 ; X64-NOF16C-NEXT:    addq $24, %rsp
 ; X64-NOF16C-NEXT:    retq
@@ -279,7 +281,7 @@ define void @fp80_to_half(x86_fp80 %0) strictfp {
 ; X64-F16C-NEXT:    fstpt (%rsp)
 ; X64-F16C-NEXT:    wait
 ; X64-F16C-NEXT:    callq ___truncxfhf2
-; X64-F16C-NEXT:    vpextrw $0, %xmm0, _a(%rip)
+; X64-F16C-NEXT:    movw %ax, _a(%rip)
 ; X64-F16C-NEXT:    addq $24, %rsp
 ; X64-F16C-NEXT:    retq
   %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f80(x86_fp80 %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
@@ -321,22 +323,20 @@ define void @add() strictfp {
 ; X32-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X32-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X32-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X32-F16C-NEXT:    vmovd %xmm0, %eax
-; X32-F16C-NEXT:    movw %ax, _c
+; X32-F16C-NEXT:    vpextrw $0, %xmm0, _c
 ; X32-F16C-NEXT:    retl
 ;
 ; X64-NOF16C-LABEL: add:
 ; X64-NOF16C:       ## %bb.0:
 ; X64-NOF16C-NEXT:    pushq %rax
 ; X64-NOF16C-NEXT:    .cfi_def_cfa_offset 16
-; X64-NOF16C-NEXT:    pinsrw $0, _a(%rip), %xmm0
+; X64-NOF16C-NEXT:    movzwl _a(%rip), %edi
 ; X64-NOF16C-NEXT:    callq ___extendhfsf2
-; X64-NOF16C-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
-; X64-NOF16C-NEXT:    pinsrw $0, _b(%rip), %xmm0
+; X64-NOF16C-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; X64-NOF16C-NEXT:    movzwl _b(%rip), %edi
 ; X64-NOF16C-NEXT:    callq ___extendhfsf2
 ; X64-NOF16C-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload
 ; X64-NOF16C-NEXT:    callq ___truncsfhf2
-; X64-NOF16C-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NOF16C-NEXT:    movw %ax, _c(%rip)
 ; X64-NOF16C-NEXT:    popq %rax
 ; X64-NOF16C-NEXT:    retq
@@ -353,8 +353,7 @@ define void @add() strictfp {
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT:    vmovd %xmm0, %eax
-; X64-F16C-NEXT:    movw %ax, _c(%rip)
+; X64-F16C-NEXT:    vpextrw $0, %xmm0, _c(%rip)
 ; X64-F16C-NEXT:    retq
   %1 = load half, half* @a, align 2
   %2 = tail call float @llvm.experimental.constrained.fpext.f32.f16(half %1, metadata !"fpexcept.strict") #0

diff  --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 2b95e8beeca8b..46179e7d9113b 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=1 \
-; RUN:   | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON
+; RUN:   | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,BWON-NOF16C
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c  -fixup-byte-word-insts=0 \
 ; RUN:   | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+f16c -fixup-byte-word-insts=1 \
@@ -9,25 +9,23 @@
 ; RUN:    | FileCheck %s -check-prefixes=CHECK-I686
 
 define void @test_load_store(half* %in, half* %out) #0 {
-; CHECK-LIBCALL-LABEL: test_load_store:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, (%rsi)
-; CHECK-LIBCALL-NEXT:    retq
+; BWON-LABEL: test_load_store:
+; BWON:       # %bb.0:
+; BWON-NEXT:    movzwl (%rdi), %eax
+; BWON-NEXT:    movw %ax, (%rsi)
+; BWON-NEXT:    retq
 ;
-; BWON-F16C-LABEL: test_load_store:
-; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rsi)
-; BWON-F16C-NEXT:    retq
+; BWOFF-LABEL: test_load_store:
+; BWOFF:       # %bb.0:
+; BWOFF-NEXT:    movw (%rdi), %ax
+; BWOFF-NEXT:    movw %ax, (%rsi)
+; BWOFF-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_load_store:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-I686-NEXT:    pinsrw $0, (%ecx), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-I686-NEXT:    movw (%ecx), %cx
 ; CHECK-I686-NEXT:    movw %cx, (%eax)
 ; CHECK-I686-NEXT:    retl
   %val = load half, half* %in
@@ -76,8 +74,8 @@ define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
 define float @test_extend32(half* %addr) #0 {
 ; CHECK-LIBCALL-LABEL: test_extend32:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; BWON-F16C-LABEL: test_extend32:
 ; BWON-F16C:       # %bb.0:
@@ -90,10 +88,9 @@ define float @test_extend32(half* %addr) #0 {
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    subl $12, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %val16 = load half, half* %addr
@@ -104,8 +101,12 @@ define float @test_extend32(half* %addr) #0 {
 define double @test_extend64(half* %addr) #0 {
 ; CHECK-LIBCALL-LABEL: test_extend64:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    jmp __extendhfdf2 at PLT # TAILCALL
+; CHECK-LIBCALL-NEXT:    pushq %rax
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:    popq %rax
+; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_extend64:
 ; BWON-F16C:       # %bb.0:
@@ -119,10 +120,9 @@ define double @test_extend64(half* %addr) #0 {
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    subl $12, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __extendhfdf2
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %val16 = load half, half* %addr
@@ -135,8 +135,7 @@ define void @test_trunc32(float %in, half* %addr) #0 {
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rbx
 ; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
 ; CHECK-LIBCALL-NEXT:    popq %rbx
 ; CHECK-LIBCALL-NEXT:    retq
@@ -144,8 +143,7 @@ define void @test_trunc32(float %in, half* %addr) #0 {
 ; BWON-F16C-LABEL: test_trunc32:
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movw %ax, (%rdi)
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_trunc32:
@@ -153,10 +151,9 @@ define void @test_trunc32(float %in, half* %addr) #0 {
 ; CHECK-I686-NEXT:    pushl %esi
 ; CHECK-I686-NEXT:    subl $8, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    movd %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, (%esi)
 ; CHECK-I686-NEXT:    addl $8, %esp
 ; CHECK-I686-NEXT:    popl %esi
@@ -167,33 +164,23 @@ define void @test_trunc32(float %in, half* %addr) #0 {
 }
 
 define void @test_trunc64(double %in, half* %addr) #0 {
-; CHECK-LIBCALL-LABEL: test_trunc64:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    pushq %rbx
-; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
-; CHECK-LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
-; CHECK-LIBCALL-NEXT:    popq %rbx
-; CHECK-LIBCALL-NEXT:    retq
-;
-; BWON-F16C-LABEL: test_trunc64:
-; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movw %ax, (%rdi)
-; BWON-F16C-NEXT:    retq
+; CHECK-LABEL: test_trunc64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    callq __truncdfhf2 at PLT
+; CHECK-NEXT:    movw %ax, (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_trunc64:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    pushl %esi
 ; CHECK-I686-NEXT:    subl $8, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-I686-NEXT:    movq %xmm0, (%esp)
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movsd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esi)
 ; CHECK-I686-NEXT:    addl $8, %esp
 ; CHECK-I686-NEXT:    popl %esi
@@ -207,8 +194,8 @@ define i64 @test_fptosi_i64(half* %p) #0 {
 ; CHECK-LIBCALL-LABEL: test_fptosi_i64:
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rax
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    cvttss2si %xmm0, %rax
 ; CHECK-LIBCALL-NEXT:    popq %rcx
 ; CHECK-LIBCALL-NEXT:    retq
@@ -223,13 +210,23 @@ define i64 @test_fptosi_i64(half* %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_fptosi_i64:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    subl $28, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __fixhfdi
-; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    orl $3072, %eax # imm = 0xC00
+; CHECK-I686-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldcw {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fistpll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldcw {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-I686-NEXT:    addl $28, %esp
 ; CHECK-I686-NEXT:    retl
   %a = load half, half* %p, align 2
   %r = fptosi half %a to i64
@@ -241,34 +238,33 @@ define void @test_sitofp_i64(i64 %a, half* %p) #0 {
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rbx
 ; CHECK-LIBCALL-NEXT:    movq %rsi, %rbx
-; CHECK-LIBCALL-NEXT:    callq __floatdihf at PLT
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-LIBCALL-NEXT:    cvtsi2ss %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
 ; CHECK-LIBCALL-NEXT:    popq %rbx
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_sitofp_i64:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    pushq %rbx
-; BWON-F16C-NEXT:    movq %rsi, %rbx
-; BWON-F16C-NEXT:    callq __floatdihf at PLT
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
-; BWON-F16C-NEXT:    popq %rbx
+; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_sitofp_i64:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $8, %esp
+; CHECK-I686-NEXT:    subl $24, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    subl $8, %esp
-; CHECK-I686-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    calll __floatdihf
-; CHECK-I686-NEXT:    addl $16, %esp
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $8, %esp
+; CHECK-I686-NEXT:    addl $24, %esp
 ; CHECK-I686-NEXT:    popl %esi
 ; CHECK-I686-NEXT:    retl
   %r = sitofp i64 %a to half
@@ -280,8 +276,8 @@ define i64 @test_fptoui_i64(half* %p) #0 {
 ; CHECK-LIBCALL-LABEL: test_fptoui_i64:
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rax
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    cvttss2si %xmm0, %rcx
 ; CHECK-LIBCALL-NEXT:    movq %rcx, %rdx
 ; CHECK-LIBCALL-NEXT:    sarq $63, %rdx
@@ -308,13 +304,35 @@ define i64 @test_fptoui_i64(half* %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_fptoui_i64:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    subl $28, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __fixunshfdi
-; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-I686-NEXT:    jae .LBB9_2
+; CHECK-I686-NEXT:  # %bb.1:
+; CHECK-I686-NEXT:    xorps %xmm1, %xmm1
+; CHECK-I686-NEXT:  .LBB9_2:
+; CHECK-I686-NEXT:    subss %xmm1, %xmm0
+; CHECK-I686-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    setae %al
+; CHECK-I686-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    orl $3072, %ecx # imm = 0xC00
+; CHECK-I686-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldcw {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fistpll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldcw {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movzbl %al, %edx
+; CHECK-I686-NEXT:    shll $31, %edx
+; CHECK-I686-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    addl $28, %esp
 ; CHECK-I686-NEXT:    retl
   %a = load half, half* %p, align 2
   %r = fptoui half %a to i64
@@ -326,34 +344,58 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rbx
 ; CHECK-LIBCALL-NEXT:    movq %rsi, %rbx
-; CHECK-LIBCALL-NEXT:    callq __floatundihf at PLT
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-LIBCALL-NEXT:    testq %rdi, %rdi
+; CHECK-LIBCALL-NEXT:    js .LBB10_1
+; CHECK-LIBCALL-NEXT:  # %bb.2:
+; CHECK-LIBCALL-NEXT:    cvtsi2ss %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    jmp .LBB10_3
+; CHECK-LIBCALL-NEXT:  .LBB10_1:
+; CHECK-LIBCALL-NEXT:    movq %rdi, %rax
+; CHECK-LIBCALL-NEXT:    shrq %rax
+; CHECK-LIBCALL-NEXT:    andl $1, %edi
+; CHECK-LIBCALL-NEXT:    orq %rax, %rdi
+; CHECK-LIBCALL-NEXT:    cvtsi2ss %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    addss %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:  .LBB10_3:
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
 ; CHECK-LIBCALL-NEXT:    popq %rbx
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_uitofp_i64:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    pushq %rbx
-; BWON-F16C-NEXT:    movq %rsi, %rbx
-; BWON-F16C-NEXT:    callq __floatundihf at PLT
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
-; BWON-F16C-NEXT:    popq %rbx
+; BWON-F16C-NEXT:    testq %rdi, %rdi
+; BWON-F16C-NEXT:    js .LBB10_1
+; BWON-F16C-NEXT:  # %bb.2:
+; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT:    jmp .LBB10_3
+; BWON-F16C-NEXT:  .LBB10_1:
+; BWON-F16C-NEXT:    movq %rdi, %rax
+; BWON-F16C-NEXT:    shrq %rax
+; BWON-F16C-NEXT:    andl $1, %edi
+; BWON-F16C-NEXT:    orq %rax, %rdi
+; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT:  .LBB10_3:
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_uitofp_i64:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $8, %esp
+; CHECK-I686-NEXT:    subl $24, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    subl $8, %esp
-; CHECK-I686-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    calll __floatundihf
-; CHECK-I686-NEXT:    addl $16, %esp
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    shrl $31, %eax
+; CHECK-I686-NEXT:    fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; CHECK-I686-NEXT:    fstps (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $8, %esp
+; CHECK-I686-NEXT:    addl $24, %esp
 ; CHECK-I686-NEXT:    popl %esi
 ; CHECK-I686-NEXT:    retl
   %r = uitofp i64 %a to half
@@ -364,31 +406,36 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
 define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
 ; CHECK-LIBCALL-LABEL: test_extend32_vec4:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    subq $72, %rsp
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    pinsrw $0, 2(%rdi), %xmm0
+; CHECK-LIBCALL-NEXT:    subq $88, %rsp
+; CHECK-LIBCALL-NEXT:    movl (%rdi), %eax
+; CHECK-LIBCALL-NEXT:    movl 4(%rdi), %ecx
+; CHECK-LIBCALL-NEXT:    movl %eax, (%rsp)
+; CHECK-LIBCALL-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0
+; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    pinsrw $0, 4(%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    pinsrw $0, 6(%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-LIBCALL-NEXT:    pextrw $1, %xmm0, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-LIBCALL-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-LIBCALL-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT:    pextrw $1, %xmm0, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-LIBCALL-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-LIBCALL-NEXT:    addq $72, %rsp
+; CHECK-LIBCALL-NEXT:    addq $88, %rsp
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_extend32_vec4:
@@ -398,36 +445,38 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_extend32_vec4:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $88, %esp
+; CHECK-I686-NEXT:    subl $124, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, 6(%eax), %xmm0
+; CHECK-I686-NEXT:    movl (%eax), %ecx
+; CHECK-I686-NEXT:    movl 4(%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    movdqa {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, 4(%eax), %xmm0
-; CHECK-I686-NEXT:    pinsrw $0, 2(%eax), %xmm1
-; CHECK-I686-NEXT:    pextrw $0, %xmm1, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-I686-NEXT:    calll __extendhfsf2
+; CHECK-I686-NEXT:    pextrw $1, %xmm0, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-I686-NEXT:    calll __extendhfsf2
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-I686-NEXT:    calll __extendhfsf2
-; CHECK-I686-NEXT:    movw %si, (%esp)
+; CHECK-I686-NEXT:    pextrw $1, %xmm0, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -436,8 +485,7 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-I686-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-I686-NEXT:    addl $88, %esp
-; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    addl $124, %esp
 ; CHECK-I686-NEXT:    retl
   %a = load <4 x half>, <4 x half>* %p, align 8
   %b = fpext <4 x half> %a to <4 x float>
@@ -447,31 +495,37 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
 define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
 ; CHECK-LIBCALL-LABEL: test_extend64_vec4:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    subq $72, %rsp
-; CHECK-LIBCALL-NEXT:    pinsrw $0, 4(%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    pinsrw $0, 6(%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    pinsrw $0, 2(%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    callq __extendhfdf2 at PLT
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfdf2 at PLT
-; CHECK-LIBCALL-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    pushq %rbp
+; CHECK-LIBCALL-NEXT:    pushq %r14
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    subq $32, %rsp
+; CHECK-LIBCALL-NEXT:    movzwl 4(%rdi), %r14d
+; CHECK-LIBCALL-NEXT:    movzwl 6(%rdi), %ebp
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %ebx
+; CHECK-LIBCALL-NEXT:    movzwl 2(%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movl %ebx, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfdf2 at PLT
+; CHECK-LIBCALL-NEXT:    movl %ebp, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
 ; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfdf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, %xmm1
+; CHECK-LIBCALL-NEXT:    movl %r14d, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm1
 ; CHECK-LIBCALL-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    # xmm1 = xmm1[0],mem[0]
 ; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    addq $72, %rsp
+; CHECK-LIBCALL-NEXT:    addq $32, %rsp
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    popq %r14
+; CHECK-LIBCALL-NEXT:    popq %rbp
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_extend64_vec4:
@@ -482,43 +536,39 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_extend64_vec4:
 ; CHECK-I686:       # %bb.0:
+; CHECK-I686-NEXT:    pushl %ebx
+; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $104, %esp
+; CHECK-I686-NEXT:    subl $64, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, 6(%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, 2(%eax), %xmm0
-; CHECK-I686-NEXT:    pinsrw $0, 4(%eax), %xmm1
-; CHECK-I686-NEXT:    pextrw $0, %xmm1, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-I686-NEXT:    calll __extendhfdf2
+; CHECK-I686-NEXT:    movzwl 6(%eax), %esi
+; CHECK-I686-NEXT:    movzwl (%eax), %edi
+; CHECK-I686-NEXT:    movzwl 2(%eax), %ebx
+; CHECK-I686-NEXT:    movzwl 4(%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-I686-NEXT:    calll __extendhfdf2
+; CHECK-I686-NEXT:    movl %ebx, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
-; CHECK-I686-NEXT:    calll __extendhfdf2
-; CHECK-I686-NEXT:    movw %si, (%esp)
+; CHECK-I686-NEXT:    movl %edi, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    movl %esi, (%esp)
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    calll __extendhfdf2
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-I686-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-I686-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-I686-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; CHECK-I686-NEXT:    addl $104, %esp
+; CHECK-I686-NEXT:    addl $64, %esp
 ; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
+; CHECK-I686-NEXT:    popl %ebx
 ; CHECK-I686-NEXT:    retl
   %a = load <4 x half>, <4 x half>* %p, align 8
   %b = fpext <4 x half> %a to <4 x double>
@@ -526,39 +576,71 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
 }
 
 define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
-; CHECK-LIBCALL-LABEL: test_trunc32_vec4:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    pushq %rbx
-; CHECK-LIBCALL-NEXT:    subq $64, %rsp
-; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
-; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, 6(%rbx)
-; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, 4(%rbx)
-; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, 2(%rbx)
-; CHECK-LIBCALL-NEXT:    addq $64, %rsp
-; CHECK-LIBCALL-NEXT:    popq %rbx
-; CHECK-LIBCALL-NEXT:    retq
+; BWON-NOF16C-LABEL: test_trunc32_vec4:
+; BWON-NOF16C:       # %bb.0:
+; BWON-NOF16C-NEXT:    pushq %rbp
+; BWON-NOF16C-NEXT:    pushq %r15
+; BWON-NOF16C-NEXT:    pushq %r14
+; BWON-NOF16C-NEXT:    pushq %rbx
+; BWON-NOF16C-NEXT:    subq $24, %rsp
+; BWON-NOF16C-NEXT:    movq %rdi, %rbx
+; BWON-NOF16C-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; BWON-NOF16C-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWON-NOF16C-NEXT:    movl %eax, %r14d
+; BWON-NOF16C-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWON-NOF16C-NEXT:    movl %eax, %r15d
+; BWON-NOF16C-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; BWON-NOF16C-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWON-NOF16C-NEXT:    movl %eax, %ebp
+; BWON-NOF16C-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWON-NOF16C-NEXT:    movw %ax, (%rbx)
+; BWON-NOF16C-NEXT:    movw %bp, 6(%rbx)
+; BWON-NOF16C-NEXT:    movw %r15w, 4(%rbx)
+; BWON-NOF16C-NEXT:    movw %r14w, 2(%rbx)
+; BWON-NOF16C-NEXT:    addq $24, %rsp
+; BWON-NOF16C-NEXT:    popq %rbx
+; BWON-NOF16C-NEXT:    popq %r14
+; BWON-NOF16C-NEXT:    popq %r15
+; BWON-NOF16C-NEXT:    popq %rbp
+; BWON-NOF16C-NEXT:    retq
+;
+; BWOFF-LABEL: test_trunc32_vec4:
+; BWOFF:       # %bb.0:
+; BWOFF-NEXT:    pushq %rbp
+; BWOFF-NEXT:    pushq %r15
+; BWOFF-NEXT:    pushq %r14
+; BWOFF-NEXT:    pushq %rbx
+; BWOFF-NEXT:    subq $24, %rsp
+; BWOFF-NEXT:    movq %rdi, %rbx
+; BWOFF-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; BWOFF-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; BWOFF-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWOFF-NEXT:    movw %ax, %r14w
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWOFF-NEXT:    movw %ax, %r15w
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; BWOFF-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWOFF-NEXT:    movw %ax, %bp
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    callq __gnu_f2h_ieee at PLT
+; BWOFF-NEXT:    movw %ax, (%rbx)
+; BWOFF-NEXT:    movw %bp, 6(%rbx)
+; BWOFF-NEXT:    movw %r15w, 4(%rbx)
+; BWOFF-NEXT:    movw %r14w, 2(%rbx)
+; BWOFF-NEXT:    addq $24, %rsp
+; BWOFF-NEXT:    popq %rbx
+; BWOFF-NEXT:    popq %r14
+; BWOFF-NEXT:    popq %r15
+; BWOFF-NEXT:    popq %rbp
+; BWOFF-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_trunc32_vec4:
 ; BWON-F16C:       # %bb.0:
@@ -567,41 +649,40 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_trunc32_vec4:
 ; CHECK-I686:       # %bb.0:
+; CHECK-I686-NEXT:    pushl %ebp
+; CHECK-I686-NEXT:    pushl %ebx
+; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $88, %esp
+; CHECK-I686-NEXT:    subl $44, %esp
 ; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-I686-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-I686-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; CHECK-I686-NEXT:    movss %xmm1, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %si
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %di
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    movd %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, 6(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, 4(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, 2(%esi)
-; CHECK-I686-NEXT:    addl $88, %esp
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %bx
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, (%ebp)
+; CHECK-I686-NEXT:    movw %bx, 6(%ebp)
+; CHECK-I686-NEXT:    movw %di, 4(%ebp)
+; CHECK-I686-NEXT:    movw %si, 2(%ebp)
+; CHECK-I686-NEXT:    addl $44, %esp
 ; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
+; CHECK-I686-NEXT:    popl %ebx
+; CHECK-I686-NEXT:    popl %ebp
 ; CHECK-I686-NEXT:    retl
   %v = fptrunc <4 x float> %a to <4 x half>
   store <4 x half> %v, <4 x half>* %p
@@ -609,98 +690,143 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
 }
 
 define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
-; CHECK-LIBCALL-LABEL: test_trunc64_vec4:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    pushq %rbx
-; CHECK-LIBCALL-NEXT:    subq $64, %rsp
-; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
-; CHECK-LIBCALL-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, 4(%rbx)
-; CHECK-LIBCALL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
-; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, 6(%rbx)
-; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movw %ax, 2(%rbx)
-; CHECK-LIBCALL-NEXT:    addq $64, %rsp
-; CHECK-LIBCALL-NEXT:    popq %rbx
-; CHECK-LIBCALL-NEXT:    retq
+; BWON-NOF16C-LABEL: test_trunc64_vec4:
+; BWON-NOF16C:       # %bb.0:
+; BWON-NOF16C-NEXT:    pushq %rbp
+; BWON-NOF16C-NEXT:    pushq %r15
+; BWON-NOF16C-NEXT:    pushq %r14
+; BWON-NOF16C-NEXT:    pushq %rbx
+; BWON-NOF16C-NEXT:    subq $40, %rsp
+; BWON-NOF16C-NEXT:    movq %rdi, %rbx
+; BWON-NOF16C-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BWON-NOF16C-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-NOF16C-NEXT:    movl %eax, %r14d
+; BWON-NOF16C-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-NOF16C-NEXT:    movl %eax, %r15d
+; BWON-NOF16C-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-NOF16C-NEXT:    movl %eax, %ebp
+; BWON-NOF16C-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-NOF16C-NEXT:    movw %ax, 4(%rbx)
+; BWON-NOF16C-NEXT:    movw %bp, (%rbx)
+; BWON-NOF16C-NEXT:    movw %r15w, 6(%rbx)
+; BWON-NOF16C-NEXT:    movw %r14w, 2(%rbx)
+; BWON-NOF16C-NEXT:    addq $40, %rsp
+; BWON-NOF16C-NEXT:    popq %rbx
+; BWON-NOF16C-NEXT:    popq %r14
+; BWON-NOF16C-NEXT:    popq %r15
+; BWON-NOF16C-NEXT:    popq %rbp
+; BWON-NOF16C-NEXT:    retq
+;
+; BWOFF-LABEL: test_trunc64_vec4:
+; BWOFF:       # %bb.0:
+; BWOFF-NEXT:    pushq %rbp
+; BWOFF-NEXT:    pushq %r15
+; BWOFF-NEXT:    pushq %r14
+; BWOFF-NEXT:    pushq %rbx
+; BWOFF-NEXT:    subq $40, %rsp
+; BWOFF-NEXT:    movq %rdi, %rbx
+; BWOFF-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; BWOFF-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT:    callq __truncdfhf2 at PLT
+; BWOFF-NEXT:    movw %ax, %r14w
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT:    callq __truncdfhf2 at PLT
+; BWOFF-NEXT:    movw %ax, %r15w
+; BWOFF-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    callq __truncdfhf2 at PLT
+; BWOFF-NEXT:    movw %ax, %bp
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    callq __truncdfhf2 at PLT
+; BWOFF-NEXT:    movw %ax, 4(%rbx)
+; BWOFF-NEXT:    movw %bp, (%rbx)
+; BWOFF-NEXT:    movw %r15w, 6(%rbx)
+; BWOFF-NEXT:    movw %r14w, 2(%rbx)
+; BWOFF-NEXT:    addq $40, %rsp
+; BWOFF-NEXT:    popq %rbx
+; BWOFF-NEXT:    popq %r14
+; BWOFF-NEXT:    popq %r15
+; BWOFF-NEXT:    popq %rbp
+; BWOFF-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_trunc64_vec4:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; BWON-F16C-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; BWON-F16C-NEXT:    vmovd %xmm1, %eax
-; BWON-F16C-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; BWON-F16C-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; BWON-F16C-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; BWON-F16C-NEXT:    vmovd %xmm2, %ecx
-; BWON-F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %edx
-; BWON-F16C-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %esi
-; BWON-F16C-NEXT:    movw %si, 4(%rdi)
-; BWON-F16C-NEXT:    movw %dx, (%rdi)
-; BWON-F16C-NEXT:    movw %cx, 6(%rdi)
-; BWON-F16C-NEXT:    movw %ax, 2(%rdi)
+; BWON-F16C-NEXT:    pushq %rbp
+; BWON-F16C-NEXT:    pushq %r15
+; BWON-F16C-NEXT:    pushq %r14
+; BWON-F16C-NEXT:    pushq %rbx
+; BWON-F16C-NEXT:    subq $56, %rsp
+; BWON-F16C-NEXT:    movq %rdi, %rbx
+; BWON-F16C-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; BWON-F16C-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; BWON-F16C-NEXT:    vzeroupper
+; BWON-F16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT:    movl %eax, %r14d
+; BWON-F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; BWON-F16C-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; BWON-F16C-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; BWON-F16C-NEXT:    vzeroupper
+; BWON-F16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT:    movl %eax, %r15d
+; BWON-F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BWON-F16C-NEXT:    vzeroupper
+; BWON-F16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT:    movl %eax, %ebp
+; BWON-F16C-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-F16C-NEXT:    callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT:    movw %ax, 4(%rbx)
+; BWON-F16C-NEXT:    movw %bp, (%rbx)
+; BWON-F16C-NEXT:    movw %r15w, 6(%rbx)
+; BWON-F16C-NEXT:    movw %r14w, 2(%rbx)
+; BWON-F16C-NEXT:    addq $56, %rsp
+; BWON-F16C-NEXT:    popq %rbx
+; BWON-F16C-NEXT:    popq %r14
+; BWON-F16C-NEXT:    popq %r15
+; BWON-F16C-NEXT:    popq %rbp
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_trunc64_vec4:
 ; CHECK-I686:       # %bb.0:
+; CHECK-I686-NEXT:    pushl %ebp
+; CHECK-I686-NEXT:    pushl %ebx
+; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $88, %esp
+; CHECK-I686-NEXT:    subl $60, %esp
 ; CHECK-I686-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    movw %ax, %si
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    movw %ax, %di
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    movw %ax, %bx
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, 6(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, 4(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, 2(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $88, %esp
+; CHECK-I686-NEXT:    movw %ax, 6(%ebp)
+; CHECK-I686-NEXT:    movw %bx, 4(%ebp)
+; CHECK-I686-NEXT:    movw %di, 2(%ebp)
+; CHECK-I686-NEXT:    movw %si, (%ebp)
+; CHECK-I686-NEXT:    addl $60, %esp
 ; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
+; CHECK-I686-NEXT:    popl %ebx
+; CHECK-I686-NEXT:    popl %ebp
 ; CHECK-I686-NEXT:    retl
   %v = fptrunc <4 x double> %a to <4 x half>
   store <4 x half> %v, <4 x half>* %p
@@ -717,8 +843,8 @@ define half @test_f80trunc_nodagcombine() #0 {
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rax
 ; CHECK-LIBCALL-NEXT:    callq test_floatret at PLT
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    popq %rax
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-LIBCALL-NEXT:    popq %rcx
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: test_f80trunc_nodagcombine:
@@ -727,8 +853,8 @@ define half @test_f80trunc_nodagcombine() #0 {
 ; BWON-F16C-NEXT:    callq test_floatret at PLT
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; BWON-F16C-NEXT:    popq %rax
+; BWON-F16C-NEXT:    # kill: def $ax killed $ax killed $eax
+; BWON-F16C-NEXT:    popq %rcx
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
@@ -736,7 +862,7 @@ define half @test_f80trunc_nodagcombine() #0 {
 ; CHECK-I686-NEXT:    subl $12, %esp
 ; CHECK-I686-NEXT:    calll test_floatret at PLT
 ; CHECK-I686-NEXT:    fstps (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %1 = call float @test_floatret()
@@ -750,84 +876,64 @@ define half @test_f80trunc_nodagcombine() #0 {
 define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
 ; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    subq $40, %rsp
-; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rsi), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    addl $-2147483648, %edi # imm = 0x80000000
-; CHECK-LIBCALL-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; CHECK-LIBCALL-NEXT:    movl $1127219200, {{[0-9]+}}(%rsp) # imm = 0x43300000
-; CHECK-LIBCALL-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-LIBCALL-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-LIBCALL-NEXT:    callq __truncdfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-LIBCALL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    subq $16, %rsp
+; CHECK-LIBCALL-NEXT:    movzwl (%rsi), %ebx
+; CHECK-LIBCALL-NEXT:    cvtsi2ss %edi, %xmm0
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-LIBCALL-NEXT:    movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-LIBCALL-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-LIBCALL-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-LIBCALL-NEXT:    movl %ebx, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
-; CHECK-LIBCALL-NEXT:    addq $40, %rsp
-; CHECK-LIBCALL-NEXT:    jmp __extendhfsf2 at PLT # TAILCALL
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee at PLT
+; CHECK-LIBCALL-NEXT:    movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT:    addq $16, %rsp
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    jmp __gnu_h2f_ieee at PLT # TAILCALL
 ;
 ; BWON-F16C-LABEL: test_sitofp_fadd_i32:
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    movzwl (%rsi), %eax
-; BWON-F16C-NEXT:    addl $-2147483648, %edi # imm = 0x80000000
-; BWON-F16C-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
-; BWON-F16C-NEXT:    movl $1127219200, -{{[0-9]+}}(%rsp) # imm = 0x43300000
-; BWON-F16C-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; BWON-F16C-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %ecx
-; BWON-F16C-NEXT:    movzwl %cx, %ecx
-; BWON-F16C-NEXT:    vmovd %ecx, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_sitofp_fadd_i32:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    subl $76, %esp
+; CHECK-I686-NEXT:    pushl %edi
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $20, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
-; CHECK-I686-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movl $1127219200, {{[0-9]+}}(%esp) # imm = 0x43300000
-; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-I686-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; CHECK-I686-NEXT:    movsd %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    movapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
+; CHECK-I686-NEXT:    movzwl (%eax), %edi
+; CHECK-I686-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %si
+; CHECK-I686-NEXT:    movl %edi, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    movzwl %si, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
-; CHECK-I686-NEXT:    addl $76, %esp
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movzwl %ax, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    addl $20, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
 ; CHECK-I686-NEXT:    retl
   %tmp0 = load half, half* %b
   %tmp1 = sitofp i32 %a to half
@@ -840,21 +946,21 @@ define half @PR40273(half) #0 {
 ; CHECK-LIBCALL-LABEL: PR40273:
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rax
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-LIBCALL-NEXT:    movzwl %di, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee at PLT
 ; CHECK-LIBCALL-NEXT:    xorl %eax, %eax
 ; CHECK-LIBCALL-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-LIBCALL-NEXT:    ucomiss %xmm1, %xmm0
 ; CHECK-LIBCALL-NEXT:    movl $15360, %ecx # imm = 0x3C00
 ; CHECK-LIBCALL-NEXT:    cmovnel %ecx, %eax
 ; CHECK-LIBCALL-NEXT:    cmovpl %ecx, %eax
-; CHECK-LIBCALL-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-LIBCALL-NEXT:    popq %rax
+; CHECK-LIBCALL-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-LIBCALL-NEXT:    popq %rcx
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: PR40273:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
+; BWON-F16C-NEXT:    movzwl %di, %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    xorl %eax, %eax
@@ -863,16 +969,15 @@ define half @PR40273(half) #0 {
 ; BWON-F16C-NEXT:    movl $15360, %ecx # imm = 0x3C00
 ; BWON-F16C-NEXT:    cmovnel %ecx, %eax
 ; BWON-F16C-NEXT:    cmovpl %ecx, %eax
-; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; BWON-F16C-NEXT:    # kill: def $ax killed $ax killed $eax
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: PR40273:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    subl $12, %esp
-; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
+; CHECK-I686-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    xorl %eax, %eax
@@ -881,7 +986,7 @@ define half @PR40273(half) #0 {
 ; CHECK-I686-NEXT:    movl $15360, %ecx # imm = 0x3C00
 ; CHECK-I686-NEXT:    cmovnel %ecx, %eax
 ; CHECK-I686-NEXT:    cmovpl %ecx, %eax
-; CHECK-I686-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-I686-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %2 = fcmp une half %0, 0xH0000
@@ -889,70 +994,4 @@ define half @PR40273(half) #0 {
   ret half %3
 }
 
-define dso_local void @brcond(half %0) {
-; CHECK-LIBCALL-LABEL: brcond:
-; CHECK-LIBCALL:       # %bb.0: # %entry
-; CHECK-LIBCALL-NEXT:    pushq %rax
-; CHECK-LIBCALL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
-; CHECK-LIBCALL-NEXT:    xorps %xmm1, %xmm1
-; CHECK-LIBCALL-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT:    setp %al
-; CHECK-LIBCALL-NEXT:    setne %cl
-; CHECK-LIBCALL-NEXT:    orb %al, %cl
-; CHECK-LIBCALL-NEXT:    jne .LBB18_2
-; CHECK-LIBCALL-NEXT:  # %bb.1: # %if.then
-; CHECK-LIBCALL-NEXT:    popq %rax
-; CHECK-LIBCALL-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-LIBCALL-NEXT:    retq
-; CHECK-LIBCALL-NEXT:  .LBB18_2: # %if.end
-;
-; BWON-F16C-LABEL: brcond:
-; BWON-F16C:       # %bb.0: # %entry
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
-; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm0
-; BWON-F16C-NEXT:    setp %al
-; BWON-F16C-NEXT:    setne %cl
-; BWON-F16C-NEXT:    orb %al, %cl
-; BWON-F16C-NEXT:    jne .LBB18_2
-; BWON-F16C-NEXT:  # %bb.1: # %if.then
-; BWON-F16C-NEXT:    retq
-; BWON-F16C-NEXT:  .LBB18_2: # %if.end
-;
-; CHECK-I686-LABEL: brcond:
-; CHECK-I686:       # %bb.0: # %entry
-; CHECK-I686-NEXT:    subl $12, %esp
-; CHECK-I686-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    calll __extendhfsf2
-; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    xorps %xmm1, %xmm1
-; CHECK-I686-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-I686-NEXT:    setp %al
-; CHECK-I686-NEXT:    setne %cl
-; CHECK-I686-NEXT:    orb %al, %cl
-; CHECK-I686-NEXT:    jne .LBB18_2
-; CHECK-I686-NEXT:  # %bb.1: # %if.then
-; CHECK-I686-NEXT:    addl $12, %esp
-; CHECK-I686-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-I686-NEXT:    retl
-; CHECK-I686-NEXT:  .LBB18_2: # %if.end
-entry:
-  %cmp = fcmp oeq half 0xH0000, %0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  ret void
-
-if.end:                                           ; preds = %entry
-  unreachable
-}
-
 attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index d426cd764c15c..555f769b316fe 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -7,77 +7,68 @@
 define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 ; X86-LABEL: ir_fadd_v1f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __gnu_h2f_ieee
+; X86-NEXT:    movl %esi, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    calll __gnu_h2f_ieee
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    calll __gnu_f2h_ieee
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ir_fadd_v1f16:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    movzwl %si, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
-; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
-; X64-NEXT:    callq __truncsfhf2 at PLT
-; X64-NEXT:    popq %rax
+; X64-NEXT:    movzwl %bx, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
+; X64-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X64-NEXT:    callq __gnu_f2h_ieee at PLT
+; X64-NEXT:    addq $16, %rsp
+; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ;
 ; F16C-LABEL: ir_fadd_v1f16:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vpextrw $0, %xmm1, %ecx
-; F16C-NEXT:    movzwl %cx, %ecx
-; F16C-NEXT:    vmovd %ecx, %xmm0
+; F16C-NEXT:    movzwl %si, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    movzwl %ax, %eax
+; F16C-NEXT:    movzwl %di, %eax
 ; F16C-NEXT:    vmovd %eax, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
 ; F16C-NEXT:    retq
 ;
 ; F16C-O0-LABEL: ir_fadd_v1f16:
 ; F16C-O0:       # %bb.0:
-; F16C-O0-NEXT:    vpextrw $0, %xmm1, %eax
-; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT:    movzwl %ax, %eax
-; F16C-O0-NEXT:    vmovd %eax, %xmm1
-; F16C-O0-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-O0-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; F16C-O0-NEXT:    movw %si, %cx
+; F16C-O0-NEXT:    movw %di, %ax
+; F16C-O0-NEXT:    movzwl %cx, %ecx
+; F16C-O0-NEXT:    vmovd %ecx, %xmm0
+; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; F16C-O0-NEXT:    movzwl %ax, %eax
 ; F16C-O0-NEXT:    vmovd %eax, %xmm0
 ; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-O0-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; F16C-O0-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-O0-NEXT:    vmovd %xmm0, %eax
-; F16C-O0-NEXT:    movw %ax, %cx
-; F16C-O0-NEXT:    # implicit-def: $eax
-; F16C-O0-NEXT:    movw %cx, %ax
-; F16C-O0-NEXT:    # implicit-def: $xmm0
-; F16C-O0-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
 ; F16C-O0-NEXT:    retq
   %retval = fadd <1 x half> %arg0, %arg1
   ret <1 x half> %retval
@@ -86,148 +77,148 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; X86-LABEL: ir_fadd_v2f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    movzwl 8(%ebp), %esi
+; X86-NEXT:    movzwl 12(%ebp), %edi
+; X86-NEXT:    movzwl 20(%ebp), %ebx
+; X86-NEXT:    movzwl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __gnu_h2f_ieee
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    movl %ebx, (%esp)
+; X86-NEXT:    calll __gnu_h2f_ieee
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    movl %edi, (%esp)
+; X86-NEXT:    calll __gnu_h2f_ieee
+; X86-NEXT:    movl %esi, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    calll __gnu_h2f_ieee
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __gnu_f2h_ieee
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    movaps %xmm0, %xmm1
-; X86-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    addl $80, %esp
+; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __gnu_f2h_ieee
+; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movdqa {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    pextrw $1, %xmm0, %edx
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    # kill: def $dx killed $dx killed $edx
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ir_fadd_v2f16:
 ; X64:       # %bb.0:
-; X64-NEXT:    subq $24, %rsp
-; X64-NEXT:    movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movaps %xmm2, %xmm0
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    movl %edx, %ebp
+; X64-NEXT:    movl %esi, %ebx
+; X64-NEXT:    movl %edi, %r14d
+; X64-NEXT:    movzwl %cx, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %bx, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; X64-NEXT:    callq __truncsfhf2 at PLT
+; X64-NEXT:    callq __gnu_f2h_ieee at PLT
+; X64-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movzwl %bp, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    movzwl %r14w, %edi
+; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; X64-NEXT:    callq __truncsfhf2 at PLT
-; X64-NEXT:    movaps %xmm0, %xmm1
-; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    callq __gnu_f2h_ieee at PLT
+; X64-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    pextrw $1, %xmm0, %edx
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    # kill: def $dx killed $dx killed $edx
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; F16C-LABEL: ir_fadd_v2f16:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm1, %eax
-; F16C-NEXT:    vpextrw $0, %xmm3, %ecx
-; F16C-NEXT:    vpextrw $0, %xmm0, %edx
-; F16C-NEXT:    vpextrw $0, %xmm2, %esi
-; F16C-NEXT:    movzwl %si, %esi
-; F16C-NEXT:    vmovd %esi, %xmm0
+; F16C-NEXT:    movzwl %cx, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    movzwl %dx, %edx
-; F16C-NEXT:    vmovd %edx, %xmm1
+; F16C-NEXT:    movzwl %si, %eax
+; F16C-NEXT:    vmovd %eax, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %edx
-; F16C-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
-; F16C-NEXT:    movzwl %cx, %ecx
-; F16C-NEXT:    vmovd %ecx, %xmm1
+; F16C-NEXT:    vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
+; F16C-NEXT:    movzwl %dx, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    movzwl %di, %eax
+; F16C-NEXT:    vmovd %eax, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    movzwl %ax, %eax
-; F16C-NEXT:    vmovd %eax, %xmm2
-; F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; F16C-NEXT:    vaddss %xmm1, %xmm2, %xmm1
-; F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; F16C-NEXT:    vmovd %xmm1, %eax
-; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
+; F16C-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    vpextrw $1, %xmm0, %edx
+; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
+; F16C-NEXT:    # kill: def $dx killed $dx killed $edx
 ; F16C-NEXT:    retq
 ;
 ; F16C-O0-LABEL: ir_fadd_v2f16:
 ; F16C-O0:       # %bb.0:
-; F16C-O0-NEXT:    vpextrw $0, %xmm2, %eax
-; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT:    movzwl %ax, %eax
-; F16C-O0-NEXT:    vmovd %eax, %xmm2
-; F16C-O0-NEXT:    vcvtph2ps %xmm2, %xmm2
-; F16C-O0-NEXT:    vpextrw $0, %xmm0, %eax
+; F16C-O0-NEXT:    movl %esi, %eax
+; F16C-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; F16C-O0-NEXT:    movw %dx, %si
 ; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; F16C-O0-NEXT:    movw %di, %dx
+; F16C-O0-NEXT:    movzwl %si, %esi
+; F16C-O0-NEXT:    vmovd %esi, %xmm0
+; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm1
+; F16C-O0-NEXT:    movzwl %dx, %edx
+; F16C-O0-NEXT:    vmovd %edx, %xmm0
+; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-O0-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; F16C-O0-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-O0-NEXT:    vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
+; F16C-O0-NEXT:    movzwl %cx, %ecx
+; F16C-O0-NEXT:    vmovd %ecx, %xmm0
+; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; F16C-O0-NEXT:    movzwl %ax, %eax
 ; F16C-O0-NEXT:    vmovd %eax, %xmm0
 ; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-O0-NEXT:    vaddss %xmm2, %xmm0, %xmm0
+; F16C-O0-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; F16C-O0-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-O0-NEXT:    vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
+; F16C-O0-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0
 ; F16C-O0-NEXT:    vmovd %xmm0, %eax
-; F16C-O0-NEXT:    movw %ax, %cx
-; F16C-O0-NEXT:    # implicit-def: $eax
-; F16C-O0-NEXT:    movw %cx, %ax
-; F16C-O0-NEXT:    # implicit-def: $xmm0
-; F16C-O0-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; F16C-O0-NEXT:    vpextrw $0, %xmm3, %eax
 ; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT:    movzwl %ax, %eax
-; F16C-O0-NEXT:    vmovd %eax, %xmm2
-; F16C-O0-NEXT:    vcvtph2ps %xmm2, %xmm2
-; F16C-O0-NEXT:    vpextrw $0, %xmm1, %eax
-; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT:    movzwl %ax, %eax
-; F16C-O0-NEXT:    vmovd %eax, %xmm1
-; F16C-O0-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-O0-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; F16C-O0-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; F16C-O0-NEXT:    vmovd %xmm1, %eax
-; F16C-O0-NEXT:    movw %ax, %cx
-; F16C-O0-NEXT:    # implicit-def: $eax
-; F16C-O0-NEXT:    movw %cx, %ax
-; F16C-O0-NEXT:    # implicit-def: $xmm1
-; F16C-O0-NEXT:    vpinsrw $0, %eax, %xmm1, %xmm1
+; F16C-O0-NEXT:    vpextrw $1, %xmm0, %ecx
+; F16C-O0-NEXT:    movw %cx, %dx
 ; F16C-O0-NEXT:    retq
   %retval = fadd <2 x half> %arg0, %arg1
   ret <2 x half> %retval

diff  --git a/llvm/test/CodeGen/X86/pr38533.ll b/llvm/test/CodeGen/X86/pr38533.ll
index 0f2360a2368d6..53652e69e7c02 100644
--- a/llvm/test/CodeGen/X86/pr38533.ll
+++ b/llvm/test/CodeGen/X86/pr38533.ll
@@ -1,52 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512fp16 | FileCheck %s --check-prefix=AVX512FP16
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s
 
 ; This test makes sure that a vector that needs to be promoted that is bitcasted to fp16 is legalized correctly without causing a width mismatch.
 define void @constant_fold_vector_to_half() {
-; SSE2-LABEL: constant_fold_vector_to_half:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000
-; SSE2-NEXT:    pinsrw $0, -{{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rax)
-; SSE2-NEXT:    retq
-;
-; AVX512-LABEL: constant_fold_vector_to_half:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000
-; AVX512-NEXT:    vpinsrw $0, -{{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512-NEXT:    vpextrw $0, %xmm0, (%rax)
-; AVX512-NEXT:    retq
-;
-; AVX512FP16-LABEL: constant_fold_vector_to_half:
-; AVX512FP16:       # %bb.0:
-; AVX512FP16-NEXT:    movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000
-; AVX512FP16-NEXT:    vmovsh -{{[0-9]+}}(%rsp), %xmm0
-; AVX512FP16-NEXT:    vmovsh %xmm0, (%rax)
-; AVX512FP16-NEXT:    retq
+; CHECK-LABEL: constant_fold_vector_to_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movw $16384, (%rax) # imm = 0x4000
+; CHECK-NEXT:    retq
   store volatile half bitcast (<4 x i4> <i4 0, i4 0, i4 0, i4 4> to half), half* undef
   ret void
 }
 
 ; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing.
 define void @pr38533_2(half %x) {
-; SSE2-LABEL: pr38533_2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rax)
-; SSE2-NEXT:    retq
-;
-; AVX512-LABEL: pr38533_2:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrw $0, %xmm0, (%rax)
-; AVX512-NEXT:    retq
-;
-; AVX512FP16-LABEL: pr38533_2:
-; AVX512FP16:       # %bb.0:
-; AVX512FP16-NEXT:    vmovsh %xmm0, (%rax)
-; AVX512FP16-NEXT:    retq
+; CHECK-LABEL: pr38533_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movw %di, (%rax)
+; CHECK-NEXT:    retq
   %a = bitcast half %x to <4 x i4>
   store volatile <4 x i4> %a, <4 x i4>* undef
   ret void
@@ -54,21 +25,10 @@ define void @pr38533_2(half %x) {
 
 ; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized.
 define void @pr38533_3(half %x) {
-; SSE2-LABEL: pr38533_3:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rax)
-; SSE2-NEXT:    retq
-;
-; AVX512-LABEL: pr38533_3:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrw $0, %xmm0, (%rax)
-; AVX512-NEXT:    retq
-;
-; AVX512FP16-LABEL: pr38533_3:
-; AVX512FP16:       # %bb.0:
-; AVX512FP16-NEXT:    vmovsh %xmm0, (%rax)
-; AVX512FP16-NEXT:    retq
+; CHECK-LABEL: pr38533_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movw %di, (%rax)
+; CHECK-NEXT:    retq
   %a = bitcast half %x to <16 x i1>
   store volatile <16 x i1> %a, <16 x i1>* undef
   ret void

diff  --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll
index 9855d2f32819c..5c77c48b37ba7 100755
--- a/llvm/test/CodeGen/X86/pr47000.ll
+++ b/llvm/test/CodeGen/X86/pr47000.ll
@@ -7,86 +7,55 @@ target triple = "i386-unknown-linux-unknown"
 define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-LABEL: doTheTestMod:
 ; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $124, %esp
-; CHECK-NEXT:    # implicit-def: $xmm3
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm3
-; CHECK-NEXT:    # implicit-def: $xmm2
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm2
-; CHECK-NEXT:    # implicit-def: $xmm1
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    # implicit-def: $xmm0
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    # implicit-def: $xmm4
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm4
-; CHECK-NEXT:    # implicit-def: $xmm5
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm5
-; CHECK-NEXT:    # implicit-def: $xmm6
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm6
-; CHECK-NEXT:    # implicit-def: $xmm7
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm7
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    pextrw $0, %xmm7, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %si
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %dx
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %cx
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %ax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %di
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %bx
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %bp
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %ax
 ; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm6, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
+; CHECK-NEXT:    movw %bp, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %di, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %si, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm5, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm4, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm3, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm2, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm1, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    # implicit-def: $xmm0
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    # implicit-def: $xmm0
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    # implicit-def: $xmm0
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    # implicit-def: $xmm0
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    # implicit-def: $xmm1
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    # implicit-def: $xmm1
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    # implicit-def: $xmm1
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    # implicit-def: $xmm1
-; CHECK-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fxch %st(1)
@@ -95,24 +64,17 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    calll fmodf
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fstps (%eax)
-; CHECK-NEXT:    calll __truncsfhf2
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fxch %st(1)
@@ -121,24 +83,17 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    calll fmodf
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fstps (%eax)
-; CHECK-NEXT:    calll __truncsfhf2
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movw %ax, %si
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fxch %st(1)
@@ -147,24 +102,17 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    calll fmodf
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fstps (%eax)
-; CHECK-NEXT:    calll __truncsfhf2
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movw %ax, %di
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, %cx
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movw %cx, (%eax)
-; CHECK-NEXT:    calll __extendhfsf2
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fxch %st(1)
@@ -173,29 +121,20 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    calll fmodf
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    fstps (%eax)
-; CHECK-NEXT:    calll __truncsfhf2
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Reload
-; CHECK-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
-; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movw %ax, %bx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    movaps %xmm0, %xmm3
-; CHECK-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    pextrw $0, %xmm3, %edx
-; CHECK-NEXT:    # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT:    movw %dx, 6(%ecx)
-; CHECK-NEXT:    pextrw $0, %xmm2, %edx
-; CHECK-NEXT:    # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT:    movw %dx, 4(%ecx)
-; CHECK-NEXT:    pextrw $0, %xmm1, %edx
-; CHECK-NEXT:    # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT:    movw %dx, 2(%ecx)
-; CHECK-NEXT:    pextrw $0, %xmm0, %edx
-; CHECK-NEXT:    # kill: def $dx killed $dx killed $edx
+; CHECK-NEXT:    movw %bx, 6(%ecx)
+; CHECK-NEXT:    movw %di, 4(%ecx)
+; CHECK-NEXT:    movw %si, 2(%ecx)
 ; CHECK-NEXT:    movw %dx, (%ecx)
 ; CHECK-NEXT:    addl $124, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl $4
 Entry:
   %x = alloca <4 x half>, align 8

diff  --git a/llvm/test/CodeGen/X86/scheduler-asm-moves.mir b/llvm/test/CodeGen/X86/scheduler-asm-moves.mir
index 7def77b74eb4a..4c515f2f4b788 100644
--- a/llvm/test/CodeGen/X86/scheduler-asm-moves.mir
+++ b/llvm/test/CodeGen/X86/scheduler-asm-moves.mir
@@ -128,7 +128,7 @@ body:             |
     ; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_daddr, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_daddr, !tbaa !4)
     ; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_proto, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_proto, !tbaa !4)
     ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2359307 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, [[MOV8rm]], 2359305 /* reguse:GR32 */, [[MOV32rm]], 2359305 /* reguse:GR32 */, [[MOV32r0_]], 2359305 /* reguse:GR32 */, [[MOV32rm1]], 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !8
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2293771 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, [[MOV8rm]], 2293769 /* reguse:GR32 */, [[MOV32rm]], 2293769 /* reguse:GR32 */, [[MOV32r0_]], 2293769 /* reguse:GR32 */, [[MOV32rm1]], 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !8
     ; CHECK-NEXT: MOV32mr $noreg, 1, $noreg, @csum_ipv6_magic_sum, $noreg, %2 :: (store (s32) into @csum_ipv6_magic_sum, !tbaa !4)
     ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @synproxy_send_tcp_ipv6_nskb, $noreg :: (dereferenceable load (s32) from `i8** bitcast (%struct.sk_buff** @synproxy_send_tcp_ipv6_nskb to i8**)`, !tbaa !9)
     ; CHECK-NEXT: OR8mi [[MOV32rm2]], 1, $noreg, 0, $noreg, 3, implicit-def dead $eflags :: (store (s8) into %ir.4), (load (s8) from %ir.4)
@@ -143,7 +143,7 @@ body:             |
     %4:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_daddr, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_daddr, !tbaa !5)
     %6:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_proto, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_proto, !tbaa !5)
     %5:gr32 = MOV32r0 implicit-def dead $eflags
-    INLINEASM &"", 0 /* attdialect */, 2359307 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, %3, 2359305 /* reguse:GR32 */, %4, 2359305 /* reguse:GR32 */, %5, 2359305 /* reguse:GR32 */, %6, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !9
+    INLINEASM &"", 0 /* attdialect */, 2293771 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, %3, 2293769 /* reguse:GR32 */, %4, 2293769 /* reguse:GR32 */, %5, 2293769 /* reguse:GR32 */, %6, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !9
     MOV32mr $noreg, 1, $noreg, @csum_ipv6_magic_sum, $noreg, %2 :: (store (s32) into @csum_ipv6_magic_sum, !tbaa !5)
     %7:gr32 = MOV32rm $noreg, 1, $noreg, @synproxy_send_tcp_ipv6_nskb, $noreg :: (dereferenceable load (s32) from `i8** bitcast (%struct.sk_buff** @synproxy_send_tcp_ipv6_nskb to i8**)`, !tbaa !10)
     OR8mi %7, 1, $noreg, 0, $noreg, 3, implicit-def dead $eflags :: (store (s8) into %ir.4), (load (s8) from %ir.4)

diff  --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
index 7734a270a78d9..6921bc142f1dc 100644
--- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
+++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
@@ -4,30 +4,13 @@
 define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pinsrw $0, (%rdi), %xmm0
-; CHECK-NEXT:    pinsrw $0, 2(%rdi), %xmm1
-; CHECK-NEXT:    pinsrw $0, 4(%rdi), %xmm2
-; CHECK-NEXT:    pinsrw $0, 6(%rdi), %xmm3
-; CHECK-NEXT:    pinsrw $0, (%rsi), %xmm4
-; CHECK-NEXT:    pinsrw $0, 2(%rsi), %xmm5
-; CHECK-NEXT:    pinsrw $0, 4(%rsi), %xmm6
-; CHECK-NEXT:    pinsrw $0, 6(%rsi), %xmm7
-; CHECK-NEXT:    pextrw $0, %xmm7, %eax
-; CHECK-NEXT:    movw %ax, 14(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm3, %eax
-; CHECK-NEXT:    movw %ax, 12(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm6, %eax
-; CHECK-NEXT:    movw %ax, 10(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm2, %eax
-; CHECK-NEXT:    movw %ax, 8(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm5, %eax
-; CHECK-NEXT:    movw %ax, 6(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm1, %eax
-; CHECK-NEXT:    movw %ax, 4(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm4, %eax
-; CHECK-NEXT:    movw %ax, 2(%rdx)
-; CHECK-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movw %ax, (%rdx)
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    movq (%rsi), %rax
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    movdqa %xmm0, (%rdx)
 ; CHECK-NEXT:    retq
   %tmp4 = load <4 x half>, <4 x half>* %a
   %tmp5 = load <4 x half>, <4 x half>* %b

diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
index 9cd1d0cf5fc59..a0cc87f87db8f 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
@@ -803,7 +803,7 @@ define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a
 
 define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmadd123sh:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
   ret half %2
@@ -812,7 +812,7 @@ declare half @llvm.fma.f16(half, half, half)
 
 define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmadd213sh:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
   ret half %2
@@ -820,7 +820,7 @@ define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmadd231sh:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
   ret half %2
@@ -828,7 +828,7 @@ define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmadd321sh:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
   ret half %2
@@ -836,7 +836,7 @@ define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmadd132sh:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
   ret half %2
@@ -844,7 +844,7 @@ define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmadd312sh:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
   ret half %2
@@ -852,7 +852,7 @@ define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmsub123sh:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
@@ -861,7 +861,7 @@ define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmsub213sh:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
@@ -870,7 +870,7 @@ define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmsub231sh:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
@@ -879,7 +879,7 @@ define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmsub321sh:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
@@ -888,7 +888,7 @@ define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmsub132sh:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
@@ -897,7 +897,7 @@ define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fmsub312sh:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
@@ -906,7 +906,7 @@ define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmadd123sh:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
@@ -915,7 +915,7 @@ define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmadd213sh:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
@@ -924,7 +924,7 @@ define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmadd231sh:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
@@ -933,7 +933,7 @@ define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmadd321sh:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
@@ -942,7 +942,7 @@ define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmadd132sh:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
@@ -951,7 +951,7 @@ define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmadd312sh:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
@@ -960,7 +960,7 @@ define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmsub123sh:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = fneg half %a2
@@ -970,7 +970,7 @@ define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmsub213sh:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = fneg half %a2
@@ -980,7 +980,7 @@ define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmsub231sh:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = fneg half %a0
@@ -990,7 +990,7 @@ define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmsub321sh:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = fneg half %a0
@@ -1000,7 +1000,7 @@ define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmsub132sh:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = fneg half %a1
@@ -1010,7 +1010,7 @@ define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
 
 define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
   ;CHECK-LABEL: stack_fold_fnmsub312sh:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = fneg half %a1

diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index f57a1caf3e4f7..0be20ab6d5f41 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -50,7 +50,7 @@ define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
 
 define half @stack_fold_addsh(half %a0, half %a1) {
   ;CHECK-LABEL: stack_fold_addsh
-  ;CHECK:       vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd half %a0, %a1
   ret half %2
@@ -107,7 +107,7 @@ define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half>
 
 define half @stack_fold_divsh(half %a0, half %a1) {
   ;CHECK-LABEL: stack_fold_divsh
-  ;CHECK:       vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fdiv half %a0, %a1
   ret half %2
@@ -390,7 +390,7 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
 
 define half @stack_fold_maxsh(half %a0, half %a1) #0 {
   ;CHECK-LABEL: stack_fold_maxsh:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -399,7 +399,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
 
 define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
   ;CHECK-LABEL: stack_fold_maxsh_commuted:
-  ;CHECK-NOT:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK-NOT:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -408,7 +408,7 @@ define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
 
 define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
   ;CHECK-LABEL: stack_fold_maxsh_commutable:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -417,7 +417,7 @@ define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
 
 define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
   ;CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -569,7 +569,7 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
 
 define half @stack_fold_minsh(half %a0, half %a1) #0 {
   ;CHECK-LABEL: stack_fold_minsh:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -578,7 +578,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
 
 define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
   ;CHECK-LABEL: stack_fold_minsh_commuted:
-  ;CHECK-NOT:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK-NOT:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -587,7 +587,7 @@ define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
 
 define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
   ;CHECK-LABEL: stack_fold_minsh_commutable:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -596,7 +596,7 @@ define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
 
 define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
   ;CHECK-LABEL: stack_fold_minsh_commutable_commuted:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -671,7 +671,7 @@ define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
 
 define half @stack_fold_mulsh(half %a0, half %a1) {
   ;CHECK-LABEL: stack_fold_mulsh
-  ;CHECK-NOT:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK-NOT:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul half %a0, %a1
   ret half %2
@@ -972,7 +972,7 @@ define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
 
 define half @stack_fold_subsh(half %a0, half %a1) {
   ;CHECK-LABEL: stack_fold_subsh
-  ;CHECK:       vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fsub half %a0, %a1
   ret half %2

diff  --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
index a7829ab938eec..e55b618bc9965 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
@@ -340,7 +340,7 @@ body:             |
   ; CHECK:   CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags
   ; CHECK:   undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0
   ; CHECK:   [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags
-  ; CHECK:   INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %102, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+  ; CHECK:   INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %102, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
   ; CHECK:   LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
   ; CHECK:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK:   $rdi = COPY [[COPY4]]
@@ -456,7 +456,7 @@ body:             |
     %63:gr64 = NOT64r %63
     CMP64rr %63, %31, implicit-def $eflags
     %63:gr64 = CMOV64rr %63, %53, 4, implicit killed $eflags
-    INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %53, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %53, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
     LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, %65, implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     $rdi = COPY %64

diff  --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index db8d04704df4f..25b702db977aa 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2101,56 +2101,58 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
 define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ; SSE-LABEL: fptosi_2f16_to_4i32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $16, %rsp
-; SSE-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    cvttss2si %xmm0, %ebx
-; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    movl %esi, %ebx
+; SSE-NEXT:    movzwl %di, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee at PLT
+; SSE-NEXT:    cvttss2si %xmm0, %ebp
+; SSE-NEXT:    movzwl %bx, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee at PLT
 ; SSE-NEXT:    cvttss2si %xmm0, %eax
 ; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movd %ebx, %xmm1
+; SSE-NEXT:    movd %ebp, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT:    addq $16, %rsp
+; SSE-NEXT:    addq $8, %rsp
 ; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: fptosi_2f16_to_4i32:
 ; VEX:       # %bb.0:
+; VEX-NEXT:    pushq %rbp
 ; VEX-NEXT:    pushq %rbx
-; VEX-NEXT:    subq $16, %rsp
-; VEX-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; VEX-NEXT:    callq __extendhfsf2 at PLT
-; VEX-NEXT:    vcvttss2si %xmm0, %ebx
-; VEX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; VEX-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; VEX-NEXT:    callq __extendhfsf2 at PLT
+; VEX-NEXT:    pushq %rax
+; VEX-NEXT:    movl %esi, %ebx
+; VEX-NEXT:    movzwl %di, %edi
+; VEX-NEXT:    callq __gnu_h2f_ieee at PLT
+; VEX-NEXT:    vcvttss2si %xmm0, %ebp
+; VEX-NEXT:    movzwl %bx, %edi
+; VEX-NEXT:    callq __gnu_h2f_ieee at PLT
 ; VEX-NEXT:    vcvttss2si %xmm0, %eax
 ; VEX-NEXT:    vmovd %eax, %xmm0
-; VEX-NEXT:    vmovd %ebx, %xmm1
+; VEX-NEXT:    vmovd %ebp, %xmm1
 ; VEX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; VEX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VEX-NEXT:    addq $16, %rsp
+; VEX-NEXT:    addq $8, %rsp
 ; VEX-NEXT:    popq %rbx
+; VEX-NEXT:    popq %rbp
 ; VEX-NEXT:    retq
 ;
 ; AVX512-LABEL: fptosi_2f16_to_4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vpextrw $0, %xmm0, %ecx
-; AVX512-NEXT:    movzwl %cx, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm0
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    vcvttss2si %xmm0, %ecx
-; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    movzwl %di, %eax
 ; AVX512-NEXT:    vmovd %eax, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttss2si %xmm0, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vmovd %ecx, %xmm1
+; AVX512-NEXT:    movzwl %si, %ecx
+; AVX512-NEXT:    vmovd %ecx, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvttss2si %xmm0, %ecx
+; AVX512-NEXT:    vmovd %ecx, %xmm0
+; AVX512-NEXT:    vmovd %eax, %xmm1
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 3c25b31131372..c3b6edf00772a 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -557,8 +557,7 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
 ; ALL-LABEL: store_cvt_f32_to_i16:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, (%rdi)
+; ALL-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; ALL-NEXT:    retq
   %1 = fptrunc float %a0 to half
   %2 = bitcast half %1 to i16
@@ -648,11 +647,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
 define i16 @cvt_f64_to_i16(double %a0) nounwind {
 ; ALL-LABEL: cvt_f64_to_i16:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL-NEXT:    retq
+; ALL-NEXT:    jmp __truncdfhf2 at PLT # TAILCALL
   %1 = fptrunc double %a0 to half
   %2 = bitcast half %1 to i16
   ret i16 %2
@@ -661,16 +656,16 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind {
 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
 ; ALL-LABEL: cvt_2f64_to_2i16:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT:    subq $40, %rsp
+; ALL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovaps (%rsp), %xmm0
+; ALL-NEXT:    addq $40, %rsp
 ; ALL-NEXT:    retq
   %1 = fptrunc <2 x double> %a0 to <2 x half>
   %2 = bitcast <2 x half> %1 to <2 x i16>
@@ -680,27 +675,28 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; ALL-LABEL: cvt_4f64_to_4i16:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT:    subq $72, %rsp
+; ALL-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovaps (%rsp), %xmm0
+; ALL-NEXT:    addq $72, %rsp
 ; ALL-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
@@ -710,27 +706,28 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; ALL-LABEL: cvt_4f64_to_8i16_undef:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT:    subq $72, %rsp
+; ALL-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovaps (%rsp), %xmm0
+; ALL-NEXT:    addq $72, %rsp
 ; ALL-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
@@ -741,27 +738,28 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; ALL-LABEL: cvt_4f64_to_8i16_zero:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    subq $72, %rsp
+; ALL-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    addq $72, %rsp
 ; ALL-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
@@ -772,165 +770,205 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX1-LABEL: cvt_8f64_to_8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %xmm2, %eax
-; AVX1-NEXT:    shll $16, %eax
-; AVX1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %xmm2, %ecx
-; AVX1-NEXT:    movzwl %cx, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %ebx
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movzwl %ax, %r15d
+; AVX1-NEXT:    orl %ebx, %r15d
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %xmm2, %edx
-; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %ebx
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movzwl %ax, %r14d
+; AVX1-NEXT:    orl %ebx, %r14d
+; AVX1-NEXT:    shlq $32, %r14
+; AVX1-NEXT:    orq %r15, %r14
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %ebx
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movzwl %ax, %r15d
+; AVX1-NEXT:    orl %ebx, %r15d
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %ebx
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
 ; AVX1-NEXT:    movzwl %ax, %eax
-; AVX1-NEXT:    orl %edx, %eax
+; AVX1-NEXT:    orl %ebx, %eax
 ; AVX1-NEXT:    shlq $32, %rax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    movzwl %dx, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %esi
-; AVX1-NEXT:    movzwl %si, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    shlq $32, %rsi
-; AVX1-NEXT:    orq %rdx, %rsi
-; AVX1-NEXT:    vmovq %rsi, %xmm0
-; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    orq %r15, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovq %r14, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    addq $64, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cvt_8f64_to_8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %xmm2, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %xmm2, %ecx
-; AVX2-NEXT:    movzwl %cx, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %ebx
+; AVX2-NEXT:    shll $16, %ebx
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movzwl %ax, %r15d
+; AVX2-NEXT:    orl %ebx, %r15d
+; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %xmm2, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %ebx
+; AVX2-NEXT:    shll $16, %ebx
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movzwl %ax, %r14d
+; AVX2-NEXT:    orl %ebx, %r14d
+; AVX2-NEXT:    shlq $32, %r14
+; AVX2-NEXT:    orq %r15, %r14
+; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %ebx
+; AVX2-NEXT:    shll $16, %ebx
+; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movzwl %ax, %r15d
+; AVX2-NEXT:    orl %ebx, %r15d
+; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %ebx
+; AVX2-NEXT:    shll $16, %ebx
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    orl %edx, %eax
+; AVX2-NEXT:    orl %ebx, %eax
 ; AVX2-NEXT:    shlq $32, %rax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    movzwl %dx, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %esi
-; AVX2-NEXT:    movzwl %si, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    shlq $32, %rsi
-; AVX2-NEXT:    orq %rdx, %rsi
-; AVX2-NEXT:    vmovq %rsi, %xmm0
-; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    orq %r15, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %r14, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    addq $64, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: cvt_8f64_to_8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %eax
-; AVX512-NEXT:    shll $16, %eax
-; AVX512-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %ecx
-; AVX512-NEXT:    movzwl %cx, %ecx
-; AVX512-NEXT:    orl %eax, %ecx
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT:    vmovd %xmm2, %edx
-; AVX512-NEXT:    shll $16, %edx
-; AVX512-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %eax
-; AVX512-NEXT:    movzwl %ax, %eax
-; AVX512-NEXT:    orl %edx, %eax
-; AVX512-NEXT:    shlq $32, %rax
-; AVX512-NEXT:    orq %rcx, %rax
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $80, %rsp
+; AVX512-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movzwl %ax, %r15d
+; AVX512-NEXT:    orl %ebx, %r15d
+; AVX512-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movzwl %ax, %r14d
+; AVX512-NEXT:    orl %ebx, %r14d
+; AVX512-NEXT:    shlq $32, %r14
+; AVX512-NEXT:    orq %r15, %r14
+; AVX512-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %ecx
-; AVX512-NEXT:    shll $16, %ecx
-; AVX512-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %edx
-; AVX512-NEXT:    movzwl %dx, %edx
-; AVX512-NEXT:    orl %ecx, %edx
+; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movzwl %ax, %r15d
+; AVX512-NEXT:    orl %ebx, %r15d
+; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %ecx
-; AVX512-NEXT:    shll $16, %ecx
-; AVX512-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %esi
-; AVX512-NEXT:    movzwl %si, %esi
-; AVX512-NEXT:    orl %ecx, %esi
-; AVX512-NEXT:    shlq $32, %rsi
-; AVX512-NEXT:    orq %rdx, %rsi
-; AVX512-NEXT:    vmovq %rsi, %xmm0
-; AVX512-NEXT:    vmovq %rax, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    orl %ebx, %eax
+; AVX512-NEXT:    shlq $32, %rax
+; AVX512-NEXT:    orq %r15, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %r14, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    addq $80, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
 ; AVX512-NEXT:    retq
   %1 = fptrunc <8 x double> %a0 to <8 x half>
   %2 = bitcast <8 x half> %1 to <8 x i16>
@@ -944,10 +982,11 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
 ; ALL-LABEL: store_cvt_f64_to_i16:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, (%rdi)
+; ALL-NEXT:    pushq %rbx
+; ALL-NEXT:    movq %rdi, %rbx
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rbx)
+; ALL-NEXT:    popq %rbx
 ; ALL-NEXT:    retq
   %1 = fptrunc double %a0 to half
   %2 = bitcast half %1 to i16
@@ -958,15 +997,21 @@ define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
 define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
 ; ALL-LABEL: store_cvt_2f64_to_2i16:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %ecx
-; ALL-NEXT:    movw %cx, (%rdi)
-; ALL-NEXT:    movw %ax, 2(%rdi)
+; ALL-NEXT:    pushq %rbp
+; ALL-NEXT:    pushq %rbx
+; ALL-NEXT:    subq $24, %rsp
+; ALL-NEXT:    movq %rdi, %rbx
+; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movl %eax, %ebp
+; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rbx)
+; ALL-NEXT:    movw %bp, 2(%rbx)
+; ALL-NEXT:    addq $24, %rsp
+; ALL-NEXT:    popq %rbx
+; ALL-NEXT:    popq %rbp
 ; ALL-NEXT:    retq
   %1 = fptrunc <2 x double> %a0 to <2 x half>
   %2 = bitcast <2 x half> %1 to <2 x i16>
@@ -975,29 +1020,119 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
 }
 
 define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
-; ALL-LABEL: store_cvt_4f64_to_4i16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %ecx
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %edx
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %esi
-; ALL-NEXT:    movw %si, 4(%rdi)
-; ALL-NEXT:    movw %dx, (%rdi)
-; ALL-NEXT:    movw %cx, 6(%rdi)
-; ALL-NEXT:    movw %ax, 2(%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; AVX1-LABEL: store_cvt_4f64_to_4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $56, %rsp
+; AVX1-NEXT:    movq %rdi, %rbx
+; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %r14d
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %r15d
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %ebp
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movw %ax, 4(%rbx)
+; AVX1-NEXT:    movw %bp, (%rbx)
+; AVX1-NEXT:    movw %r15w, 6(%rbx)
+; AVX1-NEXT:    movw %r14w, 2(%rbx)
+; AVX1-NEXT:    addq $56, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $56, %rsp
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %r14d
+; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %r15d
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %ebp
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movw %ax, 4(%rbx)
+; AVX2-NEXT:    movw %bp, (%rbx)
+; AVX2-NEXT:    movw %r15w, 6(%rbx)
+; AVX2-NEXT:    movw %r14w, 2(%rbx)
+; AVX2-NEXT:    addq $56, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $56, %rsp
+; AVX512-NEXT:    movq %rdi, %rbx
+; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %r14d
+; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %r15d
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %ebp
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movw %ax, 4(%rbx)
+; AVX512-NEXT:    movw %bp, (%rbx)
+; AVX512-NEXT:    movw %r15w, 6(%rbx)
+; AVX512-NEXT:    movw %r14w, 2(%rbx)
+; AVX512-NEXT:    addq $56, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
   store <4 x i16> %2, <4 x i16>* %a1
@@ -1007,28 +1142,32 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
 ; ALL-LABEL: store_cvt_4f64_to_8i16_undef:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
-; ALL-NEXT:    vmovaps %xmm0, (%rdi)
+; ALL-NEXT:    pushq %rbx
+; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    movq %rdi, %rbx
+; ALL-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovaps (%rsp), %xmm0
+; ALL-NEXT:    vmovaps %xmm0, (%rbx)
+; ALL-NEXT:    addq $64, %rsp
+; ALL-NEXT:    popq %rbx
 ; ALL-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
@@ -1040,28 +1179,32 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
 ; ALL-LABEL: store_cvt_4f64_to_8i16_zero:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm2
-; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; ALL-NEXT:    vmovd %xmm2, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; ALL-NEXT:    vmovd %xmm1, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    vmovaps %xmm0, (%rdi)
+; ALL-NEXT:    pushq %rbx
+; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    movq %rdi, %rbx
+; ALL-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, (%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT:    # xmm0 = mem[1,0]
+; ALL-NEXT:    callq __truncdfhf2 at PLT
+; ALL-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    vmovaps %xmm0, (%rbx)
+; ALL-NEXT:    addq $64, %rsp
+; ALL-NEXT:    popq %rbx
 ; ALL-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
@@ -1073,132 +1216,208 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX1-LABEL: store_cvt_8f64_to_8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %xmm2, %r8d
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovd %xmm3, %r9d
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovd %xmm3, %r10d
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX1-NEXT:    vcvtsd2ss %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; AVX1-NEXT:    vmovd %xmm4, %r11d
-; AVX1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm0
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %esi
-; AVX1-NEXT:    movw %si, 12(%rdi)
-; AVX1-NEXT:    movw %dx, 8(%rdi)
-; AVX1-NEXT:    movw %cx, 4(%rdi)
-; AVX1-NEXT:    movw %ax, (%rdi)
-; AVX1-NEXT:    movw %r11w, 14(%rdi)
-; AVX1-NEXT:    movw %r10w, 10(%rdi)
-; AVX1-NEXT:    movw %r9w, 6(%rdi)
-; AVX1-NEXT:    movw %r8w, 2(%rdi)
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $120, %rsp
+; AVX1-NEXT:    movq %rdi, %rbx
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %r12d
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %r13d
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %ebp
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %r14d
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movl %eax, %r15d
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    movw %ax, 12(%rbx)
+; AVX1-NEXT:    movw %r15w, 8(%rbx)
+; AVX1-NEXT:    movw %r14w, 4(%rbx)
+; AVX1-NEXT:    movw %bp, (%rbx)
+; AVX1-NEXT:    movw %r13w, 14(%rbx)
+; AVX1-NEXT:    movw %r12w, 10(%rbx)
+; AVX1-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX1-NEXT:    movw %ax, 6(%rbx)
+; AVX1-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX1-NEXT:    movw %ax, 2(%rbx)
+; AVX1-NEXT:    addq $120, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: store_cvt_8f64_to_8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %xmm2, %r8d
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; AVX2-NEXT:    vmovd %xmm3, %r9d
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; AVX2-NEXT:    vmovd %xmm3, %r10d
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX2-NEXT:    vcvtsd2ss %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; AVX2-NEXT:    vmovd %xmm4, %r11d
-; AVX2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm0
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %esi
-; AVX2-NEXT:    movw %si, 12(%rdi)
-; AVX2-NEXT:    movw %dx, 8(%rdi)
-; AVX2-NEXT:    movw %cx, 4(%rdi)
-; AVX2-NEXT:    movw %ax, (%rdi)
-; AVX2-NEXT:    movw %r11w, 14(%rdi)
-; AVX2-NEXT:    movw %r10w, 10(%rdi)
-; AVX2-NEXT:    movw %r9w, 6(%rdi)
-; AVX2-NEXT:    movw %r8w, 2(%rdi)
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $120, %rsp
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %r12d
+; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %r13d
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %ebp
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %r14d
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movl %eax, %r15d
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    movw %ax, 12(%rbx)
+; AVX2-NEXT:    movw %r15w, 8(%rbx)
+; AVX2-NEXT:    movw %r14w, 4(%rbx)
+; AVX2-NEXT:    movw %bp, (%rbx)
+; AVX2-NEXT:    movw %r13w, 14(%rbx)
+; AVX2-NEXT:    movw %r12w, 10(%rbx)
+; AVX2-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX2-NEXT:    movw %ax, 6(%rbx)
+; AVX2-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX2-NEXT:    movw %ax, 2(%rbx)
+; AVX2-NEXT:    addq $120, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: store_cvt_8f64_to_8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovd %xmm1, %r8d
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT:    vmovd %xmm2, %r9d
-; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm3
-; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT:    vmovd %xmm3, %r10d
-; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512-NEXT:    vcvtsd2ss %xmm4, %xmm4, %xmm4
-; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; AVX512-NEXT:    vmovd %xmm4, %r11d
-; AVX512-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %ecx
-; AVX512-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm0
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %edx
-; AVX512-NEXT:    vcvtsd2ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %esi
-; AVX512-NEXT:    movw %si, 12(%rdi)
-; AVX512-NEXT:    movw %dx, 8(%rdi)
-; AVX512-NEXT:    movw %cx, 4(%rdi)
-; AVX512-NEXT:    movw %ax, (%rdi)
-; AVX512-NEXT:    movw %r11w, 14(%rdi)
-; AVX512-NEXT:    movw %r10w, 10(%rdi)
-; AVX512-NEXT:    movw %r9w, 6(%rdi)
-; AVX512-NEXT:    movw %r8w, 2(%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $152, %rsp
+; AVX512-NEXT:    movq %rdi, %rbx
+; AVX512-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %r12d
+; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %r13d
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %ebp
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %r14d
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movl %eax, %r15d
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    movw %ax, 12(%rbx)
+; AVX512-NEXT:    movw %r15w, 8(%rbx)
+; AVX512-NEXT:    movw %r14w, 4(%rbx)
+; AVX512-NEXT:    movw %bp, (%rbx)
+; AVX512-NEXT:    movw %r13w, 14(%rbx)
+; AVX512-NEXT:    movw %r12w, 10(%rbx)
+; AVX512-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX512-NEXT:    movw %ax, 6(%rbx)
+; AVX512-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX512-NEXT:    movw %ax, 2(%rbx)
+; AVX512-NEXT:    addq $152, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
   %1 = fptrunc <8 x double> %a0 to <8 x half>
   %2 = bitcast <8 x half> %1 to <8 x i16>

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index 00af4c60ef0c7..b1d30090cc6d8 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -368,58 +368,69 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; SSE-LABEL: test_v2f16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $40, %rsp
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pextrw $0, %xmm1, %ebx
-; SSE-NEXT:    pextrw $0, %xmm0, %ebp
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    subq $16, %rsp
+; SSE-NEXT:    movl %esi, %ebx
+; SSE-NEXT:    movl %edi, %r14d
+; SSE-NEXT:    movzwl %bx, %ebp
+; SSE-NEXT:    movl %ebp, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee at PLT
+; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl %r14w, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee at PLT
 ; SSE-NEXT:    ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE-NEXT:    cmoval %ebp, %ebx
-; SSE-NEXT:    pinsrw $0, %ebx, %xmm0
-; SSE-NEXT:    addq $40, %rsp
+; SSE-NEXT:    movw %bp, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    cmoval %r14d, %ebx
+; SSE-NEXT:    movw %bx, (%rsp)
+; SSE-NEXT:    movl (%rsp), %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    addq $16, %rsp
 ; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    pushq %r14
 ; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    subq $40, %rsp
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpextrw $0, %xmm1, %ebx
-; AVX-NEXT:    vpextrw $0, %xmm0, %ebp
-; AVX-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    movl %esi, %ebx
+; AVX-NEXT:    movl %edi, %r14d
+; AVX-NEXT:    movzwl %bx, %ebp
+; AVX-NEXT:    movl %ebp, %edi
+; AVX-NEXT:    callq __gnu_h2f_ieee at PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    movzwl %r14w, %edi
+; AVX-NEXT:    callq __gnu_h2f_ieee at PLT
 ; AVX-NEXT:    vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVX-NEXT:    cmoval %ebp, %ebx
-; AVX-NEXT:    vpinsrw $0, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    movw %bp, {{[0-9]+}}(%rsp)
+; AVX-NEXT:    cmoval %r14d, %ebx
+; AVX-NEXT:    movw %bx, (%rsp)
+; AVX-NEXT:    movl (%rsp), %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    addq $16, %rsp
 ; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
 ; AVX-NEXT:    popq %rbp
 ; AVX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2f16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %ecx
-; AVX512BW-NEXT:    movzwl %cx, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm0
+; AVX512BW-NEXT:    movzwl %si, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
 ; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512BW-NEXT:    movzwl %ax, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    movzwl %di, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
 ; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512BW-NEXT:    vucomiss %xmm0, %xmm1
-; AVX512BW-NEXT:    cmoval %eax, %ecx
-; AVX512BW-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
+; AVX512BW-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    cmoval %edi, %esi
+; AVX512BW-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512FP16-LABEL: test_v2f16:

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index d06b34a6e17ee..50c805d37ddb7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -367,58 +367,69 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; SSE-LABEL: test_v2f16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $40, %rsp
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pextrw $0, %xmm1, %ebx
-; SSE-NEXT:    pextrw $0, %xmm0, %ebp
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    subq $16, %rsp
+; SSE-NEXT:    movl %esi, %ebx
+; SSE-NEXT:    movl %edi, %r14d
+; SSE-NEXT:    movzwl %bx, %ebp
+; SSE-NEXT:    movl %ebp, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee at PLT
+; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl %r14w, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee at PLT
 ; SSE-NEXT:    ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE-NEXT:    cmovbl %ebp, %ebx
-; SSE-NEXT:    pinsrw $0, %ebx, %xmm0
-; SSE-NEXT:    addq $40, %rsp
+; SSE-NEXT:    movw %bp, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    cmovbl %r14d, %ebx
+; SSE-NEXT:    movw %bx, (%rsp)
+; SSE-NEXT:    movl (%rsp), %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    addq $16, %rsp
 ; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    pushq %r14
 ; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    subq $40, %rsp
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpextrw $0, %xmm1, %ebx
-; AVX-NEXT:    vpextrw $0, %xmm0, %ebp
-; AVX-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    movl %esi, %ebx
+; AVX-NEXT:    movl %edi, %r14d
+; AVX-NEXT:    movzwl %bx, %ebp
+; AVX-NEXT:    movl %ebp, %edi
+; AVX-NEXT:    callq __gnu_h2f_ieee at PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    movzwl %r14w, %edi
+; AVX-NEXT:    callq __gnu_h2f_ieee at PLT
 ; AVX-NEXT:    vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVX-NEXT:    cmovbl %ebp, %ebx
-; AVX-NEXT:    vpinsrw $0, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    movw %bp, {{[0-9]+}}(%rsp)
+; AVX-NEXT:    cmovbl %r14d, %ebx
+; AVX-NEXT:    movw %bx, (%rsp)
+; AVX-NEXT:    movl (%rsp), %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    addq $16, %rsp
 ; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
 ; AVX-NEXT:    popq %rbp
 ; AVX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2f16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %ecx
-; AVX512BW-NEXT:    movzwl %cx, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm0
+; AVX512BW-NEXT:    movzwl %si, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
 ; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512BW-NEXT:    movzwl %ax, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    movzwl %di, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
 ; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512BW-NEXT:    vucomiss %xmm0, %xmm1
-; AVX512BW-NEXT:    cmovbl %eax, %ecx
-; AVX512BW-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
+; AVX512BW-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    cmovbl %edi, %esi
+; AVX512BW-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512FP16-LABEL: test_v2f16:

diff  --git a/llvm/test/MC/X86/x86_64-asm-match.s b/llvm/test/MC/X86/x86_64-asm-match.s
index 74f92118f3c9e..7e1c2b42007e3 100644
--- a/llvm/test/MC/X86/x86_64-asm-match.s
+++ b/llvm/test/MC/X86/x86_64-asm-match.s
@@ -5,16 +5,16 @@
 // CHECK: Trying to match opcode MMX_PSHUFBrr
 // CHECK:   Matching formal operand class MCK_VR64 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode
 // CHECK: Trying to match opcode PSHUFBrr
-// CHECK:   Matching formal operand class MCK_FR16 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode
+// CHECK:   Matching formal operand class MCK_FR32 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode
 // CHECK: Trying to match opcode PSHUFBrm
 // CHECK:   Matching formal operand class MCK_Mem128 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): match success using generic matcher
-// CHECK:   Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
+// CHECK:   Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
 // CHECK:   Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
 // CHECK: AsmMatcher: found 2 encodings with mnemonic 'sha1rnds4'
 // CHECK: Trying to match opcode SHA1RNDS4rri
 // CHECK:   Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:1): match success using generic matcher
-// CHECK:   Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
-// CHECK:   Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm2): match success using generic matcher
+// CHECK:   Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
+// CHECK:   Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm2): match success using generic matcher
 // CHECK:   Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
 // CHECK: AsmMatcher: found 4 encodings with mnemonic 'pinsrw'
 // CHECK: Trying to match opcode MMX_PINSRWrr
@@ -24,7 +24,7 @@
 // CHECK: Trying to match opcode PINSRWrr
 // CHECK:   Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:3): match success using generic matcher
 // CHECK:   Matching formal operand class MCK_GR32orGR64 against actual operand at index 2 (Reg:ecx): match success using generic matcher
-// CHECK:   Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm5): match success using generic matcher
+// CHECK:   Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm5): match success using generic matcher
 // CHECK:   Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
 // CHECK: AsmMatcher: found 2 encodings with mnemonic 'crc32l'
 // CHECK: Trying to match opcode CRC32r32r32

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
index 9fd8701ca701e..ec15477bb5800 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
@@ -1,5 +1,3 @@
-// FIXME: The runtime needs support for _Float16 on X86, see PR55992
-// UNSUPPORTED: i386, x86_64
 // RUN: mlir-opt %s --sparse-compiler | \
 // RUN: mlir-cpu-runner \
 // RUN:  -e entry -entry-point-result=void  \

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir
index 3ab7d6d7e657f..9f107cfeeb72a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir
@@ -1,6 +1,4 @@
-// FIXME: The runtime needs support for _Float16 on X86, see PR55992
-// UNSUPPORTED: i386, x86_64
-// RUN: mlir-opt %s --sparse-compiler | \
+ // RUN: mlir-opt %s --sparse-compiler | \
 // RUN: mlir-cpu-runner \
 // RUN:  -e entry -entry-point-result=void  \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \


        


More information about the Mlir-commits mailing list