[llvm] [X86][FP16] Widen 128/256-bit CVTTP2xI to 512-bit when VLX not enabled (PR #142763)

Wed Jun 4 03:17:51 PDT 2025

https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/142763

None

>From a797bd8705b5ddc58312cad407163144b909b616 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Wed, 4 Jun 2025 18:10:05 +0800
Subject: [PATCH] [X86][FP16] Winden 128/256-bit CVTTP2xI to 512-bit when VLX
 not enabled

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  55 +-
 .../X86/vec-strict-fptoint-128-fp16.ll        | 310 +++++++++
 .../CodeGen/X86/vec-strict-fptoint-128-fp16.s | 600 ++++++++++++++++++
 .../X86/vec-strict-fptoint-256-fp16.ll        | 104 +++
 4 files changed, 1054 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b1a3e3c006bb3..fb76846297eb9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2371,6 +2371,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
     }
 
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
+
     if (Subtarget.hasVLX()) {
       setGroup(MVT::v8f16);
       setGroup(MVT::v16f16);
@@ -2386,10 +2391,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
 
-      setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
       setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
       setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
@@ -20010,10 +20011,12 @@ static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl,
 
 static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
                               const X86Subtarget &Subtarget) {
-  if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
-    return true;
-  if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
-    return true;
+  if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
+    if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
+      return true;
+    if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
+      return true;
+  }
   if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
     return true;
   if (Subtarget.useAVX512Regs()) {
@@ -21552,6 +21555,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+  bool HasVLX = Subtarget.hasVLX();
   MVT VT = Op->getSimpleValueType(0);
   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
   SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
@@ -21582,7 +21586,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       else
         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
 
-      if (!IsSigned && !Subtarget.hasVLX()) {
+      if (!IsSigned && !HasVLX) {
         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
         // Widen to 512-bits.
         ResVT = MVT::v8i32;
@@ -21612,7 +21616,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     }
 
     if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
-      if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
+      if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
+          VT == MVT::v32i16)
         return Op;
 
       MVT ResVT = VT;
@@ -21620,7 +21625,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       if (EleVT != MVT::i64)
         ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
 
-      if (SrcVT != MVT::v8f16) {
+      if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
         SDValue Tmp =
             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
@@ -21628,6 +21633,22 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
       }
 
+      if (!HasVLX) {
+        assert(Subtarget.useAVX512Regs() && "Unexpected features!");
+        // Widen to 512-bits.
+        unsigned IntSize = EleVT.getSizeInBits();
+        unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
+        MVT TmpVT = MVT::getVectorVT(MVT::f16, Num);
+        ResVT = MVT::getVectorVT(EleVT, Num);
+        // Need to concat with zero vector for strict fp to avoid spurious
+        // exceptions.
+        // TODO: Should we just do this for non-strict as well?
+        SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, TmpVT)
+                               : DAG.getUNDEF(TmpVT);
+        Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, TmpVT, Tmp, Src,
+                          DAG.getVectorIdxConstant(0, dl));
+      }
+
       if (IsStrict) {
         Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
                                    : X86ISD::STRICT_CVTTP2UI,
@@ -21640,7 +21661,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
       // TODO: Need to add exception check code for strict FP.
       if (EleVT.getSizeInBits() < 16) {
-        ResVT = MVT::getVectorVT(EleVT, 8);
+        if (HasVLX)
+          ResVT = MVT::getVectorVT(EleVT, 8);
         Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
       }
 
@@ -34123,12 +34145,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
 
       if (IsStrict) {
-        Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
         Res =
             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
         Chain = Res.getValue(1);
       } else {
-        Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
         Res = DAG.getNode(Opc, dl, ResVT, Src);
       }
 
@@ -44126,7 +44146,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       // Conversions.
       // TODO: Add more CVT opcodes when we have test coverage.
     case X86ISD::CVTTP2SI:
-    case X86ISD::CVTTP2UI:
+    case X86ISD::CVTTP2UI: {
+      if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
+          !Subtarget.hasVLX())
+        break;
+      [[fallthrough]];
+    }
     case X86ISD::CVTPH2PS: {
       SDLoc DL(Op);
       unsigned Scale = SizeInBits / ExtSizeInBits;
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
index 0a9dd78afb8cc..0126685f2bb32 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=NOVL
 
 declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half>, metadata)
 declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half>, metadata)
@@ -34,6 +35,16 @@ define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttsh2si %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm1
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm0
+; NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; NOVL-NEXT:    retq
   %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i64> %ret
@@ -46,6 +57,16 @@ define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm1
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm0
+; NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; NOVL-NEXT:    retq
   %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i64> %ret
@@ -58,6 +79,17 @@ define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i32> %ret
@@ -70,6 +102,17 @@ define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT:    vcvttph2udq %ymm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i32> %ret
@@ -82,6 +125,17 @@ define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i16> %ret
@@ -94,6 +148,17 @@ define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i16> %ret
@@ -107,6 +172,17 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i8> %ret
@@ -120,6 +196,17 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i8> %ret
@@ -136,6 +223,21 @@ define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    kmovw %eax, %k0
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $1, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k1
+; NOVL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i1> %ret
@@ -152,6 +254,21 @@ define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 {
 ; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    kmovw %eax, %k0
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $1, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k1
+; NOVL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f16(<2 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <2 x i1> %ret
@@ -163,6 +280,21 @@ define <4 x i32> @strict_vector_fptosi_v4f16_to_v4i32(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT:    vcvttsh2si %xmm1, %eax
+; NOVL-NEXT:    vcvttsh2si %xmm0, %ecx
+; NOVL-NEXT:    vmovd %ecx, %xmm1
+; NOVL-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; NOVL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcvttsh2si %xmm2, %eax
+; NOVL-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; NOVL-NEXT:    retq
   %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i32> %ret
@@ -174,6 +306,21 @@ define <4 x i32> @strict_vector_fptoui_v4f16_to_v4i32(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT:    vcvttsh2usi %xmm1, %eax
+; NOVL-NEXT:    vcvttsh2usi %xmm0, %ecx
+; NOVL-NEXT:    vmovd %ecx, %xmm1
+; NOVL-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; NOVL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcvttsh2usi %xmm2, %eax
+; NOVL-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2usi %xmm0, %eax
+; NOVL-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; NOVL-NEXT:    retq
   %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i32> %ret
@@ -185,6 +332,16 @@ define <4 x i16> @strict_vector_fptosi_v4f16_to_v4i16(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i16> %ret
@@ -196,6 +353,16 @@ define <4 x i16> @strict_vector_fptoui_v4f16_to_v4i16(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i16> %ret
@@ -208,6 +375,16 @@ define <4 x i8> @strict_vector_fptosi_v4f16_to_v4i8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i8> %ret
@@ -220,6 +397,16 @@ define <4 x i8> @strict_vector_fptoui_v4f16_to_v4i8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i8> %ret
@@ -235,6 +422,37 @@ define <4 x i1> @strict_vector_fptosi_v4f16_to_v4i1(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    kmovw %eax, %k0
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT:    vcvttsh2si %xmm1, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $15, %k1, %k1
+; NOVL-NEXT:    kshiftrw $14, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k0
+; NOVL-NEXT:    movw $-5, %ax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kandw %k1, %k0, %k0
+; NOVL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcvttsh2si %xmm1, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $2, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k0
+; NOVL-NEXT:    kshiftlw $13, %k0, %k0
+; NOVL-NEXT:    kshiftrw $13, %k0, %k0
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $3, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k1
+; NOVL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i1> %ret
@@ -250,6 +468,37 @@ define <4 x i1> @strict_vector_fptoui_v4f16_to_v4i1(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    kmovw %eax, %k0
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT:    vcvttsh2si %xmm1, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $15, %k1, %k1
+; NOVL-NEXT:    kshiftrw $14, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k0
+; NOVL-NEXT:    movw $-5, %ax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kandw %k1, %k0, %k0
+; NOVL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcvttsh2si %xmm1, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $2, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k0
+; NOVL-NEXT:    kshiftlw $13, %k0, %k0
+; NOVL-NEXT:    kshiftrw $13, %k0, %k0
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %eax
+; NOVL-NEXT:    kmovd %eax, %k1
+; NOVL-NEXT:    kshiftlw $3, %k1, %k1
+; NOVL-NEXT:    korw %k1, %k0, %k1
+; NOVL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i1> %ret
@@ -260,6 +509,15 @@ define <8 x i16> @strict_vector_fptosi_v8f16_to_v8i16(<8 x half> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i16> %ret
@@ -270,6 +528,15 @@ define <8 x i16> @strict_vector_fptoui_v8f16_to_v8i16(<8 x half> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i16> %ret
@@ -281,6 +548,15 @@ define <8 x i8> @strict_vector_fptosi_v8f16_to_v8i8(<8 x half> %a) #0 {
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i8> %ret
@@ -292,6 +568,15 @@ define <8 x i8> @strict_vector_fptoui_v8f16_to_v8i8(<8 x half> %a) #0 {
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i8> %ret
@@ -305,6 +590,18 @@ define <8 x i1> @strict_vector_fptosi_v8f16_to_v8i1(<8 x half> %a) #0 {
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT:    vpmovm2w %k0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i1> %ret
@@ -319,6 +616,19 @@ define <8 x i1> @strict_vector_fptoui_v8f16_to_v8i1(<8 x half> %a) #0 {
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    vpslld $31, %ymm0, %ymm0
+; NOVL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT:    vpmovm2w %k0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i1> %ret
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s
new file mode 100644
index 0000000000000..c7d16899f8898
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s
@@ -0,0 +1,600 @@
+	.file	"vec-strict-fptoint-128-fp16.ll"
+	.text
+	.globl	strict_vector_fptosi_v2f16_to_v2i64 # -- Begin function strict_vector_fptosi_v2f16_to_v2i64
+	.p2align	4
+	.type	strict_vector_fptosi_v2f16_to_v2i64, at function
+strict_vector_fptosi_v2f16_to_v2i64:    # @strict_vector_fptosi_v2f16_to_v2i64
+	.cfi_startproc
+# %bb.0:
+	vcvttsh2si	%xmm0, %rax
+	vmovq	%rax, %xmm1
+	vpsrld	$16, %xmm0, %xmm0
+	vcvttsh2si	%xmm0, %rax
+	vmovq	%rax, %xmm0
+	vpunpcklqdq	%xmm0, %xmm1, %xmm0     # xmm0 = xmm1[0],xmm0[0]
+	retq
+.Lfunc_end0:
+	.size	strict_vector_fptosi_v2f16_to_v2i64, .Lfunc_end0-strict_vector_fptosi_v2f16_to_v2i64
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v2f16_to_v2i64 # -- Begin function strict_vector_fptoui_v2f16_to_v2i64
+	.p2align	4
+	.type	strict_vector_fptoui_v2f16_to_v2i64, at function
+strict_vector_fptoui_v2f16_to_v2i64:    # @strict_vector_fptoui_v2f16_to_v2i64
+	.cfi_startproc
+# %bb.0:
+	vcvttsh2usi	%xmm0, %rax
+	vmovq	%rax, %xmm1
+	vpsrld	$16, %xmm0, %xmm0
+	vcvttsh2usi	%xmm0, %rax
+	vmovq	%rax, %xmm0
+	vpunpcklqdq	%xmm0, %xmm1, %xmm0     # xmm0 = xmm1[0],xmm0[0]
+	retq
+.Lfunc_end1:
+	.size	strict_vector_fptoui_v2f16_to_v2i64, .Lfunc_end1-strict_vector_fptoui_v2f16_to_v2i64
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v2f16_to_v2i32 # -- Begin function strict_vector_fptosi_v2f16_to_v2i32
+	.p2align	4
+	.type	strict_vector_fptosi_v2f16_to_v2i32, at function
+strict_vector_fptosi_v2f16_to_v2i32:    # @strict_vector_fptosi_v2f16_to_v2i32
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$1, %xmm0, %xmm1, %xmm0         # xmm0 = xmm0[0],xmm1[1,2,3]
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$15, %ymm0, %ymm1, %ymm0        # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+	vcvttph2dq	%ymm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end2:
+	.size	strict_vector_fptosi_v2f16_to_v2i32, .Lfunc_end2-strict_vector_fptosi_v2f16_to_v2i32
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v2f16_to_v2i32 # -- Begin function strict_vector_fptoui_v2f16_to_v2i32
+	.p2align	4
+	.type	strict_vector_fptoui_v2f16_to_v2i32, at function
+strict_vector_fptoui_v2f16_to_v2i32:    # @strict_vector_fptoui_v2f16_to_v2i32
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$1, %xmm0, %xmm1, %xmm0         # xmm0 = xmm0[0],xmm1[1,2,3]
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$15, %ymm0, %ymm1, %ymm0        # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+	vcvttph2udq	%ymm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end3:
+	.size	strict_vector_fptoui_v2f16_to_v2i32, .Lfunc_end3-strict_vector_fptoui_v2f16_to_v2i32
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v2f16_to_v2i16 # -- Begin function strict_vector_fptosi_v2f16_to_v2i16
+	.p2align	4
+	.type	strict_vector_fptosi_v2f16_to_v2i16, at function
+strict_vector_fptosi_v2f16_to_v2i16:    # @strict_vector_fptosi_v2f16_to_v2i16
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$1, %xmm0, %xmm1, %xmm0         # xmm0 = xmm0[0],xmm1[1,2,3]
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2w	%zmm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end4:
+	.size	strict_vector_fptosi_v2f16_to_v2i16, .Lfunc_end4-strict_vector_fptosi_v2f16_to_v2i16
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v2f16_to_v2i16 # -- Begin function strict_vector_fptoui_v2f16_to_v2i16
+	.p2align	4
+	.type	strict_vector_fptoui_v2f16_to_v2i16, at function
+strict_vector_fptoui_v2f16_to_v2i16:    # @strict_vector_fptoui_v2f16_to_v2i16
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$1, %xmm0, %xmm1, %xmm0         # xmm0 = xmm0[0],xmm1[1,2,3]
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2uw	%zmm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end5:
+	.size	strict_vector_fptoui_v2f16_to_v2i16, .Lfunc_end5-strict_vector_fptoui_v2f16_to_v2i16
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata.cst16,"aM", at progbits,16
+	.p2align	4, 0x0                          # -- Begin function strict_vector_fptosi_v2f16_to_v2i8
+.LCPI6_0:
+	.byte	0                               # 0x0
+	.byte	2                               # 0x2
+	.byte	4                               # 0x4
+	.byte	6                               # 0x6
+	.byte	8                               # 0x8
+	.byte	10                              # 0xa
+	.byte	12                              # 0xc
+	.byte	14                              # 0xe
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.text
+	.globl	strict_vector_fptosi_v2f16_to_v2i8
+	.p2align	4
+	.type	strict_vector_fptosi_v2f16_to_v2i8, at function
+strict_vector_fptosi_v2f16_to_v2i8:     # @strict_vector_fptosi_v2f16_to_v2i8
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$1, %xmm0, %xmm1, %xmm0         # xmm0 = xmm0[0],xmm1[1,2,3]
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2w	%zmm0, %zmm0
+	vpshufb	.LCPI6_0(%rip), %xmm0, %xmm0    # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+	vzeroupper
+	retq
+.Lfunc_end6:
+	.size	strict_vector_fptosi_v2f16_to_v2i8, .Lfunc_end6-strict_vector_fptosi_v2f16_to_v2i8
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata.cst16,"aM", at progbits,16
+	.p2align	4, 0x0                          # -- Begin function strict_vector_fptoui_v2f16_to_v2i8
+.LCPI7_0:
+	.byte	0                               # 0x0
+	.byte	2                               # 0x2
+	.byte	4                               # 0x4
+	.byte	6                               # 0x6
+	.byte	8                               # 0x8
+	.byte	10                              # 0xa
+	.byte	12                              # 0xc
+	.byte	14                              # 0xe
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.text
+	.globl	strict_vector_fptoui_v2f16_to_v2i8
+	.p2align	4
+	.type	strict_vector_fptoui_v2f16_to_v2i8, at function
+strict_vector_fptoui_v2f16_to_v2i8:     # @strict_vector_fptoui_v2f16_to_v2i8
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$1, %xmm0, %xmm1, %xmm0         # xmm0 = xmm0[0],xmm1[1,2,3]
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2uw	%zmm0, %zmm0
+	vpshufb	.LCPI7_0(%rip), %xmm0, %xmm0    # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+	vzeroupper
+	retq
+.Lfunc_end7:
+	.size	strict_vector_fptoui_v2f16_to_v2i8, .Lfunc_end7-strict_vector_fptoui_v2f16_to_v2i8
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v2f16_to_v2i1 # -- Begin function strict_vector_fptosi_v2f16_to_v2i1
+	.p2align	4
+	.type	strict_vector_fptosi_v2f16_to_v2i1, at function
+strict_vector_fptosi_v2f16_to_v2i1:     # @strict_vector_fptosi_v2f16_to_v2i1
+	.cfi_startproc
+# %bb.0:
+	vcvttsh2si	%xmm0, %eax
+	andl	$1, %eax
+	kmovw	%eax, %k0
+	vpsrld	$16, %xmm0, %xmm0
+	vcvttsh2si	%xmm0, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$1, %k1, %k1
+	korw	%k1, %k0, %k1
+	vpternlogq	$255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end8:
+	.size	strict_vector_fptosi_v2f16_to_v2i1, .Lfunc_end8-strict_vector_fptosi_v2f16_to_v2i1
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v2f16_to_v2i1 # -- Begin function strict_vector_fptoui_v2f16_to_v2i1
+	.p2align	4
+	.type	strict_vector_fptoui_v2f16_to_v2i1, at function
+strict_vector_fptoui_v2f16_to_v2i1:     # @strict_vector_fptoui_v2f16_to_v2i1
+	.cfi_startproc
+# %bb.0:
+	vcvttsh2si	%xmm0, %eax
+	andl	$1, %eax
+	kmovw	%eax, %k0
+	vpsrld	$16, %xmm0, %xmm0
+	vcvttsh2si	%xmm0, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$1, %k1, %k1
+	korw	%k1, %k0, %k1
+	vpternlogq	$255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end9:
+	.size	strict_vector_fptoui_v2f16_to_v2i1, .Lfunc_end9-strict_vector_fptoui_v2f16_to_v2i1
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v4f16_to_v4i32 # -- Begin function strict_vector_fptosi_v4f16_to_v4i32
+	.p2align	4
+	.type	strict_vector_fptosi_v4f16_to_v4i32, at function
+strict_vector_fptosi_v4f16_to_v4i32:    # @strict_vector_fptosi_v4f16_to_v4i32
+	.cfi_startproc
+# %bb.0:
+	vpsrld	$16, %xmm0, %xmm1
+	vcvttsh2si	%xmm1, %eax
+	vcvttsh2si	%xmm0, %ecx
+	vmovd	%ecx, %xmm1
+	vpinsrd	$1, %eax, %xmm1, %xmm1
+	vmovshdup	%xmm0, %xmm2            # xmm2 = xmm0[1,1,3,3]
+	vcvttsh2si	%xmm2, %eax
+	vpinsrd	$2, %eax, %xmm1, %xmm1
+	vpsrlq	$48, %xmm0, %xmm0
+	vcvttsh2si	%xmm0, %eax
+	vpinsrd	$3, %eax, %xmm1, %xmm0
+	retq
+.Lfunc_end10:
+	.size	strict_vector_fptosi_v4f16_to_v4i32, .Lfunc_end10-strict_vector_fptosi_v4f16_to_v4i32
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v4f16_to_v4i32 # -- Begin function strict_vector_fptoui_v4f16_to_v4i32
+	.p2align	4
+	.type	strict_vector_fptoui_v4f16_to_v4i32, at function
+strict_vector_fptoui_v4f16_to_v4i32:    # @strict_vector_fptoui_v4f16_to_v4i32
+	.cfi_startproc
+# %bb.0:
+	vpsrld	$16, %xmm0, %xmm1
+	vcvttsh2usi	%xmm1, %eax
+	vcvttsh2usi	%xmm0, %ecx
+	vmovd	%ecx, %xmm1
+	vpinsrd	$1, %eax, %xmm1, %xmm1
+	vmovshdup	%xmm0, %xmm2            # xmm2 = xmm0[1,1,3,3]
+	vcvttsh2usi	%xmm2, %eax
+	vpinsrd	$2, %eax, %xmm1, %xmm1
+	vpsrlq	$48, %xmm0, %xmm0
+	vcvttsh2usi	%xmm0, %eax
+	vpinsrd	$3, %eax, %xmm1, %xmm0
+	retq
+.Lfunc_end11:
+	.size	strict_vector_fptoui_v4f16_to_v4i32, .Lfunc_end11-strict_vector_fptoui_v4f16_to_v4i32
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v4f16_to_v4i16 # -- Begin function strict_vector_fptosi_v4f16_to_v4i16
+	.p2align	4
+	.type	strict_vector_fptosi_v4f16_to_v4i16, at function
+strict_vector_fptosi_v4f16_to_v4i16:    # @strict_vector_fptosi_v4f16_to_v4i16
+	.cfi_startproc
+# %bb.0:
+	vmovq	%xmm0, %xmm0                    # xmm0 = xmm0[0],zero
+	vpxor	%xmm1, %xmm1, %xmm1
+	vinserti32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2w	%zmm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end12:
+	.size	strict_vector_fptosi_v4f16_to_v4i16, .Lfunc_end12-strict_vector_fptosi_v4f16_to_v4i16
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v4f16_to_v4i16 # -- Begin function strict_vector_fptoui_v4f16_to_v4i16
+	.p2align	4
+	.type	strict_vector_fptoui_v4f16_to_v4i16, at function
+strict_vector_fptoui_v4f16_to_v4i16:    # @strict_vector_fptoui_v4f16_to_v4i16
+	.cfi_startproc
+# %bb.0:
+	vmovq	%xmm0, %xmm0                    # xmm0 = xmm0[0],zero
+	vpxor	%xmm1, %xmm1, %xmm1
+	vinserti32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2uw	%zmm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end13:
+	.size	strict_vector_fptoui_v4f16_to_v4i16, .Lfunc_end13-strict_vector_fptoui_v4f16_to_v4i16
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata.cst16,"aM", at progbits,16
+	.p2align	4, 0x0                          # -- Begin function strict_vector_fptosi_v4f16_to_v4i8
+.LCPI14_0:
+	.byte	0                               # 0x0
+	.byte	2                               # 0x2
+	.byte	4                               # 0x4
+	.byte	6                               # 0x6
+	.byte	8                               # 0x8
+	.byte	10                              # 0xa
+	.byte	12                              # 0xc
+	.byte	14                              # 0xe
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.text
+	.globl	strict_vector_fptosi_v4f16_to_v4i8
+	.p2align	4
+	.type	strict_vector_fptosi_v4f16_to_v4i8, at function
+strict_vector_fptosi_v4f16_to_v4i8:     # @strict_vector_fptosi_v4f16_to_v4i8
+	.cfi_startproc
+# %bb.0:
+	vmovq	%xmm0, %xmm0                    # xmm0 = xmm0[0],zero
+	vpxor	%xmm1, %xmm1, %xmm1
+	vinserti32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2w	%zmm0, %zmm0
+	vpshufb	.LCPI14_0(%rip), %xmm0, %xmm0   # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+	vzeroupper
+	retq
+.Lfunc_end14:
+	.size	strict_vector_fptosi_v4f16_to_v4i8, .Lfunc_end14-strict_vector_fptosi_v4f16_to_v4i8
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata.cst16,"aM", at progbits,16
+	.p2align	4, 0x0                          # -- Begin function strict_vector_fptoui_v4f16_to_v4i8
+.LCPI15_0:
+	.byte	0                               # 0x0
+	.byte	2                               # 0x2
+	.byte	4                               # 0x4
+	.byte	6                               # 0x6
+	.byte	8                               # 0x8
+	.byte	10                              # 0xa
+	.byte	12                              # 0xc
+	.byte	14                              # 0xe
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.text
+	.globl	strict_vector_fptoui_v4f16_to_v4i8
+	.p2align	4
+	.type	strict_vector_fptoui_v4f16_to_v4i8, at function
+strict_vector_fptoui_v4f16_to_v4i8:     # @strict_vector_fptoui_v4f16_to_v4i8
+	.cfi_startproc
+# %bb.0:
+	vmovq	%xmm0, %xmm0                    # xmm0 = xmm0[0],zero
+	vpxor	%xmm1, %xmm1, %xmm1
+	vinserti32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2uw	%zmm0, %zmm0
+	vpshufb	.LCPI15_0(%rip), %xmm0, %xmm0   # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+	vzeroupper
+	retq
+.Lfunc_end15:
+	.size	strict_vector_fptoui_v4f16_to_v4i8, .Lfunc_end15-strict_vector_fptoui_v4f16_to_v4i8
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v4f16_to_v4i1 # -- Begin function strict_vector_fptosi_v4f16_to_v4i1
+	.p2align	4
+	.type	strict_vector_fptosi_v4f16_to_v4i1, at function
+strict_vector_fptosi_v4f16_to_v4i1:     # @strict_vector_fptosi_v4f16_to_v4i1
+	.cfi_startproc
+# %bb.0:
+	vcvttsh2si	%xmm0, %eax
+	andl	$1, %eax
+	kmovw	%eax, %k0
+	vpsrld	$16, %xmm0, %xmm1
+	vcvttsh2si	%xmm1, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$15, %k1, %k1
+	kshiftrw	$14, %k1, %k1
+	korw	%k1, %k0, %k0
+	movw	$-5, %ax
+	kmovd	%eax, %k1
+	kandw	%k1, %k0, %k0
+	vmovshdup	%xmm0, %xmm1            # xmm1 = xmm0[1,1,3,3]
+	vcvttsh2si	%xmm1, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$2, %k1, %k1
+	korw	%k1, %k0, %k0
+	kshiftlw	$13, %k0, %k0
+	kshiftrw	$13, %k0, %k0
+	vpsrlq	$48, %xmm0, %xmm0
+	vcvttsh2si	%xmm0, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$3, %k1, %k1
+	korw	%k1, %k0, %k1
+	vpternlogd	$255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end16:
+	.size	strict_vector_fptosi_v4f16_to_v4i1, .Lfunc_end16-strict_vector_fptosi_v4f16_to_v4i1
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v4f16_to_v4i1 # -- Begin function strict_vector_fptoui_v4f16_to_v4i1
+	.p2align	4
+	.type	strict_vector_fptoui_v4f16_to_v4i1, at function
+strict_vector_fptoui_v4f16_to_v4i1:     # @strict_vector_fptoui_v4f16_to_v4i1
+	.cfi_startproc
+# %bb.0:
+	vcvttsh2si	%xmm0, %eax
+	andl	$1, %eax
+	kmovw	%eax, %k0
+	vpsrld	$16, %xmm0, %xmm1
+	vcvttsh2si	%xmm1, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$15, %k1, %k1
+	kshiftrw	$14, %k1, %k1
+	korw	%k1, %k0, %k0
+	movw	$-5, %ax
+	kmovd	%eax, %k1
+	kandw	%k1, %k0, %k0
+	vmovshdup	%xmm0, %xmm1            # xmm1 = xmm0[1,1,3,3]
+	vcvttsh2si	%xmm1, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$2, %k1, %k1
+	korw	%k1, %k0, %k0
+	kshiftlw	$13, %k0, %k0
+	kshiftrw	$13, %k0, %k0
+	vpsrlq	$48, %xmm0, %xmm0
+	vcvttsh2si	%xmm0, %eax
+	kmovd	%eax, %k1
+	kshiftlw	$3, %k1, %k1
+	korw	%k1, %k0, %k1
+	vpternlogd	$255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end17:
+	.size	strict_vector_fptoui_v4f16_to_v4i1, .Lfunc_end17-strict_vector_fptoui_v4f16_to_v4i1
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v8f16_to_v8i16 # -- Begin function strict_vector_fptosi_v8f16_to_v8i16
+	.p2align	4
+	.type	strict_vector_fptosi_v8f16_to_v8i16, at function
+strict_vector_fptosi_v8f16_to_v8i16:    # @strict_vector_fptosi_v8f16_to_v8i16
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2w	%zmm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end18:
+	.size	strict_vector_fptosi_v8f16_to_v8i16, .Lfunc_end18-strict_vector_fptosi_v8f16_to_v8i16
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v8f16_to_v8i16 # -- Begin function strict_vector_fptoui_v8f16_to_v8i16
+	.p2align	4
+	.type	strict_vector_fptoui_v8f16_to_v8i16, at function
+strict_vector_fptoui_v8f16_to_v8i16:    # @strict_vector_fptoui_v8f16_to_v8i16
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2uw	%zmm0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end19:
+	.size	strict_vector_fptoui_v8f16_to_v8i16, .Lfunc_end19-strict_vector_fptoui_v8f16_to_v8i16
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata.cst16,"aM", at progbits,16
+	.p2align	4, 0x0                          # -- Begin function strict_vector_fptosi_v8f16_to_v8i8
+.LCPI20_0:
+	.byte	0                               # 0x0
+	.byte	2                               # 0x2
+	.byte	4                               # 0x4
+	.byte	6                               # 0x6
+	.byte	8                               # 0x8
+	.byte	10                              # 0xa
+	.byte	12                              # 0xc
+	.byte	14                              # 0xe
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.text
+	.globl	strict_vector_fptosi_v8f16_to_v8i8
+	.p2align	4
+	.type	strict_vector_fptosi_v8f16_to_v8i8, at function
+strict_vector_fptosi_v8f16_to_v8i8:     # @strict_vector_fptosi_v8f16_to_v8i8
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2w	%zmm0, %zmm0
+	vpshufb	.LCPI20_0(%rip), %xmm0, %xmm0   # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+	vzeroupper
+	retq
+.Lfunc_end20:
+	.size	strict_vector_fptosi_v8f16_to_v8i8, .Lfunc_end20-strict_vector_fptosi_v8f16_to_v8i8
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata.cst16,"aM", at progbits,16
+	.p2align	4, 0x0                          # -- Begin function strict_vector_fptoui_v8f16_to_v8i8
+.LCPI21_0:
+	.byte	0                               # 0x0
+	.byte	2                               # 0x2
+	.byte	4                               # 0x4
+	.byte	6                               # 0x6
+	.byte	8                               # 0x8
+	.byte	10                              # 0xa
+	.byte	12                              # 0xc
+	.byte	14                              # 0xe
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.zero	1
+	.text
+	.globl	strict_vector_fptoui_v8f16_to_v8i8
+	.p2align	4
+	.type	strict_vector_fptoui_v8f16_to_v8i8, at function
+strict_vector_fptoui_v8f16_to_v8i8:     # @strict_vector_fptoui_v8f16_to_v8i8
+	.cfi_startproc
+# %bb.0:
+	vxorps	%xmm1, %xmm1, %xmm1
+	vinsertf32x4	$0, %xmm0, %zmm1, %zmm0
+	vcvttph2uw	%zmm0, %zmm0
+	vpshufb	.LCPI21_0(%rip), %xmm0, %xmm0   # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+	vzeroupper
+	retq
+.Lfunc_end21:
+	.size	strict_vector_fptoui_v8f16_to_v8i8, .Lfunc_end21-strict_vector_fptoui_v8f16_to_v8i8
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptosi_v8f16_to_v8i1 # -- Begin function strict_vector_fptosi_v8f16_to_v8i1
+	.p2align	4
+	.type	strict_vector_fptosi_v8f16_to_v8i1, at function
+strict_vector_fptosi_v8f16_to_v8i1:     # @strict_vector_fptosi_v8f16_to_v8i1
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $xmm0 killed $xmm0 def $ymm0
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$15, %ymm0, %ymm1, %ymm0        # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+	vcvttph2dq	%ymm0, %zmm0
+	vptestmd	%zmm0, %zmm0, %k0
+	vpmovm2w	%k0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end22:
+	.size	strict_vector_fptosi_v8f16_to_v8i1, .Lfunc_end22-strict_vector_fptosi_v8f16_to_v8i1
+	.cfi_endproc
+                                        # -- End function
+	.globl	strict_vector_fptoui_v8f16_to_v8i1 # -- Begin function strict_vector_fptoui_v8f16_to_v8i1
+	.p2align	4
+	.type	strict_vector_fptoui_v8f16_to_v8i1, at function
+strict_vector_fptoui_v8f16_to_v8i1:     # @strict_vector_fptoui_v8f16_to_v8i1
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $xmm0 killed $xmm0 def $ymm0
+	vxorps	%xmm1, %xmm1, %xmm1
+	vblendps	$15, %ymm0, %ymm1, %ymm0        # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+	vcvttph2dq	%ymm0, %zmm0
+	vpslld	$31, %ymm0, %ymm0
+	vptestmd	%zmm0, %zmm0, %k0
+	vpmovm2w	%k0, %zmm0
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper
+	retq
+.Lfunc_end23:
+	.size	strict_vector_fptoui_v8f16_to_v8i1, .Lfunc_end23-strict_vector_fptoui_v8f16_to_v8i1
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","", at progbits
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
index 7bdb6a45bebcc..a232122e9c707 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=NOVL
 
 
 declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half>, metadata)
@@ -20,6 +21,24 @@ define <4 x i64> @strict_vector_fptosi_v4f16_to_v4i64(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    vcvttph2qq %xmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; NOVL-NEXT:    vcvttsh2si %xmm1, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm1
+; NOVL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcvttsh2si %xmm2, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm2
+; NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NOVL-NEXT:    vcvttsh2si %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm2
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2si %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm0
+; NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NOVL-NEXT:    retq
   %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i64> %ret
@@ -31,6 +50,24 @@ define <4 x i64> @strict_vector_fptoui_v4f16_to_v4i64(<4 x half> %a) #0 {
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    vcvttph2uqq %xmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; NOVL-NEXT:    vcvttsh2usi %xmm1, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm1
+; NOVL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcvttsh2usi %xmm2, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm2
+; NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NOVL-NEXT:    vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm2
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT:    vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT:    vmovq %rax, %xmm0
+; NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NOVL-NEXT:    retq
   %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f16(<4 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <4 x i64> %ret
@@ -41,6 +78,15 @@ define <8 x i32> @strict_vector_fptosi_v8f16_to_v8i32(<8 x half> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
   %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i32> %ret
@@ -51,6 +97,15 @@ define <8 x i32> @strict_vector_fptoui_v8f16_to_v8i32(<8 x half> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT:    vcvttph2udq %ymm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
   %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <8 x i32> %ret
@@ -61,6 +116,14 @@ define <16 x i16> @strict_vector_fptosi_v16f16_to_v16i16(<16 x half> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttph2w %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v16f16_to_v16i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf64x4 $0, %ymm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
   %ret = call <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f16(<16 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <16 x i16> %ret
@@ -71,6 +134,14 @@ define <16 x i16> @strict_vector_fptoui_v16f16_to_v16i16(<16 x half> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttph2uw %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v16f16_to_v16i16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vinsertf64x4 $0, %ymm0, %zmm1, %zmm0
+; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
   %ret = call <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f16(<16 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <16 x i16> %ret
@@ -83,6 +154,13 @@ define <16 x i8> @strict_vector_fptosi_v16f16_to_v16i8(<16 x half> %a) #0 {
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v16f16_to_v16i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f16(<16 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <16 x i8> %ret
@@ -95,6 +173,13 @@ define <16 x i8> @strict_vector_fptoui_v16f16_to_v16i8(<16 x half> %a) #0 {
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v16f16_to_v16i8:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f16(<16 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <16 x i8> %ret
@@ -108,6 +193,15 @@ define <16 x i1> @strict_vector_fptosi_v16f16_to_v16i1(<16 x half> %a) #0 {
 ; CHECK-NEXT:    vpmovm2b %k0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v16f16_to_v16i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT:    vpmovm2b %k0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f16(<16 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <16 x i1> %ret
@@ -122,6 +216,16 @@ define <16 x i1> @strict_vector_fptoui_v16f16_to_v16i1(<16 x half> %a) #0 {
 ; CHECK-NEXT:    vpmovm2b %k0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v16f16_to_v16i1:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT:    vpslld $31, %zmm0, %zmm0
+; NOVL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT:    vpmovm2b %k0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
   %ret = call <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f16(<16 x half> %a,
                                               metadata !"fpexcept.strict") #0
   ret <16 x i1> %ret