[llvm] [X86][FP16] Widen 128/256-bit CVTTP2xI to 512-bit when VLX not enabled (PR #142763)
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 4 03:17:51 PDT 2025
https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/142763
None
>From a797bd8705b5ddc58312cad407163144b909b616 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Wed, 4 Jun 2025 18:10:05 +0800
Subject: [PATCH] [X86][FP16] Winden 128/256-bit CVTTP2xI to 512-bit when VLX
not enabled
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 55 +-
.../X86/vec-strict-fptoint-128-fp16.ll | 310 +++++++++
.../CodeGen/X86/vec-strict-fptoint-128-fp16.s | 600 ++++++++++++++++++
.../X86/vec-strict-fptoint-256-fp16.ll | 104 +++
4 files changed, 1054 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b1a3e3c006bb3..fb76846297eb9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2371,6 +2371,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
}
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
+
if (Subtarget.hasVLX()) {
setGroup(MVT::v8f16);
setGroup(MVT::v16f16);
@@ -2386,10 +2391,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
@@ -20010,10 +20011,12 @@ static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl,
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
const X86Subtarget &Subtarget) {
- if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
- return true;
- if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
- return true;
+ if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
+ if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
+ return true;
+ if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
+ return true;
+ }
if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
return true;
if (Subtarget.useAVX512Regs()) {
@@ -21552,6 +21555,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+ bool HasVLX = Subtarget.hasVLX();
MVT VT = Op->getSimpleValueType(0);
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
@@ -21582,7 +21586,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
else
Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
- if (!IsSigned && !Subtarget.hasVLX()) {
+ if (!IsSigned && !HasVLX) {
assert(Subtarget.useAVX512Regs() && "Unexpected features!");
// Widen to 512-bits.
ResVT = MVT::v8i32;
@@ -21612,7 +21616,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
- if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
+ if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
+ VT == MVT::v32i16)
return Op;
MVT ResVT = VT;
@@ -21620,7 +21625,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
if (EleVT != MVT::i64)
ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
- if (SrcVT != MVT::v8f16) {
+ if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
SDValue Tmp =
IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
@@ -21628,6 +21633,22 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
}
+ if (!HasVLX) {
+ assert(Subtarget.useAVX512Regs() && "Unexpected features!");
+ // Widen to 512-bits.
+ unsigned IntSize = EleVT.getSizeInBits();
+ unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
+ MVT TmpVT = MVT::getVectorVT(MVT::f16, Num);
+ ResVT = MVT::getVectorVT(EleVT, Num);
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, TmpVT)
+ : DAG.getUNDEF(TmpVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, TmpVT, Tmp, Src,
+ DAG.getVectorIdxConstant(0, dl));
+ }
+
if (IsStrict) {
Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
: X86ISD::STRICT_CVTTP2UI,
@@ -21640,7 +21661,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
// TODO: Need to add exception check code for strict FP.
if (EleVT.getSizeInBits() < 16) {
- ResVT = MVT::getVectorVT(EleVT, 8);
+ if (HasVLX)
+ ResVT = MVT::getVectorVT(EleVT, 8);
Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
}
@@ -34123,12 +34145,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
if (IsStrict) {
- Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
Res =
DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
- Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
Res = DAG.getNode(Opc, dl, ResVT, Src);
}
@@ -44126,7 +44146,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Conversions.
// TODO: Add more CVT opcodes when we have test coverage.
case X86ISD::CVTTP2SI:
- case X86ISD::CVTTP2UI:
+ case X86ISD::CVTTP2UI: {
+ if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
+ !Subtarget.hasVLX())
+ break;
+ [[fallthrough]];
+ }
case X86ISD::CVTPH2PS: {
SDLoc DL(Op);
unsigned Scale = SizeInBits / ExtSizeInBits;
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
index 0a9dd78afb8cc..0126685f2bb32 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=NOVL
declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half>, metadata)
declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half>, metadata)
@@ -34,6 +35,16 @@ define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 {
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2qq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttsh2si %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm1
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm0
+; NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; NOVL-NEXT: retq
%ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i64> %ret
@@ -46,6 +57,16 @@ define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 {
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm1
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm0
+; NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; NOVL-NEXT: retq
%ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i64> %ret
@@ -58,6 +79,17 @@ define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 {
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i32> %ret
@@ -70,6 +102,17 @@ define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 {
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT: vcvttph2udq %ymm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i32> %ret
@@ -82,6 +125,17 @@ define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 {
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i16> %ret
@@ -94,6 +148,17 @@ define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 {
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i16> %ret
@@ -107,6 +172,17 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i8> %ret
@@ -120,6 +196,17 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i8> %ret
@@ -136,6 +223,21 @@ define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 {
; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: kmovw %eax, %k0
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $1, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k1
+; NOVL-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i1> %ret
@@ -152,6 +254,21 @@ define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 {
; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: kmovw %eax, %k0
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $1, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k1
+; NOVL-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f16(<2 x half> %a,
metadata !"fpexcept.strict") #0
ret <2 x i1> %ret
@@ -163,6 +280,21 @@ define <4 x i32> @strict_vector_fptosi_v4f16_to_v4i32(<4 x half> %a) #0 {
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT: vcvttsh2si %xmm1, %eax
+; NOVL-NEXT: vcvttsh2si %xmm0, %ecx
+; NOVL-NEXT: vmovd %ecx, %xmm1
+; NOVL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT: vcvttsh2si %xmm2, %eax
+; NOVL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; NOVL-NEXT: retq
%ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i32> %ret
@@ -174,6 +306,21 @@ define <4 x i32> @strict_vector_fptoui_v4f16_to_v4i32(<4 x half> %a) #0 {
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT: vcvttsh2usi %xmm1, %eax
+; NOVL-NEXT: vcvttsh2usi %xmm0, %ecx
+; NOVL-NEXT: vmovd %ecx, %xmm1
+; NOVL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT: vcvttsh2usi %xmm2, %eax
+; NOVL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2usi %xmm0, %eax
+; NOVL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; NOVL-NEXT: retq
%ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i32> %ret
@@ -185,6 +332,16 @@ define <4 x i16> @strict_vector_fptosi_v4f16_to_v4i16(<4 x half> %a) #0 {
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i16> %ret
@@ -196,6 +353,16 @@ define <4 x i16> @strict_vector_fptoui_v4f16_to_v4i16(<4 x half> %a) #0 {
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i16> %ret
@@ -208,6 +375,16 @@ define <4 x i8> @strict_vector_fptosi_v4f16_to_v4i8(<4 x half> %a) #0 {
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i8> %ret
@@ -220,6 +397,16 @@ define <4 x i8> @strict_vector_fptoui_v4f16_to_v4i8(<4 x half> %a) #0 {
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i8> %ret
@@ -235,6 +422,37 @@ define <4 x i1> @strict_vector_fptosi_v4f16_to_v4i1(<4 x half> %a) #0 {
; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: kmovw %eax, %k0
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT: vcvttsh2si %xmm1, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $15, %k1, %k1
+; NOVL-NEXT: kshiftrw $14, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k0
+; NOVL-NEXT: movw $-5, %ax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kandw %k1, %k0, %k0
+; NOVL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; NOVL-NEXT: vcvttsh2si %xmm1, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $2, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k0
+; NOVL-NEXT: kshiftlw $13, %k0, %k0
+; NOVL-NEXT: kshiftrw $13, %k0, %k0
+; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $3, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k1
+; NOVL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i1> %ret
@@ -250,6 +468,37 @@ define <4 x i1> @strict_vector_fptoui_v4f16_to_v4i1(<4 x half> %a) #0 {
; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: kmovw %eax, %k0
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1
+; NOVL-NEXT: vcvttsh2si %xmm1, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $15, %k1, %k1
+; NOVL-NEXT: kshiftrw $14, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k0
+; NOVL-NEXT: movw $-5, %ax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kandw %k1, %k0, %k0
+; NOVL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; NOVL-NEXT: vcvttsh2si %xmm1, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $2, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k0
+; NOVL-NEXT: kshiftlw $13, %k0, %k0
+; NOVL-NEXT: kshiftrw $13, %k0, %k0
+; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %eax
+; NOVL-NEXT: kmovd %eax, %k1
+; NOVL-NEXT: kshiftlw $3, %k1, %k1
+; NOVL-NEXT: korw %k1, %k0, %k1
+; NOVL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i1> %ret
@@ -260,6 +509,15 @@ define <8 x i16> @strict_vector_fptosi_v8f16_to_v8i16(<8 x half> %a) #0 {
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i16> %ret
@@ -270,6 +528,15 @@ define <8 x i16> @strict_vector_fptoui_v8f16_to_v8i16(<8 x half> %a) #0 {
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i16> %ret
@@ -281,6 +548,15 @@ define <8 x i8> @strict_vector_fptosi_v8f16_to_v8i8(<8 x half> %a) #0 {
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i8> %ret
@@ -292,6 +568,15 @@ define <8 x i8> @strict_vector_fptoui_v8f16_to_v8i8(<8 x half> %a) #0 {
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i8> %ret
@@ -305,6 +590,18 @@ define <8 x i1> @strict_vector_fptosi_v8f16_to_v8i1(<8 x half> %a) #0 {
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT: vpmovm2w %k0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i1> %ret
@@ -319,6 +616,19 @@ define <8 x i1> @strict_vector_fptoui_v8f16_to_v8i1(<8 x half> %a) #0 {
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT: vpmovm2w %k0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i1> %ret
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s
new file mode 100644
index 0000000000000..c7d16899f8898
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.s
@@ -0,0 +1,600 @@
+ .file "vec-strict-fptoint-128-fp16.ll"
+ .text
+ .globl strict_vector_fptosi_v2f16_to_v2i64 # -- Begin function strict_vector_fptosi_v2f16_to_v2i64
+ .p2align 4
+ .type strict_vector_fptosi_v2f16_to_v2i64, at function
+strict_vector_fptosi_v2f16_to_v2i64: # @strict_vector_fptosi_v2f16_to_v2i64
+ .cfi_startproc
+# %bb.0:
+ vcvttsh2si %xmm0, %rax
+ vmovq %rax, %xmm1
+ vpsrld $16, %xmm0, %xmm0
+ vcvttsh2si %xmm0, %rax
+ vmovq %rax, %xmm0
+ vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
+ retq
+.Lfunc_end0:
+ .size strict_vector_fptosi_v2f16_to_v2i64, .Lfunc_end0-strict_vector_fptosi_v2f16_to_v2i64
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v2f16_to_v2i64 # -- Begin function strict_vector_fptoui_v2f16_to_v2i64
+ .p2align 4
+ .type strict_vector_fptoui_v2f16_to_v2i64, at function
+strict_vector_fptoui_v2f16_to_v2i64: # @strict_vector_fptoui_v2f16_to_v2i64
+ .cfi_startproc
+# %bb.0:
+ vcvttsh2usi %xmm0, %rax
+ vmovq %rax, %xmm1
+ vpsrld $16, %xmm0, %xmm0
+ vcvttsh2usi %xmm0, %rax
+ vmovq %rax, %xmm0
+ vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
+ retq
+.Lfunc_end1:
+ .size strict_vector_fptoui_v2f16_to_v2i64, .Lfunc_end1-strict_vector_fptoui_v2f16_to_v2i64
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v2f16_to_v2i32 # -- Begin function strict_vector_fptosi_v2f16_to_v2i32
+ .p2align 4
+ .type strict_vector_fptosi_v2f16_to_v2i32, at function
+strict_vector_fptosi_v2f16_to_v2i32: # @strict_vector_fptosi_v2f16_to_v2i32
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+ vcvttph2dq %ymm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end2:
+ .size strict_vector_fptosi_v2f16_to_v2i32, .Lfunc_end2-strict_vector_fptosi_v2f16_to_v2i32
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v2f16_to_v2i32 # -- Begin function strict_vector_fptoui_v2f16_to_v2i32
+ .p2align 4
+ .type strict_vector_fptoui_v2f16_to_v2i32, at function
+strict_vector_fptoui_v2f16_to_v2i32: # @strict_vector_fptoui_v2f16_to_v2i32
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+ vcvttph2udq %ymm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end3:
+ .size strict_vector_fptoui_v2f16_to_v2i32, .Lfunc_end3-strict_vector_fptoui_v2f16_to_v2i32
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v2f16_to_v2i16 # -- Begin function strict_vector_fptosi_v2f16_to_v2i16
+ .p2align 4
+ .type strict_vector_fptosi_v2f16_to_v2i16, at function
+strict_vector_fptosi_v2f16_to_v2i16: # @strict_vector_fptosi_v2f16_to_v2i16
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2w %zmm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end4:
+ .size strict_vector_fptosi_v2f16_to_v2i16, .Lfunc_end4-strict_vector_fptosi_v2f16_to_v2i16
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v2f16_to_v2i16 # -- Begin function strict_vector_fptoui_v2f16_to_v2i16
+ .p2align 4
+ .type strict_vector_fptoui_v2f16_to_v2i16, at function
+strict_vector_fptoui_v2f16_to_v2i16: # @strict_vector_fptoui_v2f16_to_v2i16
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2uw %zmm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end5:
+ .size strict_vector_fptoui_v2f16_to_v2i16, .Lfunc_end5-strict_vector_fptoui_v2f16_to_v2i16
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function strict_vector_fptosi_v2f16_to_v2i8
+.LCPI6_0:
+ .byte 0 # 0x0
+ .byte 2 # 0x2
+ .byte 4 # 0x4
+ .byte 6 # 0x6
+ .byte 8 # 0x8
+ .byte 10 # 0xa
+ .byte 12 # 0xc
+ .byte 14 # 0xe
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .text
+ .globl strict_vector_fptosi_v2f16_to_v2i8
+ .p2align 4
+ .type strict_vector_fptosi_v2f16_to_v2i8, at function
+strict_vector_fptosi_v2f16_to_v2i8: # @strict_vector_fptosi_v2f16_to_v2i8
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2w %zmm0, %zmm0
+ vpshufb .LCPI6_0(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+ vzeroupper
+ retq
+.Lfunc_end6:
+ .size strict_vector_fptosi_v2f16_to_v2i8, .Lfunc_end6-strict_vector_fptosi_v2f16_to_v2i8
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function strict_vector_fptoui_v2f16_to_v2i8
+.LCPI7_0:
+ .byte 0 # 0x0
+ .byte 2 # 0x2
+ .byte 4 # 0x4
+ .byte 6 # 0x6
+ .byte 8 # 0x8
+ .byte 10 # 0xa
+ .byte 12 # 0xc
+ .byte 14 # 0xe
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .text
+ .globl strict_vector_fptoui_v2f16_to_v2i8
+ .p2align 4
+ .type strict_vector_fptoui_v2f16_to_v2i8, at function
+strict_vector_fptoui_v2f16_to_v2i8: # @strict_vector_fptoui_v2f16_to_v2i8
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $1, %xmm0, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1,2,3]
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2uw %zmm0, %zmm0
+ vpshufb .LCPI7_0(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+ vzeroupper
+ retq
+.Lfunc_end7:
+ .size strict_vector_fptoui_v2f16_to_v2i8, .Lfunc_end7-strict_vector_fptoui_v2f16_to_v2i8
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v2f16_to_v2i1 # -- Begin function strict_vector_fptosi_v2f16_to_v2i1
+ .p2align 4
+ .type strict_vector_fptosi_v2f16_to_v2i1, at function
+strict_vector_fptosi_v2f16_to_v2i1: # @strict_vector_fptosi_v2f16_to_v2i1
+ .cfi_startproc
+# %bb.0:
+ vcvttsh2si %xmm0, %eax
+ andl $1, %eax
+ kmovw %eax, %k0
+ vpsrld $16, %xmm0, %xmm0
+ vcvttsh2si %xmm0, %eax
+ kmovd %eax, %k1
+ kshiftlw $1, %k1, %k1
+ korw %k1, %k0, %k1
+ vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end8:
+ .size strict_vector_fptosi_v2f16_to_v2i1, .Lfunc_end8-strict_vector_fptosi_v2f16_to_v2i1
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v2f16_to_v2i1 # -- Begin function strict_vector_fptoui_v2f16_to_v2i1
+ .p2align 4
+ .type strict_vector_fptoui_v2f16_to_v2i1, at function
+strict_vector_fptoui_v2f16_to_v2i1: # @strict_vector_fptoui_v2f16_to_v2i1
+ .cfi_startproc
+# %bb.0:
+ vcvttsh2si %xmm0, %eax
+ andl $1, %eax
+ kmovw %eax, %k0
+ vpsrld $16, %xmm0, %xmm0
+ vcvttsh2si %xmm0, %eax
+ kmovd %eax, %k1
+ kshiftlw $1, %k1, %k1
+ korw %k1, %k0, %k1
+ vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end9:
+ .size strict_vector_fptoui_v2f16_to_v2i1, .Lfunc_end9-strict_vector_fptoui_v2f16_to_v2i1
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v4f16_to_v4i32 # -- Begin function strict_vector_fptosi_v4f16_to_v4i32
+ .p2align 4
+ .type strict_vector_fptosi_v4f16_to_v4i32, at function
+strict_vector_fptosi_v4f16_to_v4i32: # @strict_vector_fptosi_v4f16_to_v4i32
+ .cfi_startproc
+# %bb.0:
+ vpsrld $16, %xmm0, %xmm1
+ vcvttsh2si %xmm1, %eax
+ vcvttsh2si %xmm0, %ecx
+ vmovd %ecx, %xmm1
+ vpinsrd $1, %eax, %xmm1, %xmm1
+ vmovshdup %xmm0, %xmm2 # xmm2 = xmm0[1,1,3,3]
+ vcvttsh2si %xmm2, %eax
+ vpinsrd $2, %eax, %xmm1, %xmm1
+ vpsrlq $48, %xmm0, %xmm0
+ vcvttsh2si %xmm0, %eax
+ vpinsrd $3, %eax, %xmm1, %xmm0
+ retq
+.Lfunc_end10:
+ .size strict_vector_fptosi_v4f16_to_v4i32, .Lfunc_end10-strict_vector_fptosi_v4f16_to_v4i32
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v4f16_to_v4i32 # -- Begin function strict_vector_fptoui_v4f16_to_v4i32
+ .p2align 4
+ .type strict_vector_fptoui_v4f16_to_v4i32, at function
+strict_vector_fptoui_v4f16_to_v4i32: # @strict_vector_fptoui_v4f16_to_v4i32
+ .cfi_startproc
+# %bb.0:
+ vpsrld $16, %xmm0, %xmm1
+ vcvttsh2usi %xmm1, %eax
+ vcvttsh2usi %xmm0, %ecx
+ vmovd %ecx, %xmm1
+ vpinsrd $1, %eax, %xmm1, %xmm1
+ vmovshdup %xmm0, %xmm2 # xmm2 = xmm0[1,1,3,3]
+ vcvttsh2usi %xmm2, %eax
+ vpinsrd $2, %eax, %xmm1, %xmm1
+ vpsrlq $48, %xmm0, %xmm0
+ vcvttsh2usi %xmm0, %eax
+ vpinsrd $3, %eax, %xmm1, %xmm0
+ retq
+.Lfunc_end11:
+ .size strict_vector_fptoui_v4f16_to_v4i32, .Lfunc_end11-strict_vector_fptoui_v4f16_to_v4i32
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v4f16_to_v4i16 # -- Begin function strict_vector_fptosi_v4f16_to_v4i16
+ .p2align 4
+ .type strict_vector_fptosi_v4f16_to_v4i16, at function
+strict_vector_fptosi_v4f16_to_v4i16: # @strict_vector_fptosi_v4f16_to_v4i16
+ .cfi_startproc
+# %bb.0:
+ vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero
+ vpxor %xmm1, %xmm1, %xmm1
+ vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2w %zmm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end12:
+ .size strict_vector_fptosi_v4f16_to_v4i16, .Lfunc_end12-strict_vector_fptosi_v4f16_to_v4i16
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v4f16_to_v4i16 # -- Begin function strict_vector_fptoui_v4f16_to_v4i16
+ .p2align 4
+ .type strict_vector_fptoui_v4f16_to_v4i16, at function
+strict_vector_fptoui_v4f16_to_v4i16: # @strict_vector_fptoui_v4f16_to_v4i16
+ .cfi_startproc
+# %bb.0:
+ vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero
+ vpxor %xmm1, %xmm1, %xmm1
+ vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2uw %zmm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end13:
+ .size strict_vector_fptoui_v4f16_to_v4i16, .Lfunc_end13-strict_vector_fptoui_v4f16_to_v4i16
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function strict_vector_fptosi_v4f16_to_v4i8
+.LCPI14_0:
+ .byte 0 # 0x0
+ .byte 2 # 0x2
+ .byte 4 # 0x4
+ .byte 6 # 0x6
+ .byte 8 # 0x8
+ .byte 10 # 0xa
+ .byte 12 # 0xc
+ .byte 14 # 0xe
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .text
+ .globl strict_vector_fptosi_v4f16_to_v4i8
+ .p2align 4
+ .type strict_vector_fptosi_v4f16_to_v4i8, at function
+strict_vector_fptosi_v4f16_to_v4i8: # @strict_vector_fptosi_v4f16_to_v4i8
+ .cfi_startproc
+# %bb.0:
+ vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero
+ vpxor %xmm1, %xmm1, %xmm1
+ vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2w %zmm0, %zmm0
+ vpshufb .LCPI14_0(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+ vzeroupper
+ retq
+.Lfunc_end14:
+ .size strict_vector_fptosi_v4f16_to_v4i8, .Lfunc_end14-strict_vector_fptosi_v4f16_to_v4i8
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function strict_vector_fptoui_v4f16_to_v4i8
+.LCPI15_0:
+ .byte 0 # 0x0
+ .byte 2 # 0x2
+ .byte 4 # 0x4
+ .byte 6 # 0x6
+ .byte 8 # 0x8
+ .byte 10 # 0xa
+ .byte 12 # 0xc
+ .byte 14 # 0xe
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .text
+ .globl strict_vector_fptoui_v4f16_to_v4i8
+ .p2align 4
+ .type strict_vector_fptoui_v4f16_to_v4i8, at function
+strict_vector_fptoui_v4f16_to_v4i8: # @strict_vector_fptoui_v4f16_to_v4i8
+ .cfi_startproc
+# %bb.0:
+ vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero
+ vpxor %xmm1, %xmm1, %xmm1
+ vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2uw %zmm0, %zmm0
+ vpshufb .LCPI15_0(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+ vzeroupper
+ retq
+.Lfunc_end15:
+ .size strict_vector_fptoui_v4f16_to_v4i8, .Lfunc_end15-strict_vector_fptoui_v4f16_to_v4i8
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v4f16_to_v4i1 # -- Begin function strict_vector_fptosi_v4f16_to_v4i1
+ .p2align 4
+ .type strict_vector_fptosi_v4f16_to_v4i1, at function
+strict_vector_fptosi_v4f16_to_v4i1: # @strict_vector_fptosi_v4f16_to_v4i1
+ .cfi_startproc
+# %bb.0:
+ vcvttsh2si %xmm0, %eax
+ andl $1, %eax
+ kmovw %eax, %k0
+ vpsrld $16, %xmm0, %xmm1
+ vcvttsh2si %xmm1, %eax
+ kmovd %eax, %k1
+ kshiftlw $15, %k1, %k1
+ kshiftrw $14, %k1, %k1
+ korw %k1, %k0, %k0
+ movw $-5, %ax
+ kmovd %eax, %k1
+ kandw %k1, %k0, %k0
+ vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
+ vcvttsh2si %xmm1, %eax
+ kmovd %eax, %k1
+ kshiftlw $2, %k1, %k1
+ korw %k1, %k0, %k0
+ kshiftlw $13, %k0, %k0
+ kshiftrw $13, %k0, %k0
+ vpsrlq $48, %xmm0, %xmm0
+ vcvttsh2si %xmm0, %eax
+ kmovd %eax, %k1
+ kshiftlw $3, %k1, %k1
+ korw %k1, %k0, %k1
+ vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end16:
+ .size strict_vector_fptosi_v4f16_to_v4i1, .Lfunc_end16-strict_vector_fptosi_v4f16_to_v4i1
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v4f16_to_v4i1 # -- Begin function strict_vector_fptoui_v4f16_to_v4i1
+ .p2align 4
+ .type strict_vector_fptoui_v4f16_to_v4i1, at function
+strict_vector_fptoui_v4f16_to_v4i1: # @strict_vector_fptoui_v4f16_to_v4i1
+ .cfi_startproc
+# %bb.0:
+ vcvttsh2si %xmm0, %eax
+ andl $1, %eax
+ kmovw %eax, %k0
+ vpsrld $16, %xmm0, %xmm1
+ vcvttsh2si %xmm1, %eax
+ kmovd %eax, %k1
+ kshiftlw $15, %k1, %k1
+ kshiftrw $14, %k1, %k1
+ korw %k1, %k0, %k0
+ movw $-5, %ax
+ kmovd %eax, %k1
+ kandw %k1, %k0, %k0
+ vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
+ vcvttsh2si %xmm1, %eax
+ kmovd %eax, %k1
+ kshiftlw $2, %k1, %k1
+ korw %k1, %k0, %k0
+ kshiftlw $13, %k0, %k0
+ kshiftrw $13, %k0, %k0
+ vpsrlq $48, %xmm0, %xmm0
+ vcvttsh2si %xmm0, %eax
+ kmovd %eax, %k1
+ kshiftlw $3, %k1, %k1
+ korw %k1, %k0, %k1
+ vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # zmm0 {%k1} {z} = -1
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end17:
+ .size strict_vector_fptoui_v4f16_to_v4i1, .Lfunc_end17-strict_vector_fptoui_v4f16_to_v4i1
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v8f16_to_v8i16 # -- Begin function strict_vector_fptosi_v8f16_to_v8i16
+ .p2align 4
+ .type strict_vector_fptosi_v8f16_to_v8i16, at function
+strict_vector_fptosi_v8f16_to_v8i16: # @strict_vector_fptosi_v8f16_to_v8i16
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2w %zmm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end18:
+ .size strict_vector_fptosi_v8f16_to_v8i16, .Lfunc_end18-strict_vector_fptosi_v8f16_to_v8i16
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v8f16_to_v8i16 # -- Begin function strict_vector_fptoui_v8f16_to_v8i16
+ .p2align 4
+ .type strict_vector_fptoui_v8f16_to_v8i16, at function
+strict_vector_fptoui_v8f16_to_v8i16: # @strict_vector_fptoui_v8f16_to_v8i16
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2uw %zmm0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end19:
+ .size strict_vector_fptoui_v8f16_to_v8i16, .Lfunc_end19-strict_vector_fptoui_v8f16_to_v8i16
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function strict_vector_fptosi_v8f16_to_v8i8
+.LCPI20_0:
+ .byte 0 # 0x0
+ .byte 2 # 0x2
+ .byte 4 # 0x4
+ .byte 6 # 0x6
+ .byte 8 # 0x8
+ .byte 10 # 0xa
+ .byte 12 # 0xc
+ .byte 14 # 0xe
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .text
+ .globl strict_vector_fptosi_v8f16_to_v8i8
+ .p2align 4
+ .type strict_vector_fptosi_v8f16_to_v8i8, at function
+strict_vector_fptosi_v8f16_to_v8i8: # @strict_vector_fptosi_v8f16_to_v8i8
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2w %zmm0, %zmm0
+ vpshufb .LCPI20_0(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+ vzeroupper
+ retq
+.Lfunc_end20:
+ .size strict_vector_fptosi_v8f16_to_v8i8, .Lfunc_end20-strict_vector_fptosi_v8f16_to_v8i8
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function strict_vector_fptoui_v8f16_to_v8i8
+.LCPI21_0:
+ .byte 0 # 0x0
+ .byte 2 # 0x2
+ .byte 4 # 0x4
+ .byte 6 # 0x6
+ .byte 8 # 0x8
+ .byte 10 # 0xa
+ .byte 12 # 0xc
+ .byte 14 # 0xe
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .text
+ .globl strict_vector_fptoui_v8f16_to_v8i8
+ .p2align 4
+ .type strict_vector_fptoui_v8f16_to_v8i8, at function
+strict_vector_fptoui_v8f16_to_v8i8: # @strict_vector_fptoui_v8f16_to_v8i8
+ .cfi_startproc
+# %bb.0:
+ vxorps %xmm1, %xmm1, %xmm1
+ vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+ vcvttph2uw %zmm0, %zmm0
+ vpshufb .LCPI21_0(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+ vzeroupper
+ retq
+.Lfunc_end21:
+ .size strict_vector_fptoui_v8f16_to_v8i8, .Lfunc_end21-strict_vector_fptoui_v8f16_to_v8i8
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptosi_v8f16_to_v8i1 # -- Begin function strict_vector_fptosi_v8f16_to_v8i1
+ .p2align 4
+ .type strict_vector_fptosi_v8f16_to_v8i1, at function
+strict_vector_fptosi_v8f16_to_v8i1: # @strict_vector_fptosi_v8f16_to_v8i1
+ .cfi_startproc
+# %bb.0:
+ # kill: def $xmm0 killed $xmm0 def $ymm0
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+ vcvttph2dq %ymm0, %zmm0
+ vptestmd %zmm0, %zmm0, %k0
+ vpmovm2w %k0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end22:
+ .size strict_vector_fptosi_v8f16_to_v8i1, .Lfunc_end22-strict_vector_fptosi_v8f16_to_v8i1
+ .cfi_endproc
+ # -- End function
+ .globl strict_vector_fptoui_v8f16_to_v8i1 # -- Begin function strict_vector_fptoui_v8f16_to_v8i1
+ .p2align 4
+ .type strict_vector_fptoui_v8f16_to_v8i1, at function
+strict_vector_fptoui_v8f16_to_v8i1: # @strict_vector_fptoui_v8f16_to_v8i1
+ .cfi_startproc
+# %bb.0:
+ # kill: def $xmm0 killed $xmm0 def $ymm0
+ vxorps %xmm1, %xmm1, %xmm1
+ vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+ vcvttph2dq %ymm0, %zmm0
+ vpslld $31, %ymm0, %ymm0
+ vptestmd %zmm0, %zmm0, %k0
+ vpmovm2w %k0, %zmm0
+ # kill: def $xmm0 killed $xmm0 killed $zmm0
+ vzeroupper
+ retq
+.Lfunc_end23:
+ .size strict_vector_fptoui_v8f16_to_v8i1, .Lfunc_end23-strict_vector_fptoui_v8f16_to_v8i1
+ .cfi_endproc
+ # -- End function
+ .section ".note.GNU-stack","", at progbits
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
index 7bdb6a45bebcc..a232122e9c707 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=NOVL
declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half>, metadata)
@@ -20,6 +21,24 @@ define <4 x i64> @strict_vector_fptosi_v4f16_to_v4i64(<4 x half> %a) #0 {
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vcvttph2qq %xmm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm1
+; NOVL-NEXT: vcvttsh2si %xmm1, %rax
+; NOVL-NEXT: vmovq %rax, %xmm1
+; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT: vcvttsh2si %xmm2, %rax
+; NOVL-NEXT: vmovq %rax, %xmm2
+; NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NOVL-NEXT: vcvttsh2si %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm2
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2si %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm0
+; NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NOVL-NEXT: retq
%ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i64> %ret
@@ -31,6 +50,24 @@ define <4 x i64> @strict_vector_fptoui_v4f16_to_v4i64(<4 x half> %a) #0 {
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vcvttph2uqq %xmm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm1
+; NOVL-NEXT: vcvttsh2usi %xmm1, %rax
+; NOVL-NEXT: vmovq %rax, %xmm1
+; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; NOVL-NEXT: vcvttsh2usi %xmm2, %rax
+; NOVL-NEXT: vmovq %rax, %xmm2
+; NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NOVL-NEXT: vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm2
+; NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; NOVL-NEXT: vcvttsh2usi %xmm0, %rax
+; NOVL-NEXT: vmovq %rax, %xmm0
+; NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NOVL-NEXT: retq
%ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f16(<4 x half> %a,
metadata !"fpexcept.strict") #0
ret <4 x i64> %ret
@@ -41,6 +78,15 @@ define <8 x i32> @strict_vector_fptosi_v8f16_to_v8i32(<8 x half> %a) #0 {
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v8f16_to_v8i32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT: retq
%ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i32> %ret
@@ -51,6 +97,15 @@ define <8 x i32> @strict_vector_fptoui_v8f16_to_v8i32(<8 x half> %a) #0 {
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v8f16_to_v8i32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; NOVL-NEXT: vcvttph2udq %ymm0, %zmm0
+; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT: retq
%ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half> %a,
metadata !"fpexcept.strict") #0
ret <8 x i32> %ret
@@ -61,6 +116,14 @@ define <16 x i16> @strict_vector_fptosi_v16f16_to_v16i16(<16 x half> %a) #0 {
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttph2w %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v16f16_to_v16i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2w %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT: retq
%ret = call <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f16(<16 x half> %a,
metadata !"fpexcept.strict") #0
ret <16 x i16> %ret
@@ -71,6 +134,14 @@ define <16 x i16> @strict_vector_fptoui_v16f16_to_v16i16(<16 x half> %a) #0 {
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttph2uw %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v16f16_to_v16i16:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0
+; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0
+; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT: retq
%ret = call <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f16(<16 x half> %a,
metadata !"fpexcept.strict") #0
ret <16 x i16> %ret
@@ -83,6 +154,13 @@ define <16 x i8> @strict_vector_fptosi_v16f16_to_v16i8(<16 x half> %a) #0 {
; CHECK-NEXT: vpmovdb %zmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v16f16_to_v16i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: vpmovdb %zmm0, %xmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f16(<16 x half> %a,
metadata !"fpexcept.strict") #0
ret <16 x i8> %ret
@@ -95,6 +173,13 @@ define <16 x i8> @strict_vector_fptoui_v16f16_to_v16i8(<16 x half> %a) #0 {
; CHECK-NEXT: vpmovdb %zmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v16f16_to_v16i8:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: vpmovdb %zmm0, %xmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f16(<16 x half> %a,
metadata !"fpexcept.strict") #0
ret <16 x i8> %ret
@@ -108,6 +193,15 @@ define <16 x i1> @strict_vector_fptosi_v16f16_to_v16i1(<16 x half> %a) #0 {
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptosi_v16f16_to_v16i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT: vpmovm2b %k0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f16(<16 x half> %a,
metadata !"fpexcept.strict") #0
ret <16 x i1> %ret
@@ -122,6 +216,16 @@ define <16 x i1> @strict_vector_fptoui_v16f16_to_v16i1(<16 x half> %a) #0 {
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
+;
+; NOVL-LABEL: strict_vector_fptoui_v16f16_to_v16i1:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0
+; NOVL-NEXT: vpslld $31, %zmm0, %zmm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NOVL-NEXT: vpmovm2b %k0, %zmm0
+; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
%ret = call <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f16(<16 x half> %a,
metadata !"fpexcept.strict") #0
ret <16 x i1> %ret
More information about the llvm-commits
mailing list