[llvm] [X86][FP16][BF16] Improve vectorization of fcmp (PR #116153)
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 25 03:25:33 PST 2024
https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/116153
>From 007fc2341bcf8d920360a3b73671c05735d7da20 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Thu, 14 Nov 2024 10:52:42 +0800
Subject: [PATCH 1/2] [X86][FP16][BF16] Improve vectorization of fcmp
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 52 +++-
llvm/test/Analysis/CostModel/X86/fptoi_sat.ll | 80 +++---
.../test/CodeGen/X86/avx512-insert-extract.ll | 61 +----
llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 84 ++-----
llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 202 ++-------------
llvm/test/CodeGen/X86/half.ll | 16 +-
llvm/test/CodeGen/X86/pr114520.ll | 79 +-----
llvm/test/CodeGen/X86/pr57340.ll | 235 +-----------------
.../CodeGen/X86/vector-reduce-fmax-nnan.ll | 19 +-
.../CodeGen/X86/vector-reduce-fmin-nnan.ll | 19 +-
10 files changed, 161 insertions(+), 686 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index db04f3a48d4d03..cd107220fdf233 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1711,6 +1711,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
}
+ setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
}
// This block controls legalization of the mask vector sizes that are
@@ -2046,6 +2048,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
+ setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
@@ -2401,6 +2404,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
}
+ setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
}
@@ -2411,6 +2416,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setF16Action(MVT::v32bf16, Expand);
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
+ setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
@@ -23333,12 +23339,8 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
/// concatenate the result back.
-static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
- ISD::CondCode Cond, SelectionDAG &DAG,
- const SDLoc &dl) {
- assert(VT.isInteger() && VT == LHS.getValueType() &&
- VT == RHS.getValueType() && "Unsupported VTs!");
-
+static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond,
+ SelectionDAG &DAG, const SDLoc &dl) {
SDValue CC = DAG.getCondCode(Cond);
// Extract the LHS Lo/Hi vectors
@@ -23483,14 +23485,40 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
if (isFP) {
- MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+ MVT SVT = Op0.getSimpleValueType();
+ MVT EltVT = SVT.getVectorElementType();
assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
EltVT == MVT::f64);
- if (isSoftF16(EltVT, Subtarget))
- return SDValue();
- bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ if (isSoftF16(EltVT, Subtarget)) {
+ // Break 256-bit FP vector compare into smaller ones.
+ if (SVT.is256BitVector() && !Subtarget.useAVX512Regs())
+ return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
+
+ // Break 512-bit FP vector compare into smaller ones.
+ if (SVT.is512BitVector())
+ return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
+
+ MVT NVT = SVT.changeVectorElementType(MVT::f32);
+ if (IsStrict) {
+ Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+ {Chain, Op0});
+ Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+ {Chain, Op1});
+ return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, CC});
+ }
+ MVT DVT = VT.getVectorElementType() == MVT::i16
+ ? VT.changeVectorElementType(MVT::i32)
+ : VT;
+ SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
+ DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
+ DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
+ return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
+ }
+
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
// If we have a strict compare with a vXi1 result and the input is 128/256
// bits we can't use a masked compare unless we have VLX. If we use a wider
@@ -23701,12 +23729,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
+ return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
// Break 512-bit integer vector compare into smaller ones.
// TODO: Try harder to use VPCMPx + VPMOV2x?
if (VT.is512BitVector())
- return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
+ return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
// not-of-PCMPEQ:
diff --git a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
index 55b80350f595ea..41bf88b1ec316b 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
@@ -1016,45 +1016,45 @@ define void @fp16() {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 210 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 210 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 183 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
@@ -1069,45 +1069,45 @@ define void @fp16() {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 210 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 210 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 183 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index f2a197cca8ae5b..1c4bfa8422d810 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2159,30 +2159,11 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
; KNL-LABEL: test_concat_v2i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: vcvtph2ps %xmm0, %xmm1
-; KNL-NEXT: vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; KNL-NEXT: vucomiss %xmm2, %xmm1
-; KNL-NEXT: setb %al
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: kmovw %eax, %k0
-; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
-; KNL-NEXT: vucomiss %xmm2, %xmm0
-; KNL-NEXT: setb %al
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: kshiftlw $1, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vucomiss %xmm2, %xmm1
-; KNL-NEXT: seta %al
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vucomiss %xmm2, %xmm0
-; KNL-NEXT: seta %al
-; KNL-NEXT: kmovw %eax, %k2
-; KNL-NEXT: kshiftlw $1, %k2, %k2
-; KNL-NEXT: korw %k2, %k1, %k1
+; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT: vcvtph2ps %xmm0, %ymm0
+; KNL-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0
+; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vcmpltps %zmm0, %zmm1, %k1
; KNL-NEXT: kandw %k1, %k0, %k1
; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -2194,36 +2175,16 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
;
; SKX-LABEL: test_concat_v2i1:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,1,1,4,5,6,7]
-; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
-; SKX-NEXT: vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; SKX-NEXT: vucomiss %xmm2, %xmm1
-; SKX-NEXT: setb %al
-; SKX-NEXT: kmovd %eax, %k0
-; SKX-NEXT: kshiftlb $1, %k0, %k0
-; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
-; SKX-NEXT: vucomiss %xmm2, %xmm0
-; SKX-NEXT: setb %al
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: kshiftlb $7, %k1, %k1
-; SKX-NEXT: kshiftrb $7, %k1, %k1
-; SKX-NEXT: korw %k0, %k1, %k0
-; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vucomiss %xmm2, %xmm1
-; SKX-NEXT: seta %al
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: kshiftlb $1, %k1, %k1
-; SKX-NEXT: vucomiss %xmm2, %xmm0
-; SKX-NEXT: seta %al
-; SKX-NEXT: kmovd %eax, %k2
-; SKX-NEXT: kshiftlb $7, %k2, %k2
-; SKX-NEXT: kshiftrb $7, %k2, %k2
-; SKX-NEXT: korw %k1, %k2, %k1
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT: vcvtph2ps %xmm0, %ymm0
+; SKX-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k0
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k1
; SKX-NEXT: kandw %k1, %k0, %k1
; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; SKX-NEXT: vmovd %xmm0, (%rdx)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%tmp = load <2 x half>, ptr %arg, align 8
%tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600>
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 832e55a8355252..24eb9b3715ed63 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1441,88 +1441,44 @@ define <4 x i32> @zext_bool_logic(<4 x i64> %cond1, <4 x i64> %cond2, <4 x i32>
define void @half_vec_compare(ptr %x, ptr %y) {
; KNL-LABEL: half_vec_compare:
; KNL: ## %bb.0: ## %entry
-; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT: vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
-; KNL-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
-; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
-; KNL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; KNL-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
-; KNL-NEXT: movl $65535, %ecx ## encoding: [0xb9,0xff,0xff,0x00,0x00]
-; KNL-NEXT: ## imm = 0xFFFF
-; KNL-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
-; KNL-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
-; KNL-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; KNL-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
-; KNL-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
-; KNL-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1]
-; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; KNL-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc2,0x01]
-; KNL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0]
+; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
+; KNL-NEXT: vcvtph2ps %xmm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; KNL-NEXT: vcmpneqps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x04]
+; KNL-NEXT: vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
; KNL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; KNL-NEXT: vpextrw $0, %xmm0, (%rsi) ## encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]
+; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; KNL-NEXT: retq ## encoding: [0xc3]
;
; AVX512BW-LABEL: half_vec_compare:
; AVX512BW: ## %bb.0: ## %entry
-; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT: vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
-; AVX512BW-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
-; AVX512BW-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
-; AVX512BW-NEXT: movl $65535, %ecx ## encoding: [0xb9,0xff,0xff,0x00,0x00]
-; AVX512BW-NEXT: ## imm = 0xFFFF
-; AVX512BW-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
-; AVX512BW-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
-; AVX512BW-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
-; AVX512BW-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
-; AVX512BW-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1]
-; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x01]
-; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0]
+; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; AVX512BW-NEXT: vcmpneqps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x04]
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]
+; AVX512BW-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512BW-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: half_vec_compare:
; SKX: ## %bb.0: ## %entry
-; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; SKX-NEXT: vpshuflw $85, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xc8,0x55]
-; SKX-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
-; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9]
-; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
-; SKX-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
-; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
-; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
-; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
-; SKX-NEXT: testb %cl, %cl ## encoding: [0x84,0xc9]
-; SKX-NEXT: setne %al ## encoding: [0x0f,0x95,0xc0]
-; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; SKX-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
-; SKX-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1]
-; SKX-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2]
-; SKX-NEXT: orb %cl, %dl ## encoding: [0x08,0xca]
-; SKX-NEXT: testb %dl, %dl ## encoding: [0x84,0xd2]
-; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
-; SKX-NEXT: andl $1, %ecx ## encoding: [0x83,0xe1,0x01]
-; SKX-NEXT: kmovw %ecx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc1]
-; SKX-NEXT: kmovd %eax, %k1 ## encoding: [0xc5,0xfb,0x92,0xc8]
-; SKX-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01]
-; SKX-NEXT: korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9]
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
+; SKX-NEXT: vcvtph2ps %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; SKX-NEXT: vcmpneqps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x04]
; SKX-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SKX-NEXT: ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x05,A,A,A,A]
; SKX-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; SKX-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]
+; SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; SKX-NEXT: retq ## encoding: [0xc3]
entry:
%0 = load <2 x half>, ptr %x
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 41d9a867c0a960..07701f082b0e22 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -1641,188 +1641,26 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
;
; AVX512-LABEL: test_fmaximum_v4f16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: .cfi_def_cfa_offset 16
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: .cfi_def_cfa_offset 24
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: .cfi_def_cfa_offset 32
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: .cfi_def_cfa_offset 40
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: .cfi_def_cfa_offset 48
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: .cfi_def_cfa_offset 56
-; AVX512-NEXT: .cfi_offset %rbx, -56
-; AVX512-NEXT: .cfi_offset %r12, -48
-; AVX512-NEXT: .cfi_offset %r13, -40
-; AVX512-NEXT: .cfi_offset %r14, -32
-; AVX512-NEXT: .cfi_offset %r15, -24
-; AVX512-NEXT: .cfi_offset %rbp, -16
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovpl %ecx, %edx
-; AVX512-NEXT: movl $0, %edi
-; AVX512-NEXT: cmoval %ecx, %edi
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovpl %ecx, %esi
-; AVX512-NEXT: movl $0, %r9d
-; AVX512-NEXT: cmoval %ecx, %r9d
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $0, %r8d
-; AVX512-NEXT: cmovpl %ecx, %r8d
-; AVX512-NEXT: movl $0, %r11d
-; AVX512-NEXT: cmoval %ecx, %r11d
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $0, %r10d
-; AVX512-NEXT: cmovpl %ecx, %r10d
-; AVX512-NEXT: movl $0, %ebp
-; AVX512-NEXT: cmoval %ecx, %ebp
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $0, %ebx
-; AVX512-NEXT: cmovpl %ecx, %ebx
-; AVX512-NEXT: movl $0, %r14d
-; AVX512-NEXT: cmoval %ecx, %r14d
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $0, %r15d
-; AVX512-NEXT: cmovpl %ecx, %r15d
-; AVX512-NEXT: movl $0, %r12d
-; AVX512-NEXT: cmoval %ecx, %r12d
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
-; AVX512-NEXT: movl $0, %r13d
-; AVX512-NEXT: cmoval %ecx, %r13d
-; AVX512-NEXT: vmovd %r13d, %xmm2
-; AVX512-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm2
-; AVX512-NEXT: vpinsrw $2, %r14d, %xmm2, %xmm2
-; AVX512-NEXT: vpinsrw $3, %ebp, %xmm2, %xmm2
-; AVX512-NEXT: vpinsrw $4, %r11d, %xmm2, %xmm2
-; AVX512-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2
-; AVX512-NEXT: vpinsrw $6, %edi, %xmm2, %xmm2
-; AVX512-NEXT: movl $0, %edi
-; AVX512-NEXT: cmovpl %ecx, %edi
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vucomiss %xmm3, %xmm4
-; AVX512-NEXT: movl $0, %r9d
-; AVX512-NEXT: cmoval %ecx, %r9d
-; AVX512-NEXT: vpinsrw $7, %r9d, %xmm2, %xmm2
-; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm2
-; AVX512-NEXT: vmovd %edi, %xmm3
-; AVX512-NEXT: vpinsrw $1, %r15d, %xmm3, %xmm3
-; AVX512-NEXT: vpinsrw $2, %ebx, %xmm3, %xmm3
-; AVX512-NEXT: vpinsrw $3, %r10d, %xmm3, %xmm3
-; AVX512-NEXT: vpinsrw $4, %r8d, %xmm3, %xmm3
-; AVX512-NEXT: vpinsrw $5, %esi, %xmm3, %xmm3
-; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovpl %ecx, %edx
-; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; AVX512-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512-NEXT: vucomiss %xmm4, %xmm3
-; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: cmovpl %eax, %edx
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm3
-; AVX512-NEXT: vucomiss %xmm4, %xmm3
-; AVX512-NEXT: movl $65535, %esi # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %esi
-; AVX512-NEXT: cmovpl %eax, %esi
-; AVX512-NEXT: vmovd %esi, %xmm3
-; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: cmovpl %eax, %edx
-; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: cmovpl %eax, %edx
-; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: cmovpl %eax, %edx
-; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: cmovpl %eax, %edx
-; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: cmovpl %eax, %edx
-; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: cmovnel %eax, %ecx
-; AVX512-NEXT: cmovpl %eax, %ecx
-; AVX512-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3
-; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm5
-; AVX512-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm4
-; AVX512-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: .cfi_def_cfa_offset 48
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: .cfi_def_cfa_offset 40
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: .cfi_def_cfa_offset 32
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: .cfi_def_cfa_offset 24
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: .cfi_def_cfa_offset 16
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: .cfi_def_cfa_offset 8
+; AVX512-NEXT: vcvtph2ps %xmm0, %ymm2
+; AVX512-NEXT: vcvtph2ps %xmm1, %ymm3
+; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %ymm4
+; AVX512-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm4
+; AVX512-NEXT: vcmpunordps %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpblendvb %xmm2, %xmm3, %xmm4, %xmm2
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
+; AVX512-NEXT: vpblendvb %xmm4, %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm3
+; AVX512-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm2, %ymm1
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcmpeqps %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; X86-LABEL: test_fmaximum_v4f16:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 9f01d07e6a6705..033cadae6a1e70 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1166,15 +1166,15 @@ define void @main.45() #0 {
;
; BWON-F16C-LABEL: main.45:
; BWON-F16C: # %bb.0: # %entry
-; BWON-F16C-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT: xorl %eax, %eax
-; BWON-F16C-NEXT: vucomiss %xmm0, %xmm0
-; BWON-F16C-NEXT: movl $65535, %ecx # imm = 0xFFFF
-; BWON-F16C-NEXT: cmovnpl %eax, %ecx
-; BWON-F16C-NEXT: vmovd %ecx, %xmm0
+; BWON-F16C-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
+; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; BWON-F16C-NEXT: vcmpunordps %xmm2, %xmm0, %xmm0
+; BWON-F16C-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; BWON-F16C-NEXT: vmovq %xmm0, (%rax)
; BWON-F16C-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr114520.ll b/llvm/test/CodeGen/X86/pr114520.ll
index 660b169e302d83..c557da6b3ab8cb 100644
--- a/llvm/test/CodeGen/X86/pr114520.ll
+++ b/llvm/test/CodeGen/X86/pr114520.ll
@@ -21,83 +21,8 @@ entry:
define <8 x half> @test2(<8 x half> %x) {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm2
-; CHECK-NEXT: vmovss {{.*#+}} xmm1 = [-Inf,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,1,1,4,5,6,7]
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $14, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-5, %ax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $13, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-9, %ax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,3,3,3,4,5,6,7]
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $12, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-17, %ax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $11, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-33, %ax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $10, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-65, %ax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $6, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: kshiftlw $9, %k0, %k0
-; CHECK-NEXT: kshiftrw $9, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kshiftlw $7, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k1
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm1
+; CHECK-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index 00a52c639e43c6..6bebbe3cee1f92 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -4,236 +4,13 @@
define void @main.41() local_unnamed_addr #1 {
; CHECK-LABEL: main.41:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm0
-; CHECK-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; CHECK-NEXT: vmovdqu (%rax), %ymm3
+; CHECK-NEXT: vpbroadcastw (%rax), %ymm0
+; CHECK-NEXT: vmovdqu (%rax), %ymm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; CHECK-NEXT: vpermi2w %ymm1, %ymm3, %ymm2
-; CHECK-NEXT: vprold $16, %xmm2, %xmm1
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm3
-; CHECK-NEXT: vmovdqu (%rax), %xmm5
-; CHECK-NEXT: vprold $16, %xmm5, %xmm1
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm3, %xmm1
-; CHECK-NEXT: setnp %cl
-; CHECK-NEXT: sete %dl
-; CHECK-NEXT: testb %cl, %dl
-; CHECK-NEXT: setne %cl
-; CHECK-NEXT: kmovd %ecx, %k0
-; CHECK-NEXT: kshiftlw $15, %k0, %k0
-; CHECK-NEXT: vmovd %eax, %xmm3
-; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3
-; CHECK-NEXT: vcvtph2ps %xmm5, %xmm6
-; CHECK-NEXT: kshiftrw $14, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm3, %xmm6
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: korw %k0, %k1, %k0
-; CHECK-NEXT: movw $-5, %ax
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT: vucomiss %xmm3, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $13, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-9, %ax
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vprolq $16, %xmm2, %xmm3
-; CHECK-NEXT: vcvtph2ps %xmm3, %xmm4
-; CHECK-NEXT: vprolq $16, %xmm5, %xmm3
-; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3
-; CHECK-NEXT: vucomiss %xmm4, %xmm3
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $12, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-17, %ax
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT: vucomiss %xmm4, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $11, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-33, %ax
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm7
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm7, %xmm4
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $10, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-65, %ax
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT: vucomiss %xmm7, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $9, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-129, %ax
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
-; CHECK-NEXT: vucomiss %xmm7, %xmm5
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $8, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-257, %ax # imm = 0xFEFF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm7
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm7, %xmm6
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $7, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-513, %ax # imm = 0xFDFF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vprold $16, %xmm2, %xmm6
-; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6
-; CHECK-NEXT: vucomiss %xmm6, %xmm1
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $6, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-1025, %ax # imm = 0xFBFF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $5, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-2049, %ax # imm = 0xF7FF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vprolq $16, %xmm2, %xmm1
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm3
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $4, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-4097, %ax # imm = 0xEFFF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $3, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-8193, %ax # imm = 0xDFFF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm4
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $2, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: movw $-16385, %ax # imm = 0xBFFF
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $14, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: kshiftlw $1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT: kshiftrw $1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm0, %xmm5
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k1
+; CHECK-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; CHECK-NEXT: vcvtph2ps %ymm2, %zmm0
+; CHECK-NEXT: vcvtph2ps %ymm1, %zmm1
+; CHECK-NEXT: vcmpeqps %zmm0, %zmm1, %k1
; CHECK-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT: vmovdqa %xmm0, (%rax)
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index c71a96f704ac38..b214bf082f2357 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -413,12 +413,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512F-NEXT: vucomiss %xmm3, %xmm2
-; AVX512F-NEXT: seta %al
-; AVX512F-NEXT: negb %al
-; AVX512F-NEXT: kmovd %eax, %k1
+; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm2
+; AVX512F-NEXT: vcvtph2ps %xmm1, %ymm3
+; AVX512F-NEXT: vcmpltps %zmm2, %zmm3, %k1
; AVX512F-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -427,14 +424,12 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512VL-LABEL: test_v2f16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512VL-NEXT: vucomiss %xmm3, %xmm2
-; AVX512VL-NEXT: seta %al
-; AVX512VL-NEXT: negb %al
-; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm2
+; AVX512VL-NEXT: vcvtph2ps %xmm1, %ymm3
+; AVX512VL-NEXT: vcmpltps %ymm2, %ymm3, %k1
; AVX512VL-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512FP16-LABEL: test_v2f16:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 2dffe2bf0dfa1f..9f37df716b6cd5 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -412,12 +412,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: vucomiss %xmm3, %xmm2
-; AVX512F-NEXT: sbbl %eax, %eax
-; AVX512F-NEXT: kmovd %eax, %k1
+; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm2
+; AVX512F-NEXT: vcvtph2ps %xmm1, %ymm3
+; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
; AVX512F-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -426,14 +423,12 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512VL-LABEL: test_v2f16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512VL-NEXT: xorl %eax, %eax
-; AVX512VL-NEXT: vucomiss %xmm3, %xmm2
-; AVX512VL-NEXT: sbbl %eax, %eax
-; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm2
+; AVX512VL-NEXT: vcvtph2ps %xmm1, %ymm3
+; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
; AVX512VL-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512FP16-LABEL: test_v2f16:
>From 6848d44d803220aa32b0164c145561d14420831b Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Mon, 25 Nov 2024 19:24:23 +0800
Subject: [PATCH 2/2] Address review comments
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd107220fdf233..90c63059f30d7c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23337,10 +23337,12 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
return SSECC;
}
-/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
+/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
/// concatenate the result back.
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond,
SelectionDAG &DAG, const SDLoc &dl) {
+ assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
+ "Unsupported VTs!");
SDValue CC = DAG.getCondCode(Cond);
// Extract the LHS Lo/Hi vectors
@@ -23481,26 +23483,25 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
MVT VT = Op->getSimpleValueType(0);
ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
- bool isFP = Op1.getSimpleValueType().isFloatingPoint();
+ MVT OpVT = Op0.getSimpleValueType();
SDLoc dl(Op);
- if (isFP) {
- MVT SVT = Op0.getSimpleValueType();
- MVT EltVT = SVT.getVectorElementType();
+ if (OpVT.isFloatingPoint()) {
+ MVT EltVT = OpVT.getVectorElementType();
assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
EltVT == MVT::f64);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
if (isSoftF16(EltVT, Subtarget)) {
// Break 256-bit FP vector compare into smaller ones.
- if (SVT.is256BitVector() && !Subtarget.useAVX512Regs())
+ if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
// Break 512-bit FP vector compare into smaller ones.
- if (SVT.is512BitVector())
+ if (OpVT.is512BitVector())
return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
- MVT NVT = SVT.changeVectorElementType(MVT::f32);
+ MVT NVT = OpVT.changeVectorElementType(MVT::f32);
if (IsStrict) {
Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
{Chain, Op0});
More information about the llvm-commits
mailing list