[llvm] [Draft] Widen X86::FMIN/MAX for FP16 (PR #143298)

Sat Jun 7 21:31:36 PDT 2025

https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/143298

None

>From f222bbe8f1c4b7929e3aa2921a9825e21089574a Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Sun, 8 Jun 2025 12:28:09 +0800
Subject: [PATCH] [Draft] Widen X86::FMIN/MAX for FP16

---
 llvm/lib/Target/X86/X86ISelLowering.cpp     |  50 ++++--
 llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll | 141 ++++++++++-----
 llvm/test/CodeGen/X86/avx512fp16-fmaxnum.s  | 179 ++++++++++++++++++++
 3 files changed, 320 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/avx512fp16-fmaxnum.s

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e929dab429de5..93f2f503a85d2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35424,10 +35424,11 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const {
   switch (Opcode) {
   // These are non-commutative binops.
   // TODO: Add more X86ISD opcodes once we have test coverage.
-  case X86ISD::ANDNP:
-  case X86ISD::PCMPGT:
   case X86ISD::FMAX:
   case X86ISD::FMIN:
+    return Subtarget.hasVLX();
+  case X86ISD::ANDNP:
+  case X86ISD::PCMPGT:
   case X86ISD::FANDN:
   case X86ISD::VPSHA:
   case X86ISD::VPSHL:
@@ -44211,6 +44212,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       SDValue Insert =
           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
       return TLO.CombineTo(Op, Insert);
+    }
+    case X86ISD::FMAX:
+    case X86ISD::FMIN: {
+      if (VT.getVectorElementType() == MVT::f16 && !Subtarget.hasVLX())
+        break;
+      [[fallthrough]];
     }
       // Zero upper elements.
     case X86ISD::VZEXT_MOVL:
@@ -44241,8 +44248,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::VSRLV:
     case X86ISD::VSRAV:
       // Float ops.
-    case X86ISD::FMAX:
-    case X86ISD::FMIN:
     case X86ISD::FMAXC:
     case X86ISD::FMINC:
     case X86ISD::FRSQRT:
@@ -55368,25 +55373,46 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
 
+  auto GetNodeOrWiden = [&](SDValue Op0, SDValue Op1) {
+    if ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX())
+      return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+    Op0 = widenSubVector(MVT::v32f16, Op0, /*ZeroNewElements=*/false, Subtarget,
+                         DAG, DL);
+    Op1 = widenSubVector(MVT::v32f16, Op1, /*ZeroNewElements=*/false, Subtarget,
+                         DAG, DL);
+    SDValue Res =
+        DAG.getNode(MinMaxOp, DL, MVT::v32f16, Op0, Op1, N->getFlags());
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                       DAG.getVectorIdxConstant(0, DL));
+  };
+
   // If we don't have to respect NaN inputs, this is a direct translation to x86
   // min/max instructions.
   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
-    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+    return GetNodeOrWiden(Op0, Op1);
 
   // If one of the operands is known non-NaN use the native min/max instructions
   // with the non-NaN input as second operand.
   if (DAG.isKnownNeverNaN(Op1))
-    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+    return GetNodeOrWiden(Op0, Op1);
   if (DAG.isKnownNeverNaN(Op0))
-    return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
+    return GetNodeOrWiden(Op1, Op0);
 
   // If we have to respect NaN inputs, this takes at least 3 instructions.
   // Favor a library call when operating on a scalar and minimizing code size.
   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
     return SDValue();
 
+  EVT WindenVT = VT;
+  if ((VT == MVT::v8f16 || VT == MVT::v16f16) && !Subtarget.hasVLX()) {
+    WindenVT = MVT::v32f16;
+    Op0 = widenSubVector(MVT::v32f16, Op0, /*ZeroNewElements=*/false, Subtarget,
+                         DAG, DL);
+    Op1 = widenSubVector(MVT::v32f16, Op1, /*ZeroNewElements=*/false, Subtarget,
+                         DAG, DL);
+  }
   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                         VT);
+                                         WindenVT);
 
   // There are 4 possibilities involving NaN inputs, and these are the required
   // outputs:
@@ -55407,12 +55433,16 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   // use those instructions for fmaxnum by selecting away a NaN input.
 
   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
-  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, WindenVT, Op1, Op0);
   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
-  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
+  SDValue Res = DAG.getSelect(DL, WindenVT, IsOp0Nan, Op1, MinOrMax);
+  if (VT != WindenVT)
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                      DAG.getVectorIdxConstant(0, DL));
+  return Res;
 }
 
 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
index 1d535f93bc867..9a709ff985f94 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK,HasVL
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK,NOVL
 
 declare half @llvm.maxnum.f16(half, half)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
@@ -9,61 +10,112 @@ declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_intrinsic_fmaxh(half %x, half %y) {
-; CHECK-LABEL: test_intrinsic_fmaxh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmaxh:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; HasVL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmaxh:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; NOVL-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call half @llvm.maxnum.f16(half %x, half %y) readnone
   ret half %z
 }
 
 define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v2f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v2f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVL-NEXT:    vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+; NOVL-NEXT:    vmovdqa %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x6f,0xc2]
+; NOVL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
   ret <2 x half> %z
 }
 
 define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v4f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v4f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVL-NEXT:    vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+; NOVL-NEXT:    vmovdqa %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x6f,0xc2]
+; NOVL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
   ret <4 x half> %z
 }
 
 define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v8f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v8f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVL-NEXT:    vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+; NOVL-NEXT:    vmovdqa %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x6f,0xc2]
+; NOVL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %x, <8 x half> %y) readnone
   ret <8 x half> %z
 }
 
 define <16 x half> @test_intrinsic_fmax_v16f16(<16 x half> %x, <16 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v16f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v16f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v16f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; NOVL-NEXT:    vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+; NOVL-NEXT:    vmovdqa %ymm2, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %x, <16 x half> %y) readnone
   ret <16 x half> %z
 }
@@ -81,10 +133,19 @@ define <32 x half> @test_intrinsic_fmax_v32f16(<32 x half> %x, <32 x half> %y) {
 }
 
 define <4 x half> @maxnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) {
-; CHECK-LABEL: maxnum_intrinsic_nnan_fmf_f432:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: maxnum_intrinsic_nnan_fmf_f432:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: maxnum_intrinsic_nnan_fmf_f432:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVL-NEXT:    vmaxph %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x48,0x5f,0xc1]
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %r = tail call nnan <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %r
 }
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.s b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.s
new file mode 100644
index 0000000000000..503a60b1d867d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.s
@@ -0,0 +1,179 @@
+	.file	"avx512fp16-fmaxnum.ll"
+	.text
+	.globl	test_intrinsic_fmaxh            # -- Begin function test_intrinsic_fmaxh
+	.p2align	4
+	.type	test_intrinsic_fmaxh, at function
+test_intrinsic_fmaxh:                   # @test_intrinsic_fmaxh
+	.cfi_startproc
+# %bb.0:
+	vmaxsh	%xmm0, %xmm1, %xmm2             # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+	vcmpunordsh	%xmm0, %xmm0, %k1       # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+	vmovsh	%xmm1, %xmm0, %xmm2 {%k1}       # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+	vmovaps	%xmm2, %xmm0                    # encoding: [0xc5,0xf8,0x28,0xc2]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end0:
+	.size	test_intrinsic_fmaxh, .Lfunc_end0-test_intrinsic_fmaxh
+	.cfi_endproc
+                                        # -- End function
+	.globl	test_intrinsic_fmax_v2f16       # -- Begin function test_intrinsic_fmax_v2f16
+	.p2align	4
+	.type	test_intrinsic_fmax_v2f16, at function
+test_intrinsic_fmax_v2f16:              # @test_intrinsic_fmax_v2f16
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $xmm1 killed $xmm1 def $zmm1
+                                        # kill: def $xmm0 killed $xmm0 def $zmm0
+	vmaxph	%zmm0, %zmm1, %zmm2             # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+	vcmpunordph	%zmm0, %zmm0, %k1       # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+	vmovdqu16	%zmm1, %zmm2 {%k1}      # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+	vmovdqa	%xmm2, %xmm0                    # encoding: [0xc5,0xf9,0x6f,0xc2]
+	vzeroupper                              # encoding: [0xc5,0xf8,0x77]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end1:
+	.size	test_intrinsic_fmax_v2f16, .Lfunc_end1-test_intrinsic_fmax_v2f16
+	.cfi_endproc
+                                        # -- End function
+	.globl	test_intrinsic_fmax_v4f16       # -- Begin function test_intrinsic_fmax_v4f16
+	.p2align	4
+	.type	test_intrinsic_fmax_v4f16, at function
+test_intrinsic_fmax_v4f16:              # @test_intrinsic_fmax_v4f16
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $xmm1 killed $xmm1 def $zmm1
+                                        # kill: def $xmm0 killed $xmm0 def $zmm0
+	vmaxph	%zmm0, %zmm1, %zmm2             # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+	vcmpunordph	%zmm0, %zmm0, %k1       # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+	vmovdqu16	%zmm1, %zmm2 {%k1}      # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+	vmovdqa	%xmm2, %xmm0                    # encoding: [0xc5,0xf9,0x6f,0xc2]
+	vzeroupper                              # encoding: [0xc5,0xf8,0x77]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end2:
+	.size	test_intrinsic_fmax_v4f16, .Lfunc_end2-test_intrinsic_fmax_v4f16
+	.cfi_endproc
+                                        # -- End function
+	.globl	test_intrinsic_fmax_v8f16       # -- Begin function test_intrinsic_fmax_v8f16
+	.p2align	4
+	.type	test_intrinsic_fmax_v8f16, at function
+test_intrinsic_fmax_v8f16:              # @test_intrinsic_fmax_v8f16
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $xmm1 killed $xmm1 def $zmm1
+                                        # kill: def $xmm0 killed $xmm0 def $zmm0
+	vmaxph	%zmm0, %zmm1, %zmm2             # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+	vcmpunordph	%zmm0, %zmm0, %k1       # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+	vmovdqu16	%zmm1, %zmm2 {%k1}      # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+	vmovdqa	%xmm2, %xmm0                    # encoding: [0xc5,0xf9,0x6f,0xc2]
+	vzeroupper                              # encoding: [0xc5,0xf8,0x77]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end3:
+	.size	test_intrinsic_fmax_v8f16, .Lfunc_end3-test_intrinsic_fmax_v8f16
+	.cfi_endproc
+                                        # -- End function
+	.globl	test_intrinsic_fmax_v16f16      # -- Begin function test_intrinsic_fmax_v16f16
+	.p2align	4
+	.type	test_intrinsic_fmax_v16f16, at function
+test_intrinsic_fmax_v16f16:             # @test_intrinsic_fmax_v16f16
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $ymm1 killed $ymm1 def $zmm1
+                                        # kill: def $ymm0 killed $ymm0 def $zmm0
+	vmaxph	%zmm0, %zmm1, %zmm2             # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+	vcmpunordph	%zmm0, %zmm0, %k1       # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+	vmovdqu16	%zmm1, %zmm2 {%k1}      # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+	vmovdqa	%ymm2, %ymm0                    # encoding: [0xc5,0xfd,0x6f,0xc2]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end4:
+	.size	test_intrinsic_fmax_v16f16, .Lfunc_end4-test_intrinsic_fmax_v16f16
+	.cfi_endproc
+                                        # -- End function
+	.globl	test_intrinsic_fmax_v32f16      # -- Begin function test_intrinsic_fmax_v32f16
+	.p2align	4
+	.type	test_intrinsic_fmax_v32f16, at function
+test_intrinsic_fmax_v32f16:             # @test_intrinsic_fmax_v32f16
+	.cfi_startproc
+# %bb.0:
+	vmaxph	%zmm0, %zmm1, %zmm2             # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+	vcmpunordph	%zmm0, %zmm0, %k1       # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+	vmovdqu16	%zmm1, %zmm2 {%k1}      # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+	vmovdqa64	%zmm2, %zmm0            # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end5:
+	.size	test_intrinsic_fmax_v32f16, .Lfunc_end5-test_intrinsic_fmax_v32f16
+	.cfi_endproc
+                                        # -- End function
+	.globl	maxnum_intrinsic_nnan_fmf_f432  # -- Begin function maxnum_intrinsic_nnan_fmf_f432
+	.p2align	4
+	.type	maxnum_intrinsic_nnan_fmf_f432, at function
+maxnum_intrinsic_nnan_fmf_f432:         # @maxnum_intrinsic_nnan_fmf_f432
+	.cfi_startproc
+# %bb.0:
+                                        # kill: def $xmm1 killed $xmm1 def $zmm1
+                                        # kill: def $xmm0 killed $xmm0 def $zmm0
+	vmaxph	%zmm1, %zmm0, %zmm0             # encoding: [0x62,0xf5,0x7c,0x48,0x5f,0xc1]
+                                        # kill: def $xmm0 killed $xmm0 killed $zmm0
+	vzeroupper                              # encoding: [0xc5,0xf8,0x77]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end6:
+	.size	maxnum_intrinsic_nnan_fmf_f432, .Lfunc_end6-maxnum_intrinsic_nnan_fmf_f432
+	.cfi_endproc
+                                        # -- End function
+	.globl	maxnum_intrinsic_nnan_attr_f16  # -- Begin function maxnum_intrinsic_nnan_attr_f16
+	.p2align	4
+	.type	maxnum_intrinsic_nnan_attr_f16, at function
+maxnum_intrinsic_nnan_attr_f16:         # @maxnum_intrinsic_nnan_attr_f16
+	.cfi_startproc
+# %bb.0:
+	vmaxsh	%xmm1, %xmm0, %xmm0             # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0xc1]
+	retq                                    # encoding: [0xc3]
+.Lfunc_end7:
+	.size	maxnum_intrinsic_nnan_attr_f16, .Lfunc_end7-maxnum_intrinsic_nnan_attr_f16
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata,"a", at progbits
+	.p2align	1, 0x0                          # -- Begin function test_maxnum_const_op1
+.LCPI8_0:
+	.short	0x3c00                          # half 1
+	.text
+	.globl	test_maxnum_const_op1
+	.p2align	4
+	.type	test_maxnum_const_op1, at function
+test_maxnum_const_op1:                  # @test_maxnum_const_op1
+	.cfi_startproc
+# %bb.0:
+	vmaxsh	.LCPI8_0(%rip), %xmm0, %xmm0    # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A]
+                                        #   fixup A - offset: 6, value: .LCPI8_0-4, kind: reloc_riprel_4byte
+	retq                                    # encoding: [0xc3]
+.Lfunc_end8:
+	.size	test_maxnum_const_op1, .Lfunc_end8-test_maxnum_const_op1
+	.cfi_endproc
+                                        # -- End function
+	.section	.rodata,"a", at progbits
+	.p2align	1, 0x0                          # -- Begin function test_maxnum_const_op2
+.LCPI9_0:
+	.short	0x3c00                          # half 1
+	.text
+	.globl	test_maxnum_const_op2
+	.p2align	4
+	.type	test_maxnum_const_op2, at function
+test_maxnum_const_op2:                  # @test_maxnum_const_op2
+	.cfi_startproc
+# %bb.0:
+	vmaxsh	.LCPI9_0(%rip), %xmm0, %xmm0    # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A]
+                                        #   fixup A - offset: 6, value: .LCPI9_0-4, kind: reloc_riprel_4byte
+	retq                                    # encoding: [0xc3]
+.Lfunc_end9:
+	.size	test_maxnum_const_op2, .Lfunc_end9-test_maxnum_const_op2
+	.cfi_endproc
+                                        # -- End function
+	.globl	test_maxnum_const_nan           # -- Begin function test_maxnum_const_nan
+	.p2align	4
+	.type	test_maxnum_const_nan, at function
+test_maxnum_const_nan:                  # @test_maxnum_const_nan
+	.cfi_startproc
+# %bb.0:
+	retq                                    # encoding: [0xc3]
+.Lfunc_end10:
+	.size	test_maxnum_const_nan, .Lfunc_end10-test_maxnum_const_nan
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","", at progbits