[llvm] r344914 - DAG: Change behavior of fminnum/fmaxnum nodes

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 22 09:27:28 PDT 2018


Author: arsenm
Date: Mon Oct 22 09:27:27 2018
New Revision: 344914

URL: http://llvm.org/viewvc/llvm-project?rev=344914&view=rev
Log:
DAG: Change behavior of fminnum/fmaxnum nodes

Introduce new versions that follow the IEEE semantics
to help with legalization that may need quieted inputs.

There are some regressions from inserting unnecessary
canonicalizes when these are matched from fast math
fcmp + select which should be fixed in a future commit.

Modified:
    llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
    llvm/trunk/include/llvm/CodeGen/TargetLowering.h
    llvm/trunk/include/llvm/Target/TargetSelectionDAG.td
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
    llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
    llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td
    llvm/trunk/test/CodeGen/AMDGPU/clamp.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll
    llvm/trunk/test/CodeGen/AMDGPU/known-never-snan.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/reduction.ll

Modified: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h (original)
+++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h Mon Oct 22 09:27:27 2018
@@ -564,10 +564,19 @@ namespace ISD {
     FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR,
     /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
     /// values.
-    /// In the case where a single input is NaN, the non-NaN input is returned.
+    //
+    /// In the case where a single input is a NaN (either signaling or quiet),
+    /// the non-NaN input is returned.
     ///
     /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0.
     FMINNUM, FMAXNUM,
+
+    /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on
+    /// two values, following the IEEE-754 2008 definition. This differs from
+    /// FMINNUM/FMAXNUM in the handling of signaling NaNs. If one input is a
+    /// signaling NaN, returns a quiet NaN.
+    FMINNUM_IEEE, FMAXNUM_IEEE,
+
     /// FMINNAN/FMAXNAN - NaN-propagating minimum/maximum that also treat -0.0
     /// as less than 0.0. While FMINNUM/FMAXNUM follow IEEE 754-2008 semantics,
     /// FMINNAN/FMAXNAN follow IEEE 754-2018 draft semantics.

Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Mon Oct 22 09:27:27 2018
@@ -3644,6 +3644,9 @@ public:
   /// \returns True, if the expansion was successful, false otherwise
   bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
 
+  /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
+  SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
+
   /// Turn load of vector type into a load of the individual elements.
   /// \param LD load to expand
   /// \returns MERGE_VALUEs of the scalar loads with their chains.

Modified: llvm/trunk/include/llvm/Target/TargetSelectionDAG.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetSelectionDAG.td?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetSelectionDAG.td (original)
+++ llvm/trunk/include/llvm/Target/TargetSelectionDAG.td Mon Oct 22 09:27:27 2018
@@ -408,6 +408,11 @@ def fminnum    : SDNode<"ISD::FMINNUM"
                                   [SDNPCommutative, SDNPAssociative]>;
 def fmaxnum    : SDNode<"ISD::FMAXNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
+def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
+                          [SDNPCommutative]>;
+def fmaxnum_ieee  : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
+                           [SDNPCommutative]>;
+
 def fminnan    : SDNode<"ISD::FMINNAN"    , SDTFPBinOp>;
 def fmaxnan    : SDNode<"ISD::FMAXNAN"    , SDTFPBinOp>;
 def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Mon Oct 22 09:27:27 2018
@@ -7097,6 +7097,13 @@ static SDValue combineMinNumMaxNum(const
   case ISD::SETLE:
   case ISD::SETULT:
   case ISD::SETULE: {
+    // Since it's known never nan to get here already, either fminnum or
+    // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
+    // expanded in terms of it.
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
     unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
     if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -7108,6 +7115,10 @@ static SDValue combineMinNumMaxNum(const
   case ISD::SETGE:
   case ISD::SETUGT:
   case ISD::SETUGE: {
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
     unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
     if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp Mon Oct 22 09:27:27 2018
@@ -3247,7 +3247,12 @@ bool SelectionDAGLegalize::ExpandNode(SD
     Results.push_back(Tmp1);
     break;
   }
-
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp Mon Oct 22 09:27:27 2018
@@ -130,6 +130,7 @@ class VectorLegalizer {
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ(SDValue Op);
+  SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
   SDValue ExpandStrictFPOp(SDValue Op);
 
   /// Implements vector promotion.
@@ -353,6 +354,8 @@ SDValue VectorLegalizer::LegalizeOp(SDVa
   case ISD::FABS:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::FCOPYSIGN:
@@ -721,6 +724,9 @@ SDValue VectorLegalizer::Expand(SDValue
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
     return ExpandCTTZ(Op);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return ExpandFMINNUM_FMAXNUM(Op);
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -1120,6 +1126,12 @@ SDValue VectorLegalizer::ExpandCTTZ(SDVa
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
+  if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
+    return Expanded;
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
   EVT VT = Op.getValueType();
   EVT EltVT = VT.getVectorElementType();

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Mon Oct 22 09:27:27 2018
@@ -113,6 +113,8 @@ void DAGTypeLegalizer::ScalarizeVectorRe
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::SMIN:

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp Mon Oct 22 09:27:27 2018
@@ -3712,9 +3712,31 @@ bool SelectionDAG::isKnownNeverNaN(SDVal
     // TODO: Refine on operand
     return false;
   }
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    // Only one needs to be known not-nan, since it will be returned if the
+    // other ends up being one.
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE: {
+    if (SNaN)
+      return true;
+    // This can return a NaN if either operand is an sNaN, or if both operands
+    // are NaN.
+    return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
+           (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
+  }
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN: {
+    // TODO: Does this quiet or return the origina NaN as-is?
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
 
-  // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on
-  // what they should do.
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
   }

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp Mon Oct 22 09:27:27 2018
@@ -176,6 +176,9 @@ std::string SDNode::getOperationName(con
   case ISD::FABS:                       return "fabs";
   case ISD::FMINNUM:                    return "fminnum";
   case ISD::FMAXNUM:                    return "fmaxnum";
+  case ISD::FMINNUM_IEEE:               return "fminnum_ieee";
+  case ISD::FMAXNUM_IEEE:               return "fmaxnum_ieee";
+
   case ISD::FMINNAN:                    return "fminnan";
   case ISD::FMAXNAN:                    return "fmaxnan";
   case ISD::FNEG:                       return "fneg";

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Mon Oct 22 09:27:27 2018
@@ -4113,6 +4113,35 @@ bool TargetLowering::expandFP_TO_SINT(SD
   return true;
 }
 
+SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
+    ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+  EVT VT = Node->getValueType(0);
+  if (isOperationLegalOrCustom(NewOp, VT)) {
+    SDValue Quiet0 = Node->getOperand(0);
+    SDValue Quiet1 = Node->getOperand(1);
+
+    if (!Node->getFlags().hasNoNaNs()) {
+      // Insert canonicalizes if it's possible we need to quiet to get correct
+      // sNaN behavior.
+      if (!DAG.isKnownNeverSNaN(Quiet0)) {
+        Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
+                             Node->getFlags());
+      }
+      if (!DAG.isKnownNeverSNaN(Quiet1)) {
+        Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
+                             Node->getFlags());
+      }
+    }
+
+    return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
+  }
+
+  return SDValue();
+}
+
 SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
                                             SelectionDAG &DAG) const {
   SDLoc SL(LD);

Modified: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp Mon Oct 22 09:27:27 2018
@@ -600,6 +600,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
     setOperationAction(ISD::FMINNUM, VT, Expand);
     setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FMINNUM_IEEE, VT, Expand);
+    setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand);
     setOperationAction(ISD::FMINNAN, VT, Expand);
     setOperationAction(ISD::FMAXNAN, VT, Expand);
     setOperationAction(ISD::FMAD, VT, Expand);

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Mon Oct 22 09:27:27 2018
@@ -552,6 +552,8 @@ static bool fnegFoldsIntoOp(unsigned Opc
   case ISD::FMAD:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FSIN:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -3512,6 +3514,10 @@ static unsigned inverseMinMax(unsigned O
     return ISD::FMINNUM;
   case ISD::FMINNUM:
     return ISD::FMAXNUM;
+  case ISD::FMAXNUM_IEEE:
+    return ISD::FMINNUM_IEEE;
+  case ISD::FMINNUM_IEEE:
+    return ISD::FMAXNUM_IEEE;
   case AMDGPUISD::FMAX_LEGACY:
     return AMDGPUISD::FMIN_LEGACY;
   case AMDGPUISD::FMIN_LEGACY:
@@ -3617,6 +3623,8 @@ SDValue AMDGPUTargetLowering::performFNe
   }
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case AMDGPUISD::FMAX_LEGACY:
   case AMDGPUISD::FMIN_LEGACY: {
     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Mon Oct 22 09:27:27 2018
@@ -360,6 +360,7 @@ enum NodeType : unsigned {
   SIN_HW,
   FMAX_LEGACY,
   FMIN_LEGACY,
+
   FMAX3,
   SMAX3,
   UMAX3,

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td Mon Oct 22 09:27:27 2018
@@ -152,8 +152,14 @@ def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
 def umax_oneuse : HasOneUseBinOp<umax>;
 def umin_oneuse : HasOneUseBinOp<umin>;
+
 def fminnum_oneuse : HasOneUseBinOp<fminnum>;
 def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+
+def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
+def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
+
+
 def and_oneuse : HasOneUseBinOp<and>;
 def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
@@ -837,3 +843,25 @@ class RsqPat<Instruction RsqInst, ValueT
   (AMDGPUrcp (fsqrt vt:$src)),
   (RsqInst $src)
 >;
+
+// Instructions which select to the same v_min_f*
+def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee node:$src0, node:$src1),
+   (fminnum node:$src0, node:$src1)]
+>;
+
+// Instructions which select to the same v_max_f*
+def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee node:$src0, node:$src1),
+   (fmaxnum node:$src0, node:$src1)]
+>;
+
+def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee_oneuse node:$src0, node:$src1),
+   (fminnum_oneuse node:$src0, node:$src1)]
+>;
+
+def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
+   (fmaxnum_oneuse node:$src0, node:$src1)]
+>;

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Mon Oct 22 09:27:27 2018
@@ -384,8 +384,20 @@ SITargetLowering::SITargetLowering(const
   if (Subtarget->hasBFE())
     setHasExtractBitsInsn(true);
 
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+
+
+  // These are really only legal for ieee_mode functions. We should be avoiding
+  // them for functions that don't have ieee_mode enabled, so just say they are
+  // legal.
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+
 
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -474,8 +486,7 @@ SITargetLowering::SITargetLowering(const
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
 
     // F16 - VOP3 Actions.
@@ -558,6 +569,17 @@ SITargetLowering::SITargetLowering(const
     // This isn't really legal, but this avoids the legalizer unrolling it (and
     // allows matching fneg (fabs x) patterns)
     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
   }
 
   if (Subtarget->hasVOP3PInsts()) {
@@ -575,8 +597,10 @@ SITargetLowering::SITargetLowering(const
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
@@ -596,6 +620,10 @@ SITargetLowering::SITargetLowering(const
 
     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+
     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
@@ -634,6 +662,8 @@ SITargetLowering::SITargetLowering(const
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::FMINNUM_IEEE);
+  setTargetDAGCombine(ISD::FMAXNUM_IEEE);
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::SMAX);
@@ -3580,6 +3610,9 @@ SDValue SITargetLowering::LowerOperation
   case ISD::FNEG:
   case ISD::FCANONICALIZE:
     return splitUnaryVectorOp(Op, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return lowerFMINNUM_FMAXNUM(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -3590,10 +3623,10 @@ SDValue SITargetLowering::LowerOperation
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
-  case ISD::FMINNUM:
-  case ISD::FMAXNUM:
   case ISD::FADD:
   case ISD::FMUL:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
   }
   return SDValue();
@@ -4048,6 +4081,23 @@ SDValue SITargetLowering::lowerFP_ROUND(
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
 }
 
+SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+  // FIXME: Assert during eslection that this is only selected for
+  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
+  // mode functions, but this happens to be OK since it's only done in cases
+  // where there is known no sNaN.
+  if (IsIEEEMode)
+    return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
@@ -7521,37 +7571,32 @@ bool SITargetLowering::isCanonicalized(S
 
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case AMDGPUISD::CLAMP:
   case AMDGPUISD::FMED3:
   case AMDGPUISD::FMAX3:
   case AMDGPUISD::FMIN3: {
     // FIXME: Shouldn't treat the generic operations different based these.
-    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
-    if (IsIEEEMode) {
-      // snans will be quieted, so we only need to worry about denormals.
-      if (Subtarget->supportsMinMaxDenormModes() ||
-          denormalsEnabledForType(Op.getValueType()))
-        return true;
-
-      // Flushing may be required.
-      // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
-      // targets need to check their input recursively.
-      return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-             isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
-    }
+    // However, we aren't really required to flush the result from
+    // minnum/maxnum..
 
+    // snans will be quieted, so we only need to worry about denormals.
     if (Subtarget->supportsMinMaxDenormModes() ||
-        denormalsEnabledForType(Op.getValueType())) {
-      // Only quieting may be necessary.
-      return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
-             DAG.isKnownNeverSNaN(Op.getOperand(1));
+        denormalsEnabledForType(Op.getValueType()))
+      return true;
+
+    // Flushing may be required.
+    // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
+    // targets need to check their input recursively.
+
+    // FIXME: Does this apply with clamp? It's implemented with max.
+    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+        return false;
     }
 
-    // Flushing and quieting may be necessary
-    // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
-    // needs to be quieted.
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+    return true;
   }
   case ISD::SELECT: {
     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
@@ -7578,6 +7623,21 @@ bool SITargetLowering::isCanonicalized(S
     // Could be anything.
     return false;
 
+  case ISD::BITCAST: {
+    // Hack round the mess we make when legalizing extract_vector_elt
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType() == MVT::i16 &&
+        Src.getOpcode() == ISD::TRUNCATE) {
+      SDValue TruncSrc = Src.getOperand(0);
+      if (TruncSrc.getValueType() == MVT::i32 &&
+          TruncSrc.getOpcode() == ISD::BITCAST &&
+          TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
+        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+      }
+    }
+
+    return false;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID
       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -7603,7 +7663,6 @@ bool SITargetLowering::isCanonicalized(S
 }
 
 // Constant fold canonicalize.
-
 SDValue SITargetLowering::getCanonicalConstantFP(
   SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
   // Flush denormals to 0 if not enabled.
@@ -7699,18 +7758,40 @@ SDValue SITargetLowering::performFCanoni
     }
   }
 
+  unsigned SrcOpc = N0.getOpcode();
+
+  // If it's free to do so, push canonicalizes further up the source, which may
+  // find a canonical source.
+  //
+  // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
+  // sNaNs.
+  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
+    auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CRHS && N0.hasOneUse()) {
+      SDLoc SL(N);
+      SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
+                                   N0.getOperand(0));
+      SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
+      DCI.AddToWorklist(Canon0.getNode());
+
+      return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
+    }
+  }
+
   return isCanonicalized(DAG, N0) ? N0 : SDValue();
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
     return AMDGPUISD::FMAX3;
   case ISD::SMAX:
     return AMDGPUISD::SMAX3;
   case ISD::UMAX:
     return AMDGPUISD::UMAX3;
   case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
     return AMDGPUISD::FMIN3;
   case ISD::SMIN:
     return AMDGPUISD::SMIN3;
@@ -7877,6 +7958,7 @@ SDValue SITargetLowering::performMinMaxC
 
   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
       (VT == MVT::f32 || VT == MVT::f64 ||
@@ -7995,7 +8077,9 @@ SDValue SITargetLowering::performExtract
     case ISD::SMIN:
     case ISD::SMAX:
     case ISD::FMAXNUM:
-    case ISD::FMINNUM: {
+    case ISD::FMINNUM:
+    case ISD::FMAXNUM_IEEE:
+    case ISD::FMINNUM_IEEE: {
       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                                  Vec.getOperand(0), Idx);
       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
@@ -8595,13 +8679,15 @@ SDValue SITargetLowering::PerformDAGComb
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY: {
-    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+    if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
       return performMinMaxCombine(N, DCI);
     break;
@@ -9320,3 +9406,17 @@ bool SITargetLowering::denormalsEnabledF
     return false;
   }
 }
+
+bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+                                                    const SelectionDAG &DAG,
+                                                    bool SNaN,
+                                                    unsigned Depth) const {
+  if (Op.getOpcode() == AMDGPUISD::CLAMP) {
+    if (Subtarget->enableDX10Clamp())
+      return true; // Clamped to 0.
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+
+  return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
+                                                            SNaN, Depth);
+}

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Mon Oct 22 09:27:27 2018
@@ -110,6 +110,7 @@ private:
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
                              SelectionDAG &DAG) const;
@@ -346,6 +347,11 @@ public:
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
   bool denormalsEnabledForType(EVT VT) const;
+
+  bool isKnownNeverNaNForTargetNode(SDValue Op,
+                                    const SelectionDAG &DAG,
+                                    bool SNaN = false,
+                                    unsigned Depth = 0) const override;
 };
 
 } // End namespace llvm

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon Oct 22 09:27:27 2018
@@ -1645,10 +1645,11 @@ def : IntMed3Pat<V_MED3_U32, umax, umax_
 // This matches 16 permutations of
 // max(min(x, y), min(max(x, y), z))
 class FPMed3Pat<ValueType vt,
+                //SDPatternOperator max, SDPatternOperator min,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
@@ -1656,10 +1657,10 @@ class FPMed3Pat<ValueType vt,
 
 class FP16Med3Pat<ValueType vt,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
 >;

Modified: llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td Mon Oct 22 09:27:27 2018
@@ -393,8 +393,8 @@ defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i3
 defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
 defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
 defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
 defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
 defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
 defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
@@ -556,8 +556,8 @@ defm V_ADD_U16 : VOP2Inst <"v_add_u16",
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
-defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
-defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
 defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
 defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
 defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;

Modified: llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td Mon Oct 22 09:27:27 2018
@@ -295,8 +295,8 @@ let SchedRW = [WriteDoubleAdd] in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteQuarterRate32] in {

Modified: llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td Mon Oct 22 09:27:27 2018
@@ -48,8 +48,8 @@ def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u
 
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;

Modified: llvm/trunk/test/CodeGen/AMDGPU/clamp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/clamp.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/clamp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/clamp.ll Mon Oct 22 09:27:27 2018
@@ -74,7 +74,8 @@ define amdgpu_kernel void @v_clamp_negze
 
 ; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
 define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,8 +91,17 @@ define amdgpu_kernel void @v_clamp_negze
 
 ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
+; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
+; GCN-NOT: [[MAX]]
+; GCN-NOT: [[MED]]
+
+; SI: buffer_store_dword [[MED]]
+; SI: buffer_store_dword [[MAX]]
+
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
 define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@@ -406,8 +416,8 @@ define amdgpu_kernel void @v_clamp_f32_s
 
 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll Mon Oct 22 09:27:27 2018
@@ -455,14 +455,13 @@ define amdgpu_kernel void @test_fold_can
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
-; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-; VI-DENORM-NOT: v_max_f32
-; VI-DENORM-NOT: v_mul_f32
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
 
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-NOT: v_max
+; GCN-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
@@ -476,15 +475,13 @@ define amdgpu_kernel void @test_fold_can
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-
-; VI-DENORM-NOT: v_max
-; VI-DENORM-NOT: v_mul
 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
+
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
@@ -530,13 +527,19 @@ define amdgpu_kernel void @test_fold_can
 }
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GFX9:  v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
+
+; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 
-; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
 
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
 
+; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
 
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
@@ -552,11 +555,14 @@ define amdgpu_kernel void @test_fold_can
 }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
-; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
+
+; VI-FLUSH:    v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH:    v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 
-; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
+; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
@@ -707,16 +713,21 @@ define amdgpu_kernel void @test_fold_can
 
 ; Need to quiet the nan with a separate instruction since it will be
 ; passed through the minnum.
+; FIXME: canonicalize doens't work correctly without ieee_mode
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
 ; GFX9: v_min_f32_e32 v0, v0, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT: ; return to shader
 
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
-; VI-DENORM: v_max_f32_e32 v0, v0, v0
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: ; return
+
+; VI-DENORM-NOT: v0
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+; VI-DENORM-NEXT: ; return
 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
@@ -727,8 +738,14 @@ define amdgpu_ps float @test_fold_canoni
 ; GFX9: v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT: s_setpc_b64
 
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+
+; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
+; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+
 ; VI-NEXT: s_setpc_b64
 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll Mon Oct 22 09:27:27 2018
@@ -4,11 +4,14 @@
 declare double @llvm.maxnum.f64(double, double) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
+; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
+; SI: v_max_f64 [[QUIET_B:v\[[0-9]+:[0-9]+\]]], [[REGB]], [[REGB]]
+; SI: v_max_f64 [[MAX0:v\[[0-9]+:[0-9]+\]]], [[QUIET_A]], [[QUIET_B]]
+; SI: v_max_f64 [[QUIET_C:v\[[0-9]+:[0-9]+\]]], [[REGC]], [[REGC]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[MAX0]], [[QUIET_C]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
 define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll Mon Oct 22 09:27:27 2018
@@ -48,8 +48,11 @@ define amdgpu_kernel void @test_fmax3_ol
 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
 
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]]
 
 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
 ; GCN: buffer_store_short [[RESULT]],
@@ -75,8 +78,11 @@ define amdgpu_kernel void @test_fmax3_ol
 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
 
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]]
 
 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
 ; GCN: buffer_store_short [[RESULT]],
@@ -100,22 +106,25 @@ define amdgpu_kernel void @test_fmax3_ol
 ; SI-NEXT: v_max3_f32
 ; SI-NEXT: v_max3_f32
 
-; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v1
-; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_max_f16_e32 v0, v2, v0
-; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
 
-; GFX9: v_pk_max_f16
+; GFX9: s_waitcnt
 ; GFX9-NEXT: v_pk_max_f16
 ; GFX9-NEXT: v_pk_max_f16
-define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+; GFX9-NEXT: v_pk_max_f16
+define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 entry:
-  %max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
+  %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
   ret <2 x half> %res
 }
 
@@ -126,3 +135,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll Mon Oct 22 09:27:27 2018
@@ -97,7 +97,7 @@ define <2 x half> @test_fmax_legacy_ugt_
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
@@ -178,7 +178,7 @@ define <3 x half> @test_fmax_legacy_ugt_
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
@@ -283,8 +283,8 @@ define <4 x half> @test_fmax_legacy_ugt_
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
@@ -437,10 +437,10 @@ define <8 x half> @test_fmax_legacy_ugt_
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll Mon Oct 22 09:27:27 2018
@@ -1,13 +1,22 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
 ; EG: MAX
@@ -26,12 +35,16 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
 
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 
 ; EG: MAX
@@ -52,9 +65,14 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -72,9 +90,15 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -92,9 +116,14 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -112,9 +141,15 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
@@ -132,12 +167,24 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE-NOT: v_cmp
+; VI-SAFE-NOT: v_cndmask
+
 ; GCN-NONAN: v_max_f32_e32
 ; GCN-NONAN: v_max_f32_e32
 ; GCN-NONAN: v_max_f32_e32
+
+; GCN-NOT: v_max
 define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
@@ -153,8 +200,8 @@ define amdgpu_kernel void @test_fmax_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-NOT: v_max_
 ; GCN: v_cmp_gt_f32
 ; GCN-NEXT: v_cndmask_b32

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll Mon Oct 22 09:27:27 2018
@@ -1,14 +1,26 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_fmax_f32:
-; GCN: v_max_f32_e32
-define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmax_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_off:
+; GCN: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmax_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
 ; GCN-LABEL: {{^}}test_fmax_v2f32:
 ; GCN: v_max_f32_e32
 ; GCN: v_max_f32_e32
@@ -158,38 +170,34 @@ define amdgpu_kernel void @constant_fold
   ret void
 }
 
-; GCN-LABEL: {{^}}fmax_var_immediate_f32:
+; GCN-LABEL: {{^}}fmax_var_immediate_f32_no_ieee:
 ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_var_immediate_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_immediate_var_f32:
+; GCN-LABEL: {{^}}fmax_immediate_var_f32_no_ieee:
 ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float 2.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_immediate_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_var_literal_f32:
+; GCN-LABEL: {{^}}fmax_var_literal_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float 99.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_var_literal_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_literal_var_f32:
+; GCN-LABEL: {{^}}fmax_literal_var_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float 99.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_literal_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  ret float %val
 }
 
 ; GCN-LABEL: {{^}}test_func_fmax_v3f32:

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll Mon Oct 22 09:27:27 2018
@@ -95,22 +95,26 @@ define amdgpu_kernel void @test_fmin3_ol
 ; SI-NEXT: v_min3_f32
 ; SI-NEXT: v_min3_f32
 
-; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v1
-; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_min_f16_e32 v0, v2, v0
-; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
+; VI: s_waitcnt
+; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
+; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
 
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: v_pk_min_f16 v0, v2, v0
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX9-NEXT: s_setpc_b64
+define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 entry:
-  %min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
-  %min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
-  %res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
+  %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+  %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
+  %res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
   ret <2 x half> %res
 }
 
@@ -121,3 +125,4 @@ declare <2 x half> @llvm.minnum.v2f16(<2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll Mon Oct 22 09:27:27 2018
@@ -1,9 +1,19 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,SI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN,SI %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,VI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN,VI %s
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, 1.0
@@ -12,7 +22,14 @@ define amdgpu_ps float @min_fneg_select_
 }
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
@@ -22,9 +39,16 @@ define amdgpu_ps float @min_fneg_select_
 }
 
 ; GCN-LABEL: {{^}}max_fneg_select_regression_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
-define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, 1.0
   %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
@@ -32,9 +56,16 @@ define amdgpu_ps float @max_fneg_select_
 }
 
 ; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
-define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, -1.0
   %min.a = select i1 %cmp.a, float %fneg.a, float 1.0

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll Mon Oct 22 09:27:27 2018
@@ -98,7 +98,7 @@ define <2 x half> @test_fmin_legacy_ule_
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
@@ -179,7 +179,7 @@ define <3 x half> @test_fmin_legacy_ule_
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
@@ -284,8 +284,8 @@ define <4 x half> @test_fmin_legacy_ule_
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
@@ -438,10 +438,10 @@ define <8 x half> @test_fmin_legacy_ule_
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll Mon Oct 22 09:27:27 2018
@@ -1,5 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
@@ -10,8 +14,13 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
 ; EG: MIN *
-; GCN-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
@@ -22,13 +31,17 @@ define amdgpu_kernel void @s_test_fmin_l
 }
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
 
-; GCN-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
+; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
+
+; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]]
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
 
+; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -36,13 +49,19 @@ define amdgpu_kernel void @s_test_fmin_l
   ret void
 }
 
+; Nsz also needed
+; FIXME: Should separate tests
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
-; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0
 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0
 
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.nnan = fadd nnan float %a, 1.0
@@ -54,9 +73,14 @@ define amdgpu_kernel void @s_test_fmin_l
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -73,9 +97,14 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_le_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -92,9 +121,14 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -111,9 +145,14 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -130,9 +169,14 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -149,10 +193,15 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
-; GCN: buffer_load_dwordx2
-; GCN: buffer_load_dwordx2
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
@@ -171,13 +220,24 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE-NOT: v_min_
+
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
+; GCN-NONAN-NOT: v_min_
 define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
@@ -193,8 +253,8 @@ define amdgpu_kernel void @test_fmin_leg
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-NOT: v_min
 ; GCN: v_cmp_le_f32
 ; GCN-NEXT: v_cndmask_b32

Modified: llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll Mon Oct 22 09:27:27 2018
@@ -7,15 +7,35 @@ declare <4 x double> @llvm.minnum.v4f64(
 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 
-; FUNC-LABEL: @test_fmin_f64
-; SI: v_min_f64
-define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+; FUNC-LABEL: {{^}}test_fmin_f64_ieee:
+; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
+; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
+; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
+define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* undef, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee:
+; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]]
+; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]]
+; SI-NOT: [[VAL0]]
+; SI-NOT: [[VAL1]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]]
+; SI-NOT: [[RESULT]]
+; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
+define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
+  %a = load volatile double, double addrspace(3)* undef
+  %b = load volatile double, double addrspace(3)* undef
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* %out, align 8
+  store volatile double %val, double addrspace(3)* undef
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v2f64
+; FUNC-LABEL: {{^}}test_fmin_v2f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
@@ -24,7 +44,7 @@ define amdgpu_kernel void @test_fmin_v2f
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v4f64
+; FUNC-LABEL: {{^}}test_fmin_v4f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
@@ -35,7 +55,7 @@ define amdgpu_kernel void @test_fmin_v4f
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v8f64
+; FUNC-LABEL: {{^}}test_fmin_v8f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
@@ -50,7 +70,7 @@ define amdgpu_kernel void @test_fmin_v8f
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v16f64
+; FUNC-LABEL: {{^}}test_fmin_v16f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64

Modified: llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll Mon Oct 22 09:27:27 2018
@@ -1,14 +1,45 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_fmin_f32:
-; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_on:
+; GCN: s_waitcnt
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64
+define float @test_fmin_nnan_f32_ieee_mode_on(float %a, float %b) #0 {
+  %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_off:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_nnan_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_off:
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
 ; GCN-LABEL: {{^}}test_fmin_v2f32:
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
@@ -147,38 +178,34 @@ define amdgpu_kernel void @constant_fold
   ret void
 }
 
-; GCN-LABEL: {{^}}fmin_var_immediate_f32:
-; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+; GCN-LABEL: {{^}}fmin_var_immediate_f32_no_ieee:
+; GCN: v_min_f32_e32 v0, 2.0, v0
+define amdgpu_ps float @fmin_var_immediate_f32_no_ieee(float %a) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_immediate_var_f32:
+; GCN-LABEL: {{^}}fmin_immediate_var_f32_no_ieee:
 ; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float 2.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_immediate_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_var_literal_f32:
+; GCN-LABEL: {{^}}fmin_var_literal_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float 99.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_var_literal_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_literal_var_f32:
+; GCN-LABEL: {{^}}fmin_literal_var_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float 99.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_literal_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #1
+  ret float %val
 }
 
 ; GCN-LABEL: {{^}}test_func_fmin_v3f32:

Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll Mon Oct 22 09:27:27 2018
@@ -396,12 +396,14 @@ define amdgpu_kernel void @v_fneg_mul_mu
 ; fminnum tests
 ; --------------------------------------------------------------------------------
 
-; GCN-LABEL: {{^}}v_fneg_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -415,11 +417,23 @@ define amdgpu_kernel void @v_fneg_minnum
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -431,11 +445,22 @@ define amdgpu_kernel void @v_fneg_self_m
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  ret float %min.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -447,11 +472,22 @@ define amdgpu_kernel void @v_fneg_posk_m
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -463,6 +499,16 @@ define amdgpu_kernel void @v_fneg_negk_m
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
 ; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
@@ -479,11 +525,12 @@ define amdgpu_kernel void @v_fneg_0_minn
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -498,10 +545,11 @@ define amdgpu_kernel void @v_fneg_neg0_m
 ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
 
-; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -520,10 +568,11 @@ define amdgpu_kernel void @v_fneg_inv2pi
 ; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
 
-; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
@@ -545,7 +594,8 @@ define amdgpu_kernel void @v_fneg_neg_in
 ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
 
-; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
+; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
 
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -568,7 +618,8 @@ define amdgpu_kernel void @v_fneg_inv2pi
 ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
 
-; VI: v_max_f16_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
 
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
@@ -588,7 +639,8 @@ define amdgpu_kernel void @v_fneg_neg_in
 
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494
 ; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
@@ -611,9 +663,11 @@ define amdgpu_kernel void @v_fneg_inv2pi
 
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
-; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], 0.15915494
+; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
 
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
@@ -638,13 +692,14 @@ define amdgpu_ps float @v_fneg_neg0_minn
   ret float %fneg
 }
 
-; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -660,15 +715,16 @@ define amdgpu_kernel void @v_fneg_0_minn
 }
 
 ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
+; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
 
-; SI: v_max_f32_e64 [[MIN:v[0-9]+]], -[[A]], [[K]]
+; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
 ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
 
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -687,14 +743,29 @@ define amdgpu_kernel void @v_fneg_inv2pi
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  %mul = fmul float %fneg, %b
+  ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -710,16 +781,34 @@ define amdgpu_kernel void @v_fneg_minnum
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+  ret <2 x float> %ins1
+}
+
 ; --------------------------------------------------------------------------------
 ; fmaxnum tests
 ; --------------------------------------------------------------------------------
 
-; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -727,60 +816,104 @@ define amdgpu_kernel void @v_fneg_maxnum
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %b = load volatile float, float addrspace(1)* %b.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %b)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %a)
-  %min.fneg = fsub float -0.0, %min
-  store float %min.fneg, float addrspace(1)* %out.gep
+  %max = call float @llvm.maxnum.f32(float %a, float %a)
+  %max.fneg = fsub float -0.0, %max
+  store float %max.fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %a)
+  %max.fneg = fsub float -0.0, %max
+  ret float %max.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float 4.0, float %a)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float -4.0, float %a)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
@@ -797,11 +930,12 @@ define amdgpu_kernel void @v_fneg_0_maxn
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -813,13 +947,24 @@ define amdgpu_kernel void @v_fneg_neg0_m
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -834,14 +979,29 @@ define amdgpu_kernel void @v_fneg_0_maxn
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  %mul = fmul float %fneg, %b
+  ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -849,14 +1009,29 @@ define amdgpu_kernel void @v_fneg_maxnum
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %b = load volatile float, float addrspace(1)* %b.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %b)
-  %fneg = fsub float -0.000000e+00, %min
-  %use1 = fmul float %min, 4.0
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  %use1 = fmul float %max, 4.0
   store volatile float %fneg, float addrspace(1)* %out
   store volatile float %use1, float addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  %use1 = fmul float %max, 4.0
+  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+  ret <2 x float> %ins1
+}
+
 ; --------------------------------------------------------------------------------
 ; fma tests
 ; --------------------------------------------------------------------------------

Modified: llvm/trunk/test/CodeGen/AMDGPU/known-never-snan.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/known-never-snan.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/known-never-snan.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/known-never-snan.ll Mon Oct 22 09:27:27 2018
@@ -99,8 +99,7 @@ define float @v_test_known_not_snan_minn
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -110,14 +109,46 @@ define float @v_test_known_not_snan_minn
   ret float %med
 }
 
+define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %b.nsnan = fadd float %b, 1.0
+  %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan)
+  %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  ret float %med
+}
+
+define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %a.nsnan = fadd float %a, 1.0
+  %known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b)
+  %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  ret float %med
+}
+
 define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add)
@@ -131,9 +162,9 @@ define float @v_minnum_possible_nan_rhs_
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
@@ -148,8 +179,8 @@ define float @v_test_known_not_snan_maxn
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -164,8 +195,9 @@ define float @v_maxnum_possible_nan_lhs_
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add)
@@ -179,8 +211,9 @@ define float @v_maxnum_possible_nan_rhs_
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
@@ -215,8 +248,8 @@ define float @v_select_possible_nan_lhs_
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %cmp = icmp eq i32 %c, 0
@@ -233,8 +266,8 @@ define float @v_select_possible_nan_rhs_
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %cmp = icmp eq i32 %c, 0
@@ -494,6 +527,7 @@ define float @v_test_known_not_snan_fmed
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_med3_f32 v0, v0, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c)
@@ -507,8 +541,7 @@ define float @v_test_known_not_snan_fmin
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min0 = call float @llvm.minnum.f32(float %a, float %b)
   %known.not.snan = call float @llvm.minnum.f32(float %min0, float %c)

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll Mon Oct 22 09:27:27 2018
@@ -1,23 +1,91 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
 declare half @llvm.maxnum.f16(half %a, half %b)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
 declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
 
-; GCN-LABEL: {{^}}maxnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16(
+; SI-LABEL: maxnum_f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -29,15 +97,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16_imm_a(
+; SI-LABEL: maxnum_f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -47,15 +165,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16_imm_b(
+; SI-LABEL: maxnum_f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,34 +233,79 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16(
+; SI-LABEL: maxnum_v2f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT:    s_lshr_b32 s0, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s5, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -104,29 +317,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @maxnum_v2f16_imm_a(
+; SI-LABEL: maxnum_v2f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -136,31 +384,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @maxnum_v2f16_imm_b(
+; SI-LABEL: maxnum_v2f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -171,10 +452,94 @@ entry:
 }
 
 ; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}maxnum_v3f16:
-; GFX9: v_pk_max_f16
-; GFX9: v_pk_max_f16
 define amdgpu_kernel void @maxnum_v3f16(
+; SI-LABEL: maxnum_v3f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v3f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s6, s6
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v3f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
     <3 x half> addrspace(1)* %b) {
@@ -186,13 +551,107 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
 define amdgpu_kernel void @maxnum_v4f16(
+; SI-LABEL: maxnum_v4f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    s_lshr_b32 s4, s7, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v3, v3, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_max_f32_e32 v2, v2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v4f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v0, s7, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    v_max_f16_e32 v0, v2, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v3, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v4f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
     <4 x half> addrspace(1)* %b) {
@@ -204,28 +663,87 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmax_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
 define amdgpu_kernel void @fmax_v4f16_imm_a(
+; SI-LABEL: fmax_v4f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmax_v4f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e64 v3, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmax_v4f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
+; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.minnum.f16.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.minnum.f16.ll Mon Oct 22 09:27:27 2018
@@ -1,23 +1,91 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
 declare half @llvm.minnum.f16(half %a, half %b)
 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
 declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
 
-; GCN-LABEL: {{^}}minnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
-define amdgpu_kernel void @minnum_f16(
+define amdgpu_kernel void @minnum_f16_ieee(
+; SI-LABEL: minnum_f16_ieee:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_ieee:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_ieee:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -29,15 +97,88 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
+define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) {
+; SI-LABEL: minnum_f16_no_ieee:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: minnum_f16_no_ieee:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %r.val = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r.val
+}
+
 define amdgpu_kernel void @minnum_f16_imm_a(
+; SI-LABEL: minnum_f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -47,15 +188,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_f16_imm_b(
+; SI-LABEL: minnum_f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,33 +256,79 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-define amdgpu_kernel void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16_ieee(
+; SI-LABEL: minnum_v2f16_ieee:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT:    s_lshr_b32 s0, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v2, v3, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_ieee:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s5, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_ieee:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -103,29 +340,94 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
+define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) {
+; SI-LABEL: minnum_v2f16_no_ieee:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_min_f32_e32 v0, v0, v2
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: minnum_v2f16_no_ieee:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r.val
+}
 
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @minnum_v2f16_imm_a(
+; SI-LABEL: minnum_v2f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -135,31 +437,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @minnum_v2f16_imm_b(
+; SI-LABEL: minnum_v2f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -170,10 +505,94 @@ entry:
 }
 
 ; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}minnum_v3f16:
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
 define amdgpu_kernel void @minnum_v3f16(
+; SI-LABEL: minnum_v3f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v2, v3, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v3f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s6, s6
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_min_f16_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v3f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
     <3 x half> addrspace(1)* %b) {
@@ -185,13 +604,107 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
 define amdgpu_kernel void @minnum_v4f16(
+; SI-LABEL: minnum_v4f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    s_lshr_b32 s4, s7, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v3, v3, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_min_f32_e32 v2, v2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v4f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v0, s7, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    v_min_f16_e32 v0, v2, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v3, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v4f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
     <4 x half> addrspace(1)* %b) {
@@ -203,28 +716,87 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmin_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
 define amdgpu_kernel void @fmin_v4f16_imm_a(
+; SI-LABEL: fmin_v4f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmin_v4f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e64 v3, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmin_v4f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
+; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/reduction.ll?rev=344914&r1=344913&r2=344914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/reduction.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/reduction.ll Mon Oct 22 09:27:27 2018
@@ -434,12 +434,23 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
-; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-
-; VI:      v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -451,12 +462,24 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}reduction_minnum_v4f16:
-; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-
-; VI:      v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
+
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_minnum_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -467,13 +490,36 @@ entry:
   ret half %res
 }
 
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
 ; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
-; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-; VI:      v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; XVI: s_waitcnt
+; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -486,13 +532,37 @@ entry:
   ret half %res
 }
 
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
 ; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
-; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-; VI:      v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+; XVI: s_waitcnt
+; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
+
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>




More information about the llvm-commits mailing list