[llvm] Revert add GenericFloatingPointPredicateUtils #140254 (PR #141257)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Fri May 23 10:02:25 PDT 2025
https://github.com/tgymnich created https://github.com/llvm/llvm-project/pull/141257
None
>From 9247a2c96393b1cca3fcf3d5c48b48e6890c1e5c Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 23 May 2025 14:47:18 +0000
Subject: [PATCH 1/5] guard against non-virtual registers
---
llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 67b1a449f8483..f05a291defff6 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -693,6 +693,9 @@ static bool outputDenormalIsIEEEOrPosZero(const MachineFunction &MF, LLT Ty) {
void GISelValueTracking::computeKnownFPClass(Register R, KnownFPClass &Known,
FPClassTest InterestedClasses,
unsigned Depth) {
+ if (!R.isVirtual())
+ return;
+
LLT Ty = MRI.getType(R);
APInt DemandedElts =
Ty.isFixedVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
@@ -736,6 +739,9 @@ void GISelValueTracking::computeKnownFPClass(Register R,
assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
+ if (!R.isVirtual())
+ return;
+
MachineInstr &MI = *MRI.getVRegDef(R);
unsigned Opcode = MI.getOpcode();
LLT DstTy = MRI.getType(R);
>From 3669f1f8d920311d2dab86e2e89da0bbb841a4b3 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 23 May 2025 16:33:06 +0000
Subject: [PATCH 2/5] replace isKnownNeverNaN impl
---
llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 6 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 4 +-
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 4 +-
llvm/lib/CodeGen/GlobalISel/Utils.cpp | 88 ++-----------------
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 2 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 10 +--
6 files changed, 20 insertions(+), 94 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 684a9bf554fb1..503f61216d9e6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -336,12 +336,12 @@ bool isKnownToBeAPowerOfTwo(Register Val, const MachineRegisterInfo &MRI,
/// Returns true if \p Val can be assumed to never be a NaN. If \p SNaN is true,
/// this returns if \p Val can be assumed to never be a signaling NaN.
-bool isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
+bool isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, GISelValueTracking *ValueTracking,
bool SNaN = false);
/// Returns true if \p Val can be assumed to never be a signaling NaN.
-inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI) {
- return isKnownNeverNaN(Val, MRI, true);
+inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI, GISelValueTracking *ValueTracking) {
+ return isKnownNeverNaN(Val, MRI, ValueTracking, true);
}
Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b1e851183de0d..8952226ae7f1e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6519,8 +6519,8 @@ unsigned CombinerHelper::getFPMinMaxOpcForSelect(
CombinerHelper::SelectPatternNaNBehaviour
CombinerHelper::computeRetValAgainstNaN(Register LHS, Register RHS,
bool IsOrderedComparison) const {
- bool LHSSafe = isKnownNeverNaN(LHS, MRI);
- bool RHSSafe = isKnownNeverNaN(RHS, MRI);
+ bool LHSSafe = isKnownNeverNaN(LHS, MRI, VT);
+ bool RHSSafe = isKnownNeverNaN(RHS, MRI, VT);
// Completely unsafe.
if (!LHSSafe && !RHSSafe)
return SelectPatternNaNBehaviour::NOT_APPLICABLE;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 7b18a98d7f3ca..e242df04a5d80 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8179,10 +8179,10 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
// Note this must be done here, and not as an optimization combine in the
// absence of a dedicate quiet-snan instruction as we're using an
// omni-purpose G_FCANONICALIZE.
- if (!isKnownNeverSNaN(Src0, MRI))
+ if (!isKnownNeverSNaN(Src0, MRI, VT))
Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
- if (!isKnownNeverSNaN(Src1, MRI))
+ if (!isKnownNeverSNaN(Src1, MRI, VT))
Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
}
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 64af7a57e8d12..227fac4007463 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -12,6 +12,7 @@
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/CodeGenCommonISel.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
@@ -806,88 +807,13 @@ llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
return FoldedElements;
}
-bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
+bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, GISelValueTracking *VT,
bool SNaN) {
- const MachineInstr *DefMI = MRI.getVRegDef(Val);
- if (!DefMI)
- return false;
-
- const TargetMachine& TM = DefMI->getMF()->getTarget();
- if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
- return true;
-
- // If the value is a constant, we can obviously see if it is a NaN or not.
- if (const ConstantFP *FPVal = getConstantFPVRegVal(Val, MRI)) {
- return !FPVal->getValueAPF().isNaN() ||
- (SNaN && !FPVal->getValueAPF().isSignaling());
- }
-
- if (DefMI->getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
- for (const auto &Op : DefMI->uses())
- if (!isKnownNeverNaN(Op.getReg(), MRI, SNaN))
- return false;
- return true;
- }
-
- switch (DefMI->getOpcode()) {
- default:
- break;
- case TargetOpcode::G_FADD:
- case TargetOpcode::G_FSUB:
- case TargetOpcode::G_FMUL:
- case TargetOpcode::G_FDIV:
- case TargetOpcode::G_FREM:
- case TargetOpcode::G_FSIN:
- case TargetOpcode::G_FCOS:
- case TargetOpcode::G_FTAN:
- case TargetOpcode::G_FACOS:
- case TargetOpcode::G_FASIN:
- case TargetOpcode::G_FATAN:
- case TargetOpcode::G_FATAN2:
- case TargetOpcode::G_FCOSH:
- case TargetOpcode::G_FSINH:
- case TargetOpcode::G_FTANH:
- case TargetOpcode::G_FMA:
- case TargetOpcode::G_FMAD:
- if (SNaN)
- return true;
-
- // TODO: Need isKnownNeverInfinity
- return false;
- case TargetOpcode::G_FMINNUM_IEEE:
- case TargetOpcode::G_FMAXNUM_IEEE: {
- if (SNaN)
- return true;
- // This can return a NaN if either operand is an sNaN, or if both operands
- // are NaN.
- return (isKnownNeverNaN(DefMI->getOperand(1).getReg(), MRI) &&
- isKnownNeverSNaN(DefMI->getOperand(2).getReg(), MRI)) ||
- (isKnownNeverSNaN(DefMI->getOperand(1).getReg(), MRI) &&
- isKnownNeverNaN(DefMI->getOperand(2).getReg(), MRI));
- }
- case TargetOpcode::G_FMINNUM:
- case TargetOpcode::G_FMAXNUM: {
- // Only one needs to be known not-nan, since it will be returned if the
- // other ends up being one.
- return isKnownNeverNaN(DefMI->getOperand(1).getReg(), MRI, SNaN) ||
- isKnownNeverNaN(DefMI->getOperand(2).getReg(), MRI, SNaN);
- }
- }
-
- if (SNaN) {
- // FP operations quiet. For now, just handle the ones inserted during
- // legalization.
- switch (DefMI->getOpcode()) {
- case TargetOpcode::G_FPEXT:
- case TargetOpcode::G_FPTRUNC:
- case TargetOpcode::G_FCANONICALIZE:
- return true;
- default:
- return false;
- }
- }
-
- return false;
+ KnownFPClass FPClass = VT->computeKnownFPClass(Val, fcNan);
+ if (SNaN)
+ return FPClass.isKnownNever(fcSNan);
+
+ return FPClass.isKnownNeverNaN();
}
Align llvm::inferAlignFromPtrInfo(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..2a6073c20c73b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -859,7 +859,7 @@ class NeverNaNPats<dag ops, list<dag> frags> : PatFrags<ops, frags> {
return CurDAG->isKnownNeverNaN(SDValue(N,0));
}];
let GISelPredicateCode = [{
- return isKnownNeverNaN(MI.getOperand(0).getReg(), MRI);
+ return isKnownNeverNaN(MI.getOperand(0).getReg(), MRI, VT);
}];
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index f08502fb3d928..344b580773c9d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -261,7 +261,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
// nodes(max/min) have same behavior when one input is NaN and other isn't.
// Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
// also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
- if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
+ if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI, VT)) {
// Don't fold single use constant that can't be inlined.
if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
(!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
@@ -291,8 +291,8 @@ bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI,
// For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
// to 0.0 requires dx10_clamp = true.
if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
- isKnownNeverSNaN(Val, MRI)) ||
- isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) {
+ isKnownNeverSNaN(Val, MRI, VT)) ||
+ isKnownNeverNaN(MI.getOperand(0).getReg(), MRI, VT)) {
Reg = Val;
return true;
}
@@ -338,9 +338,9 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
// no NaN inputs. Most often MI is marked with nnan fast math flag.
// For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
// when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
- if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) ||
+ if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI, VT) ||
(getIEEE() && getDX10Clamp() &&
- (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
+ (isKnownNeverSNaN(Val, MRI, VT) || isOp3Zero()))) {
Reg = Val;
return true;
}
>From fe2c76ce3527b621a7d6184996311e7645a112a9 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 23 May 2025 16:35:11 +0000
Subject: [PATCH 3/5] fix bug in matchFPMed3ToClamp
---
llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 344b580773c9d..7279fbe474212 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -329,6 +329,8 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
Register Val = Src0->getOperand(0).getReg();
auto isOp3Zero = [&]() {
+ if (MI.getNumOperands() < 5)
+ return false;
MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
>From db9e8f3be3e3e28ac337f311e97fc8660b2742db Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 23 May 2025 16:35:41 +0000
Subject: [PATCH 4/5] fix fp semantics lookup for vectors
---
.../CodeGen/GlobalISel/GISelValueTracking.cpp | 36 +++++++++----------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index f05a291defff6..f1e77d813f0df 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1030,7 +1030,7 @@ void GISelValueTracking::computeKnownFPClass(Register R,
//
if ((Known.KnownFPClasses & fcZero) != fcNone &&
!Known.isKnownNeverSubnormal()) {
- DenormalMode Mode = MF->getDenormalMode(getFltSemanticForLLT(DstTy));
+ DenormalMode Mode = MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()));
if (Mode != DenormalMode::getIEEE())
Known.KnownFPClasses |= fcZero;
}
@@ -1092,8 +1092,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
// If the parent function flushes denormals, the canonical output cannot
// be a denormal.
- LLT Ty = MRI.getType(Val);
- const fltSemantics &FPType = getFltSemanticForLLT(Ty.getScalarType());
+ LLT Ty = MRI.getType(Val).getScalarType();
+ const fltSemantics &FPType = getFltSemanticForLLT(Ty);
DenormalMode DenormMode = MF->getDenormalMode(FPType);
if (DenormMode == DenormalMode::getIEEE()) {
if (KnownSrc.isKnownNever(fcPosZero))
@@ -1203,8 +1203,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
if (KnownSrc.isKnownNeverNaN() && KnownSrc.cannotBeOrderedLessThanZero())
Known.knownNot(fcNan);
- LLT Ty = MRI.getType(Val);
- const fltSemantics &FltSem = getFltSemanticForLLT(Ty.getScalarType());
+ LLT Ty = MRI.getType(Val).getScalarType();
+ const fltSemantics &FltSem = getFltSemanticForLLT(Ty);
DenormalMode Mode = MF->getDenormalMode(FltSem);
if (KnownSrc.isKnownNeverLogicalZero(Mode))
@@ -1323,18 +1323,18 @@ void GISelValueTracking::computeKnownFPClass(Register R,
// (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
if ((KnownLHS.isKnownNeverLogicalNegZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy))) ||
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()))) ||
KnownRHS.isKnownNeverLogicalNegZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) &&
// Make sure output negative denormal can't flush to -0
outputDenormalIsIEEEOrPosZero(*MF, DstTy))
Known.knownNot(fcNegZero);
} else {
// Only fsub -0, +0 can return -0
if ((KnownLHS.isKnownNeverLogicalNegZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy))) ||
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()))) ||
KnownRHS.isKnownNeverLogicalPosZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) &&
// Make sure output negative denormal can't flush to -0
outputDenormalIsIEEEOrPosZero(*MF, DstTy))
Known.knownNot(fcNegZero);
@@ -1381,10 +1381,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
if ((KnownRHS.isKnownNeverInfinity() ||
KnownLHS.isKnownNeverLogicalZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) &&
(KnownLHS.isKnownNeverInfinity() ||
KnownRHS.isKnownNeverLogicalZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))))
Known.knownNot(fcNan);
break;
@@ -1437,9 +1437,9 @@ void GISelValueTracking::computeKnownFPClass(Register R,
(KnownLHS.isKnownNeverInfinity() ||
KnownRHS.isKnownNeverInfinity()) &&
((KnownLHS.isKnownNeverLogicalZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) ||
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) ||
(KnownRHS.isKnownNeverLogicalZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))) {
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))))) {
Known.knownNot(fcNan);
}
@@ -1453,7 +1453,7 @@ void GISelValueTracking::computeKnownFPClass(Register R,
if (KnownLHS.isKnownNeverNaN() && KnownRHS.isKnownNeverNaN() &&
KnownLHS.isKnownNeverInfinity() &&
KnownRHS.isKnownNeverLogicalZero(
- MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) {
+ MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) {
Known.knownNot(fcNan);
}
@@ -1478,10 +1478,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
// Infinity, nan and zero propagate from source.
computeKnownFPClass(R, DemandedElts, InterestedClasses, Known, Depth + 1);
- LLT DstTy = MRI.getType(Dst);
- const fltSemantics &DstSem = getFltSemanticForLLT(DstTy.getScalarType());
- LLT SrcTy = MRI.getType(Src);
- const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy.getScalarType());
+ LLT DstTy = MRI.getType(Dst).getScalarType();
+ const fltSemantics &DstSem = getFltSemanticForLLT(DstTy);
+ LLT SrcTy = MRI.getType(Src).getScalarType();
+ const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy);
// All subnormal inputs should be in the normal range in the result type.
if (APFloat::isRepresentableAsNormalIn(SrcSem, DstSem)) {
>From 0e0b1fb40e1f55e04074a0d49965b1921e78998b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 23 May 2025 16:35:57 +0000
Subject: [PATCH 5/5] update tests
---
.../GlobalISel/clamp-fmed3-const-combine.ll | 25 +-
.../GlobalISel/clamp-minmax-const-combine.ll | 40 +-
.../GlobalISel/fmed3-min-max-const-combine.ll | 28 +-
.../AMDGPU/GlobalISel/legalize-fmaxnum.mir | 48 +-
.../AMDGPU/GlobalISel/legalize-fminnum.mir | 48 +-
.../GlobalISel/legalize-vector-args-gfx7.mir | 20 +-
.../regbankcombiner-clamp-fmed3-const.mir | 40 +-
.../regbankcombiner-clamp-minmax-const.mir | 18 +-
.../regbankcombiner-fmed3-minmax-const.mir | 10 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 2926 ++++++++++++-----
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 216 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 635 ++--
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 107 +-
llvm/test/CodeGen/AMDGPU/minmax.ll | 140 +-
14 files changed, 3039 insertions(+), 1262 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index c7676e9da6f49..0ca26b1b7d0df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -74,7 +74,8 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp
; GFX10-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX10-NEXT: v_med3_f32 v0, v0, 1.0, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true:
@@ -84,7 +85,9 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f32 v0, v0, 1.0, 0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -97,7 +100,8 @@ define float @test_fmed3_global_nnan(float %a) #3 {
; GFX10-LABEL: test_fmed3_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmed3_global_nnan:
@@ -107,7 +111,9 @@ define float @test_fmed3_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -134,7 +140,9 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f32 v0, v0, 1.0, 0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -172,7 +180,8 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
; GFX10-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true:
@@ -182,7 +191,9 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index e2e1c1147eeee..70276bd670715 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -51,7 +51,8 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 {
; GFX10-LABEL: test_min_K1max_ValK0_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT: v_mul_f16_e32 v0, 2.0, v0
+; GFX10-NEXT: v_med3_f16 v0, v0, 0, 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_min_K1max_ValK0_f16:
@@ -61,7 +62,9 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f16_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f16 v0, v0, 0, 1.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%maxnum = call half @llvm.maxnum.f16(half %fmul, half 0.0)
@@ -95,7 +98,9 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
; GFX10-LABEL: test_min_max_splat_padded_with_undef:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_max_f16 v0, v0, 0
+; GFX10-NEXT: v_pk_min_f16 v0, v0, 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_min_max_splat_padded_with_undef:
@@ -105,7 +110,10 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, 0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, 1.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half poison>, <2 x half> %fmul)
@@ -231,7 +239,9 @@ define float @test_max_min_global_nnan(float %a) #3 {
; GFX10-LABEL: test_max_min_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e64 v0, v0, v0 clamp
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GFX10-NEXT: v_max_f32_e32 v0, 0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_max_min_global_nnan:
@@ -241,7 +251,9 @@ define float @test_max_min_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 1.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
@@ -305,9 +317,7 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT: v_max_f32_e32 v0, 0, v0
-; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false:
@@ -317,7 +327,9 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -341,7 +353,9 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -381,9 +395,7 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0
-; GFX10-NEXT: v_max_f32_e32 v0, 0, v0
+; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index 75c4cd53e3bfc..97c86b9582784 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -236,12 +236,14 @@ define float @test_min_max_global_nnan(float %a) #2 {
; GFX10-LABEL: test_min_max_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_min_max_global_nnan:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -252,6 +254,8 @@ define float @test_min_max_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
@@ -263,13 +267,17 @@ define float @test_max_min_global_nnan(float %a) #2 {
; GFX10-LABEL: test_max_min_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_max_min_global_nnan:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_max_min_global_nnan:
@@ -279,7 +287,9 @@ define float @test_max_min_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -456,15 +466,13 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_min_max_maybe_NaN_input_ieee_false:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false:
@@ -489,15 +497,13 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0
+; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_max_min_maybe_NaN_input_ieee_false:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0
+; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir
index d977049de26f4..eb1f0096c113a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir
@@ -291,7 +291,9 @@ body: |
; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
@@ -411,11 +413,15 @@ body: |
; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32)
; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
@@ -493,15 +499,21 @@ body: |
; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32)
; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT4]], [[FPEXT5]]
+ ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]]
+ ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]]
+ ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]]
; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE2]](s32)
; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
@@ -661,19 +673,27 @@ body: |
; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32)
; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT4]], [[FPEXT5]]
+ ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]]
+ ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]]
+ ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]]
; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE2]](s32)
; SI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; SI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT6]], [[FPEXT7]]
+ ; SI-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT6]]
+ ; SI-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT7]]
+ ; SI-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]]
; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE3]](s32)
; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
@@ -1040,11 +1060,15 @@ body: |
; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
- ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32)
; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir
index 32c353d2c579c..4f99e6f8ea6a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir
@@ -291,7 +291,9 @@ body: |
; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
@@ -411,11 +413,15 @@ body: |
; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32)
; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
@@ -493,15 +499,21 @@ body: |
; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32)
; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT4]], [[FPEXT5]]
+ ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]]
+ ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]]
+ ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]]
; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE2]](s32)
; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
@@ -661,19 +673,27 @@ body: |
; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32)
; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT4]], [[FPEXT5]]
+ ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]]
+ ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]]
+ ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]]
; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE2]](s32)
; SI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; SI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT6]], [[FPEXT7]]
+ ; SI-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT6]]
+ ; SI-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT7]]
+ ; SI-NEXT: [[FMINNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]]
; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE3]](s32)
; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
@@ -1040,11 +1060,15 @@ body: |
; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32)
; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
- ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32)
; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir
index 4328d47969a1e..29266b42227e1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir
@@ -290,23 +290,33 @@ body: |
; GFX7-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX7-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX7-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX7-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]]
+ ; GFX7-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]]
+ ; GFX7-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]]
+ ; GFX7-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
; GFX7-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
; GFX7-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX7-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; GFX7-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]]
+ ; GFX7-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]]
+ ; GFX7-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]]
+ ; GFX7-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
; GFX7-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32)
; GFX7-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX7-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; GFX7-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT4]], [[FPEXT5]]
+ ; GFX7-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]]
+ ; GFX7-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]]
+ ; GFX7-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]]
; GFX7-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE2]](s32)
; GFX7-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; GFX7-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC8]](s16)
- ; GFX7-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT6]], [[FPEXT7]]
+ ; GFX7-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT6]]
+ ; GFX7-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT7]]
+ ; GFX7-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]]
; GFX7-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE3]](s32)
; GFX7-NEXT: [[FPEXT8:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
; GFX7-NEXT: [[FPEXT9:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC9]](s16)
- ; GFX7-NEXT: [[FMAXNUM_IEEE4:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT8]], [[FPEXT9]]
+ ; GFX7-NEXT: [[FCANONICALIZE8:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT8]]
+ ; GFX7-NEXT: [[FCANONICALIZE9:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT9]]
+ ; GFX7-NEXT: [[FMAXNUM_IEEE4:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE8]], [[FCANONICALIZE9]]
; GFX7-NEXT: [[FPTRUNC4:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE4]](s32)
; GFX7-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; GFX7-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC1]](s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
index a97d905f2a978..129cbcfca6fa5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
@@ -162,8 +162,12 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
;
; GFX12-LABEL: name: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true
; GFX12: liveins: $vgpr0
@@ -172,8 +176,12 @@ body: |
; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GFX12-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; GFX12-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
- ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; GFX12-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%8:vgpr(s32) = COPY %2(s32)
@@ -222,8 +230,12 @@ body: |
; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GFX12-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; GFX12-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
- ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; GFX12-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%8:vgpr(s32) = COPY %2(s32)
@@ -307,8 +319,12 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
;
; GFX12-LABEL: name: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true
; GFX12: liveins: $vgpr0
@@ -317,8 +333,12 @@ body: |
; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GFX12-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; GFX12-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
- ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; GFX12-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%8:vgpr(s32) = COPY %2(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir
index 70fd67363648d..7e5555b68daad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir
@@ -441,13 +441,8 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMUL]], [[COPY2]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
- ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMAXNUM]], [[COPY3]]
- ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM]](s32)
+ ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%9:vgpr(s32) = COPY %2(s32)
@@ -564,13 +559,8 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMUL]], [[COPY2]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
- ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMINNUM]], [[COPY3]]
- ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32)
+ ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%9:vgpr(s32) = COPY %2(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir
index 2f41d86100040..f329d126e66db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir
@@ -469,11 +469,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[COPY]], [[COPY1]]
; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMAXNUM]], [[COPY2]]
- ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%7:vgpr(s32) = COPY %2(s32)
@@ -502,11 +501,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[COPY]], [[COPY1]]
; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMINNUM]], [[COPY2]]
- ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 4.000000e+00
%7:vgpr(s32) = COPY %2(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index f9a1472b4596f..60aabda10533d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -886,6 +886,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -923,33 +924,60 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_med3_f32 v2, v3, 2.0, 4.0
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v3
+; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -1158,7 +1186,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -1205,20 +1238,25 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -1249,7 +1287,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -1284,8 +1327,11 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1347,8 +1393,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -1395,20 +1446,25 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -1438,8 +1494,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
-; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -1473,9 +1534,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1537,8 +1601,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v4
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -1597,8 +1666,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -1628,8 +1702,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
-; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -1663,9 +1742,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1728,8 +1810,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e64 v3, 1.0, |v3|
; SI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v4|
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, |v3|, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -1789,8 +1875,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e64 v2, 1.0, |v2|
; VI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3|
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, |v2|, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -1821,8 +1911,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX9-GISEL-NEXT: v_max_f32_e64 v2, |v2|, |v2|
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
-; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -1857,9 +1951,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX11-GISEL-NEXT: v_max_f32_e64 v2, |v2|, |v2|
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1930,7 +2027,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; SI-GISEL-NEXT: v_mul_f32_e64 v2, -1.0, |v2|
; SI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3|
; SI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v4|
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -1992,7 +2092,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v7|
; VI-GISEL-NEXT: v_mul_f32_e64 v2, -1.0, |v2|
; VI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3|
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -2025,7 +2128,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
-; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -2062,8 +2168,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2776,7 +2884,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -2823,70 +2937,118 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
- %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
- %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
- %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
- %a = load volatile float, ptr addrspace(1) %gep0
- %b = load volatile float, ptr addrspace(1) %gep1
- %c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
- store float %med3, ptr addrspace(1) %outgep
- ret void
-}
-
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+ %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+ %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+ %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+ %a = load volatile float, ptr addrspace(1) %gep0
+ %b = load volatile float, ptr addrspace(1) %gep1
+ %c = load volatile float, ptr addrspace(1) %gep2
+ %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ store float %med3, ptr addrspace(1) %outgep
+ ret void
+}
+
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2928,7 +3090,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -2975,53 +3143,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_min_f32 v4, v1, v2 :: v_dual_max_f32 v1, v2, v1
+; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -3081,7 +3297,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -3128,20 +3349,25 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -3172,7 +3398,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -3207,8 +3438,11 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3270,7 +3504,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -3317,54 +3557,102 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
%gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
@@ -3422,7 +3710,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -3469,53 +3763,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_min_f32 v4, v1, v2 :: v_dual_max_f32 v1, v2, v1
+; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -3574,7 +3916,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -3621,53 +3969,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -3726,10 +4122,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
-; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
-; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-GISEL-NEXT: s_endpgm
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-SDAG: ; %bb.0:
@@ -3773,53 +4175,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -3878,7 +4328,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -3925,53 +4381,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_min_f32 v4, v2, v1 :: v_dual_max_f32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4030,7 +4534,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -4077,53 +4587,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4182,7 +4740,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -4229,53 +4793,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4334,7 +4946,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -4381,53 +4999,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4486,9 +5152,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
-; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
-; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
+; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
@@ -4533,53 +5205,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4638,7 +5358,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -4685,53 +5411,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4790,7 +5564,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -4837,53 +5617,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -4942,7 +5770,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -4989,53 +5823,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -5094,7 +5976,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -5141,53 +6029,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -5246,7 +6182,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -5293,53 +6235,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -5401,7 +6391,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -5448,53 +6444,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, v2, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -6400,7 +7444,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -6465,47 +7512,92 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
-; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
-; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
+; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_add_f32 v3, 4.0, v3 :: v_dual_min_f32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -6575,7 +7667,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -6640,47 +7735,92 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
-; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
-; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
+; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_add_f32 v3, 4.0, v3 :: v_dual_min_f32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -6750,7 +7890,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
-; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -6815,47 +7958,92 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3
-; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
-; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
-; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
+; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_add_f32 v3, 4.0, v3 :: v_dual_min_f32 v4, v1, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
@@ -7112,9 +8300,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_mul_f32_e32 v5, -1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-GISEL-NEXT: v_min_f32_e32 v5, v5, v3
; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -7178,10 +8369,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
-; VI-GISEL-NEXT: v_max_f32_e32 v5, v7, v2
-; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
-; VI-GISEL-NEXT: v_min_f32_e32 v3, v5, v3
-; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; VI-GISEL-NEXT: v_min_f32_e32 v4, v4, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -7215,10 +8409,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v4, v2
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
-; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -7255,10 +8452,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
-; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -7322,8 +8521,11 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-GISEL-NEXT: s_endpgm
@@ -7371,55 +8573,97 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
-; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
+; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6
+; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_max_f32_e32 v2, v7, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3
; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-LABEL: v_test_global_nnans_min_max_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX9-NEXT: s_endpgm
+; GFX9-SDAG-LABEL: v_test_global_nnans_min_max_f32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_global_nnans_min_max_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX9-GISEL-LABEL: v_test_global_nnans_min_max_f32:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_global_nnans_min_max_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, v2, v3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_global_nnans_min_max_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
%gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index cbd824e171976..850aeb60335e8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -350,22 +350,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
+; SDAG-GFX9: ; %bb.0:
+; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
-; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
+; SDAG-VI: ; %bb.0:
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
; SDAG-CI: ; %bb.0:
@@ -378,19 +378,41 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_med3_f32 v0, v0, 0, 1.0
; GISEL-GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
+; GISEL-GFX9: ; %bb.0:
+; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX9-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
+; GISEL-VI: ; %bb.0:
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_med3_f32 v0, v2, 0, 1.0
+; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-CI-NEXT: v_med3_f32 v0, v2, 0, 1.0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -405,27 +427,27 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; SDAG-GFX9: ; %bb.0:
+; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; VI-NEXT: v_mac_f32_e32 v2, v0, v1
-; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; SDAG-VI: ; %bb.0:
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
; SDAG-CI: ; %bb.0:
@@ -435,6 +457,36 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_med3_f16 v0, v0, 0, 1.0
+; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; GISEL-GFX9: ; %bb.0:
+; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX9-NEXT: v_med3_f16 v0, v0, 0, 1.0
+; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; GISEL-VI: ; %bb.0:
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; GISEL-VI-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,36 +516,36 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
+; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX9-NEXT: global_store_short v[0:1], v3, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX9: ; %bb.0:
+; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX9-NEXT: global_store_short v[0:1], v3, off
+; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0)
+; SDAG-GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; VI-NEXT: v_mac_f32_e32 v2, v0, v1
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; VI-NEXT: flat_store_short v[0:1], v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-VI: ; %bb.0:
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; SDAG-VI-NEXT: flat_store_short v[0:1], v0
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; SDAG-CI: ; %bb.0:
@@ -507,6 +559,42 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_med3_f16 v0, v1, 0, 1.0
+; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-GFX9: ; %bb.0:
+; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX9-NEXT: global_store_short v[0:1], v0, off
+; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0)
+; GISEL-GFX9-NEXT: v_med3_f16 v0, v0, 0, 1.0
+; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-VI: ; %bb.0:
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GISEL-VI-NEXT: flat_store_short v[0:1], v0
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GISEL-VI-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 32e0d393a1001..a9d07877b3887 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -271,32 +271,38 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
-; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; VI-NEXT: v_mac_f32_e32 v2, v0, v1
-; VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; SDAG-VI: ; %bb.0:
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp
+; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; SDAG-CI: ; %bb.0:
@@ -306,6 +312,39 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_med3_f16 v0, v0, 0, 1.0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT: v_med3_f16 v0, v0, 0, 1.0
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT: v_med3_f16 v0, v0, 0, 1.0
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GISEL-VI: ; %bb.0:
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0
+; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -348,28 +387,28 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src
; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SDAG-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; SDAG-VI: ; %bb.0:
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
; SDAG-CI: ; %bb.0:
@@ -382,17 +421,45 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src
; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0
; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; GISEL-VI: ; %bb.0:
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_med3_f32 v0, v2, 0, 1.0
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-CI-NEXT: v_med3_f32 v0, v2, 0, 1.0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -914,30 +981,39 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; FIXME (DAG): Fold clamp
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_mov_b32_e32 v0, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mov_b32_e32 v0, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_mov_b32_e32 v0, v3
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
@@ -978,6 +1054,35 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_pk_max_f16 v0, v3, 0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_pk_max_f16 v0, v3, 0
+; GISEL-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_pk_max_f16 v0, v3, 0
+; GISEL-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -989,8 +1094,13 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
-; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v5 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; GISEL-VI-NEXT: v_max_f16_e32 v1, 0, v1
+; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0
+; GISEL-VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1147,33 +1257,36 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0
+; GISEL-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1190,11 +1303,18 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v8 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
-; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp
-; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GISEL-VI-NEXT: v_max_f16_e32 v1, 0, v1
+; GISEL-VI-NEXT: v_max_f16_e32 v2, 0, v2
+; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0
+; GISEL-VI-NEXT: v_min_f16_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-VI-NEXT: v_min_f16_e32 v1, 1.0, v2
+; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v3
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1247,39 +1367,51 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-NEXT: v_mov_b32_e32 v1, v2
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_mov_b32_e32 v0, v6
-; GFX906-NEXT: v_mov_b32_e32 v1, v2
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
@@ -1358,6 +1490,48 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0
+; GISEL-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v7, 0
+; GISEL-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v7, 0
+; GISEL-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1377,10 +1551,19 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
-; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v10 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v11 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v10
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v11
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v5
+; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; GISEL-VI-NEXT: v_max_f16_e32 v1, 0, v1
+; GISEL-VI-NEXT: v_max_f16_e32 v2, 0, v2
+; GISEL-VI-NEXT: v_max_f16_e32 v3, 0, v3
+; GISEL-VI-NEXT: v_mov_b32_e32 v4, 0x3c00
+; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0
+; GISEL-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-VI-NEXT: v_min_f16_e32 v2, 1.0, v2
+; GISEL-VI-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
@@ -1528,7 +1711,7 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, v3
-; GISEL-GFX1100-NEXT: v_max_f16_e64 v3, v3, v3 clamp
+; GISEL-GFX1100-NEXT: v_med3_f16 v3, v3, 0, 1.0
; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v3
@@ -1539,20 +1722,22 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_max_f16_e64 v4, v3, v3 clamp
+; GISEL-GFX900-NEXT: v_med3_f16 v4, v3, 0, 1.0
; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 0xffff0000
-; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v0, v4
+; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff0000
+; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_max_f16_e64 v4, v3, v3 clamp
+; GISEL-GFX906-NEXT: v_med3_f16 v4, v3, 0, 1.0
; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 0xffff0000
-; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v0, v4
+; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff0000
+; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
@@ -1690,13 +1875,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GISEL-GFX1100-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GISEL-GFX1100-NEXT: v_med3_f16 v3, v3, 0, 1.0
; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GISEL-GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-GFX1100-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
@@ -1704,9 +1891,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GISEL-GFX900-NEXT: v_med3_f16 v0, v4, 0, 1.0
+; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff
; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0
@@ -1716,9 +1904,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GISEL-GFX906-NEXT: v_med3_f16 v0, v4, 0, 1.0
+; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0
@@ -1872,10 +2061,13 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3
+; GISEL-GFX1100-NEXT: v_med3_f32 v1, v3, 0, 1.0
+; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v1, v0
@@ -1884,9 +2076,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_med3_f32 v1, v3, 0, 1.0
+; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -1894,9 +2088,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_med3_f32 v1, v3, 0, 1.0
+; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v1, v0
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
@@ -1910,11 +2106,13 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GISEL-VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp
-; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v3
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0
+; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_med3_f32 v0, v5, 0, 1.0
+; GISEL-VI-NEXT: v_med3_f32 v1, v2, 0, 1.0
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt:
@@ -1926,8 +2124,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
-; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
+; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
+; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
+; GISEL-CI-NEXT: v_med3_f32 v0, v4, 0, 1.0
+; GISEL-CI-NEXT: v_med3_f32 v1, v5, 0, 1.0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
@@ -2052,25 +2252,33 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GISEL-GFX1100-NEXT: v_med3_f32 v2, v6, 0, 1.0
+; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GISEL-GFX1100-NEXT: v_med3_f32 v1, v1, 0, 1.0
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6
; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v2, v0
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_med3_f32 v2, v6, 0, 1.0
+; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -2078,11 +2286,14 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_med3_f32 v2, v6, 0, 1.0
+; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v2, v0
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
@@ -2099,13 +2310,16 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GISEL-VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp
-; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v6
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
+; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7
+; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
+; GISEL-VI-NEXT: v_med3_f32 v0, v8, 0, 1.0
+; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
+; GISEL-VI-NEXT: v_med3_f32 v1, v4, 0, 1.0
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: v_med3_f32 v1, v5, 0, 1.0
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-VI-NEXT: v_or_b32_e32 v0, v2, v0
+; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt:
@@ -2120,9 +2334,12 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp
-; GISEL-CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp
-; GISEL-CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp
+; GISEL-CI-NEXT: v_mac_f32_e32 v6, v0, v3
+; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4
+; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5
+; GISEL-CI-NEXT: v_med3_f32 v0, v6, 0, 1.0
+; GISEL-CI-NEXT: v_med3_f32 v1, v7, 0, 1.0
+; GISEL-CI-NEXT: v_med3_f32 v2, v8, 0, 1.0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -2275,12 +2492,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GISEL-GFX1100-NEXT: v_med3_f32 v3, v6, 0, 1.0
+; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GISEL-GFX1100-NEXT: v_med3_f32 v2, v2, 0, 1.0
+; GISEL-GFX1100-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v6
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -2293,11 +2516,15 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v6
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_med3_f32 v3, v6, 0, 1.0
+; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX900-NEXT: v_med3_f32 v2, v2, 0, 1.0
+; GISEL-GFX900-NEXT: v_med3_f32 v1, v1, 0, 1.0
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -2308,11 +2535,15 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v6
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_med3_f32 v3, v6, 0, 1.0
+; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX906-NEXT: v_med3_f32 v2, v2, 0, 1.0
+; GISEL-GFX906-NEXT: v_med3_f32 v1, v1, 0, 1.0
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -2335,16 +2566,20 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GISEL-VI-NEXT: v_mad_f32 v6, v6, v8, v10 clamp
-; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
-; GISEL-VI-NEXT: v_mad_f32 v2, v7, v9, v11 clamp
-; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
-; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v6
-; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8
+; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
+; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9
+; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
+; GISEL-VI-NEXT: v_med3_f32 v0, v10, 0, 1.0
+; GISEL-VI-NEXT: v_med3_f32 v1, v4, 0, 1.0
+; GISEL-VI-NEXT: v_med3_f32 v2, v11, 0, 1.0
+; GISEL-VI-NEXT: v_med3_f32 v3, v5, 0, 1.0
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GISEL-VI-NEXT: v_or_b32_e32 v0, v3, v0
-; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v1
+; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt:
@@ -2362,10 +2597,14 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp
-; GISEL-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp
-; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp
-; GISEL-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp
+; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4
+; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5
+; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6
+; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
+; GISEL-CI-NEXT: v_med3_f32 v0, v8, 0, 1.0
+; GISEL-CI-NEXT: v_med3_f32 v1, v9, 0, 1.0
+; GISEL-CI-NEXT: v_med3_f32 v2, v10, 0, 1.0
+; GISEL-CI-NEXT: v_med3_f32 v3, v11, 0, 1.0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index e2170fa406da4..53db04e21af6e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -1599,41 +1599,41 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
}
define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; GFX9GEN: ; %bb.0:
-; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9GEN-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
-; GFX9GEN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX9GEN: ; %bb.0:
+; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SDAG-VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-VI: ; %bb.0:
+; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
; SDAG-CI: ; %bb.0:
@@ -1641,13 +1641,56 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX9GEN: ; %bb.0:
+; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-GFX9GEN-NEXT: v_med3_f32 v0, v2, 0, 1.0
+; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-VI: ; %bb.0:
+; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-VI-NEXT: v_med3_f32 v0, v2, 0, 1.0
+; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
+; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; GISEL-CI-NEXT: v_med3_f32 v0, v2, 0, 1.0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.hi = extractelement <2 x half> %src0, i32 1
%src1.hi = extractelement <2 x half> %src1, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index bdd8935d0df5e..2958ca7122cb7 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -445,23 +445,47 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
}
define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) #0 {
-; GFX11-LABEL: test_med3_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
-; GFX11-NEXT: global_store_b32 v[0:1], v2, off
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_med3_f32:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
+; SDAG-GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_med3_f32:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
-; GFX12-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_med3_f32:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GISEL-GFX11-NEXT: v_min_f32_e32 v5, v2, v3
+; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4
+; GISEL-GFX11-NEXT: v_minmax_f32 v2, v2, v3, v5
+; GISEL-GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_med3_f32:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
+; SDAG-GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_med3_f32:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GISEL-GFX12-NEXT: v_min_num_f32_e32 v5, v2, v3
+; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v3, v4, v4
+; GISEL-GFX12-NEXT: v_minmax_num_f32 v2, v2, v3, v5
+; GISEL-GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call float @llvm.minnum.f32(float %x, float %y)
%tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -471,23 +495,47 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
}
define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) #0 {
-; GFX11-LABEL: test_med3_minimumnum_maximumnum_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
-; GFX11-NEXT: global_store_b32 v[0:1], v2, off
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_med3_minimumnum_maximumnum_f32:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
+; SDAG-GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_med3_minimumnum_maximumnum_f32:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
-; GFX12-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_med3_minimumnum_maximumnum_f32:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GISEL-GFX11-NEXT: v_min_f32_e32 v5, v2, v3
+; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4
+; GISEL-GFX11-NEXT: v_minmax_f32 v2, v2, v3, v5
+; GISEL-GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_med3_minimumnum_maximumnum_f32:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
+; SDAG-GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_med3_minimumnum_maximumnum_f32:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GISEL-GFX12-NEXT: v_min_num_f32_e32 v5, v2, v3
+; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v3, v4, v4
+; GISEL-GFX12-NEXT: v_minmax_num_f32 v2, v2, v3, v5
+; GISEL-GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call float @llvm.minimumnum.f32(float %x, float %y)
%tmp1 = call float @llvm.maximumnum.f32(float %x, float %y)
%tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z)
@@ -875,14 +923,24 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GISEL-GFX11-TRUE16-LABEL: test_med3_f16:
; GISEL-GFX11-TRUE16: ; %bb.0:
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
+; GISEL-GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.l, v2.h
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.h
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GISEL-GFX11-TRUE16-NEXT: v_minmax_f16 v2.l, v2.l, v2.h, v3.l
; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX11-FAKE16-LABEL: test_med3_f16:
; GISEL-GFX11-FAKE16: ; %bb.0:
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT: v_med3_f16 v2, v2, v3, v4
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GISEL-GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v2, v3
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v3
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4
+; GISEL-GFX11-FAKE16-NEXT: v_minmax_f16 v2, v2, v3, v5
; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -915,7 +973,12 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GISEL-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
+; GISEL-GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.l, v2.h
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.h
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v2.l, v2.l, v2.h, v3.l
; GISEL-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -926,7 +989,12 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GISEL-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GISEL-GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v2, v3
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v3
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v2, v2, v3, v5
; GISEL-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call half @llvm.minnum.f16(half %x, half %y)
More information about the llvm-commits
mailing list