[llvm] [X86] Remove LowerFCanonicalize and use generic expansion (PR #147877)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 01:19:45 PDT 2025
https://github.com/woruyu updated https://github.com/llvm/llvm-project/pull/147877
>From 24a7803b9cf46ac36b8818d304b2bfd9a1ac850f Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Thu, 10 Jul 2025 11:37:58 +0800
Subject: [PATCH 1/5] [X86] Remove LowerFCanonicalize and use generic expansion
---
llvm/include/llvm/CodeGen/TargetLowering.h | 14 +++++
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 25 +--------
.../SelectionDAG/LegalizeVectorOps.cpp | 9 ++++
.../CodeGen/SelectionDAG/TargetLowering.cpp | 20 +++++++
llvm/lib/Target/X86/X86ISelLowering.cpp | 53 +++++++------------
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
6 files changed, 65 insertions(+), 58 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a248eb7444b20..a5d8d58038986 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5019,6 +5019,10 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
return DL.isLittleEndian();
}
+ virtual bool shouldExpandVectorFCANONICALIZEInVectorLegalizer() const {
+ return false;
+ }
+
/// Returns a 0 terminated array of registers that can be safely used as
/// scratch registers.
virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
@@ -5681,6 +5685,16 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// only the first Count elements of the vector are used.
SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;
+ /// This implements llvm.canonicalize.f* by multiplication with 1.0, as
+ /// suggested in
+ /// https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic.
+ /// It uses strict_fp operations even outside a strict_fp context in order
+ /// to guarantee that the canonicalization is not optimized away by later
+ /// passes. The result chain introduced by that is intentionally ignored
+ /// since no ordering requirement is intended here.
+ SDValue expandFCanonicalizeWithStrictFmul(SDNode *Node, SDLoc DL,
+ SelectionDAG &DAG) const;
+
/// Expand a VECREDUCE_SEQ_* into an explicit ordered calculation.
SDValue expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 528136a55f14a..9877015c96269 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3356,29 +3356,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
break;
}
case ISD::FCANONICALIZE: {
- // This implements llvm.canonicalize.f* by multiplication with 1.0, as
- // suggested in
- // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic.
- // It uses strict_fp operations even outside a strict_fp context in order
- // to guarantee that the canonicalization is not optimized away by later
- // passes. The result chain introduced by that is intentionally ignored
- // since no ordering requirement is intended here.
-
- // Create strict multiplication by 1.0.
- SDValue Operand = Node->getOperand(0);
- EVT VT = Operand.getValueType();
- SDValue One = DAG.getConstantFP(1.0, dl, VT);
- SDValue Chain = DAG.getEntryNode();
- SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
- {Chain, Operand, One});
-
- // Propagate existing flags on canonicalize, and additionally set
- // NoFPExcept.
- SDNodeFlags CanonicalizeFlags = Node->getFlags();
- CanonicalizeFlags.setNoFPExcept(true);
- Mul->setFlags(CanonicalizeFlags);
-
- Results.push_back(Mul);
+ SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG);
+ Results.push_back(Result);
break;
}
case ISD::SIGN_EXTEND_INREG: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index f908a66128ec8..fccd1cb61b7d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1309,6 +1309,15 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
return;
}
break;
+ case ISD::FCANONICALIZE: {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.shouldExpandVectorFCANONICALIZEInVectorLegalizer()) {
+ SDLoc dl(Node);
+ SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG);
+ Results.push_back(Result);
+ return;
+ }
+ }
}
SDValue Unrolled = DAG.UnrollVectorOp(Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e0597988e8907..028e4c305c97a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11581,6 +11581,26 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
return Res;
}
+SDValue
+TargetLowering::expandFCanonicalizeWithStrictFmul(SDNode *Node, SDLoc DL,
+ SelectionDAG &DAG) const {
+ // Create strict multiplication by 1.0.
+ SDValue Operand = Node->getOperand(0);
+ EVT VT = Operand.getValueType();
+ SDValue One = DAG.getConstantFP(1.0, DL, VT);
+ SDValue Chain = DAG.getEntryNode();
+
+ SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, DL, {VT, MVT::Other},
+ {Chain, Operand, One});
+
+ // Propagate existing flags on canonicalize, and additionally set NoFPExcept.
+ SDNodeFlags Flags = Node->getFlags();
+ Flags.setNoFPExcept(true);
+ Mul->setFlags(Flags);
+
+ return Mul;
+}
+
SDValue TargetLowering::expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
SDValue AccOp = Node->getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5e35d5630d667..e2abe98b19f81 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -316,8 +316,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
}
- setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
@@ -348,9 +346,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!Subtarget.hasSSE2()) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
- setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
- setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i32, Expand);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
// Without SSE, i64->f64 goes through memory.
@@ -716,7 +712,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
setOperationAction(ISD::LRINT, MVT::f16, Expand);
@@ -871,7 +866,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
- setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand);
if (isTypeLegal(MVT::f16)) {
setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
@@ -934,7 +929,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (isTypeLegal(MVT::f80)) {
setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand);
}
setOperationAction(ISD::SETCC, MVT::f128, Custom);
@@ -1070,11 +1065,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Expand);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Expand);
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
@@ -1137,7 +1132,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Expand);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
@@ -1473,7 +1468,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
- setOperationAction(ISD::FCANONICALIZE, VT, Custom);
+ setOperationAction(ISD::FCANONICALIZE, VT, Expand);
}
setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
@@ -1741,9 +1736,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom);
- setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Expand);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Expand);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
@@ -1825,7 +1820,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
- setOperationAction(ISD::FCANONICALIZE, VT, Custom);
+ setOperationAction(ISD::FCANONICALIZE, VT, Expand);
}
setOperationAction(ISD::LRINT, MVT::v16f32,
Subtarget.hasDQI() ? Legal : Custom);
@@ -3318,6 +3313,13 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
+// X86 prefers to defer vector FCANONICALIZE to DAG legalization
+// to avoid scalarization during vector legalization.
+bool X86TargetLowering::shouldExpandVectorFCANONICALIZEInVectorLegalizer()
+ const {
+ return true;
+}
+
bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
@@ -33436,24 +33438,6 @@ static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
return Op;
}
-static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) {
- SDNode *N = Op.getNode();
- SDValue Operand = N->getOperand(0);
- EVT VT = Operand.getValueType();
- SDLoc dl(N);
-
- SDValue One = DAG.getConstantFP(1.0, dl, VT);
-
- // TODO: Fix Crash for bf16 when generating strict_fmul as it
- // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
- // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
- // promote this operator's result!
- SDValue Chain = DAG.getEntryNode();
- SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
- {Chain, Operand, One});
- return StrictFmul;
-}
-
static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
unsigned OpNo) {
const APInt Operand(32, OpNo);
@@ -33593,7 +33577,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
- case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
case ISD::STRICT_SINT_TO_FP:
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::STRICT_UINT_TO_FP:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a36e91b5..05c109511561c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1524,6 +1524,8 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
+ bool shouldExpandVectorFCANONICALIZEInVectorLegalizer() const override;
+
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
bool convertSelectOfConstantsToMath(EVT VT) const override;
>From b2c46c78a3276abffa3a4c674fc417539efd2b0e Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Thu, 10 Jul 2025 15:59:00 +0800
Subject: [PATCH 2/5] fix: remove target hook
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 ----
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 10 ++++------
llvm/lib/Target/X86/X86ISelLowering.cpp | 7 -------
llvm/lib/Target/X86/X86ISelLowering.h | 2 --
4 files changed, 4 insertions(+), 19 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a5d8d58038986..e455e31e54d28 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5019,10 +5019,6 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
return DL.isLittleEndian();
}
- virtual bool shouldExpandVectorFCANONICALIZEInVectorLegalizer() const {
- return false;
- }
-
/// Returns a 0 terminated array of registers that can be safely used as
/// scratch registers.
virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index fccd1cb61b7d5..ce88fa1066d3c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1311,12 +1311,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
break;
case ISD::FCANONICALIZE: {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.shouldExpandVectorFCANONICALIZEInVectorLegalizer()) {
- SDLoc dl(Node);
- SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG);
- Results.push_back(Result);
- return;
- }
+ SDLoc dl(Node);
+ SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG);
+ Results.push_back(Result);
+ return;
}
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e2abe98b19f81..cc24ba995e31a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3313,13 +3313,6 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-// X86 prefers to defer vector FCANONICALIZE to DAG legalization
-// to avoid scalarization during vector legalization.
-bool X86TargetLowering::shouldExpandVectorFCANONICALIZEInVectorLegalizer()
- const {
- return true;
-}
-
bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 05c109511561c..6bcb7a36e91b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1524,8 +1524,6 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool shouldExpandVectorFCANONICALIZEInVectorLegalizer() const override;
-
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
bool convertSelectOfConstantsToMath(EVT VT) const override;
>From cc350699ae32d2629c6c70703cd8d08afacaf7ad Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Fri, 11 Jul 2025 14:38:26 +0800
Subject: [PATCH 3/5] fix: review
---
llvm/include/llvm/CodeGen/TargetLowering.h | 7 -------
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 1 -
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 8 ++++++++
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e455e31e54d28..e99e4a0b3e482 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5681,13 +5681,6 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// only the first Count elements of the vector are used.
SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;
- /// This implements llvm.canonicalize.f* by multiplication with 1.0, as
- /// suggested in
- /// https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic.
- /// It uses strict_fp operations even outside a strict_fp context in order
- /// to guarantee that the canonicalization is not optimized away by later
- /// passes. The result chain introduced by that is intentionally ignored
- /// since no ordering requirement is intended here.
SDValue expandFCanonicalizeWithStrictFmul(SDNode *Node, SDLoc DL,
SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ce88fa1066d3c..a8feb22a4f191 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1310,7 +1310,6 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
}
break;
case ISD::FCANONICALIZE: {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(Node);
SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG);
Results.push_back(Result);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 028e4c305c97a..1ec07917075f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11584,6 +11584,14 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
SDValue
TargetLowering::expandFCanonicalizeWithStrictFmul(SDNode *Node, SDLoc DL,
SelectionDAG &DAG) const {
+ // This implements llvm.canonicalize.f* by multiplication with 1.0, as
+ // suggested in
+ // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic.
+ // It uses strict_fp operations even outside a strict_fp context in order
+ // to guarantee that the canonicalization is not optimized away by later
+ // passes. The result chain introduced by that is intentionally ignored
+ // since no ordering requirement is intended here.
+
// Create strict multiplication by 1.0.
SDValue Operand = Node->getOperand(0);
EVT VT = Operand.getValueType();
>From a32e690c61b7c7983b402fc926efff8661c26b5b Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Fri, 11 Jul 2025 16:08:11 +0800
Subject: [PATCH 4/5] test: regenerate old test update patch
---
.../AMDGPU/fcanonicalize-elimination.ll | 1709 ++++++++++++++---
1 file changed, 1394 insertions(+), 315 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index ab476dd96c707..6ad021a9355e3 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -1,12 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
-; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
-; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GFX9: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_no_fold_canonicalize_loaded_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_fold_canonicalize_loaded_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%v = load float, ptr addrspace(1) %gep, align 4
@@ -15,11 +37,31 @@ define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
-; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-; GCN-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fmul_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 0x41700000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fmul_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x41700000, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -29,12 +71,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
-; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fmul_legacy_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_legacy_f32_e32 v2, 0x41700000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fmul_legacy_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_legacy_f32_e32 v1, 0x41700000, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -44,12 +105,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(ptr addr
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
-; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_sub_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_f32_e32 v2, 0x41700000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_sub_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_f32_e32 v1, 0x41700000, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -59,12 +139,31 @@ define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(ptr addrspace(1)
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
-; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_add_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v2, 0x41700000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_add_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v1, 0x41700000, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -74,12 +173,31 @@ define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(ptr addrspace(1)
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
-; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_sqrt_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sqrt_f32_e32 v2, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_sqrt_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -89,12 +207,31 @@ define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
-; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fceil_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ceil_f32_e32 v2, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fceil_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_ceil_f32_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -104,12 +241,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(ptr addrspace(
ret void
}
-; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
-; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_floor_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_floor_f32_e32 v2, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_floor_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_floor_f32_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -119,13 +275,33 @@ define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(ptr addrspace(
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
-; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
-; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fma_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_mov_b32 s0, 0x41700000
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_fma_f32 v2, v2, s0, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fma_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s2, 0x41700000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_fma_f32 v1, v1, s2, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -135,12 +311,33 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(ptr addrspace(1)
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
-; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v3, 0x41700000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mac_f32_e32 v3, 0x41700000, v2
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x41700000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mac_f32_e32 v2, 0x41700000, v1
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -150,15 +347,60 @@ define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(ptr addrspa
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
-; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
-; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-; GCN-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(ptr addrspace(1) %arg) {
+; VI-FLUSH-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; VI-FLUSH: ; %bb.0:
+; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, 0x41700000
+; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT: flat_load_dword v2, v[0:1]
+; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT: v_mac_f32_e32 v3, 0x41700000, v2
+; VI-FLUSH-NEXT: flat_store_dword v[0:1], v3
+; VI-FLUSH-NEXT: s_endpgm
+;
+; VI-DENORM-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; VI-DENORM: ; %bb.0:
+; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT: flat_load_dword v2, v[0:1]
+; VI-DENORM-NEXT: s_mov_b32 s0, 0x41700000
+; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT: v_fma_f32 v2, v2, s0, s0
+; VI-DENORM-NEXT: flat_store_dword v[0:1], v2
+; VI-DENORM-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_mov_b32 s2, 0x41700000
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f32 v1, v1, s2, s2
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, 0x41700000
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 0x41700000, v1
+; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -168,15 +410,32 @@ define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(ptr addrspac
ret void
}
-; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
-; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
-; VI: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
-; GFX9: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_canonicalize_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_canonicalize_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -186,12 +445,36 @@ define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(ptr add
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
-; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
+; VI-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dword v1, v[1:2]
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -202,12 +485,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(ptr add
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
-; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
+; VI-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_ushort v1, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%load = load half, ptr addrspace(1) %gep, align 2
@@ -218,12 +525,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(ptr add
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
-; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
+; VI-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_ushort v1, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%load = load half, ptr addrspace(1) %gep, align 2
@@ -234,12 +565,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
-; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
+; VI-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2]
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
%load = load double, ptr addrspace(1) %gep, align 8
@@ -250,12 +605,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(ptr addr
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
-; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_max
-; GCN-NOT: v_mul
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
+; VI-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dword v1, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f16_f32_e32 v3, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -266,12 +645,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(ptr addr
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
-; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_max
-; GCN-NOT: v_mul
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
+; VI-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dword v1, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f16_f32_e32 v3, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -282,16 +685,40 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
-; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
-; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
-; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
-; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
-; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]]
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
+; VI-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT: v_cvt_f16_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v1, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -302,10 +729,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr
ret void
}
-; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
-; VI: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
-; GFX9: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_no_fold_canonicalize_fneg_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_fold_canonicalize_fneg_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -315,12 +763,33 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(ptr addrspac
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
-; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fneg_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v2, 0, v2
+; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fneg_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v1, 0, v1
+; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -331,10 +800,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
-; VI: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
-; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_no_fold_canonicalize_fabs_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v2, 1.0, |v2|
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_fold_canonicalize_fabs_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -344,13 +834,32 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(ptr addrspac
ret void
}
-; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
-; VI: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
-; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
-; GCN-NOT: v_mul_
-; GCN-NOT: v_max_
define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(ptr addrspace(1) %arg, float %sign) {
+; VI-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v2, 1.0, |v2|
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -362,12 +871,33 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(ptr add
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
-; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_fabs_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v2, 0, v2
+; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_fabs_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v1, 0, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -378,12 +908,34 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
-; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_sin_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 0.15915494, v2
+; VI-NEXT: v_fract_f32_e32 v2, v2
+; VI-NEXT: v_sin_f32_e32 v2, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_sin_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_f32_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_sin_f32_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -393,12 +945,34 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(ptr addrspace(1)
ret void
}
-; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
-; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_cos_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 0.15915494, v2
+; VI-NEXT: v_fract_f32_e32 v2, v2
+; VI-NEXT: v_cos_f32_e32 v2, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_cos_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_f32_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_cos_f32_e32 v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -408,12 +982,34 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(ptr addrspace(1)
ret void
}
-; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
-; GCN: v_sin_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]]
define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_sin_value_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; VI-NEXT: v_fract_f16_e32 v2, v2
+; VI-NEXT: v_sin_f16_e32 v2, v2
+; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_sin_value_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%load = load half, ptr addrspace(1) %gep, align 2
@@ -423,12 +1019,34 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(ptr addrspace(1)
ret void
}
-; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
-; GCN: v_cos_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]]
define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_cos_value_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; VI-NEXT: v_fract_f16_e32 v2, v2
+; VI-NEXT: v_cos_f16_e32 v2, v2
+; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_cos_value_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_cos_f16_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%load = load half, ptr addrspace(1) %gep, align 2
@@ -438,12 +1056,27 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(ptr addrspace(1)
ret void
}
-; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
-; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_qNaN_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qNaN_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
@@ -451,17 +1084,35 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
-; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
-; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
-; GCN-NOT: v_max
-; GCN-NOT: v_mul
-; GFX9: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-NEXT: v_min_f32_e32 v2, 0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_min_f32_e32 v1, 0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -481,12 +1132,33 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
; ret void
; }
-; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
-; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_minnum_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v2, 0, v2
+; VI-NEXT: v_min_f32_e32 v2, 0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_minnum_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v1, 0, v1
+; GFX9-NEXT: v_min_f32_e32 v1, 0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -499,11 +1171,31 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace
; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
-; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
-; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
-; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
-; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_sNaN_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_sNaN_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -523,20 +1215,37 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1
; ret void
; }
-; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
-; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
-; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
-; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-NEXT: v_max_f32_e32 v2, 0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -546,12 +1255,33 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_iee
ret void
}
-; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
-; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
-; GCN-NOT: v_max
-; GCN-NOT: v_mul
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_maxnum_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v2, 0, v2
+; VI-NEXT: v_max_f32_e32 v2, 0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_maxnum_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v1, 0, v1
+; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
@@ -562,12 +1292,33 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(ptr addrspace
ret void
}
-; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
-; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_maxnum_value_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f64 v[2:3], v[2:3], 0
+; VI-NEXT: v_max_f64 v[2:3], v[2:3], 0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_maxnum_value_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 0
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
%load = load double, ptr addrspace(1) %gep, align 8
@@ -578,50 +1329,130 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(ptr addrspace
ret void
}
-; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
-; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
+; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mul_f32_e32 v0, 0x41700000, v0
+; GCN-NEXT: ; return to shader part epilog
entry:
%v = fmul float %arg, 15.0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
-; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
-; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
+; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mul_f32_e32 v0, 0x41700000, v0
+; GCN-NEXT: ; return to shader part epilog
entry:
%v = fmul nnan float %arg, 15.0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee:
-; GCN: v_div_fixup_f32
-; GCN-NOT: v_max
-; GCN-NOT: v_mul
-; GCN: ; return
define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
+; VI-FLUSH-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee:
+; VI-FLUSH: ; %bb.0: ; %entry
+; VI-FLUSH-NEXT: s_mov_b32 s2, 0x41700000
+; VI-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
+; VI-FLUSH-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; VI-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; VI-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3
+; VI-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3
+; VI-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2
+; VI-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4
+; VI-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; VI-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, s2
+; VI-FLUSH-NEXT: ; return to shader part epilog
+;
+; VI-DENORM-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee:
+; VI-DENORM: ; %bb.0: ; %entry
+; VI-DENORM-NEXT: s_mov_b32 s2, 0x41700000
+; VI-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
+; VI-DENORM-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-DENORM-NEXT: v_rcp_f32_e32 v3, v1
+; VI-DENORM-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; VI-DENORM-NEXT: v_fma_f32 v3, v4, v3, v3
+; VI-DENORM-NEXT: v_mul_f32_e32 v4, v2, v3
+; VI-DENORM-NEXT: v_fma_f32 v5, -v1, v4, v2
+; VI-DENORM-NEXT: v_fma_f32 v4, v5, v3, v4
+; VI-DENORM-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-DENORM-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; VI-DENORM-NEXT: v_div_fixup_f32 v0, v1, v0, s2
+; VI-DENORM-NEXT: ; return to shader part epilog
+;
+; GFX9-DENORM-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee:
+; GFX9-DENORM: ; %bb.0: ; %entry
+; GFX9-DENORM-NEXT: s_mov_b32 s2, 0x41700000
+; GFX9-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
+; GFX9-DENORM-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; GFX9-DENORM-NEXT: v_rcp_f32_e32 v3, v1
+; GFX9-DENORM-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX9-DENORM-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX9-DENORM-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX9-DENORM-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX9-DENORM-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX9-DENORM-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX9-DENORM-NEXT: v_div_fixup_f32 v0, v1, v0, s2
+; GFX9-DENORM-NEXT: ; return to shader part epilog
+;
+; GFX9-FLUSH-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee:
+; GFX9-FLUSH: ; %bb.0: ; %entry
+; GFX9-FLUSH-NEXT: s_mov_b32 s2, 0x41700000
+; GFX9-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
+; GFX9-FLUSH-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX9-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX9-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX9-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX9-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX9-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, s2
+; GFX9-FLUSH-NEXT: ; return to shader part epilog
entry:
%v = fdiv float 15.0, %arg0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
-; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
-; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[V]], s{{\[[0-9]+:[0-9]+\]}}
-; GFX9-DENORM-NOT: 1.0
-; GFX9-DENORM-NOT: v_max
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GFX9-FLUSH: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
+; VI-LABEL: test_fold_canonicalize_load_nnan_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_load_nnan_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%v = load float, ptr addrspace(1) %gep, align 4
@@ -631,12 +1462,34 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
-; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
-; GCN-NOT: v_mul_
-; GCN-NOT: v_max_
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
+; VI-LABEL: test_fold_canonicalize_load_nnan_value_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_load_nnan_value_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
%v = load double, ptr addrspace(1) %gep, align 8
@@ -646,11 +1499,34 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
-; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]],
-; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]]
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]]
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
+; VI-LABEL: test_fold_canonicalize_load_nnan_value_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v3, v0, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_load_nnan_value_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%v = load half, ptr addrspace(1) %gep, align 2
@@ -660,13 +1536,46 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
-; GCN: v_add_f32
-; GCN: v_add_f32
-; GCN: v_cndmask_b32
-; GCN-NOT: v_mul_
-; GCN-NOT: v_max_
define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(ptr addrspace(1) %arg) {
+; VI-LABEL: test_fold_canonicalize_select_value_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_load_dword v3, v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_load_dword v4, v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v2, 0x41700000, v2
+; VI-NEXT: v_add_f32_e32 v3, 0x42000000, v3
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_select_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; kill: killed $vgpr0_vgpr1
+; GFX9-NEXT: global_load_dword v3, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v1, 0x41700000, v1
+; GFX9-NEXT: v_add_f32_e32 v2, 0x42000000, v2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load0 = load volatile float, ptr addrspace(1) %gep, align 4
@@ -685,57 +1594,91 @@ define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(ptr addrspace
; passed through the minnum.
; FIXME: canonicalize doens't work correctly without ieee_mode
-; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9: v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT: ; return to shader
-; VI-FLUSH: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; VI-FLUSH-NEXT: ; return
-; VI-DENORM-NOT: v0
-; VI-DENORM: v_min_f32_e32 v0, v0, v1
-; VI-DENORM-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
+; VI-FLUSH-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode:
+; VI-FLUSH: ; %bb.0:
+; VI-FLUSH-NEXT: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: ; return to shader part epilog
+;
+; VI-DENORM-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode:
+; VI-DENORM: ; %bb.0:
+; VI-DENORM-NEXT: v_min_f32_e32 v0, v0, v1
+; VI-DENORM-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
-; GFX9: v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64
-; VI-DAG: v_mul_f32_e32 v0, 1.0, v0
-; VI-DAG: v_mul_f32_e32 v1, 1.0, v1
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-NEXT: s_setpc_b64
define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
+; VI-LABEL: test_fold_canonicalize_minnum_value_ieee_mode:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-NEXT: v_min_f32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_fold_canonicalize_minnum_value_ieee_mode:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; Canonicalizing flush necessary pre-gfx9
-; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
-; GCN: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
+; VI-FLUSH-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
+; VI-FLUSH: ; %bb.0:
+; VI-FLUSH-NEXT: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: ; return to shader part epilog
+;
+; VI-DENORM-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
+; VI-DENORM: ; %bb.0:
+; VI-DENORM-NEXT: v_min_f32_e32 v0, v0, v1
+; VI-DENORM-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
-; GFX9-DAG: v_add_f16_e32
-; GFX9-DAG: v_mul_f16_e32
-; GFX9-NOT: v_max
-; GFX9-NOT: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
+; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: v_add_f16_e32 v1, 1.0, v0
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v0
+; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%lo = extractelement <2 x half> %vec, i32 0
%hi = extractelement <2 x half> %vec, i32 1
%lo.op = fadd half %lo, 1.0
@@ -746,10 +1689,23 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon1_v2f16:
-; GFX9: v_add_f16_e32
-; GFX9: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %vec) {
+; VI-LABEL: v_test_canonicalize_build_vector_noncanon1_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_add_f16_e32 v1, 1.0, v0
+; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_build_vector_noncanon1_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v0
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%lo = extractelement <2 x half> %vec, i32 0
%lo.op = fadd half %lo, 1.0
%ins = insertelement <2 x half> %vec, half %lo.op, i32 0
@@ -757,10 +1713,25 @@ define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon0_v2f16:
-; GFX9: v_add_f16_sdwa
-; GFX9: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %vec) {
+; VI-LABEL: v_test_canonicalize_build_vector_noncanon0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_build_vector_noncanon0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX9-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%hi = extractelement <2 x half> %vec, i32 1
%hi.op = fadd half %hi, 1.0
%ins = insertelement <2 x half> %vec, half %hi.op, i32 1
@@ -768,11 +1739,12 @@ define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: s_setpc_b64
define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
+; GCN-LABEL: v_test_canonicalize_extract_element_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f16_e32 v0, 4.0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
%elt = extractelement <2 x half> %vec.op, i32 0
%canonicalized = call half @llvm.canonicalize.f16(half %elt)
@@ -787,22 +1759,71 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
; ret <2 x half> %canonicalized
; }
-; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
-; GFX9: v_mul_f16
-; GFX9: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x half> %vec, half %val, i32 %idx) {
+; VI-LABEL: v_test_canonicalize_insertelement_noncanon_vec_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mul_f16_e32 v1, 0x4800, v1
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; VI-NEXT: s_mov_b32 s4, 0xffff
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
+; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; VI-NEXT: v_bfi_b32 v0, v2, v1, v0
+; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_insertelement_noncanon_vec_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v1, 0x4800, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1
+; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT: v_bfi_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%ins.op = fmul half %val, 8.0
%ins = insertelement <2 x half> %vec, half %ins.op, i32 %idx
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_insval_v2f16:
-; GFX9: v_pk_mul_f16
-; GFX9: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x half> %vec, half %val, i32 %idx) {
+; VI-LABEL: v_test_canonicalize_insertelement_noncanon_insval_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, 0x4400
+; VI-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v3
+; VI-NEXT: v_mov_b32_e32 v3, 16
+; VI-NEXT: s_mov_b32 s4, 0xffff
+; VI-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; VI-NEXT: v_bfi_b32 v0, v2, v1, v0
+; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_insertelement_noncanon_insval_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT: v_bfi_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
%ins = insertelement <2 x half> %vec.op, half %val, i32 %idx
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
@@ -815,101 +1836,157 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x
; ret <2 x half> %canonicalized
; }
-; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
-; GCN: s_waitcnt
-; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_cubeid(float %a, float %b, float %c) {
+; GCN-LABEL: v_test_canonicalize_cubeid:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
%cvt = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
%canonicalized = call float @llvm.canonicalize.f32(float %cvt)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_frexp_mant:
-; GCN: s_waitcnt
-; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_frexp_mant(float %a) {
+; GCN-LABEL: v_test_canonicalize_frexp_mant:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%cvt = call float @llvm.amdgcn.frexp.mant.f32(float %a)
%canonicalized = call float @llvm.canonicalize.f32(float %cvt)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_log:
-; GCN: s_waitcnt
-; GCN-NEXT: v_log_f32
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_amdgcn_log(float %a) {
+; GCN-LABEL: v_test_canonicalize_amdgcn_log:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_log_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%log = call float @llvm.amdgcn.log.f32(float %a)
%canonicalized = call float @llvm.canonicalize.f32(float %log)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_exp2:
-; GCN: s_waitcnt
-; GCN-NEXT: v_exp_f32
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_amdgcn_exp2(float %a) {
+; GCN-LABEL: v_test_canonicalize_amdgcn_exp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_exp_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%log = call float @llvm.amdgcn.exp2.f32(float %a)
%canonicalized = call float @llvm.canonicalize.f32(float %log)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_minimum:
-; GCN: s_waitcnt
-; GCN-NEXT: v_min_f32_e32 [[MIN:v[0-9]+]], v0, v1
-; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 0x7fc00000
-; GCN-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, [[K]], [[MIN]], vcc
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_minimum(float %a, float %b) {
+; VI-FLUSH-LABEL: v_test_canonicalize_minimum:
+; VI-FLUSH: ; %bb.0:
+; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-FLUSH-NEXT: v_min_f32_e32 v2, v0, v1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; VI-FLUSH-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; VI-FLUSH-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-DENORM-LABEL: v_test_canonicalize_minimum:
+; VI-DENORM: ; %bb.0:
+; VI-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-DENORM-NEXT: v_min_f32_e32 v2, v0, v1
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; VI-DENORM-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; VI-DENORM-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_minimum:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimum.f32(float %a, float %b)
%canonicalized = call float @llvm.canonicalize.f32(float %min)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_maximum:
-; GCN: s_waitcnt
-; GCN-NEXT: v_max_f32_e32 [[MIN:v[0-9]+]], v0, v1
-; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 0x7fc00000
-; GCN-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, [[K]], [[MIN]], vcc
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_maximum(float %a, float %b) {
+; VI-FLUSH-LABEL: v_test_canonicalize_maximum:
+; VI-FLUSH: ; %bb.0:
+; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-FLUSH-NEXT: v_max_f32_e32 v2, v0, v1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; VI-FLUSH-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; VI-FLUSH-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-DENORM-LABEL: v_test_canonicalize_maximum:
+; VI-DENORM: ; %bb.0:
+; VI-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-DENORM-NEXT: v_max_f32_e32 v2, v0, v1
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; VI-DENORM-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; VI-DENORM-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_maximum:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.maximum.f32(float %a, float %b)
%canonicalized = call float @llvm.canonicalize.f32(float %min)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_minimumnum:
-; GCN: s_waitcnt
-; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_minimumnum(float %a, float %b) {
+; VI-LABEL: v_test_canonicalize_minimumnum:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-NEXT: v_min_f32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_minimumnum:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float %a, float %b)
%canonicalized = call float @llvm.canonicalize.f32(float %min)
ret float %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_maximumnum:
-; GCN: s_waitcnt
-; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_maximumnum(float %a, float %b) {
+; VI-LABEL: v_test_canonicalize_maximumnum:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-NEXT: v_max_f32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_maximumnum:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.maximumnum.f32(float %a, float %b)
%canonicalized = call float @llvm.canonicalize.f32(float %min)
ret float %canonicalized
@@ -917,7 +1994,6 @@ define float @v_test_canonicalize_maximumnum(float %a, float %b) {
; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
-; GCN: .amd_amdgpu_isa
declare float @llvm.canonicalize.f32(float) #0
declare float @llvm.copysign.f32(float, float) #0
@@ -949,3 +2025,6 @@ declare float @llvm.amdgcn.exp2.f32(float) #0
attributes #0 = { nounwind readnone }
attributes #1 = { "no-nans-fp-math"="true" }
attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-DENORM: {{.*}}
+; GCN-FLUSH: {{.*}}
>From 5963de7cf02146b52606bf990760997f616f2d68 Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Fri, 11 Jul 2025 16:10:27 +0800
Subject: [PATCH 5/5] update testcase change for amd and systemz
---
.../AMDGPU/fcanonicalize-elimination.ll | 36 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 490 +++++++++---------
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 181 +++----
llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll | 77 +--
llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll | 77 +--
llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 36 +-
llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 36 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 35 +-
llvm/test/CodeGen/AMDGPU/maximumnum.ll | 444 ++++++++--------
llvm/test/CodeGen/AMDGPU/minimumnum.ll | 444 ++++++++--------
llvm/test/CodeGen/AMDGPU/reduction.ll | 44 +-
.../test/CodeGen/SystemZ/canonicalize-vars.ll | 48 +-
12 files changed, 974 insertions(+), 974 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 6ad021a9355e3..475e6201dd391 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -690,19 +690,22 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_cvt_f16_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1666,8 +1669,10 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: v_add_f16_e32 v1, 1.0, v0
+; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1694,7 +1699,9 @@ define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_f16_e32 v1, 1.0, v0
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1718,8 +1725,9 @@ define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
-; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1770,9 +1778,10 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x hal
; VI-NEXT: v_or_b32_e32 v1, v1, v3
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
; VI-NEXT: v_bfi_b32 v0, v2, v1, v0
-; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v0
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_insertelement_noncanon_vec_v2f16:
@@ -1807,9 +1816,10 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
; VI-NEXT: v_bfi_b32 v0, v2, v1, v0
-; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v0
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_insertelement_noncanon_insval_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 9ef48588a51ae..4abfd045bd878 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -199,8 +199,9 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1394,10 +1395,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v2, v0, v1
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v0
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -1468,10 +1470,11 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0|
-; VI-NEXT: v_or_b32_e32 v2, v0, v1
+; VI-NEXT: v_mul_f16_e64 v2, |v0|, 1.0
+; VI-NEXT: v_mul_f16_sdwa v0, |v0|, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -1545,10 +1548,11 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0|
-; VI-NEXT: v_or_b32_e32 v2, v0, v1
+; VI-NEXT: v_mul_f16_e64 v2, -|v0|, 1.0
+; VI-NEXT: v_mul_f16_sdwa v0, -|v0|, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -1624,10 +1628,11 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
-; VI-NEXT: v_or_b32_e32 v2, v0, v1
+; VI-NEXT: v_mul_f16_e64 v2, -v0, 1.0
+; VI-NEXT: v_mul_f16_sdwa v0, -v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -1695,12 +1700,13 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0x3c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_max_f16_e64 v0, s2, s2
-; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v0, v1
+; VI-NEXT: v_mul_f16_e64 v1, s2, 1.0
+; VI-NEXT: s_lshr_b32 s2, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -2425,9 +2431,10 @@ define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2463,12 +2470,13 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v4f16:
@@ -2883,9 +2891,11 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
-; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT: v_mul_f16_e64 v1, s4, 1.0
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
@@ -2932,10 +2942,13 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mul_f16_e64 v2, s4, 1.0
+; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
@@ -2985,10 +2998,13 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mul_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v3
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3044,15 +3060,16 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v6f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v2, v2, v2
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v5
-; VI-NEXT: v_or_b32_e32 v1, v1, v4
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; VI-NEXT: v_mul_f16_sdwa v5, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_e32 v1, v1, v5
+; VI-NEXT: v_or_b32_e32 v2, v2, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v6f16:
@@ -3095,18 +3112,19 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v3, v3, v3
-; VI-NEXT: v_max_f16_e32 v2, v2, v2
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v7
-; VI-NEXT: v_or_b32_e32 v1, v1, v6
-; VI-NEXT: v_or_b32_e32 v2, v2, v5
-; VI-NEXT: v_or_b32_e32 v3, v3, v4
+; VI-NEXT: v_mov_b32_e32 v4, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; VI-NEXT: v_mul_f16_sdwa v6, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; VI-NEXT: v_mul_f16_sdwa v7, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NEXT: v_or_b32_e32 v1, v1, v7
+; VI-NEXT: v_or_b32_e32 v2, v2, v6
+; VI-NEXT: v_or_b32_e32 v3, v3, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v8f16:
@@ -3155,24 +3173,25 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v12f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v9, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v10, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v11, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v5, v5, v5
-; VI-NEXT: v_max_f16_e32 v4, v4, v4
-; VI-NEXT: v_max_f16_e32 v3, v3, v3
-; VI-NEXT: v_max_f16_e32 v2, v2, v2
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v11
-; VI-NEXT: v_or_b32_e32 v1, v1, v10
-; VI-NEXT: v_or_b32_e32 v2, v2, v9
-; VI-NEXT: v_or_b32_e32 v3, v3, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v7
-; VI-NEXT: v_or_b32_e32 v5, v5, v6
+; VI-NEXT: v_mov_b32_e32 v6, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; VI-NEXT: v_mul_f16_sdwa v8, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; VI-NEXT: v_mul_f16_sdwa v9, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; VI-NEXT: v_mul_f16_sdwa v10, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; VI-NEXT: v_mul_f16_sdwa v11, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v6, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v6
+; VI-NEXT: v_or_b32_e32 v1, v1, v11
+; VI-NEXT: v_or_b32_e32 v2, v2, v10
+; VI-NEXT: v_or_b32_e32 v3, v3, v9
+; VI-NEXT: v_or_b32_e32 v4, v4, v8
+; VI-NEXT: v_or_b32_e32 v5, v5, v7
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v12f16:
@@ -3233,30 +3252,31 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v12, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v13, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v14, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v15, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v7, v7, v7
-; VI-NEXT: v_max_f16_e32 v6, v6, v6
-; VI-NEXT: v_max_f16_e32 v5, v5, v5
-; VI-NEXT: v_max_f16_e32 v4, v4, v4
-; VI-NEXT: v_max_f16_e32 v3, v3, v3
-; VI-NEXT: v_max_f16_e32 v2, v2, v2
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v15
-; VI-NEXT: v_or_b32_e32 v1, v1, v14
-; VI-NEXT: v_or_b32_e32 v2, v2, v13
-; VI-NEXT: v_or_b32_e32 v3, v3, v12
-; VI-NEXT: v_or_b32_e32 v4, v4, v11
-; VI-NEXT: v_or_b32_e32 v5, v5, v10
-; VI-NEXT: v_or_b32_e32 v6, v6, v9
-; VI-NEXT: v_or_b32_e32 v7, v7, v8
+; VI-NEXT: v_mov_b32_e32 v8, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v7, 1.0, v7
+; VI-NEXT: v_mul_f16_sdwa v10, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; VI-NEXT: v_mul_f16_sdwa v11, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; VI-NEXT: v_mul_f16_sdwa v12, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; VI-NEXT: v_mul_f16_sdwa v13, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; VI-NEXT: v_mul_f16_sdwa v14, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; VI-NEXT: v_mul_f16_sdwa v15, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v8
+; VI-NEXT: v_or_b32_e32 v1, v1, v15
+; VI-NEXT: v_or_b32_e32 v2, v2, v14
+; VI-NEXT: v_or_b32_e32 v3, v3, v13
+; VI-NEXT: v_or_b32_e32 v4, v4, v12
+; VI-NEXT: v_or_b32_e32 v5, v5, v11
+; VI-NEXT: v_or_b32_e32 v6, v6, v10
+; VI-NEXT: v_or_b32_e32 v7, v7, v9
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v16f16:
@@ -3329,54 +3349,55 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v32f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_mov_b32_e32 v16, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v19, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v19
-; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_mul_f16_sdwa v19, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
; VI-NEXT: v_or_b32_e32 v1, v1, v19
-; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v2, v2, v2
+; VI-NEXT: v_mul_f16_sdwa v19, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v19
-; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v3, v3, v3
+; VI-NEXT: v_mul_f16_sdwa v19, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3
; VI-NEXT: v_or_b32_e32 v3, v3, v19
-; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v4, v4, v4
+; VI-NEXT: v_mul_f16_sdwa v19, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4
; VI-NEXT: v_or_b32_e32 v4, v4, v19
-; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v5, v5, v5
+; VI-NEXT: v_mul_f16_sdwa v19, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5
; VI-NEXT: v_or_b32_e32 v5, v5, v19
-; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v6, v6, v6
+; VI-NEXT: v_mul_f16_sdwa v19, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v6, 1.0, v6
; VI-NEXT: v_or_b32_e32 v6, v6, v19
-; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v7, v7, v7
+; VI-NEXT: v_mul_f16_sdwa v19, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v7, 1.0, v7
; VI-NEXT: v_or_b32_e32 v7, v7, v19
-; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v8, v8, v8
+; VI-NEXT: v_mul_f16_sdwa v19, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v8, 1.0, v8
; VI-NEXT: v_or_b32_e32 v8, v8, v19
-; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v9, v9, v9
+; VI-NEXT: v_mul_f16_sdwa v19, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v9, 1.0, v9
; VI-NEXT: v_or_b32_e32 v9, v9, v19
-; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v10, v10, v10
+; VI-NEXT: v_mul_f16_sdwa v19, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v10, 1.0, v10
; VI-NEXT: v_or_b32_e32 v10, v10, v19
-; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v11, v11, v11
-; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_mul_f16_sdwa v19, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v11, 1.0, v11
+; VI-NEXT: v_mul_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v15, 1.0, v15
+; VI-NEXT: v_mul_f16_sdwa v18, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v14, 1.0, v14
; VI-NEXT: v_or_b32_e32 v11, v11, v19
-; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v15, v15, v15
-; VI-NEXT: v_max_f16_e32 v14, v14, v14
-; VI-NEXT: v_max_f16_e32 v13, v13, v13
-; VI-NEXT: v_max_f16_e32 v12, v12, v12
-; VI-NEXT: v_or_b32_e32 v12, v12, v19
-; VI-NEXT: v_or_b32_e32 v13, v13, v18
-; VI-NEXT: v_or_b32_e32 v14, v14, v17
-; VI-NEXT: v_or_b32_e32 v15, v15, v16
+; VI-NEXT: v_mul_f16_sdwa v19, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v13, 1.0, v13
+; VI-NEXT: v_mul_f16_sdwa v16, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v12, 1.0, v12
+; VI-NEXT: v_or_b32_e32 v12, v12, v16
+; VI-NEXT: v_or_b32_e32 v13, v13, v19
+; VI-NEXT: v_or_b32_e32 v14, v14, v18
+; VI-NEXT: v_or_b32_e32 v15, v15, v17
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v32f16:
@@ -3499,104 +3520,105 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v64f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v31
-; VI-NEXT: v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_or_b32_e32 v1, v1, v31
-; VI-NEXT: v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v2, v2, v2
-; VI-NEXT: v_or_b32_e32 v2, v2, v31
-; VI-NEXT: v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v3, v3, v3
-; VI-NEXT: v_or_b32_e32 v3, v3, v31
-; VI-NEXT: v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v4, v4, v4
-; VI-NEXT: v_or_b32_e32 v4, v4, v31
-; VI-NEXT: v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v5, v5, v5
-; VI-NEXT: v_or_b32_e32 v5, v5, v31
-; VI-NEXT: v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v6, v6, v6
-; VI-NEXT: v_or_b32_e32 v6, v6, v31
-; VI-NEXT: v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v7, v7, v7
-; VI-NEXT: v_or_b32_e32 v7, v7, v31
-; VI-NEXT: v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v8, v8, v8
-; VI-NEXT: v_or_b32_e32 v8, v8, v31
-; VI-NEXT: v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v9, v9, v9
-; VI-NEXT: v_or_b32_e32 v9, v9, v31
-; VI-NEXT: v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v10, v10, v10
-; VI-NEXT: v_or_b32_e32 v10, v10, v31
-; VI-NEXT: v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v11, v11, v11
-; VI-NEXT: v_or_b32_e32 v11, v11, v31
-; VI-NEXT: v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v12, v12, v12
-; VI-NEXT: v_or_b32_e32 v12, v12, v31
-; VI-NEXT: v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v13, v13, v13
-; VI-NEXT: v_or_b32_e32 v13, v13, v31
-; VI-NEXT: v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v14, v14, v14
-; VI-NEXT: v_or_b32_e32 v14, v14, v31
-; VI-NEXT: v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v15, v15, v15
-; VI-NEXT: v_or_b32_e32 v15, v15, v31
-; VI-NEXT: v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v16, v16, v16
-; VI-NEXT: v_or_b32_e32 v16, v16, v31
-; VI-NEXT: v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v17, v17, v17
-; VI-NEXT: v_or_b32_e32 v17, v17, v31
-; VI-NEXT: v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v18, v18, v18
-; VI-NEXT: v_or_b32_e32 v18, v18, v31
-; VI-NEXT: v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v19, v19, v19
-; VI-NEXT: v_or_b32_e32 v19, v19, v31
-; VI-NEXT: v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v20, v20, v20
-; VI-NEXT: v_or_b32_e32 v20, v20, v31
-; VI-NEXT: v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v21, v21, v21
-; VI-NEXT: v_or_b32_e32 v21, v21, v31
-; VI-NEXT: v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v22, v22, v22
-; VI-NEXT: v_or_b32_e32 v22, v22, v31
-; VI-NEXT: v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v23, v23, v23
-; VI-NEXT: v_or_b32_e32 v23, v23, v31
-; VI-NEXT: v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v24, v24, v24
-; VI-NEXT: v_or_b32_e32 v24, v24, v31
-; VI-NEXT: v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v25, v25, v25
-; VI-NEXT: v_or_b32_e32 v25, v25, v31
-; VI-NEXT: v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v26, v26, v26
-; VI-NEXT: v_or_b32_e32 v26, v26, v31
-; VI-NEXT: v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v27, v27, v27
-; VI-NEXT: v_or_b32_e32 v27, v27, v31
-; VI-NEXT: v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v28, v28, v28
-; VI-NEXT: v_or_b32_e32 v28, v28, v31
-; VI-NEXT: v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v29, v29, v29
-; VI-NEXT: v_or_b32_e32 v29, v29, v31
-; VI-NEXT: v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v30, v30, v30
-; VI-NEXT: v_or_b32_e32 v30, v30, v31
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: v_mov_b32_e32 v31, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v32, v0, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; VI-NEXT: v_or_b32_e32 v1, v1, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; VI-NEXT: v_or_b32_e32 v2, v2, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; VI-NEXT: v_or_b32_e32 v3, v3, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; VI-NEXT: v_or_b32_e32 v4, v4, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; VI-NEXT: v_or_b32_e32 v5, v5, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; VI-NEXT: v_or_b32_e32 v6, v6, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v7, 1.0, v7
+; VI-NEXT: v_or_b32_e32 v7, v7, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v8, 1.0, v8
+; VI-NEXT: v_or_b32_e32 v8, v8, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v9, 1.0, v9
+; VI-NEXT: v_or_b32_e32 v9, v9, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v10, 1.0, v10
+; VI-NEXT: v_or_b32_e32 v10, v10, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v11, 1.0, v11
+; VI-NEXT: v_or_b32_e32 v11, v11, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v12, 1.0, v12
+; VI-NEXT: v_or_b32_e32 v12, v12, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v13, 1.0, v13
+; VI-NEXT: v_or_b32_e32 v13, v13, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v14, 1.0, v14
+; VI-NEXT: v_or_b32_e32 v14, v14, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v15, 1.0, v15
+; VI-NEXT: v_or_b32_e32 v15, v15, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v16, 1.0, v16
+; VI-NEXT: v_or_b32_e32 v16, v16, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v17, 1.0, v17
+; VI-NEXT: v_or_b32_e32 v17, v17, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v18, 1.0, v18
+; VI-NEXT: v_or_b32_e32 v18, v18, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v19, 1.0, v19
+; VI-NEXT: v_or_b32_e32 v19, v19, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v20, 1.0, v20
+; VI-NEXT: v_or_b32_e32 v20, v20, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v21, 1.0, v21
+; VI-NEXT: v_or_b32_e32 v21, v21, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v22, 1.0, v22
+; VI-NEXT: v_or_b32_e32 v22, v22, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v23, 1.0, v23
+; VI-NEXT: v_or_b32_e32 v23, v23, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v24, 1.0, v24
+; VI-NEXT: v_or_b32_e32 v24, v24, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v25, 1.0, v25
+; VI-NEXT: v_or_b32_e32 v25, v25, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v26, 1.0, v26
+; VI-NEXT: v_or_b32_e32 v26, v26, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v27, 1.0, v27
+; VI-NEXT: v_or_b32_e32 v27, v27, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v28, 1.0, v28
+; VI-NEXT: v_or_b32_e32 v28, v28, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v29, 1.0, v29
+; VI-NEXT: v_or_b32_e32 v29, v29, v32
+; VI-NEXT: v_mul_f16_sdwa v32, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v30, 1.0, v30
+; VI-NEXT: v_or_b32_e32 v30, v30, v32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v31, v31, v31
-; VI-NEXT: v_or_b32_e32 v31, v31, v32
+; VI-NEXT: v_mul_f16_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_e32 v32, 1.0, v32
+; VI-NEXT: v_or_b32_e32 v31, v32, v31
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_var_v64f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index bc541043f1fab..900f81811af15 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2506,9 +2506,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0
-; GFX8-NEXT: v_or_b32_e32 v4, v0, v1
+; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v4
@@ -2886,14 +2886,15 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v3, v0, v3
+; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_store_dword v[0:1], v3
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_dword v[0:1], v4
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: test_canonicalize_value_v2f16_denorm:
@@ -2957,8 +2958,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX6-NEXT: v_mov_b32_e32 v5, s1
; GFX6-NEXT: v_mov_b32_e32 v4, s0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX6-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX6-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
; GFX6-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX6-NEXT: s_endpgm
;
@@ -2977,8 +2978,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX8-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -2990,8 +2991,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -3005,8 +3006,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
@@ -3020,8 +3021,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3]
+; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3044,14 +3045,14 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
; GFX9-LABEL: v_test_canonicalize_v2f32_flush:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v2f32_flush:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v2f32_flush:
@@ -3061,7 +3062,7 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg)
ret <2 x float> %canon
@@ -3080,16 +3081,16 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
; GFX9-LABEL: v_test_canonicalize_v3f32_flush:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v3f32_flush:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
+; GFX11-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v3f32_flush:
@@ -3099,8 +3100,8 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
+; GFX12-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg)
ret <3 x float> %canon
@@ -3120,17 +3121,17 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
; GFX9-LABEL: v_test_canonicalize_v4f32_flush:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v4f32_flush:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
+; GFX11-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v4f32_flush:
@@ -3140,8 +3141,8 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
+; GFX12-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg)
ret <4 x float> %canon
@@ -3165,23 +3166,23 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
; GFX9-LABEL: v_test_canonicalize_v8f32_flush:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT: v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX9-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX9-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX9-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX9-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v8f32_flush:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
+; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
+; GFX11-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3
+; GFX11-NEXT: v_dual_mul_f32 v4, 1.0, v4 :: v_dual_mul_f32 v5, 1.0, v5
+; GFX11-NEXT: v_dual_mul_f32 v6, 1.0, v6 :: v_dual_mul_f32 v7, 1.0, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v8f32_flush:
@@ -3191,10 +3192,10 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
+; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1
+; GFX12-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3
+; GFX12-NEXT: v_dual_mul_f32 v4, 1.0, v4 :: v_dual_mul_f32 v5, 1.0, v5
+; GFX12-NEXT: v_dual_mul_f32 v6, 1.0, v6 :: v_dual_mul_f32 v7, 1.0, v7
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg)
ret <8 x float> %canon
@@ -3204,22 +3205,22 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
; GFX678-LABEL: v_test_canonicalize_v2f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX678-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX678-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v2f64:
@@ -3229,8 +3230,8 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg)
ret <2 x double> %canon
@@ -3240,25 +3241,25 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
; GFX678-LABEL: v_test_canonicalize_v3f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX678-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX678-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX678-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_v3f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v3f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v3f64:
@@ -3268,9 +3269,9 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3]
+; GFX12-NEXT: v_mul_f64_e32 v[4:5], 1.0, v[4:5]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg)
ret <3 x double> %canon
@@ -3280,28 +3281,28 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
; GFX678-LABEL: v_test_canonicalize_v4f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX678-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX678-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX678-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX678-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0
+; GFX678-NEXT: v_mul_f64 v[6:7], v[6:7], 1.0
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_canonicalize_v4f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0
+; GFX9-NEXT: v_mul_f64 v[6:7], v[6:7], 1.0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_v4f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0
+; GFX11-NEXT: v_mul_f64 v[6:7], v[6:7], 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_test_canonicalize_v4f64:
@@ -3311,10 +3312,10 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3]
+; GFX12-NEXT: v_mul_f64_e32 v[4:5], 1.0, v[4:5]
+; GFX12-NEXT: v_mul_f64_e32 v[6:7], 1.0, v[6:7]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg)
ret <4 x double> %canon
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index 2f08931f2287e..78ee94e8c372c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1291,22 +1291,26 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3
; GFX8-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2
+; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0
+; GFX8-NEXT: v_max_f16_e32 v6, v7, v6
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3
+; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-NEXT: v_max_f16_e32 v1, v1, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v6, v7, v6
-; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v2
+; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-NEXT: v_mul_f16_sdwa v2, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v6
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v4
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0:
@@ -1467,28 +1471,33 @@ define <4 x half> @v_max3_v4f16_maximumnum_maximumnum__v_v_v_0(<4 x half> %a, <4
; GFX8-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2
+; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0
; GFX8-NEXT: v_max_f16_e32 v6, v7, v6
-; GFX8-NEXT: v_max_f16_sdwa v7, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v7, v8, v7
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v2
-; GFX8-NEXT: v_max_f16_sdwa v2, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v5
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3
+; GFX8-NEXT: v_mul_f16_e32 v8, 1.0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v8, v2
+; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v3, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v5
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v3
+; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-NEXT: v_mul_f16_sdwa v3, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v6
+; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index 969c6c3980fc3..0a7f7810051c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1291,22 +1291,26 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3
; GFX8-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2
+; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0
+; GFX8-NEXT: v_min_f16_e32 v6, v7, v6
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3
+; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-NEXT: v_min_f16_e32 v1, v1, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX8-NEXT: v_min_f16_e32 v6, v7, v6
-; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v5
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v2
+; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-NEXT: v_mul_f16_sdwa v2, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v6
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v4
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0:
@@ -1467,28 +1471,33 @@ define <4 x half> @v_min3_v4f16_minimumnum_minimumnum__v_v_v_0(<4 x half> %a, <4
; GFX8-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2
+; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0
; GFX8-NEXT: v_min_f16_e32 v6, v7, v6
-; GFX8-NEXT: v_max_f16_sdwa v7, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_e32 v7, v8, v7
-; GFX8-NEXT: v_min_f16_e32 v1, v1, v2
-; GFX8-NEXT: v_max_f16_sdwa v2, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v1, v1, v5
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3
+; GFX8-NEXT: v_mul_f16_e32 v8, 1.0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_min_f16_e32 v2, v8, v2
+; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v3, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v5
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v3
+; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-NEXT: v_mul_f16_sdwa v3, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v6
+; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-NEXT: v_min_f16_e32 v3, v3, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 863240cc591c3..62f081c38b0ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -763,17 +763,17 @@ define amdgpu_kernel void @maxnum_v3f16(
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
+; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0
+; VI-NEXT: v_mul_f16_e64 v1, s2, 1.0
; VI-NEXT: s_lshr_b32 s0, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
+; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0
; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_mul_f16_e64 v1, s9, 1.0
+; VI-NEXT: v_mul_f16_e64 v2, s3, 1.0
; VI-NEXT: v_max_f16_e32 v1, v2, v1
; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -907,22 +907,22 @@ define amdgpu_kernel void @maxnum_v4f16(
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
+; VI-NEXT: v_mul_f16_e64 v0, s9, 1.0
+; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0
; VI-NEXT: s_lshr_b32 s0, s9, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
+; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0
; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0
+; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0
; VI-NEXT: s_lshr_b32 s0, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0
; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1042,15 +1042,15 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0
+; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0
+; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0
; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_lshr_b32 s0, s2, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 7e8c30161c1c8..2fa7c4d3e45cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -827,17 +827,17 @@ define amdgpu_kernel void @minnum_v3f16(
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
+; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0
+; VI-NEXT: v_mul_f16_e64 v1, s2, 1.0
; VI-NEXT: s_lshr_b32 s0, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
+; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0
; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_mul_f16_e64 v1, s9, 1.0
+; VI-NEXT: v_mul_f16_e64 v2, s3, 1.0
; VI-NEXT: v_min_f16_e32 v1, v2, v1
; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -970,22 +970,22 @@ define amdgpu_kernel void @minnum_v4f16(
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
+; VI-NEXT: v_mul_f16_e64 v0, s9, 1.0
+; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0
; VI-NEXT: s_lshr_b32 s0, s9, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
+; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0
; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0
+; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0
; VI-NEXT: s_lshr_b32 s0, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0
; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1104,15 +1104,15 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0
+; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0
+; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0
; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_lshr_b32 s0, s2, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 32e0d393a1001..8696e61891e13 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1088,27 +1088,34 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8
-; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5
-; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
-; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1
-; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00
-; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3
-; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1
+; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v5
+; SDAG-VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; SDAG-VI-NEXT: v_mul_f16_e64 v3, 0, 1.0
+; SDAG-VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; SDAG-VI-NEXT: v_max_f16_e32 v0, v0, v3
+; SDAG-VI-NEXT: v_max_f16_e32 v2, v2, v3
+; SDAG-VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v1, v1, v3
+; SDAG-VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; SDAG-VI-NEXT: v_mul_f16_e64 v3, 1.0, 1.0
+; SDAG-VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; SDAG-VI-NEXT: v_min_f16_e32 v2, v2, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v1, v1, v3
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1308,7 +1315,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1
; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2
; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3
+; SDAG-VI-NEXT: v_mul_f16_e32 v1, 1.0, v1
; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00
+; SDAG-VI-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; SDAG-VI-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; SDAG-VI-NEXT: v_mul_f16_e32 v2, 1.0, v2
; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e9c1883..ba7b1fc2b9a49 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -3636,16 +3636,17 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
; GFX8-SDAG-LABEL: v_maximumnum_v3f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_maximumnum_v3f16:
@@ -3926,20 +3927,21 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
; GFX8-SDAG-LABEL: v_maximumnum_v4f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_maximumnum_v4f16:
@@ -4260,27 +4262,28 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
; GFX8-SDAG-LABEL: v_maximumnum_v6f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v5
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v4
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v7
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_maximumnum_v6f16:
@@ -4552,34 +4555,35 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
; GFX8-SDAG-LABEL: v_maximumnum_v8f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v5
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v4
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v11
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v9
-; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v6
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7
+; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v9
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_maximumnum_v8f16:
@@ -4964,62 +4968,63 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
; GFX8-SDAG-LABEL: v_maximumnum_v16f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v16, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7
; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15
-; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14
-; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v15
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v15, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v14
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v14, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v14, v18, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v13
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v13, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v13, v18, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v12
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v12, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v12, v18, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v11
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v11, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v10
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v9
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v18, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v8
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v23
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v22
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v21
-; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20
-; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v19
-; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v18
-; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v17
-; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v16
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v11
+; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v12
+; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v13
+; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v14
+; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v15
+; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v17
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_maximumnum_v16f16:
@@ -5820,144 +5825,121 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
; GFX8-SDAG-LABEL: v_maximumnum_v32f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v38, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v39, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v48, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v49, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v50, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v51, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v40, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v41, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v58, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v59, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v17, v17, v17
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v52, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v53, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v54, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v55, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v42, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v43, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v44, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v45, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v46, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v47, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v56, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v57, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v39, v49, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v48, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v51, v41, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v40, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v17
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v49, v53, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v50, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v52, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v53, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v54, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v55, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX8-SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v34, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v36, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v31, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v32, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v30, 1.0, v30
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14
; GFX8-SDAG-NEXT: v_max_f16_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v35, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_e32 v30, v30, v30
-; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14
-; GFX8-SDAG-NEXT: v_max_f16_e32 v29, v29, v29
-; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13
-; GFX8-SDAG-NEXT: v_max_f16_e32 v28, v28, v28
-; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12
-; GFX8-SDAG-NEXT: v_max_f16_e32 v27, v27, v27
-; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11
-; GFX8-SDAG-NEXT: v_max_f16_e32 v26, v26, v26
-; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10
-; GFX8-SDAG-NEXT: v_max_f16_e32 v25, v25, v25
-; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9
-; GFX8-SDAG-NEXT: v_max_f16_e32 v24, v24, v24
-; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8
-; GFX8-SDAG-NEXT: v_max_f16_e32 v23, v23, v23
-; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-SDAG-NEXT: v_max_f16_e32 v22, v22, v22
-; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-SDAG-NEXT: v_max_f16_e32 v21, v21, v21
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v20, v20, v20
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v19, v19, v19
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v18, v18, v18
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v16, v16, v16
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v30
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v30, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v29, 1.0, v29
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v30, v33, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v29
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v29, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v28, 1.0, v28
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v29, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v28
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v28, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v27, 1.0, v27
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v28, v33, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v27
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v27, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v26, 1.0, v26
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v27, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v26
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v26, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v25, 1.0, v25
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v26, v33, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v25
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v25, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8
; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v24
+; GFX8-SDAG-NEXT: buffer_load_dword v24, off, s[0:3], s32
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v25, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15
+; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v25
+; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v26
+; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v27
+; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v28
+; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v29
+; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v30
+; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v33, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v24
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v24, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v23, 1.0, v23
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v23
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v23, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v22, 1.0, v22
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v22
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v22, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v21, 1.0, v21
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v34, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v21
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v21, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v20, 1.0, v20
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v20
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v20, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v19, 1.0, v19
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v34, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v19
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v19, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v18, 1.0, v18
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v18
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v17, 1.0, v17
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v17
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v31, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v16, 1.0, v16
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v31, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v16
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v33
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v55
-; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v54
-; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v53
-; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v52
-; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v51
-; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v50
-; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v49
-; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v48
-; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v39
-; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v38
-; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v36
-; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v34
-; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32
-; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v31, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v31, v31, v31
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v37, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v31
-; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v35
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v17
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v18
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20
+; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v22
+; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v23
+; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v24
+; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v33
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_maximumnum_v32f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d2b6957..f50e83bc4a616 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -3461,16 +3461,17 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
; GFX8-SDAG-LABEL: v_minimumnum_v3f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_minimumnum_v3f16:
@@ -3751,20 +3752,21 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
; GFX8-SDAG-LABEL: v_minimumnum_v4f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v3
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_minimumnum_v4f16:
@@ -4085,27 +4087,28 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
; GFX8-SDAG-LABEL: v_minimumnum_v6f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
; GFX8-SDAG-NEXT: v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v5
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v4
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v7
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_minimumnum_v6f16:
@@ -4377,34 +4380,35 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
; GFX8-SDAG-LABEL: v_minimumnum_v8f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
; GFX8-SDAG-NEXT: v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v5
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v6, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v4
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v11
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v9
-; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v6
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7
+; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v9
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_minimumnum_v8f16:
@@ -4789,62 +4793,63 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
; GFX8-SDAG-LABEL: v_minimumnum_v16f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v16, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7
; GFX8-SDAG-NEXT: v_min_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15
-; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14
-; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v7, v7, v15
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v15, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v6, v6, v14
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v14, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v14, v18, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v5, v5, v13
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v13, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v13, v18, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v4, v4, v12
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v12, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v12, v18, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v3, v3, v11
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v11, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v10
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v9
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v10, v18, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v9, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v8
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v23
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v22
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v21
-; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20
-; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v19
-; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v18
-; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v17
-; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v16
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v11
+; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v12
+; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v13
+; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v14
+; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v15
+; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v17
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_minimumnum_v16f16:
@@ -5645,144 +5650,121 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
; GFX8-SDAG-LABEL: v_minimumnum_v32f16:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v38, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v39, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v48, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v49, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v50, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v51, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v40, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v41, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v58, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v59, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v17, v17, v17
-; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v52, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v53, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v54, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v55, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v42, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v43, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v44, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v45, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v46, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v47, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v56, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v57, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v39, v49, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v48, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v51, v41, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v40, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v17
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v49, v53, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v50, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v52, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v53, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v54, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v55, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX8-SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v34, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v36, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v31, 0x3c00
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v32, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v30, 1.0, v30
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14
; GFX8-SDAG-NEXT: v_min_f16_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v33, v35, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_max_f16_e32 v30, v30, v30
-; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14
-; GFX8-SDAG-NEXT: v_max_f16_e32 v29, v29, v29
-; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13
-; GFX8-SDAG-NEXT: v_max_f16_e32 v28, v28, v28
-; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12
-; GFX8-SDAG-NEXT: v_max_f16_e32 v27, v27, v27
-; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11
-; GFX8-SDAG-NEXT: v_max_f16_e32 v26, v26, v26
-; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10
-; GFX8-SDAG-NEXT: v_max_f16_e32 v25, v25, v25
-; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9
-; GFX8-SDAG-NEXT: v_max_f16_e32 v24, v24, v24
-; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8
-; GFX8-SDAG-NEXT: v_max_f16_e32 v23, v23, v23
-; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-SDAG-NEXT: v_max_f16_e32 v22, v22, v22
-; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-SDAG-NEXT: v_max_f16_e32 v21, v21, v21
-; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX8-SDAG-NEXT: v_max_f16_e32 v20, v20, v20
-; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-SDAG-NEXT: v_max_f16_e32 v19, v19, v19
-; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-SDAG-NEXT: v_max_f16_e32 v18, v18, v18
-; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-SDAG-NEXT: v_max_f16_e32 v16, v16, v16
-; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-SDAG-NEXT: v_min_f16_e32 v14, v14, v30
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v30, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v29, 1.0, v29
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v30, v33, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v13, v13, v29
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v29, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v28, 1.0, v28
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v29, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v12, v12, v28
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v28, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v27, 1.0, v27
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v28, v33, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v11, v11, v27
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v27, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v26, 1.0, v26
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v27, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v10, v10, v26
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v26, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v25, 1.0, v25
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v26, v33, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v9, v9, v25
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v25, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8
; GFX8-SDAG-NEXT: v_min_f16_e32 v8, v8, v24
+; GFX8-SDAG-NEXT: buffer_load_dword v24, off, s[0:3], s32
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v25, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15
+; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v25
+; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v26
+; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v27
+; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v28
+; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v29
+; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v30
+; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v33, v33, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_e32 v15, v15, v24
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v24, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v23, 1.0, v23
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v24, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v7, v7, v23
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v23, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v22, 1.0, v22
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v23, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v6, v6, v22
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v22, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v21, 1.0, v21
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v22, v34, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v5, v5, v21
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v21, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v20, 1.0, v20
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v21, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v4, v4, v20
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v20, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v19, 1.0, v19
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v20, v34, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v3, v3, v19
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v19, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v18, 1.0, v18
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v19, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v18
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v17, 1.0, v17
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v17
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_sdwa v31, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v16, 1.0, v16
+; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v18, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_min_f16_sdwa v17, v31, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v16
-; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v33
-; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v55
-; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v54
-; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v53
-; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v52
-; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v51
-; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v50
-; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v49
-; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v48
-; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v39
-; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v38
-; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v36
-; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v34
-; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32
-; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v31, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT: v_max_f16_e32 v31, v31, v31
-; GFX8-SDAG-NEXT: v_min_f16_sdwa v35, v37, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-SDAG-NEXT: v_min_f16_e32 v15, v15, v31
-; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v35
+; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v17
+; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v18
+; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20
+; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v22
+; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v23
+; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v24
+; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v33
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_minimumnum_v32f16:
diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll
index 7f9044ae164d5..a55081981408c 100644
--- a/llvm/test/CodeGen/AMDGPU/reduction.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduction.ll
@@ -566,13 +566,14 @@ define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
; VI-LABEL: reduction_maxnum_v4f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0
; VI-NEXT: v_max_f16_e32 v2, v3, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v2
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
@@ -596,13 +597,14 @@ define half @reduction_minnum_v4f16(<4 x half> %vec4) {
; VI-LABEL: reduction_minnum_v4f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0
; VI-NEXT: v_min_f16_e32 v2, v3, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_min_f16_e32 v0, v0, v1
-; VI-NEXT: v_min_f16_e32 v0, v0, v2
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
@@ -628,13 +630,14 @@ define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
; VI-LABEL: reduction_fast_max_pattern_v4f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0
; VI-NEXT: v_max_f16_e32 v2, v3, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v2
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
@@ -662,13 +665,14 @@ define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
; VI-LABEL: reduction_fast_min_pattern_v4f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1
+; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0
; VI-NEXT: v_min_f16_e32 v2, v3, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_min_f16_e32 v0, v0, v1
-; VI-NEXT: v_min_f16_e32 v0, v0, v2
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
index e02f931c4d31e..8b4640a1e33b1 100644
--- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
@@ -199,17 +199,8 @@ define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind {
define <4 x float> @canonicalize_v4f32(<4 x float> %a) {
; Z16-LABEL: canonicalize_v4f32:
; Z16: # %bb.0:
-; Z16-NEXT: vrepf %v0, %v24, 3
-; Z16-NEXT: vgmf %v1, 2, 8
-; Z16-NEXT: vrepf %v2, %v24, 2
-; Z16-NEXT: meebr %f0, %f1
-; Z16-NEXT: meebr %f2, %f1
-; Z16-NEXT: vrepf %v3, %v24, 1
-; Z16-NEXT: vmrhf %v0, %v2, %v0
-; Z16-NEXT: wfmsb %f2, %v24, %f1
-; Z16-NEXT: wfmsb %f1, %f3, %f1
-; Z16-NEXT: vmrhf %v1, %v2, %v1
-; Z16-NEXT: vmrhg %v24, %v1, %v0
+; Z16-NEXT: vgmf %v0, 2, 8
+; Z16-NEXT: vfmsb %v24, %v24, %v0
; Z16-NEXT: br %r14
%canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a)
ret <4 x float> %canonicalized
@@ -219,14 +210,8 @@ define <4 x double> @canonicalize_v4f64(<4 x double> %a) {
; Z16-LABEL: canonicalize_v4f64:
; Z16: # %bb.0:
; Z16-NEXT: vgmg %v0, 2, 11
-; Z16-NEXT: vrepg %v2, %v24, 1
-; Z16-NEXT: wfmdb %f1, %v24, %f0
-; Z16-NEXT: mdbr %f2, %f0
-; Z16-NEXT: vmrhg %v24, %v1, %v2
-; Z16-NEXT: vrepg %v2, %v26, 1
-; Z16-NEXT: wfmdb %f1, %v26, %f0
-; Z16-NEXT: wfmdb %f0, %f2, %f0
-; Z16-NEXT: vmrhg %v26, %v1, %v0
+; Z16-NEXT: vfmdb %v24, %v24, %v0
+; Z16-NEXT: vfmdb %v26, %v26, %v0
; Z16-NEXT: br %r14
%canonicalized = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %a)
ret <4 x double> %canonicalized
@@ -344,17 +329,8 @@ define void @canonicalize_ptr_v4f32(ptr %out) {
; Z16-LABEL: canonicalize_ptr_v4f32:
; Z16: # %bb.0:
; Z16-NEXT: vl %v0, 0(%r2), 3
-; Z16-NEXT: vrepf %v1, %v0, 3
-; Z16-NEXT: vgmf %v2, 2, 8
-; Z16-NEXT: vrepf %v3, %v0, 2
-; Z16-NEXT: meebr %f1, %f2
-; Z16-NEXT: meebr %f3, %f2
-; Z16-NEXT: vmrhf %v1, %v3, %v1
-; Z16-NEXT: wfmsb %f3, %f0, %f2
-; Z16-NEXT: vrepf %v0, %v0, 1
-; Z16-NEXT: meebr %f0, %f2
-; Z16-NEXT: vmrhf %v0, %v3, %v0
-; Z16-NEXT: vmrhg %v0, %v0, %v1
+; Z16-NEXT: vgmf %v1, 2, 8
+; Z16-NEXT: vfmsb %v0, %v0, %v1
; Z16-NEXT: vst %v0, 0(%r2), 3
; Z16-NEXT: br %r14
%val = load <4 x float>, ptr %out
@@ -366,17 +342,11 @@ define void @canonicalize_ptr_v4f32(ptr %out) {
define void @canonicalize_ptr_v4f64(ptr %out) {
; Z16-LABEL: canonicalize_ptr_v4f64:
; Z16: # %bb.0:
+; Z16-NEXT: vl %v0, 0(%r2), 4
; Z16-NEXT: vl %v1, 16(%r2), 4
; Z16-NEXT: vgmg %v2, 2, 11
-; Z16-NEXT: wfmdb %f3, %f1, %f2
-; Z16-NEXT: vrepg %v1, %v1, 1
-; Z16-NEXT: mdbr %f1, %f2
-; Z16-NEXT: vl %v0, 0(%r2), 4
-; Z16-NEXT: vmrhg %v1, %v3, %v1
-; Z16-NEXT: wfmdb %f3, %f0, %f2
-; Z16-NEXT: vrepg %v0, %v0, 1
-; Z16-NEXT: mdbr %f0, %f2
-; Z16-NEXT: vmrhg %v0, %v3, %v0
+; Z16-NEXT: vfmdb %v1, %v1, %v2
+; Z16-NEXT: vfmdb %v0, %v0, %v2
; Z16-NEXT: vst %v0, 0(%r2), 4
; Z16-NEXT: vst %v1, 16(%r2), 4
; Z16-NEXT: br %r14
More information about the llvm-commits
mailing list