[llvm-branch-commits] [llvm] [AMDGPU][DAGCombiner][GlobalISel] Extend allMulUsesCanBeContracted with FPEXT pattern (PR #188116)
Adel Ejjeh via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Mar 23 15:51:12 PDT 2026
https://github.com/adelejjeh updated https://github.com/llvm/llvm-project/pull/188116
>From 56494b429f85b7398df6c1dda56c4a1229316107 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <adel.ejjeh at amd.com>
Date: Thu, 12 Mar 2026 10:09:35 -0500
Subject: [PATCH] [AMDGPU][DAGCombiner][GlobalISel] Extend
allMulUsesCanBeContracted with FPEXT pattern
Extend the allMulUsesCanBeContracted analysis to recognize FPEXT patterns
where the multiply result flows through fpext before being used in
contractable operations (fadd, fsub). This covers:
- fmul --> fpext --> {fadd, fsub}: FPEXT folds if isFPExtFoldable
- fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB
- fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable
Also adds allMulUsesCanBeContracted guards to all FPEXT fold sites in
both SDAG (visitFADDForFMACombine, visitFSUBForFMACombine) and GISel
(matchCombineFAddFpExtFMulToFMadOrFMA, matchCombineFSubFpExtFMulToFMadOrFMA,
matchCombineFSubFpExtFNegFMulToFMadOrFMA).
Fixes a missing isFPExtFoldable check in GISel's
matchCombineFSubFpExtFMulToFMadOrFMA which could fold without verifying
the extension is actually foldable.
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 107 +-
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 91 +-
.../AMDGPU/fma-multiple-uses-contraction.ll | 1941 ++++++++++++++++-
4 files changed, 2103 insertions(+), 39 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 09c827f71a34d..8440fdcbbd08b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -805,7 +805,8 @@ class CombinerHelper {
/// Check if all uses of a multiply can be contracted into fma/fmad
/// operations, so that duplicating the multiply is acceptable.
- bool allMulUsesCanBeContracted(const MachineInstr &MI) const;
+ bool allMulUsesCanBeContracted(const MachineInstr &MI,
+ unsigned PreferredFusedOpcode) const;
bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally,
bool &HasFMAD, bool &Aggressive,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index f495e3e0dae4c..0941e6da0f40f 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6316,10 +6316,15 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
/// would duplicate the multiply without reducing the total number of
/// operations.
///
-/// Currently checks for the following patterns:
+/// This uses a simple, non-recursive check for the following patterns:
/// - fmul --> fadd/fsub: Direct contraction
/// - fmul --> fneg --> fsub: Contraction through fneg
-bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
+/// - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable
+/// - fmul --> fpext --> {fadd, fsub}: FPEXT folds if foldable
+/// - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB
+bool CombinerHelper::allMulUsesCanBeContracted(
+ const MachineInstr &MI, unsigned PreferredFusedOpcode) const {
+ const auto &TLI = getTargetLowering();
Register MulReg = MI.getOperand(0).getReg();
for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(MulReg)) {
@@ -6329,13 +6334,66 @@ bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
if (Opcode == TargetOpcode::G_FADD || Opcode == TargetOpcode::G_FSUB)
continue;
- // G_FNEG use - contractable if all users of the fneg are G_FSUB.
+ // FNEG --> FSUB pattern
+ // Also handles FNEG --> FPEXT --> FSUB
if (Opcode == TargetOpcode::G_FNEG) {
Register FNegReg = UseMI.getOperand(0).getReg();
- for (const MachineInstr &FNegUser : MRI.use_nodbg_instructions(FNegReg)) {
- unsigned FNegUserOp = FNegUser.getOpcode();
- if (FNegUserOp != TargetOpcode::G_FSUB)
+ // ALL users of the FNEG must be contractable FSUBs or FPEXTs leading to
+ // FSUBs
+ for (const MachineInstr &FNegUseMI :
+ MRI.use_nodbg_instructions(FNegReg)) {
+ unsigned FNegUseOpcode = FNegUseMI.getOpcode();
+
+ if (FNegUseOpcode == TargetOpcode::G_FSUB)
+ continue;
+ if (FNegUseOpcode == TargetOpcode::G_FPEXT) {
+ // FNEG --> FPEXT --> FSUB
+ Register FNegFPExtReg = FNegUseMI.getOperand(0).getReg();
+ for (const MachineInstr &FNegFPExtUseMI :
+ MRI.use_nodbg_instructions(FNegFPExtReg)) {
+ if (FNegFPExtUseMI.getOpcode() != TargetOpcode::G_FSUB)
+ return false;
+ // FPEXT use is FSUB, check if can be folded in
+ if (!TLI.isFPExtFoldable(
+ FNegFPExtUseMI, PreferredFusedOpcode,
+ MRI.getType(FNegFPExtUseMI.getOperand(0).getReg()),
+ MRI.getType(FNegReg)))
+ return false;
+ }
+ continue;
+ }
+ return false;
+ }
+ continue;
+ }
+
+ // FP_EXTEND - check if ALL users are FADD, FSUB, or FNEG --> FSUB
+ if (Opcode == TargetOpcode::G_FPEXT) {
+ Register FPExtReg = UseMI.getOperand(0).getReg();
+
+ // ALL users of the FP_EXTEND must be contractable operations or FNEGs
+ for (const MachineInstr &FPExtUseMI :
+ MRI.use_nodbg_instructions(FPExtReg)) {
+ if (!TLI.isFPExtFoldable(FPExtUseMI, PreferredFusedOpcode,
+ MRI.getType(FPExtUseMI.getOperand(0).getReg()),
+ MRI.getType(MulReg)))
return false;
+ unsigned ExtUseOpcode = FPExtUseMI.getOpcode();
+ if (ExtUseOpcode == TargetOpcode::G_FADD ||
+ ExtUseOpcode == TargetOpcode::G_FSUB) {
+ continue;
+ }
+ if (ExtUseOpcode == TargetOpcode::G_FNEG) {
+ // FP_EXTEND --> FNEG --> FSUB
+ Register FPExtFNegReg = FPExtUseMI.getOperand(0).getReg();
+ for (const MachineInstr &FPExtFNegUseMI :
+ MRI.use_nodbg_instructions(FPExtFNegReg)) {
+ if (FPExtFNegUseMI.getOpcode() != TargetOpcode::G_FSUB)
+ return false;
+ }
+ continue;
+ }
+ return false;
}
continue;
}
@@ -6407,7 +6465,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
// avoiding duplication of the multiply without reducing total operations.
if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
(MRI.hasOneNonDBGUse(LHS.Reg) ||
- (Aggressive && allMulUsesCanBeContracted(*LHS.MI)))) {
+ (Aggressive &&
+ allMulUsesCanBeContracted(*LHS.MI, PreferredFusedOpcode)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
{LHS.MI->getOperand(1).getReg(),
@@ -6421,7 +6480,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
// avoiding duplication of the multiply without reducing total operations.
if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
(MRI.hasOneNonDBGUse(RHS.Reg) ||
- (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) {
+ (Aggressive &&
+ allMulUsesCanBeContracted(*RHS.MI, PreferredFusedOpcode)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
{RHS.MI->getOperand(1).getReg(),
@@ -6464,6 +6524,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
MachineInstr *FpExtSrc;
if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+ allMulUsesCanBeContracted(*FpExtSrc, PreferredFusedOpcode) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6479,6 +6540,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
// Note: Commutes FADD operands.
if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+ allMulUsesCanBeContracted(*FpExtSrc, PreferredFusedOpcode) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6718,7 +6780,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
if (FirstMulHasFewerUses &&
(isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
(MRI.hasOneNonDBGUse(LHS.Reg) ||
- (Aggressive && allMulUsesCanBeContracted(*LHS.MI))))) {
+ (Aggressive &&
+ allMulUsesCanBeContracted(*LHS.MI, PreferredFusedOpcode))))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0);
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
@@ -6732,7 +6795,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
// avoiding duplication of the multiply without reducing total operations.
if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
(MRI.hasOneNonDBGUse(RHS.Reg) ||
- (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) {
+ (Aggressive &&
+ allMulUsesCanBeContracted(*RHS.MI, PreferredFusedOpcode)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register NegY =
B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0);
@@ -6771,7 +6835,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
((MRI.hasOneNonDBGUse(LHSReg) &&
MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
- (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) {
+ (Aggressive &&
+ allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register NegX =
B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
@@ -6795,7 +6860,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
((MRI.hasOneNonDBGUse(RHSReg) &&
MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
- (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) {
+ (Aggressive &&
+ allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
{FMulMI->getOperand(1).getReg(),
@@ -6819,6 +6885,7 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
Register LHSReg = MI.getOperand(1).getReg();
Register RHSReg = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
unsigned PreferredFusedOpcode =
HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
@@ -6827,7 +6894,12 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
// fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z))
if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(LHSReg))) {
+ ((MRI.hasOneNonDBGUse(LHSReg) &&
+ MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
+ (Aggressive &&
+ allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode))) &&
+ TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+ MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register FpExtX =
B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
@@ -6843,7 +6915,12 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
// fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x)
if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(RHSReg))) {
+ ((MRI.hasOneNonDBGUse(RHSReg) &&
+ MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
+ (Aggressive &&
+ allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode))) &&
+ TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+ MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register FpExtY =
B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
@@ -6891,6 +6968,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+ allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6907,6 +6985,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+ allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4c67eec0e5e8a..1b9acb3ab6b9d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17673,25 +17673,75 @@ static bool isFusedOp(const MatcherClass &Matcher, SDValue N) {
/// would duplicate the multiply without reducing the total number of
/// operations.
///
-/// Currently checks for the following patterns:
+/// This uses a simple, non-recursive check for the following patterns:
/// - fmul --> fadd/fsub: Direct contraction
/// - fmul --> fneg --> fsub: Contraction through fneg
-static bool allMulUsesCanBeContracted(SDValue Mul) {
+/// - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable
+/// - fmul --> fpext --> {fadd, fsub}: FPEXT folds if foldable
+/// - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB
+static bool allMulUsesCanBeContracted(SDValue Mul,
+ const unsigned PreferredFusedOpcode,
+ const TargetLowering &TLI,
+ SelectionDAG &DAG) {
for (const auto *User : Mul->users()) {
- unsigned Opcode = User->getOpcode();
+ SDNode *UserNode = const_cast<SDNode *>(User);
+ unsigned Opcode = UserNode->getOpcode();
// Direct FADD/FSUB - contractable.
if (Opcode == ISD::FADD || Opcode == ISD::FSUB)
continue;
- // FNEG use - contractable if all users of the fneg are FSUB.
+ // FNEG - check if ALL users are FSUB or foldable FPEXT --> FSUB
if (Opcode == ISD::FNEG) {
- for (const auto *FNegUser : User->users()) {
+ for (const auto *FNegUser : UserNode->users()) {
unsigned FNegUserOp = FNegUser->getOpcode();
- if (FNegUserOp != ISD::FSUB)
- return false;
+
+ if (FNegUserOp == ISD::FSUB) {
+ // FNEG --> FSUB
+ continue;
+ }
+ if (FNegUserOp == ISD::FP_EXTEND) {
+ // FNEG --> FPEXT --> FSUB
+ EVT SrcVT = UserNode->getValueType(0); // Src of FPEXT is the FNEG
+ for (const auto *FNegFPExtUser : FNegUser->users()) {
+ if (FNegFPExtUser->getOpcode() != ISD::FSUB)
+ return false;
+ if (!TLI.isFPExtFoldable(DAG, PreferredFusedOpcode,
+ FNegFPExtUser->getValueType(0), SrcVT))
+ return false;
+ }
+ continue;
+ }
+ return false;
}
- continue;
+ continue; // All FNEG uses are contractable
+ }
+
+ // FP_EXTEND - check if ALL users are FADD, FSUB, or FNEG --> FSUB
+ if (Opcode == ISD::FP_EXTEND) {
+ EVT SrcVT = Mul.getValueType();
+
+ for (const auto *FPExtUser : UserNode->users()) {
+ unsigned ExtUserOp = FPExtUser->getOpcode();
+ EVT DstVT = FPExtUser->getValueType(0);
+ if (!TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, DstVT, SrcVT))
+ return false; // this FPEXT cannot be folded
+
+ if (ExtUserOp == ISD::FADD || ExtUserOp == ISD::FSUB) {
+ continue; // FPEXT --> {FADD, FSUB} is contractable
+ }
+ if (ExtUserOp == ISD::FNEG) {
+ // FP_EXTEND --> FNEG --> FSUB
+ for (const auto *FPExtFNegUser : FPExtUser->users()) {
+ if (FPExtFNegUser->getOpcode() != ISD::FSUB) {
+ return false;
+ }
+ }
+ continue;
+ }
+ return false;
+ }
+ continue; // All FPEXT uses are contractable
}
// Any other use type is not currently recognized as contractable.
@@ -17765,7 +17815,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
// Only contract if the multiply has one use or all uses are contractable,
// avoiding duplication of the multiply without reducing total operations.
if (isContractableFMUL(N0) &&
- (N0->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N0)))) {
+ (N0->hasOneUse() ||
+ (Aggressive &&
+ allMulUsesCanBeContracted(N0, PreferredFusedOpcode, TLI, DAG)))) {
return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
N0.getOperand(1), N1);
}
@@ -17775,7 +17827,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
// Only contract if the multiply has one use or all uses are contractable,
// avoiding duplication of the multiply without reducing total operations.
if (isContractableFMUL(N1) &&
- (N1->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N1)))) {
+ (N1->hasOneUse() ||
+ (Aggressive &&
+ allMulUsesCanBeContracted(N1, PreferredFusedOpcode, TLI, DAG)))) {
return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
N1.getOperand(1), N0);
}
@@ -17822,6 +17876,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
if (matcher.match(N0, ISD::FP_EXTEND)) {
SDValue N00 = N0.getOperand(0);
if (isContractableFMUL(N00) &&
+ allMulUsesCanBeContracted(N00, PreferredFusedOpcode, TLI, DAG) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return matcher.getNode(
@@ -17836,6 +17891,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
if (matcher.match(N1, ISD::FP_EXTEND)) {
SDValue N10 = N1.getOperand(0);
if (isContractableFMUL(N10) &&
+ allMulUsesCanBeContracted(N10, PreferredFusedOpcode, TLI, DAG) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N10.getValueType())) {
return matcher.getNode(
@@ -17994,7 +18050,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// avoiding duplication of the multiply without reducing total operations.
auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
if (isContractableFMUL(XY) &&
- (XY->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(XY)))) {
+ (XY->hasOneUse() ||
+ (Aggressive &&
+ allMulUsesCanBeContracted(XY, PreferredFusedOpcode, TLI, DAG)))) {
return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
XY.getOperand(1),
matcher.getNode(ISD::FNEG, SL, VT, Z));
@@ -18008,7 +18066,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// avoiding duplication of the multiply without reducing total operations.
auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
if (isContractableFMUL(YZ) &&
- (YZ->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(YZ)))) {
+ (YZ->hasOneUse() ||
+ (Aggressive &&
+ allMulUsesCanBeContracted(YZ, PreferredFusedOpcode, TLI, DAG)))) {
return matcher.getNode(
PreferredFusedOpcode, SL, VT,
matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
@@ -18047,7 +18107,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// multiply without reducing total operations.
if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) &&
((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
- (Aggressive && allMulUsesCanBeContracted(N0.getOperand(0))))) {
+ (Aggressive && allMulUsesCanBeContracted(
+ N0.getOperand(0), PreferredFusedOpcode, TLI, DAG)))) {
SDValue N00 = N0.getOperand(0).getOperand(0);
SDValue N01 = N0.getOperand(0).getOperand(1);
return matcher.getNode(PreferredFusedOpcode, SL, VT,
@@ -18062,6 +18123,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
if (matcher.match(N0, ISD::FP_EXTEND)) {
SDValue N00 = N0.getOperand(0);
if (isContractableFMUL(N00) &&
+ allMulUsesCanBeContracted(N00, PreferredFusedOpcode, TLI, DAG) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return matcher.getNode(
@@ -18078,6 +18140,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
if (matcher.match(N1, ISD::FP_EXTEND)) {
SDValue N10 = N1.getOperand(0);
if (isContractableFMUL(N10) &&
+ allMulUsesCanBeContracted(N10, PreferredFusedOpcode, TLI, DAG) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N10.getValueType())) {
return matcher.getNode(
@@ -18100,6 +18163,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
if (matcher.match(N00, ISD::FNEG)) {
SDValue N000 = N00.getOperand(0);
if (isContractableFMUL(N000) &&
+ allMulUsesCanBeContracted(N000, PreferredFusedOpcode, TLI, DAG) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return matcher.getNode(
@@ -18124,6 +18188,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
if (matcher.match(N00, ISD::FP_EXTEND)) {
SDValue N000 = N00.getOperand(0);
if (isContractableFMUL(N000) &&
+ allMulUsesCanBeContracted(N000, PreferredFusedOpcode, TLI, DAG) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N000.getValueType())) {
return matcher.getNode(
diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
index 456951b74729b..b7dcfef9eaa83 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
@@ -1488,14 +1488,1933 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
ret { float, float } %ret1
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10-GISEL-F32DENORM: {{.*}}
-; GFX10-GISEL-F32FLUSH: {{.*}}
-; GFX11-GISEL-F32DENORM: {{.*}}
-; GFX11-GISEL-F32FLUSH: {{.*}}
-; GFX11-SDAG-F32DENORM: {{.*}}
-; GFX11-SDAG-F32FLUSH: {{.*}}
-; GFX12-GISEL-F32DENORM: {{.*}}
-; GFX12-GISEL-F32FLUSH: {{.*}}
-; GFX12-SDAG-F32DENORM: {{.*}}
-; GFX12-SDAG-F32FLUSH: {{.*}}
+; Test case: extended multiply used by two adds; second add uses result of first
+; Note, contraction only happens with preserve-sign (flush)
+; Corner case, some architectures only one add gets contracted
+define float @fpext_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v2, v1, v0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v1, v0, v2 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v2, v1, v0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v1, v0, v2 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v1, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v1, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %add2 = fadd contract float %add, %mul.ext
+ ret float %add2
+}
+
+; Test case: extended multiply used by an add and used in the return directly
+; Should NOT contract the add, as the extended result is used elsewhere
+define { float, float } @fpext_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fpext_noncontractable:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fpext_noncontractable:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fpext_noncontractable:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fpext_noncontractable:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-GISEL-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: fpext_noncontractable:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: fpext_noncontractable:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul.ext, 1
+ ret { float, float } %ret1
+}
+
+; Test case: extended multiply used by an add and multiply used in the return directly
+; Should NOT contract the add, as the mul is used elsewhere
+define { float, half } @fpext_noncontractable_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fpext_noncontractable_2:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fpext_noncontractable_2:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fpext_noncontractable_2:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fpext_noncontractable_2:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: fpext_noncontractable_2:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: fpext_noncontractable_2:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_noncontractable_2:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_noncontractable_2:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %ret0 = insertvalue { float, half } poison, float %add, 0
+ %ret1 = insertvalue { float, half } %ret0, half %mul, 1
+ ret { float, half } %ret1
+}
+
+; Test case: extended multiply used by a single add
+; Should contract. Note, contraction only happens with preserve-sign (flush)
+define float @fpext_contractable_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v1, v0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v1, v0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ ret float %add
+}
+
+; Test case: extended multiply used by two adds that can be contracted
+; Should contract. Note, contraction only happens with preserve-sign (flush)
+define {float, float} @fpext_contractable_3(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v2, v3, v1, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v3, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v2, v3, v1, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v3, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %add2 = fadd contract float %x, %mul.ext
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: extended multiply used by two subs; second sub uses result of first
+; Note, contraction only happens with preserve-sign (flush)
+; Corner case, some architectures only one sub gets contracted
+define float @fpext_contractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v2, v1, v0, -v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v1, v0, v2 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v2, v1, v0, -v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v1, v0, v2 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v1, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v1, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %sub2 = fsub contract float %sub, %mul.ext
+ ret float %sub2
+}
+
+; Test case: extended multiply used by an sub and used in the return directly
+; Should NOT contract the sub, as the extended result is used elsewhere
+define { float, float } @fpext_noncontractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul.ext, 1
+ ret { float, float } %ret1
+}
+
+; Test case: extended multiply used by an sub and multiply used in the return directly
+; Should NOT contract the sub, as the mul is used elsewhere
+define { float, half } @fpext_noncontractable_sub_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %ret0 = insertvalue { float, half } poison, float %sub, 0
+ %ret1 = insertvalue { float, half } %ret0, half %mul, 1
+ ret { float, half } %ret1
+}
+
+; Test case: extended multiply used by a single sub
+; Should contract. Note, contraction only happens with preserve-sign (flush)
+define float @fpext_contractable_sub_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v1, v0, -v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v1, v0, -v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ ret float %sub
+}
+
+; Test case: extended multiply used by two subs that can be contracted
+; Should contract. Note, contraction only happens with preserve-sign (flush)
+define {float, float} @fpext_contractable_sub_3(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v2, v3, v1, -v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v3, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v2, v3, v1, -v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v3, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v2, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v1, v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v2, v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v1, v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %sub2 = fsub contract float %x, %mul.ext
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: multiply used by fpext->fadd AND fneg->fpext->fsub (both contractable)
+; Should contract. Tests the FNEG->FPEXT->FSUB sub-check in allMulUsesCanBeContracted.
+define {float, float} @fpext_fneg_fpext_fsub_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v1, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v1, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v1, v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v1, v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX11-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ ; Use 1: fpext -> fadd (contractable via FPEXT case)
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ ; Use 2: fneg -> fpext -> fsub (contractable via FNEG->FPEXT sub-check)
+ %neg = fneg contract half %mul
+ %neg.ext = fpext contract half %neg to float
+ %sub = fsub contract float %neg.ext, %x
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub, 1
+ ret { float, float } %ret1
+}
+
+; Test case: multiply used by fneg->fpext->fsub AND direct return
+; Should NOT contract. The direct use of the mul is non-contractable.
+define {float, half} @fpext_fneg_fpext_fsub_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e64 v0, -v1
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e64 v0, -v1
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ ; Use 1: fneg -> fpext -> fsub
+ %neg = fneg contract half %mul
+ %neg.ext = fpext contract half %neg to float
+ %sub = fsub contract float %neg.ext, %z
+ ; Use 2: direct return (non-contractable)
+ %ret0 = insertvalue { float, half } poison, float %sub, 0
+ %ret1 = insertvalue { float, half } %ret0, half %mul, 1
+ ret { float, half } %ret1
+}
+
+; Test case: multiply used by fpext, fpext used by fadd AND fneg->fsub (both contractable)
+; Should contract. Tests the FPEXT->FNEG->FSUB sub-check in allMulUsesCanBeContracted.
+define {float, float} @fpext_fneg_fsub_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX10-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX10-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX10-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX10-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX11-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v1, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX11-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX11-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32FLUSH-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v1, v0 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v1, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX12-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX12-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e64 v1, -v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e64 v1, -v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX10-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX10-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX10-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX10-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX10-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX11-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX11-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX11-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX11-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX11-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX11-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX12-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX12-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ ; Use 1 of fpext: fadd (contractable)
+ %add = fadd contract float %mul.ext, %z
+ ; Use 2 of fpext: fneg -> fsub (contractable via FPEXT->FNEG sub-check)
+ %neg = fneg contract float %mul.ext
+ %sub = fsub contract float %neg, %x
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub, 1
+ ret { float, float } %ret1
+}
+
+; Test case: multiply used by fpext, fpext used by fneg->fsub AND direct return
+; Should NOT contract. The direct use of fpext result is non-contractable.
+define {float, float} @fpext_fneg_fsub_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX10-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_sub_f32_e64 v0, -v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e64 v0, -v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ ; Use 1 of fpext: fneg -> fsub
+ %neg = fneg contract float %mul.ext
+ %sub = fsub contract float %neg, %z
+ ; Use 2 of fpext: direct return (non-contractable)
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul.ext, 1
+ ret { float, float } %ret1
+}
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
More information about the llvm-branch-commits
mailing list