[llvm] [DAGCombiner][GlobalISel] NFC: Extract isFusedOp, add FMA contraction test, fix isFPExtFoldable (PR #189093)
Adel Ejjeh via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 30 08:54:03 PDT 2026
https://github.com/adelejjeh updated https://github.com/llvm/llvm-project/pull/189093
>From 9a942a856472321e688d5aa659c1c8ed9c97c2f2 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <adel.ejjeh at amd.com>
Date: Tue, 24 Mar 2026 11:55:53 -0500
Subject: [PATCH] [DAGCombiner][GlobalISel] Extract isFusedOp lambda, add FMA
contraction test, fix missing isFPExtFoldable check
Extract the duplicated isFusedOp lambda in visitFADDForFMACombine and
visitFSUBForFMACombine into a shared static template function.
Add fma-multiple-uses-contraction.ll test file with baseline CHECK lines
for testing FMA contraction behavior when fmul has multiple uses.
This test will be updated in subsequent patches as contraction prevention
logic is added.
Fix a missing isFPExtFoldable check in GISel's
matchCombineFSubFpExtFMulToFMadOrFMA which could incorrectly fold
fsub(fpext(fmul)) into fma on targets where the fpext is not actually
foldable (e.g., gfx9-generic). This makes GISel consistent with SDAG,
which already checks isFPExtFoldable at all fpext fold sites.
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
Made-with: Cursor
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 9 +-
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 43 +-
.../AMDGPU/fma-multiple-uses-contraction.ll | 3746 +++++++++++++++++
3 files changed, 3774 insertions(+), 24 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4d92754ae9b48..f4d5bd5ee5745 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6755,6 +6755,7 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
Register LHSReg = MI.getOperand(1).getReg();
Register RHSReg = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
unsigned PreferredFusedOpcode =
HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
@@ -6763,7 +6764,9 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
// fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z))
if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(LHSReg))) {
+ (Aggressive || MRI.hasOneNonDBGUse(LHSReg)) &&
+ TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+ MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register FpExtX =
B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
@@ -6779,7 +6782,9 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
// fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x)
if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(RHSReg))) {
+ (Aggressive || MRI.hasOneNonDBGUse(RHSReg)) &&
+ TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+ MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register FpExtY =
B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9134a940f217b..1e1c184b9d119 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17672,6 +17672,12 @@ static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
N->getFlags().hasAllowContract();
}
+// Check if a node is a fused FMA or FMAD operation.
+template <class MatchContextClass>
+static bool isFusedOp(const MatchContextClass &Matcher, SDValue N) {
+ return Matcher.match(N, ISD::FMA) || Matcher.match(N, ISD::FMAD);
+}
+
/// Try to perform FMA combining on a given FADD node.
template <class MatchContextClass>
SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
@@ -17718,10 +17724,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
- auto isFusedOp = [&](SDValue N) {
- return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD);
- };
-
// Is the node an FMUL and contractable either due to global flags or
// SDNodeFlags.
auto isContractableFMUL = [AllowFusionGlobally, &matcher](SDValue N) {
@@ -17760,16 +17762,16 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
bool CanReassociate = N->getFlags().hasAllowReassociation();
if (CanReassociate) {
SDValue FMA, E;
- if (isFusedOp(N0) && N0.hasOneUse()) {
+ if (isFusedOp(matcher, N0) && N0.hasOneUse()) {
FMA = N0;
E = N1;
- } else if (isFusedOp(N1) && N1.hasOneUse()) {
+ } else if (isFusedOp(matcher, N1) && N1.hasOneUse()) {
FMA = N1;
E = N0;
}
SDValue TmpFMA = FMA;
- while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) {
+ while (E && isFusedOp(matcher, TmpFMA) && TmpFMA.hasOneUse()) {
SDValue FMul = TmpFMA->getOperand(2);
if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) {
SDValue C = FMul.getOperand(0);
@@ -17826,7 +17828,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
matcher.getNode(ISD::FP_EXTEND, SL, VT, U),
matcher.getNode(ISD::FP_EXTEND, SL, VT, V), Z));
};
- if (isFusedOp(N0)) {
+ if (isFusedOp(matcher, N0)) {
SDValue N02 = N0.getOperand(2);
if (matcher.match(N02, ISD::FP_EXTEND)) {
SDValue N020 = N02.getOperand(0);
@@ -17857,7 +17859,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
};
if (N0.getOpcode() == ISD::FP_EXTEND) {
SDValue N00 = N0.getOperand(0);
- if (isFusedOp(N00)) {
+ if (isFusedOp(matcher, N00)) {
SDValue N002 = N00.getOperand(2);
if (isContractableFMUL(N002) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -17871,7 +17873,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
// fold (fadd x, (fma y, z, (fpext (fmul u, v)))
// -> (fma y, z, (fma (fpext u), (fpext v), x))
- if (isFusedOp(N1)) {
+ if (isFusedOp(matcher, N1)) {
SDValue N12 = N1.getOperand(2);
if (N12.getOpcode() == ISD::FP_EXTEND) {
SDValue N120 = N12.getOperand(0);
@@ -17892,7 +17894,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
// interesting for all targets, especially GPUs.
if (N1.getOpcode() == ISD::FP_EXTEND) {
SDValue N10 = N1.getOperand(0);
- if (isFusedOp(N10)) {
+ if (isFusedOp(matcher, N10)) {
SDValue N102 = N10.getOperand(2);
if (isContractableFMUL(N102) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -18095,16 +18097,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
return isContractableFMUL(N) && N->getFlags().hasAllowReassociation();
};
- auto isFusedOp = [&](SDValue N) {
- return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD);
- };
-
// More folding opportunities when target permits.
if (Aggressive && N->getFlags().hasAllowReassociation()) {
bool CanFuse = N->getFlags().hasAllowContract();
// fold (fsub (fma x, y, (fmul u, v)), z)
// -> (fma x, y (fma u, v, (fneg z)))
- if (CanFuse && isFusedOp(N0) &&
+ if (CanFuse && isFusedOp(matcher, N0) &&
isContractableAndReassociableFMUL(N0.getOperand(2)) &&
N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
return matcher.getNode(
@@ -18117,7 +18115,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// fold (fsub x, (fma y, z, (fmul u, v)))
// -> (fma (fneg y), z, (fma (fneg u), v, x))
- if (CanFuse && isFusedOp(N1) &&
+ if (CanFuse && isFusedOp(matcher, N1) &&
isContractableAndReassociableFMUL(N1.getOperand(2)) &&
N1->hasOneUse() && NoSignedZero) {
SDValue N20 = N1.getOperand(2).getOperand(0);
@@ -18132,7 +18130,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// fold (fsub (fma x, y, (fpext (fmul u, v))), z)
// -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
- if (isFusedOp(N0) && N0->hasOneUse()) {
+ if (isFusedOp(matcher, N0) && N0->hasOneUse()) {
SDValue N02 = N0.getOperand(2);
if (matcher.match(N02, ISD::FP_EXTEND)) {
SDValue N020 = N02.getOperand(0);
@@ -18158,7 +18156,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// interesting for all targets, especially GPUs.
if (matcher.match(N0, ISD::FP_EXTEND)) {
SDValue N00 = N0.getOperand(0);
- if (isFusedOp(N00)) {
+ if (isFusedOp(matcher, N00)) {
SDValue N002 = N00.getOperand(2);
if (isContractableAndReassociableFMUL(N002) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -18178,8 +18176,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// fold (fsub x, (fma y, z, (fpext (fmul u, v))))
// -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
- if (isFusedOp(N1) && matcher.match(N1.getOperand(2), ISD::FP_EXTEND) &&
- N1->hasOneUse()) {
+ if (isFusedOp(matcher, N1) &&
+ matcher.match(N1.getOperand(2), ISD::FP_EXTEND) && N1->hasOneUse()) {
SDValue N120 = N1.getOperand(2).getOperand(0);
if (isContractableAndReassociableFMUL(N120) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -18204,7 +18202,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
- if (matcher.match(N1, ISD::FP_EXTEND) && isFusedOp(N1.getOperand(0))) {
+ if (matcher.match(N1, ISD::FP_EXTEND) &&
+ isFusedOp(matcher, N1.getOperand(0))) {
SDValue CvtSrc = N1.getOperand(0);
SDValue N100 = CvtSrc.getOperand(0);
SDValue N101 = CvtSrc.getOperand(1);
diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
new file mode 100644
index 0000000000000..78eeb5e807e58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
@@ -0,0 +1,3746 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-generic -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX9-SDAG,GFX9-SDAG-F32FLUSH %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-generic -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX9-GISEL,GFX9-GISEL-F32FLUSH %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-4-generic -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX9_4-SDAG,GFX9_4-SDAG-F32FLUSH %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-4-generic -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX9_4-GISEL,GFX9_4-GISEL-F32FLUSH %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx12-5-generic -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX12_5-SDAG,GFX12_5-SDAG-F32FLUSH %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx12-5-generic -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX12_5-GISEL,GFX12_5-GISEL-F32FLUSH %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-generic -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GFX9-SDAG,GFX9-SDAG-F32DENORM %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-generic -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GFX9-GISEL,GFX9-GISEL-F32DENORM %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-4-generic -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GFX9_4-SDAG,GFX9_4-SDAG-F32DENORM %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-4-generic -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GFX9_4-GISEL,GFX9_4-GISEL-F32DENORM %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx12-5-generic -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GFX12_5-SDAG,GFX12_5-SDAG-F32DENORM %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx12-5-generic -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GFX12_5-GISEL,GFX12_5-GISEL-F32DENORM %s
+
+
+; ==========================================================================
+; Direct FADD/FSUB patterns
+; Tests for allMulUsesCanBeContracted with direct fadd/fsub users of fmul.
+; ==========================================================================
+
+; Test case: fmul -> {fmul, fadd} (one non-contractable use).
+; IR: %mul = fmul(a,b), %extrause = fmul(%mul, c), %add = fadd(%mul, d).
+; fmul has two users: another fmul (non-contractable) and fadd (contractable).
+; Should NOT contract -- one user (fmul) is not contractable.
+; Expected: v_mul shared by both paths, no fma contraction.
+define { float, float } @mul_has_noncontractable_use(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_has_noncontractable_use:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_has_noncontractable_use:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_has_noncontractable_use:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_has_noncontractable_use:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_has_noncontractable_use:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v0, v1, v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_has_noncontractable_use:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v0, v1, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_has_noncontractable_use:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_has_noncontractable_use:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %extrause = fmul contract float %mul, %c ; non-contractable
+ %fma1 = fadd contract float %mul, %d ; contractable
+ %ret0 = insertvalue { float, float } poison, float %extrause, 0
+ %ret1 = insertvalue { float, float } %ret0, float %fma1, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fadd, fadd} (two contractable uses).
+; IR: %mul = fmul(a,b), %add1 = fadd(%mul, c), %add2 = fadd(%mul, d).
+; fmul has two users, both fadd.
+; Should contract -- both uses are contractable.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_two_contractable_fadd_uses(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_two_contractable_fadd_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_two_contractable_fadd_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_two_contractable_fadd_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_two_contractable_fadd_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_two_contractable_fadd_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, v1, v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_two_contractable_fadd_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, v1, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_two_contractable_fadd_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_two_contractable_fadd_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %fma1 = fadd contract float %mul, %c ; contractable
+ %fma2 = fadd contract float %mul, %d ; contractable
+ %ret0 = insertvalue { float, float } poison, float %fma1, 0
+ %ret1 = insertvalue { float, float } %ret0, float %fma2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul(a, 2.0) -> {fadd, fadd} (constant multiply, two contractable uses).
+; IR: %mul = fmul(a, 2.0), %add1 = fadd(%mul, c), %add2 = fadd(%mul, d).
+; fmul has two users, both fadd.
+; Should contract -- both uses are contractable.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_constant_two_contractable_uses(float %a, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_constant_two_contractable_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v3, 2.0, v0, v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, 2.0, v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_constant_two_contractable_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v3, 2.0, v0, v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, 2.0, v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_constant_two_contractable_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v3, 2.0, v0, v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, 2.0, v0, v2
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_constant_two_contractable_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v3, 2.0, v0, v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, 2.0, v0, v2
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_constant_two_contractable_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v3, 2.0, v0, v1 :: v_dual_fma_f32 v1, 2.0, v0, v2
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_constant_two_contractable_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v3, 2.0, v0, v1 :: v_dual_fma_f32 v1, 2.0, v0, v2
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_constant_two_contractable_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v3, v0, 2.0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, 2.0, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_constant_two_contractable_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v3, v0, 2.0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, 2.0, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, 2.0
+ %fma1 = fadd contract float %mul, %c
+ %fma2 = fadd contract float %mul, %d
+ %ret0 = insertvalue { float, float } poison, float %fma1, 0
+ %ret1 = insertvalue { float, float } %ret0, float %fma2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul(a, 2.0) -> {fadd, fadd, return} (constant multiply, non-contractable direct use).
+; IR: %mul = fmul(a, 2.0), %add1 = fadd(%mul, c), %add2 = fadd(%mul, d), return {%add1, %add2, %mul}.
+; fmul has three users: two fadds (contractable) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul shared by all paths, no fma contraction.
+define { float, float, float } @mul_constant_two_contractable_uses_one_noncontractable(float %a, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v3, v0, v0
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v4, 2.0, v0, v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, 2.0, v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v3, 2.0, v0
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v4, 2.0, v0, v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, 2.0, v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v3, v0, v0
+; GFX9_4-SDAG-NEXT: v_fma_f32 v4, 2.0, v0, v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, 2.0, v0, v2
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v3
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v3, 2.0, v0
+; GFX9_4-GISEL-NEXT: v_fma_f32 v4, 2.0, v0, v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, 2.0, v0, v2
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v3
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v3, 2.0, v0, v1 :: v_dual_fma_f32 v1, 2.0, v0, v2
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_add_f32 v2, v0, v0 :: v_dual_mov_b32 v0, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v3, 2.0, v0, v1 :: v_dual_fma_f32 v1, 2.0, v0, v2
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v2, 2.0, v0 :: v_dual_mov_b32 v0, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v3, v0, v0
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v4, v0, 2.0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, 2.0, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v3, 2.0, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v4, v0, 2.0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, 2.0, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, 2.0
+ %fma1 = fadd contract float %mul, %c
+ %fma2 = fadd contract float %mul, %d
+ %ret0 = insertvalue { float, float, float } poison, float %fma1, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %fma2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %mul, 2
+ ret { float, float, float } %ret2
+}
+
+; Test case: fmul -> {fadd, fadd, return} (two contractable uses, non-contractable direct use).
+; IR: %mul = fmul(a,b), %add1 = fadd(%mul, c), %add2 = fadd(%mul, d), return {%add1, %add2, %mul}.
+; fmul has three users: two fadds (contractable) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul shared by all paths, no fma contraction.
+define { float, float, float } @mul_two_contractable_uses_plus_direct_use(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v4
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v4, v0, v1, v2 :: v_dual_fma_f32 v3, v0, v1, v3
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v4, v0, v1, v2 :: v_dual_fma_f32 v3, v0, v1, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %fma1 = fadd contract float %mul, %c
+ %fma2 = fadd contract float %mul, %d
+ %ret0 = insertvalue { float, float, float } poison, float %fma1, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %fma2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %mul, 2 ; non-contractable (direct use)
+ ret { float, float, float } %ret2
+}
+
+; Test case: fmul -> {fsub, fsub} (two contractable uses).
+; IR: %mul = fmul(a,b), %sub1 = fsub(%mul, c), %sub2 = fsub(%mul, d).
+; fmul has two users, both fsub.
+; Should contract -- both uses are contractable.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_two_contractable_fsub_uses(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_two_contractable_fsub_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_two_contractable_fsub_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_two_contractable_fsub_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_two_contractable_fsub_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_two_contractable_fsub_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, v0, v1, -v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_two_contractable_fsub_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, v0, v1, -v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_two_contractable_fsub_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_two_contractable_fsub_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %fma1 = fsub contract float %mul, %c
+ %fma2 = fsub contract float %mul, %d
+ %ret0 = insertvalue { float, float } poison, float %fma1, 0
+ %ret1 = insertvalue { float, float } %ret0, float %fma2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fsub, fsub, return} (two contractable uses, non-contractable direct use).
+; IR: %mul = fmul(a,b), %sub1 = fsub(%mul, c), %sub2 = fsub(%mul, d), return {%sub1, %sub2, %mul}.
+; fmul has three users: two fsubs (contractable) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul shared by all paths, no fma contraction.
+define { float, float, float } @mul_two_contractable_fsub_uses_plus_direct_use(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v4
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v4, v0, v1, -v2 :: v_dual_fma_f32 v3, v0, v1, -v3
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v4, v0, v1, -v2 :: v_dual_fma_f32 v3, v0, v1, -v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %fma1 = fsub contract float %mul, %c
+ %fma2 = fsub contract float %mul, %d
+ %ret0 = insertvalue { float, float, float } poison, float %fma1, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %fma2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %mul, 2 ; non-contractable (direct use)
+ ret { float, float, float } %ret2
+}
+
+; Test case: fmul -> {fadd, fsub} (mixed contractable uses).
+; IR: %mul = fmul(a,b), %add = fadd(%mul, c), %sub = fsub(%mul, d).
+; fmul has two users: fadd and fsub.
+; Should contract -- both uses are contractable.
+; Expected: one fma/mad + one fma/mad, no v_mul.
+define { float, float } @mul_mixed_fadd_fsub_uses(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, v1, -v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, v1, -v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_mixed_fadd_fsub_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %add = fadd contract float %mul, %c
+ %sub = fsub contract float %mul, %d
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fadd, fadd, fsub} (three contractable uses).
+; IR: %mul = fmul(a,b), %add1 = fadd(%mul, c), %add2 = fadd(%mul, d), %sub = fsub(%mul, e).
+; fmul has three users: two fadds and one fsub, all contractable.
+; Should contract -- all three uses are contractable.
+; Expected: three fma/mad instructions, no v_mul.
+define { float, float, float } @mul_three_contractable_uses(float %a, float %b, float %c, float %d, float %e) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_three_contractable_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v5, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v3, v0, v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_three_contractable_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v5, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v3, v0, v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v4
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_three_contractable_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v5, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v3, v0, v1, v3
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, -v4
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v5
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v1, v3
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_three_contractable_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v5, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v3, v0, v1, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, -v4
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v1, v3
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_three_contractable_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v5, v0, v1, v2 :: v_dual_fma_f32 v3, v0, v1, v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, v0, v1, -v4 :: v_dual_mov_b32 v0, v5
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v1, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_three_contractable_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v5, v0, v1, v2 :: v_dual_fma_f32 v3, v0, v1, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, -v4 :: v_dual_mov_b32 v0, v5
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v1, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_three_contractable_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v5, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v3, v0, v1, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v4
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_three_contractable_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v5, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v3, v0, v1, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v4
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %add1 = fadd contract float %mul, %c
+ %add2 = fadd contract float %mul, %d
+ %sub = fsub contract float %mul, %e
+ %ret0 = insertvalue { float, float, float } poison, float %add1, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %add2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %sub, 2
+ ret { float, float, float } %ret2
+}
+
+; ==========================================================================
+; FNEG patterns
+; Tests for allMulUsesCanBeContracted recognizing fneg as a transparent user.
+; ==========================================================================
+
+; Test case: fmul -> fneg -> fsub (single use chain).
+; IR: %mul = fmul(a,b), %neg = fneg(%mul), %sub = fsub(%neg, c).
+; fmul has one user: fneg, which has one user: fsub.
+; Should contract -- single-use chain, fneg folds into fma.
+; Expected: single fma/mad, no v_mul.
+define float @mul_fneg_fsub_single_use(float %a, float %b, float %c) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_fsub_single_use:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, v0, -v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_fsub_single_use:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v0, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_fneg_fsub_single_use:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_fneg_fsub_single_use:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_fneg_fsub_single_use:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_fma_f32 v0, -v0, v1, -v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_fneg_fsub_single_use:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_fsub_single_use:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_fsub_single_use:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %neg = fneg contract float %mul
+ %sub = fsub contract float %neg, %c ; contractable
+ ret float %sub
+}
+
+; Test case: fmul -> fneg -> {fsub, fsub} (multiple fsub uses of fneg).
+; IR: %mul = fmul(a,b), %neg = fneg(%mul), %sub1 = fsub(%neg, c), %sub2 = fsub(%neg, d).
+; fmul has one user: fneg, which has two users: both fsub.
+; Should contract -- all fneg uses are contractable fsubs.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_fneg_multiple_fsub_uses(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, -v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, -v0, v1, -v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, -v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, -v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %neg = fneg contract float %mul
+ %sub1 = fsub contract float %neg, %c ; contractable
+ %sub2 = fsub contract float %neg, %d ; contractable
+ %ret0 = insertvalue { float, float } poison, float %sub1, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> fneg -> {fsub, fadd} (mixed uses of fneg).
+; IR: %mul = fmul(a,b), %neg = fneg(%mul), %sub = fsub(%neg, c), %add = fadd(%neg, d).
+; fmul has one user: fneg, which has two users: fsub and fadd.
+; Should contract -- both fneg uses (fsub, fadd) are contractable.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_fneg_mixed_uses(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_mixed_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, -v0, v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, -v0, v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_mixed_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, -v0, v1, v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, -v1, v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, -v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_fneg_mixed_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, -v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_mixed_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, -v0, v1, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_mixed_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, -v1, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %neg = fneg contract float %mul
+ %sub = fsub contract float %neg, %c ; contractable
+ %add = fadd contract float %neg, %d ; contractable
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> fneg -> {fsub, fmul} (non-contractable fneg use).
+; IR: %mul = fmul(a,b), %neg = fneg(%mul), %sub = fsub(%neg, c), %mul2 = fmul(%neg, d).
+; fmul has one user: fneg, which has two users: fsub (contractable) and fmul (non-contractable).
+; Should NOT contract -- one fneg user (fmul) is not contractable.
+; Expected: v_mul + v_mul, no fma contraction.
+define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, v0, -v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v0, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses_2:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses_2:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses_2:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, -v0, v1, -v2
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_fneg_mixed_uses_2:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, -v1, -v2
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %neg = fneg contract float %mul
+ %sub = fsub contract float %neg, %c ; contractable
+ %mul2 = fmul contract float %neg, %d ; non-contractable
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fadd, fneg -> fmul} (non-contractable fneg use).
+; IR: %mul = fmul(a,b), %neg = fneg(%mul), %add = fadd(%mul, c), %other = fmul(%neg, d).
+; fmul has two users: fadd (contractable) and fneg, whose user is fmul (non-contractable).
+; Should NOT contract -- one path (fneg -> fmul) is not contractable.
+; Expected: v_mul shared by both paths, no fma contraction.
+define { float, float } @mul_fneg_nonfsub_noncontractable(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e64 v1, -v4, v3
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v0, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, v1, v2
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %neg = fneg float %mul
+ %add = fadd contract float %mul, %c ; contractable
+ %other = fmul contract float %neg, %d ; non-contractable
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %other, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fadd, fneg -> fsub} (all contractable).
+; IR: %mul = fmul(a,b), %add = fadd(%mul, c), %neg = fneg(%mul), %sub = fsub(%neg, d).
+; fmul has two users: fadd (contractable) and fneg, whose user is fsub (contractable).
+; Should contract -- all paths are contractable.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, -v0, v1, -v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %add = fadd contract float %mul, %c ; contractable
+ %neg = fneg contract float %mul
+ %sub = fsub contract float %neg, %d ; contractable
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fsub, fneg -> fsub} (all contractable).
+; IR: %mul = fmul(a,b), %sub1 = fsub(%mul, c), %neg = fneg(%mul), %sub2 = fsub(%neg, d).
+; fmul has two users: fsub (contractable) and fneg, whose user is fsub (contractable).
+; Should contract -- all paths are contractable.
+; Expected: two fma/mad instructions, no v_mul.
+define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b, float %c, float %d) {
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9_4-SDAG: ; %bb.0:
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, -v0, v1, -v3
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9_4-GISEL: ; %bb.0:
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX12_5-SDAG: ; %bb.0:
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX12_5-GISEL: ; %bb.0:
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul contract float %a, %b
+ %sub1 = fsub contract float %mul, %c ; contractable
+ %neg = fneg contract float %mul
+ %sub2 = fsub contract float %neg, %d ; contractable
+ %ret0 = insertvalue { float, float } poison, float %sub1, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub2, 1
+ ret { float, float } %ret1
+}
+
+; ==========================================================================
+; FPEXT patterns
+; Tests for allMulUsesCanBeContracted with fpext(fmul) feeding into
+; fadd, fsub, and fneg combinations.
+; ==========================================================================
+
+; Test case: fpext(fmul) -> {fadd, fadd} (chained adds, second uses result of first).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z), %add2 = fadd(%add, %ext).
+; fpext(%mul) has two users: both fadd.
+; Should contract -- both uses are contractable fadds.
+; Expected: fma_mix (or fma after cvt) for both adds, no v_mul_f16.
+define float @fpext_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v0, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v0, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_nop 0
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_nop 0
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %add2 = fadd contract float %add, %mul.ext
+ ret float %add2
+}
+
+; Test case: fpext(fmul) -> {fadd, return} (non-contractable direct use of fpext result).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z), return {%add, %ext}.
+; fpext(%mul) has two users: fadd (contractable) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
+define { float, float } @fpext_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_noncontractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v1, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_noncontractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul.ext, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fpext -> fadd, return} (non-contractable direct use of half-precision mul).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z), return {%add, %mul}.
+; fmul has two users: fpext (feeding fadd) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return of half mul) is not contractable.
+; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
+define { float, half } @fpext_noncontractable_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_noncontractable_2:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_noncontractable_2:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_2:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_2:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_2:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_2:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %ret0 = insertvalue { float, half } poison, float %add, 0
+ %ret1 = insertvalue { float, half } %ret0, half %mul, 1
+ ret { float, half } %ret1
+}
+
+; Test case: fpext(fmul) -> fadd (single use).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z).
+; fpext(%mul) has one user: fadd.
+; Should contract -- single use, trivially contractable.
+; Expected: fma_mix (or fma after cvt), no v_mul_f16.
+define float @fpext_contractable_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_2:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_contractable_2:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ ret float %add
+}
+
+; Test case: fpext(fmul) -> {fadd, fadd} (two independent adds).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z), %add2 = fadd(x, %ext).
+; fpext(%mul) has two users: both fadd.
+; Should contract -- both uses are contractable fadds.
+; Expected: fma_mix (or fma after cvt) for both adds, no v_mul_f16.
+define {float, float} @fpext_contractable_3(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_3:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_contractable_3:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %add2 = fadd contract float %x, %mul.ext
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fpext(fmul) -> {fsub, fsub} (chained subs, second uses result of first).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), %sub2 = fsub(%sub, %ext).
+; fpext(%mul) has two users: both fsub.
+; Should contract -- both uses are contractable fsubs.
+; Expected: fma_mix (or fma after cvt) for both subs, no v_mul_f16.
+define float @fpext_contractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_sub:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v1, v0, v4
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_contractable_sub:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v1, v0, v4
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %sub2 = fsub contract float %sub, %mul.ext
+ ret float %sub2
+}
+
+; Test case: fpext(fmul) -> {fsub, return} (non-contractable direct use of fpext result).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), return {%sub, %ext}.
+; fpext(%mul) has two users: fsub (contractable) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
+define { float, float } @fpext_noncontractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v1, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v1, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul.ext, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fpext -> fsub, return} (non-contractable direct use of half-precision mul).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), return {%sub, %mul}.
+; fmul has two users: fpext (feeding fsub) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return of half mul) is not contractable.
+; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
+define { float, half } @fpext_noncontractable_sub_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %ret0 = insertvalue { float, half } poison, float %sub, 0
+ %ret1 = insertvalue { float, half } %ret0, half %mul, 1
+ ret { float, half } %ret1
+}
+
+; Test case: fpext(fmul) -> fsub (single use).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z).
+; fpext(%mul) has one user: fsub.
+; Should contract -- single use, trivially contractable.
+; Expected: fma_mix (or fma after cvt), no v_mul_f16.
+define float @fpext_contractable_sub_2(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_sub_2:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_contractable_sub_2:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ ret float %sub
+}
+
+; Test case: fpext(fmul) -> {fsub, fsub} (two independent subs).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), %sub2 = fsub(x, %ext).
+; fpext(%mul) has two users: both fsub.
+; Should contract -- both uses are contractable fsubs.
+; Expected: fma_mix (or fma after cvt) for both subs, no v_mul_f16.
+define {float, float} @fpext_contractable_sub_3(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_sub_3:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v2, v1, v4
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_contractable_sub_3:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v2, v1, v4
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %sub = fsub contract float %mul.ext, %z
+ %sub2 = fsub contract float %x, %mul.ext
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fpext -> fadd, fneg -> fpext -> fsub} (both contractable).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z), %neg = fneg(%mul), %neg.ext = fpext(%neg), %sub = fsub(%neg.ext, x).
+; fmul has two users: fpext (feeding fadd) and fneg (feeding fpext -> fsub).
+; Should contract -- all paths are contractable.
+; Expected: fma_mix (or fma after cvt) for both paths, no v_mul_f16.
+define {float, float} @fpext_fneg_fpext_fsub_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v1, v1, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v1, v1, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %neg = fneg contract half %mul
+ %neg.ext = fpext contract half %neg to float
+ %sub = fsub contract float %neg.ext, %x
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fneg -> fpext -> fsub, return} (non-contractable direct use of mul).
+; IR: %mul = fmul(u,v), %neg = fneg(%mul), %neg.ext = fpext(%neg), %sub = fsub(%neg.ext, z), return {%sub, %mul}.
+; fmul has two users: fneg (feeding fpext -> fsub) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return of half mul) is not contractable.
+; Expected: v_mul_f16, no fma_mix fold.
+define {float, half} @fpext_fneg_fpext_fsub_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %neg = fneg contract half %mul
+ %neg.ext = fpext contract half %neg to float
+ %sub = fsub contract float %neg.ext, %z
+ %ret0 = insertvalue { float, half } poison, float %sub, 0
+ %ret1 = insertvalue { float, half } %ret0, half %mul, 1
+ ret { float, half } %ret1
+}
+
+; Test case: fmul -> fpext -> {fadd, fneg -> fsub} (both contractable).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %add = fadd(%ext, z), %neg = fneg(%ext), %sub = fsub(%neg, x).
+; fpext(%mul) has two users: fadd (contractable) and fneg (feeding fsub, contractable).
+; Should contract -- all paths are contractable.
+; Expected: fma_mix (or fma after cvt) for both paths, no v_mul_f16.
+define {float, float} @fpext_fneg_fsub_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-SDAG-NEXT: v_sub_f32_e64 v1, -v1, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v2, v1, v4
+; GFX9-GISEL-NEXT: v_sub_f32_e64 v1, -v1, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %add = fadd contract float %mul.ext, %z
+ %neg = fneg contract float %mul.ext
+ %sub = fsub contract float %neg, %x
+ %ret0 = insertvalue { float, float } poison, float %add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> fpext -> {fneg -> fsub, return} (non-contractable direct use of fpext result).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %neg = fneg(%ext), %sub = fsub(%neg, z), return {%sub, %ext}.
+; fpext(%mul) has two users: fneg (feeding fsub) and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
+define {float, float} @fpext_fneg_fsub_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e64 v0, -v1, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e64 v0, -v1, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %neg = fneg contract float %mul.ext
+ %sub = fsub contract float %neg, %z
+ %ret0 = insertvalue { float, float } poison, float %sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %mul.ext, 1
+ ret { float, float } %ret1
+}
+
+; ==========================================================================
+; FMA/FMAD chain patterns
+; Tests for allMulUsesCanBeContracted recognizing FMA/FMAD as contractable.
+; ==========================================================================
+
+; Test case: fpext(fmul) -> {fma -> fadd, fadd} (chained fma with fpext).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %fma = fma(x, y, %ext), %add1 = fadd(%fma, z), %add2 = fadd(%ext, w).
+; fpext(%mul) has two users: fma (feeding fadd) and fadd.
+; Should contract -- both paths are contractable (fpext folds aggressively).
+; Expected: fma_mix (or fma after cvt) for both paths, no v_mul_f16.
+define {float, float} @fma_chain_fpext_contractable(float %x, float %y, half %u, half %v, float %z, float %w) #0 {
+; GFX9-SDAG-LABEL: fma_chain_fpext_contractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fma_chain_fpext_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fma_chain_fpext_contractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fma_chain_fpext_contractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fma_chain_fpext_contractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fma_chain_fpext_contractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_contractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fma_chain_fpext_contractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fma_chain_fpext_contractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fma_chain_fpext_contractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %mul.xy = fmul contract float %x, %y
+ %fma.res = call contract float @llvm.fma.f32(float %x, float %y, float %mul.ext)
+ %outer.add = fadd contract float %fma.res, %z
+ %add2 = fadd contract float %mul.ext, %w
+ %ret0 = insertvalue { float, float } poison, float %outer.add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fma -> fpext -> fadd, fadd} (half precision, outer fpext chain).
+; IR: %mul = fmul(u,v), %fma = fma(x, y, %mul), %fma.ext = fpext(%fma), %add1 = fadd(%fma.ext, z), %add2 = fadd(%mul, w).
+; fmul has two users: fma (feeding fpext -> fadd) and fadd.
+; Should contract -- both paths are contractable (fpext on fma result is foldable).
+; Expected: fma_mix or fma chain, no standalone v_mul_f16.
+define {float, float} @fma_chain_fpext_outer_contractable(half %x, half %y, half %u, half %v, float %z, half %w) #0 {
+; GFX9-SDAG-LABEL: fma_chain_fpext_outer_contractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT: v_fma_f16 v1, v2, v3, v5
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fma_chain_fpext_outer_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT: v_fma_f16 v1, v2, v3, v5
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fma_chain_fpext_outer_contractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_f16 v5, v2, v3, v5
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v1, v5
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fma_chain_fpext_outer_contractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_f16 v5, v2, v3, v5
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v1, v5
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fma_chain_fpext_outer_contractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fmac_f16_e32 v5, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fma_chain_fpext_outer_contractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fmac_f16_e32 v5, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_f16 v2, v2, v3, v5
+; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v1, v2
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_f16 v2, v2, v3, v5
+; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v1, v2
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fmac_f16_e32 v5, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fmac_f16_e32 v6, v0, v1
+; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v6, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fmac_f16_e32 v5, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fmac_f16_e32 v6, v0, v1
+; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v6, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %fma.res = call contract half @llvm.fma.f16(half %x, half %y, half %mul)
+ %fma.ext = fpext contract half %fma.res to float
+ %outer.add = fadd contract float %fma.ext, %z
+ %add2 = fadd contract half %mul, %w
+ %add2.ext = fpext half %add2 to float
+ %ret0 = insertvalue { float, float } poison, float %outer.add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2.ext, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fpext(fmul) -> {fma -> fadd, fadd, return} (non-contractable direct use).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %fma = fma(x, y, %ext), %add1 = fadd(%fma, z), %add2 = fadd(%ext, w), return {%add1, %add2, %ext}.
+; fpext(%mul) has three users: fma (feeding fadd), fadd, and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold for the multiply.
+define {float, float, float} @fma_chain_fpext_noncontractable(float %x, float %y, half %u, half %v, float %z, float %w) #0 {
+; GFX9-SDAG-LABEL: fma_chain_fpext_noncontractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fma_chain_fpext_noncontractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fma_chain_fpext_noncontractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fma_chain_fpext_noncontractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fma_chain_fpext_noncontractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v6
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fma_chain_fpext_noncontractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v3, v5 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v6
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_noncontractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v3, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v3, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fma_chain_fpext_noncontractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v3, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v3, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fma_chain_fpext_noncontractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX12_5-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fma_chain_fpext_noncontractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX12_5-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract half %u, %v
+ %mul.ext = fpext contract half %mul to float
+ %fma.res = call contract float @llvm.fma.f32(float %x, float %y, float %mul.ext)
+ %outer.add = fadd contract float %fma.res, %z
+ %add2 = fadd contract float %mul.ext, %w
+ %ret0 = insertvalue { float, float, float } poison, float %outer.add, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %add2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %mul.ext, 2
+ ret { float, float, float } %ret2
+}
+
+; Test case: fpext(fmul) -> {fma -> fsub, fsub} (chained fma with fpext, fsub variant).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %fma = fma(x, y, %ext), %sub1 = fsub(%fma, z), %sub2 = fsub(%ext, w).
+; fpext(%mul) has two users: fma (feeding fsub) and fsub.
+; Should contract -- both paths are contractable (fpext folds aggressively).
+; Expected: fma_mix (or fma after cvt) for both paths, no v_mul_f16.
+define {float, float} @fma_chain_fpext_fsub_contractable(float %x, float %y, half %u, half %v, float %z, float %w) #0 {
+; GFX9-SDAG-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v1, v2, v5
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, -v5 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v6 op_sel_hi:[0,0,1]
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, -v5 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, -v5 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v6 op_sel_hi:[0,0,1]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, -v5 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable:
+; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %mul = fmul contract reassoc half %u, %v
+ %mul.ext = fpext contract reassoc half %mul to float
+ %fma.res = call contract reassoc float @llvm.fma.f32(float %x, float %y, float %mul.ext)
+ %outer.sub = fsub contract reassoc float %fma.res, %z
+ %sub2 = fsub contract reassoc float %mul.ext, %w
+ %ret0 = insertvalue { float, float } poison, float %outer.sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %sub2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fma -> fadd, fadd} (no fpext, no reassoc).
+; IR: %mul = fmul(c,d), %fma = fma(a, b, %mul), %add1 = fadd(%fma, e), %add2 = fadd(%mul, f).
+; fmul has two users: fma (feeding fadd) and fadd.
+; Should NOT contract -- the fma user's consumer (fadd) lacks reassoc, so chain
+; reassociation can't fire to eliminate the multiply.
+; Expected: v_mul + v_fma + v_add + v_add (no contraction, mul shared by both paths).
+define {float, float} @fma_chain_fadd_no_reassoc(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9_4-SDAG: ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-SDAG-NEXT: v_fmac_f32_e32 v6, v0, v1
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v6, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9_4-GISEL: ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-GISEL-NEXT: v_fmac_f32_e32 v6, v0, v1
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v6, v4
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: fma_chain_fadd_no_reassoc:
+; GFX12_5-SDAG: ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_fmac_f32 v6, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-SDAG-NEXT: v_add_f32_e32 v0, v6, v4
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: fma_chain_fadd_no_reassoc:
+; GFX12_5-GISEL: ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_fmac_f32 v6, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-GISEL-NEXT: v_add_f32_e32 v0, v6, v4
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract float %c, %d
+ %fma.res = call contract float @llvm.fma.f32(float %a, float %b, float %mul)
+ %outer.add = fadd contract float %fma.res, %e
+ %add2 = fadd contract float %mul, %f
+ %ret0 = insertvalue { float, float } poison, float %outer.add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fma -> fadd, fadd, return} (no fpext, non-contractable direct use).
+; IR: %mul = fmul(c,d), %fma = fma(a, b, %mul), %add1 = fadd(%fma, e), %add2 = fadd(%mul, f), return {%add1, %add2, %mul}.
+; fmul has three users: fma (feeding fadd), fadd, and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul shared by all paths, no fma contraction of the multiply.
+define {float, float, float} @fma_direct_use_noncontractable(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fma_direct_use_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fma_direct_use_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: fma_direct_use_noncontractable:
+; GFX9_4-SDAG: ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: fma_direct_use_noncontractable:
+; GFX9_4-GISEL: ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: fma_direct_use_noncontractable:
+; GFX12_5-SDAG: ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v0, v0, v1, v6 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-SDAG-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_add_f32 v0, v0, v4
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: fma_direct_use_noncontractable:
+; GFX12_5-GISEL: ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v0, v0, v1, v6 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_add_f32 v0, v0, v4
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fma_direct_use_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fma_direct_use_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract float %c, %d
+ %fma.res = call contract float @llvm.fma.f32(float %a, float %b, float %mul)
+ %outer.add = fadd contract float %fma.res, %e
+ %add2 = fadd contract float %mul, %f
+ %ret0 = insertvalue { float, float, float } poison, float %outer.add, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %add2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %mul, 2
+ ret { float, float, float } %ret2
+}
+
+; Test case: fmul -> {fma -> fsub, fadd} (no fpext, fsub variant).
+; IR: %mul = fmul(c,d), %fma = fma(a, b, %mul), %sub = fsub(e, %fma), %add = fadd(%mul, f).
+; fmul has two users: fma (feeding fsub) and fadd.
+; Should contract -- SDAG algebraically simplifies fsub(e, fma(a,b,mul)) into a
+; fma chain that eliminates the multiply. GISEL does not perform this transform
+; and retains v_mul.
+; Expected: SDAG: fma/mad chain (no v_mul). GISEL: v_mul + v_fma + v_sub + fma.
+define {float, float} @fma_chain_fsub_contractable(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fma_chain_fsub_contractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v4, -v2, v3, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fsub_contractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: fma_chain_fsub_contractable:
+; GFX9_4-SDAG: ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v4, -v2, v3, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, -v0, v1, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: fma_chain_fsub_contractable:
+; GFX9_4-GISEL: ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-GISEL-NEXT: v_fmac_f32_e32 v6, v0, v1
+; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v4, v6
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: fma_chain_fsub_contractable:
+; GFX12_5-SDAG: ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_fma_f32 v4, -v2, v3, v4
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v0, -v0, v1, v4 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: fma_chain_fsub_contractable:
+; GFX12_5-GISEL: ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_fmac_f32 v6, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-GISEL-NEXT: v_sub_f32_e32 v0, v4, v6
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fsub_contractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v4, -v2, v3, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fsub_contractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract reassoc nsz float %c, %d
+ %fma.res = call contract reassoc nsz float @llvm.fma.f32(float %a, float %b, float %mul)
+ %outer.sub = fsub contract reassoc nsz float %e, %fma.res
+ %add2 = fadd contract float %mul, %f
+ %ret0 = insertvalue { float, float } poison, float %outer.sub, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+ ret { float, float } %ret1
+}
+
+; Test case: fmul -> {fma -> fsub, fadd, return} (no fpext, non-contractable direct use).
+; IR: %mul = fmul(c,d), %fma = fma(a, b, %mul), %sub = fsub(e, %fma), %add = fadd(%mul, f), return {%sub, %add, %mul}.
+; fmul has three users: fma (feeding fsub), fadd, and direct return (non-contractable).
+; Should NOT contract -- one user (direct return) is not contractable.
+; Expected: v_mul shared by all paths, no fma contraction of the multiply.
+define {float, float, float} @fma_chain_fsub_noncontractable(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fma_chain_fsub_noncontractable:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v4, -v2, v3, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fsub_noncontractable:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: fma_chain_fsub_noncontractable:
+; GFX9_4-SDAG: ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-SDAG-NEXT: v_fma_f32 v4, -v2, v3, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, -v0, v1, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: fma_chain_fsub_noncontractable:
+; GFX9_4-GISEL: ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: fma_chain_fsub_noncontractable:
+; GFX12_5-SDAG: ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v6, -v2, v3, v4 :: v_dual_fma_f32 v4, v2, v3, v5
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v2, v2, v3 :: v_dual_fma_f32 v0, -v0, v1, v6
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v1, v4
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: fma_chain_fsub_noncontractable:
+; GFX12_5-GISEL: ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v0, v0, v1, v6 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_sub_f32 v0, v4, v0
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fsub_noncontractable:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v4, -v2, v3, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fsub_noncontractable:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract reassoc nsz float %c, %d
+ %fma.res = call contract reassoc nsz float @llvm.fma.f32(float %a, float %b, float %mul)
+ %outer.sub = fsub contract reassoc nsz float %e, %fma.res
+ %add2 = fadd contract float %mul, %f
+ %ret0 = insertvalue { float, float, float } poison, float %outer.sub, 0
+ %ret1 = insertvalue { float, float, float } %ret0, float %add2, 1
+ %ret2 = insertvalue { float, float, float } %ret1, float %mul, 2
+ ret { float, float, float } %ret2
+}
+
+; Test case: fmul -> {fma -> fadd, fadd} (no fpext, WITH reassoc).
+; IR: %mul = fmul(c,d), %fma = fma(a, b, %mul), %add1 = fadd(%fma, e), %add2 = fadd(%mul, f).
+; fmul has two users: fma (feeding fadd) and fadd. All ops have reassoc nsz.
+; Should contract on targets where chain reassociation fires (F32DENORM, GFX9_4+).
+; reassoc allows: fadd(fma(a,b,mul), e) -> fma(a, b, fma(c, d, e)), eliminating
+; the multiply. On GFX9-F32FLUSH, fma is not used (mad instead), so chain
+; reassociation does not fire and the multiply remains.
+; Expected: fma chain (no v_mul) on denorm-capable targets; v_mul remains on gfx9-flush.
+define {float, float} @fma_chain_fadd_reassoc(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
+; GFX9-SDAG-F32FLUSH-LABEL: fma_chain_fadd_reassoc:
+; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fadd_reassoc:
+; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
+; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9-GISEL-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-SDAG-LABEL: fma_chain_fadd_reassoc:
+; GFX9_4-SDAG: ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX9_4-SDAG-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-LABEL: fma_chain_fadd_reassoc:
+; GFX9_4-GISEL: ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX9_4-GISEL-NEXT: v_fmac_f32_e32 v4, v0, v1
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-LABEL: fma_chain_fadd_reassoc:
+; GFX12_5-SDAG: ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_fmac_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-LABEL: fma_chain_fadd_reassoc:
+; GFX12_5-GISEL: ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_fmac_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fadd_reassoc:
+; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
+; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fadd_reassoc:
+; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
+; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v4
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %mul = fmul contract reassoc nsz float %c, %d
+ %fma.res = call contract reassoc nsz float @llvm.fma.f32(float %a, float %b, float %mul)
+ %outer.add = fadd contract reassoc nsz float %fma.res, %e
+ %add2 = fadd contract float %mul, %f
+ %ret0 = insertvalue { float, float } poison, float %outer.add, 0
+ %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+ ret { float, float } %ret1
+}
+
+declare float @llvm.fma.f32(float, float, float)
+declare half @llvm.fma.f16(half, half, half)
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
More information about the llvm-commits
mailing list