[llvm-branch-commits] [llvm] [AMDGPU][DAGCombiner][GlobalISel] Extend allMulUsesCanBeContracted with FNEG pattern (PR #188115)

Adel Ejjeh via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Mar 30 09:54:26 PDT 2026


https://github.com/adelejjeh updated https://github.com/llvm/llvm-project/pull/188115

>From 0210ce33ebf52cc9b9d7499aa4d401eb0b1a7638 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <adel.ejjeh at amd.com>
Date: Thu, 5 Mar 2026 12:18:16 -0600
Subject: [PATCH] [AMDGPU][DAGCombiner][GlobalISel] Extend
 allMulUsesCanBeContracted with FNEG pattern

Extend allMulUsesCanBeContracted() to recognize fmul -> fneg -> fsub
chains as contractable uses. This allows FMA contraction when a multiply
feeds an fneg that is only used by fsub operations.

Changes:
- DAGCombiner.cpp: Add ISD::FNEG case to allMulUsesCanBeContracted()
  checking that all FNEG users are ISD::FSUB. Update 1 fold site guard
  in visitFSUBForFMACombine (fsub(fneg(fmul))).
- CombinerHelper.cpp: Add G_FNEG case to allMulUsesCanBeContracted()
  checking that all FNEG users are G_FSUB. Update 2 fold site guards
  in matchCombineFSubFNegFMulToFMadOrFMA. Fix guard ordering to check
  isContractableFMul before allMulUsesCanBeContracted (cheap first).
- Add 7 new test functions to fma-multiple-uses-contraction.ll covering
  fneg single-use, multi-use, mixed contractable/non-contractable, and
  cross-pattern (P1 direct + P2 fneg) interactions.
- Update mad-combine.ll CHECK lines affected by the guard changes.

Note: FADD is intentionally not checked as an FNEG user because
fadd(fneg(x), y) is canonicalized to fsub(y, x) before FMA combine
runs in both SDAG (visitFSUB) and GISel (redundant_neg_operands).
FPEXT inside FNEG chains is deferred to a follow-up patch.

Made-with: Cursor
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  40 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  24 +-
 .../AMDGPU/fma-multiple-uses-contraction.ll   | 762 +++++++++---------
 llvm/test/CodeGen/AMDGPU/mad-combine.ll       |   9 +-
 4 files changed, 421 insertions(+), 414 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 297d6edac2e5c..d2bf2568df276 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6316,8 +6316,9 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
 /// would duplicate the multiply without reducing the total number of
 /// operations.
 ///
-/// Currently checks for the following pattern:
+/// Currently checks for the following patterns:
 ///   - fmul --> fadd/fsub: Direct contraction
+///   - fmul --> fneg --> fsub: Contraction through fneg
 bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
   Register MulReg = MI.getOperand(0).getReg();
 
@@ -6328,6 +6329,17 @@ bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
     if (Opcode == TargetOpcode::G_FADD || Opcode == TargetOpcode::G_FSUB)
       continue;
 
+    // G_FNEG use - contractable if all users of the fneg are G_FSUB.
+    if (Opcode == TargetOpcode::G_FNEG) {
+      Register FNegReg = UseMI.getOperand(0).getReg();
+      for (const MachineInstr &FNegUser : MRI.use_nodbg_instructions(FNegReg)) {
+        unsigned FNegUserOp = FNegUser.getOpcode();
+        if (FNegUserOp != TargetOpcode::G_FSUB)
+          return false;
+      }
+      continue;
+    }
+
     // Any other use type is not currently recognized as contractable.
     return false;
   }
@@ -6751,10 +6763,15 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
 
   MachineInstr *FMulMI;
   // fold (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+  // Only contract if both fneg and fmul have one use (both eliminated), or
+  // under Aggressive mode if all uses of the multiply are contractable
+  // (including through fneg -> fsub chains), avoiding duplication of the
+  // multiply without reducing total operations.
   if (mi_match(LHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) &&
-      (Aggressive || (MRI.hasOneNonDBGUse(LHSReg) &&
-                      MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) &&
-      isContractableFMul(*FMulMI, AllowFusionGlobally)) {
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      ((MRI.hasOneNonDBGUse(LHSReg) &&
+        MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
+       (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       Register NegX =
           B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
@@ -6766,10 +6783,19 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
   }
 
   // fold (fsub x, (fneg (fmul, y, z))) -> (fma y, z, x)
+  // Note: In the standard combiner ordering, redundant_neg_operands
+  // canonicalizes fsub(x, fneg(y)) -> fadd(x, y) before fma_combines runs,
+  // so this fold may not fire in practice. It is kept as defensive code
+  // against combiner reordering.
+  // Only contract if both fneg and fmul have one use (both eliminated), or
+  // under Aggressive mode if all uses of the multiply are contractable
+  // (including through fneg -> fsub chains), avoiding duplication of the
+  // multiply without reducing total operations.
   if (mi_match(RHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) &&
-      (Aggressive || (MRI.hasOneNonDBGUse(RHSReg) &&
-                      MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) &&
-      isContractableFMul(*FMulMI, AllowFusionGlobally)) {
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      ((MRI.hasOneNonDBGUse(RHSReg) &&
+        MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
+       (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
                    {FMulMI->getOperand(1).getReg(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4e1fab5391e87..ffcdb9c40bb81 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17685,8 +17685,9 @@ static bool isFusedOp(const MatchContextClass &Matcher, SDValue N) {
 /// would duplicate the multiply without reducing the total number of
 /// operations.
 ///
-/// Currently checks for the following pattern:
+/// Currently checks for the following patterns:
 ///   - fmul --> fadd/fsub: Direct contraction
+///   - fmul --> fneg --> fsub: Contraction through fneg
 static bool allMulUsesCanBeContracted(SDValue Mul) {
   for (const auto *User : Mul->users()) {
     unsigned Opcode = User->getOpcode();
@@ -17695,6 +17696,16 @@ static bool allMulUsesCanBeContracted(SDValue Mul) {
     if (Opcode == ISD::FADD || Opcode == ISD::FSUB)
       continue;
 
+    // FNEG use - contractable if all users of the fneg are FSUB.
+    if (Opcode == ISD::FNEG) {
+      for (const auto *FNegUser : User->users()) {
+        unsigned FNegUserOp = FNegUser->getOpcode();
+        if (FNegUserOp != ISD::FSUB)
+          return false;
+      }
+      continue;
+    }
+
     // Any other use type is not currently recognized as contractable.
     return false;
   }
@@ -18038,8 +18049,17 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+  // Note: SDAG does not need the symmetric fold (fsub x, (fneg (fmul y, z)))
+  // because visitFSUB canonicalizes fsub(A, fneg(B)) -> fadd(A, B) before
+  // calling visitFSUBForFMACombine, so that pattern is handled by
+  // visitFADDForFMACombine instead.
+  // Only contract if the multiply has one use (both fneg and fmul eliminated),
+  // or under Aggressive mode if all uses of the multiply are contractable
+  // (including through fneg -> fsub chains), avoiding duplication of the
+  // multiply without reducing total operations.
   if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) &&
-      (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
+      ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
+       (Aggressive && allMulUsesCanBeContracted(N0.getOperand(0))))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
     return matcher.getNode(PreferredFusedOpcode, SL, VT,
diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
index 803561b2b29a6..99dabe9961033 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
@@ -714,15 +714,9 @@ define { float, float, float } @mul_three_contractable_uses(float %a, float %b,
   ret { float, float, float } %ret2
 }
 
-
 ; ==========================================================================
 ; FNEG patterns
 ; Tests for allMulUsesCanBeContracted recognizing fneg as a transparent user.
-;
-; NOTE: The allMulUsesCanBeContracted guard does not yet recognize fneg as
-; transparent. That support is added by the next patch in the series. Until
-; then, the CHECK lines below reflect current (potentially over-conservative)
-; codegen and may not match the "Expected:" comments on individual tests.
 ; ==========================================================================
 
 ; Test case: fmul -> fneg -> fsub (single use chain).
@@ -731,55 +725,55 @@ define { float, float, float } @mul_three_contractable_uses(float %a, float %b,
 ; Should contract -- single-use chain, fneg folds into fma.
 ; Expected: single fma/mad, no v_mul.
 define float @mul_fneg_fsub_single_use(float %a, float %b, float %c) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v0, v0, -v1, -v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_fsub_single_use:
+; GFX9-SDAG-F32FLUSH:       ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v0, v0, -v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v0, v0, -v1, -v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_fsub_single_use:
+; GFX9-GISEL-F32FLUSH:       ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v0, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v0, -v0, v1, -v2
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_fneg_fsub_single_use:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v0, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v0, v0, -v1, -v2
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_fneg_fsub_single_use:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_fma_f32 v0, -v0, v1, -v2
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_fneg_fsub_single_use:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_fma_f32 v0, -v0, v1, -v2
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_fma_f32 v0, v0, -v1, -v2
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_fneg_fsub_single_use:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v0, -v0, v1, -v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_fsub_single_use:
+; GFX9-SDAG-F32DENORM:       ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v0, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_fneg_fsub_single_use:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v0, v0, -v1, -v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_fsub_single_use:
+; GFX9-GISEL-F32DENORM:       ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %a, %b
   %neg = fneg contract float %mul
   %sub = fsub contract float %neg, %c       ; contractable
@@ -792,71 +786,71 @@ define float @mul_fneg_fsub_single_use(float %a, float %b, float %c) {
 ; Should contract -- all fneg uses are contractable fsubs.
 ; Expected: two fma/mad instructions, no v_mul.
 define { float, float } @mul_fneg_multiple_fsub_uses(float %a, float %b, float %c, float %d) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, v0, -v1, -v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-SDAG-F32FLUSH:       ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, v0, -v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, -v1, -v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-GISEL-F32FLUSH:       ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v2, -v0, v1, -v2
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, -v3
-; P0-GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v2, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, -v3
+; GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, -v1, -v2
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, -v3
-; P0-GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, -v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
-; P0-GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, -v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, -v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
-; P0-GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, -v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, -v0, v1, -v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, -v3
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-SDAG-F32DENORM:       ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_fneg_multiple_fsub_uses:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, -v1, -v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, -v3
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_multiple_fsub_uses:
+; GFX9-GISEL-F32DENORM:       ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %a, %b
   %neg = fneg contract float %mul
   %sub1 = fsub contract float %neg, %c      ; contractable
@@ -872,71 +866,71 @@ define { float, float } @mul_fneg_multiple_fsub_uses(float %a, float %b, float %
 ; Should contract -- both fneg uses (fsub, fadd) are contractable.
 ; Expected: two fma/mad instructions, no v_mul.
 define { float, float } @mul_fneg_mixed_uses(float %a, float %b, float %c, float %d) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_mixed_uses:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, -v0, v1, -v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, -v0, v1, v3
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_mixed_uses:
+; GFX9-SDAG-F32FLUSH:       ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, -v0, v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, -v0, v1, v3
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_mixed_uses:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, -v1, -v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, v3
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_mixed_uses:
+; GFX9-GISEL-F32FLUSH:       ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, v3
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v2, -v0, v1, -v2
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, v3
-; P0-GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v2, -v0, v1, -v2
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, v3
+; GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, -v1, -v2
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, v3
-; P0-GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, -v1, -v2
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, v3
+; GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, -v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, v3
-; P0-GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, -v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_fneg_mixed_uses:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, -v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, v3
-; P0-GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_fneg_mixed_uses:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, -v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_fneg_mixed_uses:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, -v0, v1, -v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, v3
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_mixed_uses:
+; GFX9-SDAG-F32DENORM:       ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, -v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, v3
+; GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_fneg_mixed_uses:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, -v1, -v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, v3
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_mixed_uses:
+; GFX9-GISEL-F32DENORM:       ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, -v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, v3
+; GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %a, %b
   %neg = fneg contract float %mul
   %sub = fsub contract float %neg, %c       ; contractable
@@ -952,71 +946,55 @@ define { float, float } @mul_fneg_mixed_uses(float %a, float %b, float %c, float
 ; Should NOT contract -- one fneg user (fmul) is not contractable.
 ; Expected: v_mul + v_mul, no fma contraction.
 define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, float %d) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v0, v0, -v1, -v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v0, v0, -v1, -v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v0, -v0, v1, -v2
-; P0-GFX9_4-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, v0, -v1
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v0, v0, -v1, -v2
-; P0-GFX9_4-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v1, v0, -v1
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, -v0, v1, -v2
-; P0-GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses_2:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f32_e64 v1, v0, -v1
+; GFX9_4-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9_4-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, -v1, -v2
-; P0-GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses_2:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f32_e64 v1, v0, -v1
+; GFX9_4-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9_4-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v0, -v0, v1, -v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses_2:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f32_e64 v1, v0, -v1
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_fneg_mixed_uses_2:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v0, v0, -v1, -v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_fneg_mixed_uses_2:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f32_e64 v1, v0, -v1
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %mul = fmul contract float %a, %b
   %neg = fneg contract float %mul
   %sub = fsub contract float %neg, %c       ; contractable
@@ -1032,71 +1010,55 @@ define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, flo
 ; Should NOT contract -- one path (fneg -> fmul) is not contractable.
 ; Expected: v_mul shared by both paths, no fma contraction.
 define { float, float } @mul_fneg_nonfsub_noncontractable(float %a, float %b, float %c, float %d) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mul_f32_e32 v4, v0, v1
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v0, v0, v1, v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mul_f32_e64 v1, -v4, v3
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v0, v0, v1, v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v0, v0, v1, v2
-; P0-GFX9_4-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v0, v0, v1, v2
-; P0-GFX9_4-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, v1, v2
-; P0-GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f32_e32 v1, v0, v1
+; GFX9_4-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9_4-SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, v1, v2
-; P0-GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f32_e32 v1, v0, v1
+; GFX9_4-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9_4-GISEL-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f32_e32 v1, v0, v1
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mul_f32 v1, -v1, v3
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_fneg_nonfsub_noncontractable:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mul_f32_e64 v4, v0, -v1
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mul_f32_e32 v1, v4, v3
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f32_e32 v1, v0, v1
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mul_f32 v1, -v1, v3
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %mul = fmul contract float %a, %b
   %neg = fneg float %mul
   %add = fadd contract float %mul, %c       ; contractable
@@ -1112,71 +1074,71 @@ define { float, float } @mul_fneg_nonfsub_noncontractable(float %a, float %b, fl
 ; Should contract -- all paths are contractable.
 ; Expected: two fma/mad instructions, no v_mul.
 define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %b, float %c, float %d) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, -v0, v1, -v3
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32FLUSH-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-SDAG-F32FLUSH:       ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32FLUSH-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-GISEL-F32FLUSH:       ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v2, v0, v1, v2
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, -v3
-; P0-GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v2, v0, v1, v2
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, -v3
+; GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, v1, v2
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, -v3
-; P0-GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, v1, v2
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
-; P0-GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
-; P0-GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, -v3
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32DENORM-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-SDAG-F32DENORM:       ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_direct_and_fneg_contractable_uses:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, -v3
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32DENORM-LABEL: mul_direct_and_fneg_contractable_uses:
+; GFX9-GISEL-F32DENORM:       ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, v2
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %a, %b
   %add = fadd contract float %mul, %c       ; contractable
   %neg = fneg contract float %mul
@@ -1192,71 +1154,71 @@ define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %
 ; Should contract -- all paths are contractable.
 ; Expected: two fma/mad instructions, no v_mul.
 define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b, float %c, float %d) {
-; P0-GFX9-SDAG-F32FLUSH-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX9-SDAG-F32FLUSH:       ; %bb.0:
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, -v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, -v0, v1, -v3
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32FLUSH-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-SDAG-F32FLUSH:       ; %bb.0:
+; GFX9-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mad_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32FLUSH-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX9-GISEL-F32FLUSH:       ; %bb.0:
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, -v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32FLUSH-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-GISEL-F32FLUSH:       ; %bb.0:
+; GFX9-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX9_4-SDAG:       ; %bb.0:
-; P0-GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v2, v0, v1, -v2
-; P0-GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, -v3
-; P0-GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9_4-SDAG:       ; %bb.0:
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-SDAG-NEXT:    v_fma_f32 v1, -v0, v1, -v3
+; GFX9_4-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX9_4-GISEL:       ; %bb.0:
-; P0-GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, v1, -v2
-; P0-GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, -v3
-; P0-GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9_4-GISEL:       ; %bb.0:
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v2, v0, v1, -v2
+; GFX9_4-GISEL-NEXT:    v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX12_5-SDAG:       ; %bb.0:
-; P0-GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
-; P0-GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX12_5-SDAG:       ; %bb.0:
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX12_5-GISEL:       ; %bb.0:
-; P0-GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
-; P0-GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX12_5-GISEL:       ; %bb.0:
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX9-SDAG-F32DENORM-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX9-SDAG-F32DENORM:       ; %bb.0:
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, -v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, -v3
-; P0-GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-F32DENORM-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-SDAG-F32DENORM:       ; %bb.0:
+; GFX9-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, -v2
+; GFX9-SDAG-F32DENORM-NEXT:    v_fma_f32 v1, -v0, v1, -v3
+; GFX9-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-F32DENORM-LABEL: mul_fsub_and_fneg_fsub_contractable:
-; P0-GFX9-GISEL-F32DENORM:       ; %bb.0:
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, -v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, -v3
-; P0-GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-F32DENORM-LABEL: mul_fsub_and_fneg_fsub_contractable:
+; GFX9-GISEL-F32DENORM:       ; %bb.0:
+; GFX9-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v2, v0, v1, -v2
+; GFX9-GISEL-F32DENORM-NEXT:    v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %a, %b
   %sub1 = fsub contract float %mul, %c      ; contractable
   %neg = fneg contract float %mul
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index a1bbe00635ed9..cf6732d30d080 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -960,12 +960,11 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
 ; SI-STD-NEXT:    buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-STD-NEXT:    s_waitcnt vmcnt(0)
 ; SI-STD-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-STD-NEXT:    v_mul_f32_e32 v6, v2, v3
-; SI-STD-NEXT:    v_mad_f32 v2, -v2, v3, -v4
-; SI-STD-NEXT:    v_sub_f32_e32 v3, v6, v5
-; SI-STD-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-STD-NEXT:    v_mad_f32 v4, -v2, v3, -v4
+; SI-STD-NEXT:    v_mad_f32 v2, v2, v3, -v5
+; SI-STD-NEXT:    buffer_store_dword v4, v[0:1], s[0:3], 0 addr64
 ; SI-STD-NEXT:    s_waitcnt vmcnt(0)
-; SI-STD-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-STD-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
 ; SI-STD-NEXT:    s_waitcnt vmcnt(0)
 ; SI-STD-NEXT:    s_endpgm
 ;



More information about the llvm-branch-commits mailing list