[llvm-branch-commits] [llvm] [AMDGPU][DAGCombiner][GlobalISel] Extend allMulUsesCanBeContracted with FPEXT pattern (PR #188116)

Adel Ejjeh via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Mar 30 09:54:40 PDT 2026


https://github.com/adelejjeh updated https://github.com/llvm/llvm-project/pull/188116

>From 562894cd69fbc3015314746b6994129af1f73579 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <adel.ejjeh at amd.com>
Date: Thu, 12 Mar 2026 10:09:35 -0500
Subject: [PATCH] [AMDGPU][DAGCombiner][GlobalISel] Extend
 allMulUsesCanBeContracted with FPEXT pattern

Extend the allMulUsesCanBeContracted analysis to recognize FPEXT patterns
where the multiply result flows through fpext before being used in
contractable operations (fadd, fsub). This covers:
  - fmul --> fpext --> {fadd, fsub}: FPEXT folds if isFPExtFoldable
  - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB
  - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable

Also adds allMulUsesCanBeContracted guards to all FPEXT fold sites in
both SDAG (visitFADDForFMACombine, visitFSUBForFMACombine) and GISel
(matchCombineFAddFpExtFMulToFMadOrFMA, matchCombineFSubFpExtFMulToFMadOrFMA,
matchCombineFSubFpExtFNegFMulToFMadOrFMA).

Fixes a missing isFPExtFoldable check in GISel's
matchCombineFSubFpExtFMulToFMadOrFMA which could fold without verifying
the extension is actually foldable.

Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
Made-with: Cursor
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |    3 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  102 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   91 +-
 .../AMDGPU/fma-multiple-uses-contraction.ll   | 2185 ++++++++---------
 4 files changed, 1152 insertions(+), 1229 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 09c827f71a34d..8440fdcbbd08b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -805,7 +805,8 @@ class CombinerHelper {
 
   /// Check if all uses of a multiply can be contracted into fma/fmad
   /// operations, so that duplicating the multiply is acceptable.
-  bool allMulUsesCanBeContracted(const MachineInstr &MI) const;
+  bool allMulUsesCanBeContracted(const MachineInstr &MI,
+                                 unsigned PreferredFusedOpcode) const;
 
   bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally,
                            bool &HasFMAD, bool &Aggressive,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index d2bf2568df276..0941e6da0f40f 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6316,10 +6316,15 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
 /// would duplicate the multiply without reducing the total number of
 /// operations.
 ///
-/// Currently checks for the following patterns:
+/// This uses a simple, non-recursive check for the following patterns:
 ///   - fmul --> fadd/fsub: Direct contraction
 ///   - fmul --> fneg --> fsub: Contraction through fneg
-bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
+///   - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable
+///   - fmul --> fpext --> {fadd, fsub}: FPEXT folds if foldable
+///   - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB
+bool CombinerHelper::allMulUsesCanBeContracted(
+    const MachineInstr &MI, unsigned PreferredFusedOpcode) const {
+  const auto &TLI = getTargetLowering();
   Register MulReg = MI.getOperand(0).getReg();
 
   for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(MulReg)) {
@@ -6329,13 +6334,66 @@ bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
     if (Opcode == TargetOpcode::G_FADD || Opcode == TargetOpcode::G_FSUB)
       continue;
 
-    // G_FNEG use - contractable if all users of the fneg are G_FSUB.
+    // FNEG --> FSUB pattern
+    // Also handles FNEG --> FPEXT --> FSUB
     if (Opcode == TargetOpcode::G_FNEG) {
       Register FNegReg = UseMI.getOperand(0).getReg();
-      for (const MachineInstr &FNegUser : MRI.use_nodbg_instructions(FNegReg)) {
-        unsigned FNegUserOp = FNegUser.getOpcode();
-        if (FNegUserOp != TargetOpcode::G_FSUB)
+      // ALL users of the FNEG must be contractable FSUBs or FPEXTs leading to
+      // FSUBs
+      for (const MachineInstr &FNegUseMI :
+           MRI.use_nodbg_instructions(FNegReg)) {
+        unsigned FNegUseOpcode = FNegUseMI.getOpcode();
+
+        if (FNegUseOpcode == TargetOpcode::G_FSUB)
+          continue;
+        if (FNegUseOpcode == TargetOpcode::G_FPEXT) {
+          // FNEG --> FPEXT --> FSUB
+          Register FNegFPExtReg = FNegUseMI.getOperand(0).getReg();
+          for (const MachineInstr &FNegFPExtUseMI :
+               MRI.use_nodbg_instructions(FNegFPExtReg)) {
+            if (FNegFPExtUseMI.getOpcode() != TargetOpcode::G_FSUB)
+              return false;
+            // FPEXT use is FSUB, check if can be folded in
+            if (!TLI.isFPExtFoldable(
+                    FNegFPExtUseMI, PreferredFusedOpcode,
+                    MRI.getType(FNegFPExtUseMI.getOperand(0).getReg()),
+                    MRI.getType(FNegReg)))
+              return false;
+          }
+          continue;
+        }
+        return false;
+      }
+      continue;
+    }
+
+    // FP_EXTEND - check if ALL users are FADD, FSUB, or FNEG --> FSUB
+    if (Opcode == TargetOpcode::G_FPEXT) {
+      Register FPExtReg = UseMI.getOperand(0).getReg();
+
+      // ALL users of the FP_EXTEND must be contractable operations or FNEGs
+      for (const MachineInstr &FPExtUseMI :
+           MRI.use_nodbg_instructions(FPExtReg)) {
+        if (!TLI.isFPExtFoldable(FPExtUseMI, PreferredFusedOpcode,
+                                 MRI.getType(FPExtUseMI.getOperand(0).getReg()),
+                                 MRI.getType(MulReg)))
           return false;
+        unsigned ExtUseOpcode = FPExtUseMI.getOpcode();
+        if (ExtUseOpcode == TargetOpcode::G_FADD ||
+            ExtUseOpcode == TargetOpcode::G_FSUB) {
+          continue;
+        }
+        if (ExtUseOpcode == TargetOpcode::G_FNEG) {
+          // FP_EXTEND --> FNEG --> FSUB
+          Register FPExtFNegReg = FPExtUseMI.getOperand(0).getReg();
+          for (const MachineInstr &FPExtFNegUseMI :
+               MRI.use_nodbg_instructions(FPExtFNegReg)) {
+            if (FPExtFNegUseMI.getOpcode() != TargetOpcode::G_FSUB)
+              return false;
+          }
+          continue;
+        }
+        return false;
       }
       continue;
     }
@@ -6407,7 +6465,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
   // avoiding duplication of the multiply without reducing total operations.
   if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
       (MRI.hasOneNonDBGUse(LHS.Reg) ||
-       (Aggressive && allMulUsesCanBeContracted(*LHS.MI)))) {
+       (Aggressive &&
+        allMulUsesCanBeContracted(*LHS.MI, PreferredFusedOpcode)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
                    {LHS.MI->getOperand(1).getReg(),
@@ -6421,7 +6480,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
   // avoiding duplication of the multiply without reducing total operations.
   if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
       (MRI.hasOneNonDBGUse(RHS.Reg) ||
-       (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) {
+       (Aggressive &&
+        allMulUsesCanBeContracted(*RHS.MI, PreferredFusedOpcode)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
                    {RHS.MI->getOperand(1).getReg(),
@@ -6464,6 +6524,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
   MachineInstr *FpExtSrc;
   if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
       isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+      allMulUsesCanBeContracted(*FpExtSrc, PreferredFusedOpcode) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
                           MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6479,6 +6540,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
   // Note: Commutes FADD operands.
   if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
       isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+      allMulUsesCanBeContracted(*FpExtSrc, PreferredFusedOpcode) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
                           MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6718,7 +6780,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
   if (FirstMulHasFewerUses &&
       (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
        (MRI.hasOneNonDBGUse(LHS.Reg) ||
-        (Aggressive && allMulUsesCanBeContracted(*LHS.MI))))) {
+        (Aggressive &&
+         allMulUsesCanBeContracted(*LHS.MI, PreferredFusedOpcode))))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0);
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
@@ -6732,7 +6795,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
   // avoiding duplication of the multiply without reducing total operations.
   if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
       (MRI.hasOneNonDBGUse(RHS.Reg) ||
-       (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) {
+       (Aggressive &&
+        allMulUsesCanBeContracted(*RHS.MI, PreferredFusedOpcode)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       Register NegY =
           B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0);
@@ -6771,7 +6835,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
       ((MRI.hasOneNonDBGUse(LHSReg) &&
         MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
-       (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) {
+       (Aggressive &&
+        allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       Register NegX =
           B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
@@ -6795,7 +6860,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
       ((MRI.hasOneNonDBGUse(RHSReg) &&
         MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
-       (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) {
+       (Aggressive &&
+        allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
                    {FMulMI->getOperand(1).getReg(),
@@ -6828,7 +6894,10 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
   // fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z))
   if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
-      (Aggressive || MRI.hasOneNonDBGUse(LHSReg)) &&
+      ((MRI.hasOneNonDBGUse(LHSReg) &&
+        MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
+       (Aggressive &&
+        allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode))) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
                           MRI.getType(FMulMI->getOperand(0).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6846,7 +6915,10 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
   // fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x)
   if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
-      (Aggressive || MRI.hasOneNonDBGUse(RHSReg)) &&
+      ((MRI.hasOneNonDBGUse(RHSReg) &&
+        MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) ||
+       (Aggressive &&
+        allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode))) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
                           MRI.getType(FMulMI->getOperand(0).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6896,6 +6968,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
        mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
                           MRI.getType(FMulMI->getOperand(0).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
@@ -6912,6 +6985,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
        mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
                           MRI.getType(FMulMI->getOperand(0).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ffcdb9c40bb81..e4f1c8adc8abe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17685,25 +17685,75 @@ static bool isFusedOp(const MatchContextClass &Matcher, SDValue N) {
 /// would duplicate the multiply without reducing the total number of
 /// operations.
 ///
-/// Currently checks for the following patterns:
+/// This uses a simple, non-recursive check for the following patterns:
 ///   - fmul --> fadd/fsub: Direct contraction
 ///   - fmul --> fneg --> fsub: Contraction through fneg
-static bool allMulUsesCanBeContracted(SDValue Mul) {
+///   - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable
+///   - fmul --> fpext --> {fadd, fsub}: FPEXT folds if foldable
+///   - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB
+static bool allMulUsesCanBeContracted(SDValue Mul,
+                                      const unsigned PreferredFusedOpcode,
+                                      const TargetLowering &TLI,
+                                      SelectionDAG &DAG) {
   for (const auto *User : Mul->users()) {
-    unsigned Opcode = User->getOpcode();
+    SDNode *UserNode = const_cast<SDNode *>(User);
+    unsigned Opcode = UserNode->getOpcode();
 
     // Direct FADD/FSUB - contractable.
     if (Opcode == ISD::FADD || Opcode == ISD::FSUB)
       continue;
 
-    // FNEG use - contractable if all users of the fneg are FSUB.
+    // FNEG - check if ALL users are FSUB or foldable FPEXT --> FSUB
     if (Opcode == ISD::FNEG) {
-      for (const auto *FNegUser : User->users()) {
+      for (const auto *FNegUser : UserNode->users()) {
         unsigned FNegUserOp = FNegUser->getOpcode();
-        if (FNegUserOp != ISD::FSUB)
-          return false;
+
+        if (FNegUserOp == ISD::FSUB) {
+          // FNEG --> FSUB
+          continue;
+        }
+        if (FNegUserOp == ISD::FP_EXTEND) {
+          // FNEG --> FPEXT --> FSUB
+          EVT SrcVT = UserNode->getValueType(0); // Src of FPEXT is the FNEG
+          for (const auto *FNegFPExtUser : FNegUser->users()) {
+            if (FNegFPExtUser->getOpcode() != ISD::FSUB)
+              return false;
+            if (!TLI.isFPExtFoldable(DAG, PreferredFusedOpcode,
+                                     FNegFPExtUser->getValueType(0), SrcVT))
+              return false;
+          }
+          continue;
+        }
+        return false;
       }
-      continue;
+      continue; // All FNEG uses are contractable
+    }
+
+    // FP_EXTEND - check if ALL users are FADD, FSUB, or FNEG --> FSUB
+    if (Opcode == ISD::FP_EXTEND) {
+      EVT SrcVT = Mul.getValueType();
+
+      for (const auto *FPExtUser : UserNode->users()) {
+        unsigned ExtUserOp = FPExtUser->getOpcode();
+        EVT DstVT = FPExtUser->getValueType(0);
+        if (!TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, DstVT, SrcVT))
+          return false; // this FPEXT cannot be folded
+
+        if (ExtUserOp == ISD::FADD || ExtUserOp == ISD::FSUB) {
+          continue; // FPEXT --> {FADD, FSUB} is contractable
+        }
+        if (ExtUserOp == ISD::FNEG) {
+          // FP_EXTEND --> FNEG --> FSUB
+          for (const auto *FPExtFNegUser : FPExtUser->users()) {
+            if (FPExtFNegUser->getOpcode() != ISD::FSUB) {
+              return false;
+            }
+          }
+          continue;
+        }
+        return false;
+      }
+      continue; // All FPEXT uses are contractable
     }
 
     // Any other use type is not currently recognized as contractable.
@@ -17777,7 +17827,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   // Only contract if the multiply has one use or all uses are contractable,
   // avoiding duplication of the multiply without reducing total operations.
   if (isContractableFMUL(N0) &&
-      (N0->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N0)))) {
+      (N0->hasOneUse() ||
+       (Aggressive &&
+        allMulUsesCanBeContracted(N0, PreferredFusedOpcode, TLI, DAG)))) {
     return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
                            N0.getOperand(1), N1);
   }
@@ -17787,7 +17839,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   // Only contract if the multiply has one use or all uses are contractable,
   // avoiding duplication of the multiply without reducing total operations.
   if (isContractableFMUL(N1) &&
-      (N1->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N1)))) {
+      (N1->hasOneUse() ||
+       (Aggressive &&
+        allMulUsesCanBeContracted(N1, PreferredFusedOpcode, TLI, DAG)))) {
     return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
                            N1.getOperand(1), N0);
   }
@@ -17834,6 +17888,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (matcher.match(N0, ISD::FP_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
     if (isContractableFMUL(N00) &&
+        allMulUsesCanBeContracted(N00, PreferredFusedOpcode, TLI, DAG) &&
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N00.getValueType())) {
       return matcher.getNode(
@@ -17848,6 +17903,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (matcher.match(N1, ISD::FP_EXTEND)) {
     SDValue N10 = N1.getOperand(0);
     if (isContractableFMUL(N10) &&
+        allMulUsesCanBeContracted(N10, PreferredFusedOpcode, TLI, DAG) &&
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N10.getValueType())) {
       return matcher.getNode(
@@ -18006,7 +18062,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // avoiding duplication of the multiply without reducing total operations.
   auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
     if (isContractableFMUL(XY) &&
-        (XY->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(XY)))) {
+        (XY->hasOneUse() ||
+         (Aggressive &&
+          allMulUsesCanBeContracted(XY, PreferredFusedOpcode, TLI, DAG)))) {
       return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
                              XY.getOperand(1),
                              matcher.getNode(ISD::FNEG, SL, VT, Z));
@@ -18020,7 +18078,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // avoiding duplication of the multiply without reducing total operations.
   auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
     if (isContractableFMUL(YZ) &&
-        (YZ->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(YZ)))) {
+        (YZ->hasOneUse() ||
+         (Aggressive &&
+          allMulUsesCanBeContracted(YZ, PreferredFusedOpcode, TLI, DAG)))) {
       return matcher.getNode(
           PreferredFusedOpcode, SL, VT,
           matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
@@ -18059,7 +18119,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // multiply without reducing total operations.
   if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) &&
       ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
-       (Aggressive && allMulUsesCanBeContracted(N0.getOperand(0))))) {
+       (Aggressive && allMulUsesCanBeContracted(
+                          N0.getOperand(0), PreferredFusedOpcode, TLI, DAG)))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
     return matcher.getNode(PreferredFusedOpcode, SL, VT,
@@ -18074,6 +18135,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (matcher.match(N0, ISD::FP_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
     if (isContractableFMUL(N00) &&
+        allMulUsesCanBeContracted(N00, PreferredFusedOpcode, TLI, DAG) &&
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N00.getValueType())) {
       return matcher.getNode(
@@ -18090,6 +18152,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (matcher.match(N1, ISD::FP_EXTEND)) {
     SDValue N10 = N1.getOperand(0);
     if (isContractableFMUL(N10) &&
+        allMulUsesCanBeContracted(N10, PreferredFusedOpcode, TLI, DAG) &&
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N10.getValueType())) {
       return matcher.getNode(
@@ -18112,6 +18175,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     if (matcher.match(N00, ISD::FNEG)) {
       SDValue N000 = N00.getOperand(0);
       if (isContractableFMUL(N000) &&
+          allMulUsesCanBeContracted(N000, PreferredFusedOpcode, TLI, DAG) &&
           TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                               N00.getValueType())) {
         return matcher.getNode(
@@ -18136,6 +18200,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     if (matcher.match(N00, ISD::FP_EXTEND)) {
       SDValue N000 = N00.getOperand(0);
       if (isContractableFMUL(N000) &&
+          allMulUsesCanBeContracted(N000, PreferredFusedOpcode, TLI, DAG) &&
           TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                               N000.getValueType())) {
         return matcher.getNode(
diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
index 99dabe9961033..701e93411899e 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
@@ -1228,17 +1228,10 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
   ret { float, float } %ret1
 }
 
-
 ; ==========================================================================
 ; FPEXT patterns
 ; Tests for allMulUsesCanBeContracted with fpext(fmul) feeding into
 ; fadd, fsub, and fneg combinations.
-;
-; NOTE: The allMulUsesCanBeContracted guard does not yet recognize fpext
-; users of the multiply. That support is added by later patches in the
-; series. Until then, the CHECK lines below reflect current (potentially
-; over-conservative) codegen and may not match the "Expected:" comments on
-; individual tests.
 ; ==========================================================================
 
 ; Test case: fpext(fmul) -> {fadd, fadd} (chained adds, second uses result of first).
@@ -1247,95 +1240,95 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
 ; Should contract -- both uses are contractable fadds.
 ; Expected: fma_mix (or fma after cvt) for both adds, no v_mul_f16.
 define float @fpext_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_contractable:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v1, v0, v4
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_contractable:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v1, v0, v4
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_nop 0
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_nop 0
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_contractable:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, v0, v4
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_contractable:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, v0, v4
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_nop 0
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_nop 0
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -1350,93 +1343,57 @@ entry:
 ; Should NOT contract -- one user (direct return) is not contractable.
 ; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
 define { float, float } @fpext_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_noncontractable:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_noncontractable:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_noncontractable:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_noncontractable:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: fpext_noncontractable:
+; GFX9_4-SDAG:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: fpext_noncontractable:
+; GFX9_4-GISEL:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: fpext_noncontractable:
+; GFX12_5-SDAG:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: fpext_noncontractable:
+; GFX12_5-GISEL:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -1452,83 +1409,53 @@ entry:
 ; Should NOT contract -- one user (direct return of half mul) is not contractable.
 ; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
 define { float, half } @fpext_noncontractable_2(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_noncontractable_2:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_noncontractable_2:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_noncontractable_2:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_noncontractable_2:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_2:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: fpext_noncontractable_2:
+; GFX9_4-SDAG:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_2:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: fpext_noncontractable_2:
+; GFX9_4-GISEL:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_2:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: fpext_noncontractable_2:
+; GFX12_5-SDAG:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_2:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: fpext_noncontractable_2:
+; GFX12_5-GISEL:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -1544,79 +1471,79 @@ entry:
 ; Should contract -- single use, trivially contractable.
 ; Expected: fma_mix (or fma after cvt), no v_mul_f16.
 define float @fpext_contractable_2(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_contractable_2:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_contractable_2:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_2:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_2:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_contractable_2:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_2:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_contractable_2:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_2:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_2:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_2:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_2:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -1630,204 +1557,204 @@ entry:
 ; Should contract -- both uses are contractable fadds.
 ; Expected: fma_mix (or fma after cvt) for both adds, no v_mul_f16.
 define {float, float} @fpext_contractable_3(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_contractable_3:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v2, v1, v4
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v1, v0, v1
-; P0-GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_contractable_3:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v2, v1, v4
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9-GISEL-LABEL: fpext_contractable_3:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v2, v1, v4
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v1, v0, v1
-; P0-GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_contractable_3:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v2, v1, v4
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_3:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_3:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %mul = fmul contract half %u, %v
+  %mul.ext = fpext contract half %mul to float
+  %add = fadd contract float %mul.ext, %z
+  %add2 = fadd contract float %x, %mul.ext
+  %ret0 = insertvalue { float, float } poison, float %add, 0
+  %ret1 = insertvalue { float, float } %ret0, float %add2, 1
+  ret { float, float } %ret1
+}
+
+; Test case: fpext(fmul) -> {fsub, fsub} (chained subs, second uses result of first).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), %sub2 = fsub(%sub, %ext).
+; fpext(%mul) has two users: both fsub.
+; Should contract -- both uses are contractable fsubs.
+; Expected: fma_mix (or fma after cvt) for both subs, no v_mul_f16.
+define float @fpext_contractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_sub:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v1, v0, v4
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_3:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_3:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_3:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_3:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_3:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_3:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, 1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
-entry:
-  %mul = fmul contract half %u, %v
-  %mul.ext = fpext contract half %mul to float
-  %add = fadd contract float %mul.ext, %z
-  %add2 = fadd contract float %x, %mul.ext
-  %ret0 = insertvalue { float, float } poison, float %add, 0
-  %ret1 = insertvalue { float, float } %ret0, float %add2, 1
-  ret { float, float } %ret1
-}
-
-; Test case: fpext(fmul) -> {fsub, fsub} (chained subs, second uses result of first).
-; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), %sub2 = fsub(%sub, %ext).
-; fpext(%mul) has two users: both fsub.
-; Should contract -- both uses are contractable fsubs.
-; Expected: fma_mix (or fma after cvt) for both subs, no v_mul_f16.
-define float @fpext_contractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_contractable_sub:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v1, v0, v4
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_contractable_sub:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v1, v0, v4
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_contractable_sub:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v1, v0, v4
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_nop 0
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -1842,93 +1769,57 @@ entry:
 ; Should NOT contract -- one user (direct return) is not contractable.
 ; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
 define { float, float } @fpext_noncontractable_sub(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_noncontractable_sub:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_noncontractable_sub:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX9_4-SDAG:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX9_4-GISEL:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: fpext_noncontractable_sub:
+; GFX12_5-SDAG:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: fpext_noncontractable_sub:
+; GFX12_5-GISEL:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -1944,83 +1835,53 @@ entry:
 ; Should NOT contract -- one user (direct return of half mul) is not contractable.
 ; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
 define { float, half } @fpext_noncontractable_sub_2(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX9_4-SDAG:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX9_4-GISEL:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: fpext_noncontractable_sub_2:
+; GFX12_5-SDAG:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: fpext_noncontractable_sub_2:
+; GFX12_5-GISEL:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -2036,187 +1897,187 @@ entry:
 ; Should contract -- single use, trivially contractable.
 ; Expected: fma_mix (or fma after cvt), no v_mul_f16.
 define float @fpext_contractable_sub_2(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_contractable_sub_2:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_contractable_sub_2:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_contractable_sub_2:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_contractable_sub_2:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_2:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub_2:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
-  %mul = fmul contract half %u, %v
-  %mul.ext = fpext contract half %mul to float
-  %sub = fsub contract float %mul.ext, %z
-  ret float %sub
-}
-
-; Test case: fpext(fmul) -> {fsub, fsub} (two independent subs).
-; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), %sub2 = fsub(x, %ext).
-; fpext(%mul) has two users: both fsub.
-; Should contract -- both uses are contractable fsubs.
-; Expected: fma_mix (or fma after cvt) for both subs, no v_mul_f16.
-define {float, float} @fpext_contractable_sub_3(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_contractable_sub_3:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v4
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v1, v0, v1
-; P0-GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_contractable_sub_3:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v1, v4
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v1, v0, v1
-; P0-GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+  %mul = fmul contract half %u, %v
+  %mul.ext = fpext contract half %mul to float
+  %sub = fsub contract float %mul.ext, %z
+  ret float %sub
+}
+
+; Test case: fpext(fmul) -> {fsub, fsub} (two independent subs).
+; IR: %mul = fmul(u,v), %ext = fpext(%mul), %sub = fsub(%ext, z), %sub2 = fsub(x, %ext).
+; fpext(%mul) has two users: both fsub.
+; Should contract -- both uses are contractable fsubs.
+; Expected: fma_mix (or fma after cvt) for both subs, no v_mul_f16.
+define {float, float} @fpext_contractable_sub_3(float %x, float %y, half %u, half %v, float %z) #0 {
+; GFX9-SDAG-LABEL: fpext_contractable_sub_3:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v4
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_contractable_sub_3:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v1, v4
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_contractable_sub_3:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, -v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, -v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_contractable_sub_3:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v4, -1.0, v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v1, -1.0, v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -2233,105 +2094,105 @@ entry:
 ; Should contract -- all paths are contractable.
 ; Expected: fma_mix (or fma after cvt) for both paths, no v_mul_f16.
 define {float, float} @fpext_fneg_fpext_fsub_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -v1
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v0
-; P0-GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e64 v1, -v1
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v0
-; P0-GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_contractable:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -2350,89 +2211,53 @@ entry:
 ; Should NOT contract -- one user (direct return of half mul) is not contractable.
 ; Expected: v_mul_f16, no fma_mix fold.
 define {float, half} @fpext_fneg_fpext_fsub_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -v1
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v1
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -v1
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v1
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9_4-SDAG:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX9_4-GISEL:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12_5-SDAG:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable:
+; GFX12_5-GISEL:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %neg = fneg contract half %mul
@@ -2449,103 +2274,103 @@ entry:
 ; Should contract -- all paths are contractable.
 ; Expected: fma_mix (or fma after cvt) for both paths, no v_mul_f16.
 define {float, float} @fpext_fneg_fsub_contractable(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX9-SDAG-NEXT:    v_add_f32_e32 v2, v1, v4
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e64 v1, -v1, v0
-; P0-GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX9-GISEL-NEXT:    v_add_f32_e32 v2, v1, v4
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e64 v1, -v1, v0
-; P0-GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v2, v1, v4
+; GFX9-SDAG-NEXT:    v_sub_f32_e64 v1, -v1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_fneg_fsub_contractable:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v2, v1, v4
+; GFX9-GISEL-NEXT:    v_sub_f32_e64 v1, -v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GFX9_4-SDAG-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, v3, v0 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v1, v2, -v3, -v0 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12_5-GISEL-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_contractable:
+; GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v2, v1, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v1, v0, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-F32DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float
@@ -2563,99 +2388,57 @@ entry:
 ; Should NOT contract -- one user (direct return) is not contractable.
 ; Expected: v_mul_f16 + v_cvt_f32_f16, no fma_mix fold.
 define {float, float} @fpext_fneg_fsub_noncontractable(float %x, float %y, half %u, half %v, float %z) #0 {
-; P0-GFX9-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX9-SDAG:       ; %bb.0: ; %entry
-; P0-GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9-SDAG-NEXT:    v_sub_f32_e64 v0, -v1, v4
-; P0-GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX9-GISEL:       ; %bb.0: ; %entry
-; P0-GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9-GISEL-NEXT:    v_sub_f32_e64 v0, -v1, v4
-; P0-GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX9_4-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX9_4-SDAG-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX9_4-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_nop 0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX9_4-GISEL-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
-;
-; P0-GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX12_5-SDAG-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-SDAG-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e64 v0, -v1, v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX12_5-GISEL-F32FLUSH:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0]
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-GISEL-F32FLUSH-NEXT:    s_set_pc_i64 s[30:31]
+; GFX9-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e64 v0, -v1, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX9_4-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-SDAG-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9_4-SDAG:       ; %bb.0: ; %entry
+; GFX9_4-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX9_4-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX9_4-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v0, v2, v3
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
-; P0-GFX9_4-GISEL-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9_4-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX9_4-GISEL:       ; %bb.0: ; %entry
+; GFX9_4-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-GISEL-NEXT:    v_mul_f16_e32 v0, v2, v3
+; GFX9_4-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9_4-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1]
+; GFX9_4-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; P0-GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX12_5-SDAG-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-SDAG-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-SDAG-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12_5-SDAG:       ; %bb.0: ; %entry
+; GFX12_5-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
-; P0-GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable:
-; P0-GFX12_5-GISEL-F32DENORM:       ; %bb.0: ; %entry
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_loadcnt_dscnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_wait_kmcnt 0x0
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_mul_f16_e32 v1, v2, v3
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; P0-GFX12_5-GISEL-F32DENORM-NEXT:    s_set_pc_i64 s[30:31]
+; GFX12_5-GISEL-LABEL: fpext_fneg_fsub_noncontractable:
+; GFX12_5-GISEL:       ; %bb.0: ; %entry
+; GFX12_5-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12_5-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12_5-GISEL-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX12_5-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT:    v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1]
+; GFX12_5-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX12_5-GISEL-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
   %mul.ext = fpext contract half %mul to float



More information about the llvm-branch-commits mailing list