[llvm-branch-commits] [llvm] [DAGCombiner][GlobalISel] Prevent FMA contraction when fmul cannot be eliminated (FADD/FSUB pattern) (PR #188114)
Adel Ejjeh via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Mar 30 08:54:23 PDT 2026
https://github.com/adelejjeh updated https://github.com/llvm/llvm-project/pull/188114
>From 81c28b3684aba81ff89091a8b1d8179286595c65 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <adel.ejjeh at amd.com>
Date: Fri, 27 Mar 2026 15:59:22 -0500
Subject: [PATCH] [DAGCombiner][GlobalISel] Prevent FMA contraction when fmul
cannot be eliminated (FADD/FSUB pattern)
Made-with: Cursor
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 +
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 48 +-
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 44 +-
.../amdgpu-simplify-libcall-pow-codegen.ll | 50 +-
.../AMDGPU/copysign-simplify-demanded-bits.ll | 10 +-
.../CodeGen/AMDGPU/dagcombine-fma-crash.ll | 22 +-
.../CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 189 ++---
.../AMDGPU/fma-multiple-uses-contraction.ll | 793 +++++++-----------
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 263 +++---
.../AMDGPU/fmul-2-combine-multi-use.ll | 4 +-
llvm/test/CodeGen/AMDGPU/mad-combine.ll | 9 +-
llvm/test/CodeGen/PowerPC/fma-aggr-FMF.ll | 6 +-
llvm/test/CodeGen/PowerPC/fma-precision.ll | 25 +-
13 files changed, 673 insertions(+), 794 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 365bbaacfe055..09c827f71a34d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -803,6 +803,10 @@ class CombinerHelper {
bool matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) const;
void applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) const;
+ /// Check if all uses of a multiply can be contracted into fma/fmad
+ /// operations, so that duplicating the multiply is acceptable.
+ bool allMulUsesCanBeContracted(const MachineInstr &MI) const;
+
bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally,
bool &HasFMAD, bool &Aggressive,
bool CanReassociate = false) const;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index f4d5bd5ee5745..297d6edac2e5c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6309,6 +6309,32 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
MRI.use_instr_nodbg_end());
}
+/// Check if all uses of a multiply can be contracted into FMA operations.
+/// Returns true if all uses of the multiply are contractable, meaning the
+/// multiply can potentially be eliminated through FMA contraction.
+/// Returns false if any use cannot be contracted, which would mean contracting
+/// would duplicate the multiply without reducing the total number of
+/// operations.
+///
+/// Currently checks for the following pattern:
+/// - fmul --> fadd/fsub: Direct contraction
+bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const {
+ Register MulReg = MI.getOperand(0).getReg();
+
+ for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(MulReg)) {
+ unsigned Opcode = UseMI.getOpcode();
+
+ // Direct FADD/FSUB uses - contractable.
+ if (Opcode == TargetOpcode::G_FADD || Opcode == TargetOpcode::G_FSUB)
+ continue;
+
+ // Any other use type is not currently recognized as contractable.
+ return false;
+ }
+
+ return true; // All uses can be contracted.
+}
+
bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
bool &AllowFusionGlobally,
bool &HasFMAD, bool &Aggressive,
@@ -6365,8 +6391,11 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
}
// fold (fadd (fmul x, y), z) -> (fma x, y, z)
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) {
+ (MRI.hasOneNonDBGUse(LHS.Reg) ||
+ (Aggressive && allMulUsesCanBeContracted(*LHS.MI)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
{LHS.MI->getOperand(1).getReg(),
@@ -6376,8 +6405,11 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
}
// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) {
+ (MRI.hasOneNonDBGUse(RHS.Reg) ||
+ (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
{RHS.MI->getOperand(1).getReg(),
@@ -6669,9 +6701,12 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
// fold (fsub (fmul x, y), z) -> (fma x, y, -z)
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
if (FirstMulHasFewerUses &&
(isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) {
+ (MRI.hasOneNonDBGUse(LHS.Reg) ||
+ (Aggressive && allMulUsesCanBeContracted(*LHS.MI))))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0);
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
@@ -6681,8 +6716,11 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
return true;
}
// fold (fsub x, (fmul y, z)) -> (fma -y, z, x)
- else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) {
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
+ if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+ (MRI.hasOneNonDBGUse(RHS.Reg) ||
+ (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
Register NegY =
B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1e1c184b9d119..4e1fab5391e87 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17678,6 +17678,30 @@ static bool isFusedOp(const MatchContextClass &Matcher, SDValue N) {
return Matcher.match(N, ISD::FMA) || Matcher.match(N, ISD::FMAD);
}
+/// Check if all uses of a multiply can be contracted into FMA operations.
+/// Returns true if all uses of the multiply are contractable, meaning the
+/// multiply can potentially be eliminated through FMA contraction.
+/// Returns false if any use cannot be contracted, which would mean contracting
+/// would duplicate the multiply without reducing the total number of
+/// operations.
+///
+/// Currently checks for the following pattern:
+/// - fmul --> fadd/fsub: Direct contraction
+static bool allMulUsesCanBeContracted(SDValue Mul) {
+ for (const auto *User : Mul->users()) {
+ unsigned Opcode = User->getOpcode();
+
+ // Direct FADD/FSUB - contractable.
+ if (Opcode == ISD::FADD || Opcode == ISD::FSUB)
+ continue;
+
+ // Any other use type is not currently recognized as contractable.
+ return false;
+ }
+
+ return true; // All uses can be contracted.
+}
+
/// Try to perform FMA combining on a given FADD node.
template <class MatchContextClass>
SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
@@ -17739,14 +17763,20 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
}
// fold (fadd (fmul x, y), z) -> (fma x, y, z)
- if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
+ if (isContractableFMUL(N0) &&
+ (N0->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N0)))) {
return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
N0.getOperand(1), N1);
}
// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
// Note: Commutes FADD operands.
- if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
+ if (isContractableFMUL(N1) &&
+ (N1->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N1)))) {
return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
N1.getOperand(1), N0);
}
@@ -17961,8 +17991,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
};
// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
- if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
+ if (isContractableFMUL(XY) &&
+ (XY->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(XY)))) {
return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
XY.getOperand(1),
matcher.getNode(ISD::FNEG, SL, VT, Z));
@@ -17972,8 +18005,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
// Note: Commutes FSUB operands.
+ // Only contract if the multiply has one use or all uses are contractable,
+ // avoiding duplication of the multiply without reducing total operations.
auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
- if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
+ if (isContractableFMUL(YZ) &&
+ (YZ->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(YZ)))) {
return matcher.getNode(
PreferredFusedOpcode, SL, VT,
matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index afe0971088bc1..056ec8215e6b4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -90,12 +90,12 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
; CHECK-NEXT: v_trunc_f32_e32 v3, v1
-; CHECK-NEXT: v_mul_f32_e32 v4, v2, v3
+; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3
; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000
-; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; CHECK-NEXT: v_fma_f32 v2, v2, v3, v4
+; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NEXT: v_add_f32_e32 v2, v2, v3
; CHECK-NEXT: v_exp_f32_e32 v2, v2
; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
; CHECK-NEXT: v_not_b32_e32 v3, 63
@@ -235,11 +235,11 @@ define float @test_powr_fast_f32(float %x, float %y) {
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2
-; CHECK-NEXT: v_mul_f32_e32 v2, v1, v0
-; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000
-; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; CHECK-NEXT: v_fma_f32 v0, v1, v0, v2
+; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x42800000
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: v_not_b32_e32 v1, 63
; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -374,12 +374,12 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
-; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4
+; CHECK-NEXT: v_mul_f32_e32 v2, v2, v4
; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000
-; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3
+; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NEXT: v_add_f32_e32 v2, v2, v3
; CHECK-NEXT: v_exp_f32_e32 v2, v2
; CHECK-NEXT: v_not_b32_e32 v3, 63
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
@@ -517,12 +517,12 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2
-; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1
+; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000
-; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x42800000
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: v_not_b32_e32 v1, 63
; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -656,12 +656,12 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
-; CHECK-NEXT: v_mul_f32_e32 v3, v2, v1
+; CHECK-NEXT: v_mul_f32_e32 v1, v2, v1
; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x42800000
-; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
-; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x42800000
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT: v_add_f32_e32 v1, v1, v2
; CHECK-NEXT: v_exp_f32_e32 v1, v1
; CHECK-NEXT: v_not_b32_e32 v2, 63
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index ef676ddc8070e..3c67255ebdc5b 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -339,12 +339,12 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v1
; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x42800000
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
-; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000
+; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v3
; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_not_b32_e32 v3, 63
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
index 142494a803755..57070e763e79b 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
@@ -20,22 +20,24 @@ define void @main(float %arg) {
; CHECK-NEXT: bb.1.bb2:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
- ; CHECK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_MUL_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
+ ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_MUL_F32_e64_]], 0, [[S_MOV_B32_2]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, killed [[V_FMAC_F32_e64_1]], 0, [[S_MOV_B32_2]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb11:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_1]], %bb.1
- ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_]], %bb.1
- ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_]], %bb.1
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_1]], %bb.1
+ ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_3]], %bb.1
; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI2]], implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
- ; CHECK-NEXT: S_CMP_LG_U32 killed [[COPY1]], killed [[S_MOV_B32_3]], implicit-def $scc
+ ; CHECK-NEXT: S_CMP_LG_U32 killed [[COPY1]], killed [[S_MOV_B32_4]], implicit-def $scc
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY2]], implicit-def dead $scc
; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_1]]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 28a18ec3845e0..e95f19a19d134 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -12,64 +12,64 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_buffer_load_dword s24, s[0:3], 0x5c
-; GFX10-NEXT: s_buffer_load_dword s28, s[0:3], 0x7c
-; GFX10-NEXT: s_buffer_load_dword s29, s[0:3], 0xc0
+; GFX10-NEXT: s_buffer_load_dword s25, s[0:3], 0x7c
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_nop 0
; GFX10-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_buffer_load_dword s26, s[0:3], 0xc0
; GFX10-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50
-; GFX10-NEXT: s_nop 0
-; GFX10-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c
-; GFX10-NEXT: v_sub_f32_e64 v5, s24, s28
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x4
; GFX10-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60
-; GFX10-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20
-; GFX10-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0
-; GFX10-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70
-; GFX10-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10
-; GFX10-NEXT: v_fma_f32 v1, v1, v5, s28
-; GFX10-NEXT: v_max_f32_e64 v6, s0, s0 clamp
-; GFX10-NEXT: v_add_f32_e64 v5, s29, -1.0
-; GFX10-NEXT: v_sub_f32_e32 v8, s0, v1
-; GFX10-NEXT: v_fma_f32 v7, -s2, v6, s6
-; GFX10-NEXT: v_fma_f32 v5, v6, v5, 1.0
-; GFX10-NEXT: v_mad_f32 v10, s2, v6, v2
-; GFX10-NEXT: s_mov_b32 s0, 0x3c23d70a
-; GFX10-NEXT: v_fmac_f32_e32 v1, v6, v8
-; GFX10-NEXT: v_fmac_f32_e32 v10, v7, v6
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mul_f32_e32 v9, s10, v0
-; GFX10-NEXT: v_fma_f32 v0, -v0, s10, s14
-; GFX10-NEXT: v_mul_f32_e32 v8, s18, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, s22, v3
-; GFX10-NEXT: v_fmac_f32_e32 v9, v0, v6
-; GFX10-NEXT: v_sub_f32_e32 v0, v1, v5
-; GFX10-NEXT: v_mul_f32_e32 v1, v8, v6
-; GFX10-NEXT: v_mul_f32_e32 v7, v6, v3
-; GFX10-NEXT: v_fma_f32 v3, -v6, v3, v9
-; GFX10-NEXT: v_fmac_f32_e32 v5, v0, v6
-; GFX10-NEXT: v_fma_f32 v0, v2, s26, -v1
-; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v6
-; GFX10-NEXT: v_fmac_f32_e32 v1, v0, v6
-; GFX10-NEXT: v_mul_f32_e32 v0, v2, v6
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: s_buffer_load_dword s4, s[0:3], 0x2c
+; GFX10-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x70
+; GFX10-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x20
+; GFX10-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x0
+; GFX10-NEXT: v_max_f32_e64 v5, s0, s0 clamp
+; GFX10-NEXT: v_sub_f32_e64 v6, s24, s25
+; GFX10-NEXT: v_mul_f32_e32 v7, s2, v5
+; GFX10-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x10
+; GFX10-NEXT: v_fma_f32 v1, v1, v6, s25
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s0, 0x3c23d70a
+; GFX10-NEXT: v_add_f32_e64 v6, s26, -1.0
+; GFX10-NEXT: v_sub_f32_e32 v8, s6, v7
+; GFX10-NEXT: v_mul_f32_e32 v0, s10, v0
+; GFX10-NEXT: v_sub_f32_e32 v9, s4, v1
+; GFX10-NEXT: v_mul_f32_e32 v2, s14, v2
+; GFX10-NEXT: v_fma_f32 v6, v5, v6, 1.0
+; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v5
+; GFX10-NEXT: v_sub_f32_e32 v8, s18, v0
+; GFX10-NEXT: v_fmac_f32_e32 v1, v5, v9
+; GFX10-NEXT: v_mul_f32_e32 v9, s22, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v5, v2
+; GFX10-NEXT: v_add_f32_e32 v7, v3, v7
+; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v5
+; GFX10-NEXT: v_sub_f32_e32 v1, v1, v6
+; GFX10-NEXT: v_mul_f32_e32 v8, v9, v5
+; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_fmac_f32_e32 v6, v1, v5
+; GFX10-NEXT: v_fma_f32 v1, v3, s2, -v8
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v5
+; GFX10-NEXT: v_fmaak_f32 v0, s0, v6, 0x3ca3d70a
+; GFX10-NEXT: v_fmac_f32_e32 v8, v1, v5
+; GFX10-NEXT: v_mul_f32_e32 v1, v3, v5
+; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX10-NEXT: v_mul_f32_e32 v3, v4, v6
-; GFX10-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
-; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_mul_f32_e32 v2, v7, v4
-; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v0
-; GFX10-NEXT: v_max_f32_e32 v0, 0, v1
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v8
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_max_f32_e32 v0, 0, v2
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: _amdgpu_ps_main:
@@ -80,67 +80,68 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_buffer_load_b32 s24, s[0:3], 0x5c
-; GFX11-NEXT: s_buffer_load_b32 s28, s[0:3], 0x7c
-; GFX11-NEXT: s_buffer_load_b32 s29, s[0:3], 0xc0
+; GFX11-NEXT: s_buffer_load_b32 s25, s[0:3], 0x7c
; GFX11-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x40
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_buffer_load_b32 s26, s[0:3], 0xc0
; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x50
-; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x2c
-; GFX11-NEXT: v_sub_f32_e64 v5, s24, s28
+; GFX11-NEXT: s_buffer_load_b128 s[8:11], s[0:3], 0x60
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b128 s[8:11], s[0:3], 0x60
-; GFX11-NEXT: s_buffer_load_b128 s[12:15], s[0:3], 0x20
-; GFX11-NEXT: s_buffer_load_b128 s[16:19], s[0:3], 0x0
-; GFX11-NEXT: s_buffer_load_b128 s[20:23], s[0:3], 0x70
-; GFX11-NEXT: v_fma_f32 v1, v1, v5, s28
-; GFX11-NEXT: v_max_f32_e64 v6, s0, s0 clamp
-; GFX11-NEXT: s_buffer_load_b128 s[24:27], s[0:3], 0x10
-; GFX11-NEXT: v_add_f32_e64 v5, s29, -1.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_f32_e32 v8, s0, v1
-; GFX11-NEXT: v_fma_f32 v7, -s2, v6, s6
-; GFX11-NEXT: v_fma_f32 v10, s2, v6, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_fma_f32 v5, v6, v5, 1.0
-; GFX11-NEXT: s_mov_b32 s0, 0x3c23d70a
+; GFX11-NEXT: s_buffer_load_b32 s4, s[0:3], 0x2c
+; GFX11-NEXT: s_buffer_load_b128 s[12:15], s[0:3], 0x70
+; GFX11-NEXT: s_buffer_load_b128 s[16:19], s[0:3], 0x20
+; GFX11-NEXT: s_buffer_load_b128 s[20:23], s[0:3], 0x0
+; GFX11-NEXT: v_max_f32_e64 v5, s0, s0 clamp
+; GFX11-NEXT: v_sub_f32_e64 v6, s24, s25
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v7, s2, v5
+; GFX11-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x10
+; GFX11-NEXT: v_fma_f32 v1, v1, v6, s25
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e32 v9, s10, v0
-; GFX11-NEXT: v_fma_f32 v0, -v0, s10, s14
-; GFX11-NEXT: v_mul_f32_e32 v3, s22, v3
-; GFX11-NEXT: v_dual_fmac_f32 v1, v6, v8 :: v_dual_mul_f32 v8, s18, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_fmac_f32_e32 v9, v0, v6
-; GFX11-NEXT: v_dual_fmac_f32 v10, v7, v6 :: v_dual_mul_f32 v7, v6, v3
+; GFX11-NEXT: s_mov_b32 s0, 0x3c23d70a
+; GFX11-NEXT: v_add_f32_e64 v6, s26, -1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_sub_f32 v8, s6, v7 :: v_dual_sub_f32 v9, s4, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, s10, v0
+; GFX11-NEXT: v_mul_f32_e32 v2, s14, v2
+; GFX11-NEXT: v_fma_f32 v6, v5, v6, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v5
+; GFX11-NEXT: v_dual_fmac_f32 v1, v5, v9 :: v_dual_sub_f32 v8, s18, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v9, s22, v3 :: v_dual_mul_f32 v2, v5, v2
+; GFX11-NEXT: v_add_f32_e32 v7, v3, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_f32_e32 v0, v1, v5
-; GFX11-NEXT: v_fma_f32 v3, -v6, v3, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v6
-; GFX11-NEXT: v_fmac_f32_e32 v5, v0, v6
-; GFX11-NEXT: v_mul_f32_e32 v1, v8, v6
+; GFX11-NEXT: v_dual_sub_f32 v1, v1, v6 :: v_dual_fmac_f32 v0, v8, v5
+; GFX11-NEXT: v_mul_f32_e32 v8, v9, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_fmac_f32_e32 v6, v1, v5
+; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fma_f32 v1, v3, s2, -v8
+; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_fmaak_f32 v0, s0, v6, 0x3ca3d70a
+; GFX11-NEXT: v_fmac_f32_e32 v8, v1, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v1, v3, v5 :: v_dual_mul_f32 v0, v2, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v3, v4, v6 :: v_dual_fmaak_f32 v4, s0, v5, 0x3ca3d70a
-; GFX11-NEXT: v_fma_f32 v0, v2, s26, -v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_fmac_f32_e32 v1, v0, v6
-; GFX11-NEXT: v_mul_f32_e32 v0, v2, v6
-; GFX11-NEXT: v_mul_f32_e32 v2, v7, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v8
+; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, 0, v1
+; GFX11-NEXT: v_max_f32_e32 v0, 0, v2
; GFX11-NEXT: ; return to shader part epilog
.entry:
%0 = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
index 78eeb5e807e58..8535cea11dc14 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll
@@ -25,79 +25,55 @@
; Should NOT contract -- one user (fmul) is not contractable.
; Expected: v_mul shared by both paths, no fma contraction.
define { float, float } @mul_has_noncontractable_use(float %a, float %b, float %c, float %d) {
-; GFX9-SDAG-F32FLUSH-LABEL: mul_has_noncontractable_use:
-; GFX9-SDAG-F32FLUSH: ; %bb.0:
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_has_noncontractable_use:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v1, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: mul_has_noncontractable_use:
-; GFX9-GISEL-F32FLUSH: ; %bb.0:
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_has_noncontractable_use:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: mul_has_noncontractable_use:
; GFX9_4-SDAG: ; %bb.0:
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-GISEL-LABEL: mul_has_noncontractable_use:
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: mul_has_noncontractable_use:
; GFX12_5-SDAG: ; %bb.0:
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v0, v1, v3
-; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v0, v1, v2 :: v_dual_add_f32 v1, v1, v3
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: mul_has_noncontractable_use:
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v0, v1, v3
-; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v0, v1, v2 :: v_dual_add_f32 v1, v1, v3
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: mul_has_noncontractable_use:
-; GFX9-SDAG-F32DENORM: ; %bb.0:
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: mul_has_noncontractable_use:
-; GFX9-GISEL-F32DENORM: ; %bb.0:
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
%extrause = fmul contract float %mul, %c ; non-contractable
%fma1 = fadd contract float %mul, %d ; contractable
@@ -280,15 +256,14 @@ define { float, float, float } @mul_constant_two_contractable_uses_one_noncontra
; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v3
; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
-; GFX9-GISEL-F32FLUSH: ; %bb.0:
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v3, 2.0, v0
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v4, 2.0, v0, v1
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, 2.0, v0, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v3, 2.0, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v3, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
; GFX9_4-SDAG: ; %bb.0:
@@ -304,9 +279,8 @@ define { float, float, float } @mul_constant_two_contractable_uses_one_noncontra
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v3, 2.0, v0
-; GFX9_4-GISEL-NEXT: v_fma_f32 v4, 2.0, v0, v1
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, 2.0, v0, v2
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v3, v1
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v3, v2
; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v3
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -323,9 +297,10 @@ define { float, float, float } @mul_constant_two_contractable_uses_one_noncontra
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v3, 2.0, v0, v1 :: v_dual_fma_f32 v1, 2.0, v0, v2
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v3, 2.0, v0
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v2, 2.0, v0 :: v_dual_mov_b32 v0, v3
+; GFX12_5-GISEL-NEXT: v_dual_add_f32 v0, v3, v1 :: v_dual_add_f32 v1, v3, v2
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v2, v3
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-SDAG-F32DENORM-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
@@ -337,16 +312,6 @@ define { float, float, float } @mul_constant_two_contractable_uses_one_noncontra
; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v4
; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v3
; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: mul_constant_two_contractable_uses_one_noncontractable:
-; GFX9-GISEL-F32DENORM: ; %bb.0:
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v3, 2.0, v0
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v4, v0, 2.0, v1
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, 2.0, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, 2.0
%fma1 = fadd contract float %mul, %c
%fma2 = fadd contract float %mul, %d
@@ -362,33 +327,30 @@ define { float, float, float } @mul_constant_two_contractable_uses_one_noncontra
; Should NOT contract -- one user (direct return) is not contractable.
; Expected: v_mul shared by all paths, no fma contraction.
define { float, float, float } @mul_two_contractable_uses_plus_direct_use(float %a, float %b, float %c, float %d) {
-; GFX9-SDAG-F32FLUSH-LABEL: mul_two_contractable_uses_plus_direct_use:
-; GFX9-SDAG-F32FLUSH: ; %bb.0:
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v4, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v4, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: mul_two_contractable_uses_plus_direct_use:
-; GFX9-GISEL-F32FLUSH: ; %bb.0:
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v4, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v4, v3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use:
; GFX9_4-SDAG: ; %bb.0:
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, v2
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v4, v2
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v1, v4, v3
; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v4
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -396,9 +358,8 @@ define { float, float, float } @mul_two_contractable_uses_plus_direct_use(float
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, v2
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v4, v2
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v4, v3
; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v4
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -406,41 +367,21 @@ define { float, float, float } @mul_two_contractable_uses_plus_direct_use(float
; GFX12_5-SDAG: ; %bb.0:
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v4, v0, v1, v2 :: v_dual_fma_f32 v3, v0, v1, v3
-; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12_5-SDAG-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_add_f32 v0, v4, v2 :: v_dual_add_f32 v1, v4, v3
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v2, v4
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use:
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v4, v0, v1, v2 :: v_dual_fma_f32 v3, v0, v1, v3
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_add_f32 v0, v4, v2 :: v_dual_add_f32 v1, v4, v3
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v2, v4
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: mul_two_contractable_uses_plus_direct_use:
-; GFX9-SDAG-F32DENORM: ; %bb.0:
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: mul_two_contractable_uses_plus_direct_use:
-; GFX9-GISEL-F32DENORM: ; %bb.0:
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, v3
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
%fma1 = fadd contract float %mul, %c
%fma2 = fadd contract float %mul, %d
@@ -535,33 +476,30 @@ define { float, float } @mul_two_contractable_fsub_uses(float %a, float %b, floa
; Should NOT contract -- one user (direct return) is not contractable.
; Expected: v_mul shared by all paths, no fma contraction.
define { float, float, float } @mul_two_contractable_fsub_uses_plus_direct_use(float %a, float %b, float %c, float %d) {
-; GFX9-SDAG-F32FLUSH-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
-; GFX9-SDAG-F32FLUSH: ; %bb.0:
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v0, v4, v2
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
-; GFX9-GISEL-F32FLUSH: ; %bb.0:
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, v1, -v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v4, v2
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v1, v4, v3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
; GFX9_4-SDAG: ; %bb.0:
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9_4-SDAG-NEXT: v_fma_f32 v2, v0, v1, -v2
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v0, v1, -v3
-; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-SDAG-NEXT: v_sub_f32_e32 v0, v4, v2
+; GFX9_4-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3
; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v4
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,9 +507,8 @@ define { float, float, float } @mul_two_contractable_fsub_uses_plus_direct_use(f
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, -v2
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v3
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v4, v2
+; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v1, v4, v3
; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v4
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -579,41 +516,21 @@ define { float, float, float } @mul_two_contractable_fsub_uses_plus_direct_use(f
; GFX12_5-SDAG: ; %bb.0:
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v4, v0, v1, -v2 :: v_dual_fma_f32 v3, v0, v1, -v3
-; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12_5-SDAG-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-SDAG-NEXT: v_dual_sub_f32 v0, v4, v2 :: v_dual_sub_f32 v1, v4, v3
+; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v2, v4
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v4, v0, v1, -v2 :: v_dual_fma_f32 v3, v0, v1, -v3
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v4, v0, v1
+; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12_5-GISEL-NEXT: v_dual_sub_f32 v0, v4, v2 :: v_dual_sub_f32 v1, v4, v3
+; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v2, v4
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
-; GFX9-SDAG-F32DENORM: ; %bb.0:
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
-; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: mul_two_contractable_fsub_uses_plus_direct_use:
-; GFX9-GISEL-F32DENORM: ; %bb.0:
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, v1, -v3
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
%fma1 = fsub contract float %mul, %c
%fma2 = fsub contract float %mul, %d
@@ -800,6 +717,11 @@ define { float, float, float } @mul_three_contractable_uses(float %a, float %b,
; ==========================================================================
; FNEG patterns
; Tests for allMulUsesCanBeContracted recognizing fneg as a transparent user.
+;
+; NOTE: The allMulUsesCanBeContracted guard does not yet recognize fneg as
+; transparent. That support is added by the next patch in the series. Until
+; then, the CHECK lines below reflect current (potentially over-conservative)
+; codegen and may not match the "Expected:" comments on individual tests.
; ==========================================================================
; Test case: fmul -> fneg -> fsub (single use chain).
@@ -1032,18 +954,18 @@ define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, flo
; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_mixed_uses_2:
; GFX9-SDAG-F32FLUSH: ; %bb.0:
; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, v0, -v1, -v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e64 v1, v0, -v1
+; GFX9-SDAG-F32FLUSH-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_mixed_uses_2:
-; GFX9-GISEL-F32FLUSH: ; %bb.0:
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v0, v0, -v1, -v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v1, v4, v3
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_fneg_mixed_uses_2:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e64 v1, v0, -v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: mul_fneg_mixed_uses_2:
; GFX9_4-SDAG: ; %bb.0:
@@ -1056,9 +978,9 @@ define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, flo
; GFX9_4-GISEL-LABEL: mul_fneg_mixed_uses_2:
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, -v1, -v2
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-GISEL-NEXT: v_mul_f32_e64 v1, v0, -v1
+; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: mul_fneg_mixed_uses_2:
@@ -1074,9 +996,9 @@ define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, flo
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, -v1, -v2
+; GFX12_5-GISEL-NEXT: v_mul_f32_e64 v1, v0, -v1
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-GISEL-NEXT: v_dual_sub_f32 v0, v1, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_mixed_uses_2:
@@ -1086,14 +1008,6 @@ define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, flo
; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, -v2
; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_mixed_uses_2:
-; GFX9-GISEL-F32DENORM: ; %bb.0:
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, -v1, -v2
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
%neg = fneg contract float %mul
%sub = fsub contract float %neg, %c ; contractable
@@ -1109,71 +1023,55 @@ define { float, float } @mul_fneg_mixed_uses_2(float %a, float %b, float %c, flo
; Should NOT contract -- one path (fneg -> fmul) is not contractable.
; Expected: v_mul shared by both paths, no fma contraction.
define { float, float } @mul_fneg_nonfsub_noncontractable(float %a, float %b, float %c, float %d) {
-; GFX9-SDAG-F32FLUSH-LABEL: mul_fneg_nonfsub_noncontractable:
-; GFX9-SDAG-F32FLUSH: ; %bb.0:
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v4, v0, v1
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, v0, v1, v2
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e64 v1, -v4, v3
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: mul_fneg_nonfsub_noncontractable:
-; GFX9-GISEL-F32FLUSH: ; %bb.0:
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v0, v0, v1, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v1, v4, v3
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-GISEL-NEXT: v_mul_f32_e64 v1, -v1, v3
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
; GFX9_4-SDAG: ; %bb.0:
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-SDAG-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v1, v0, v1
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_mul_f32_e64 v1, -v1, v3
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: mul_fneg_nonfsub_noncontractable:
; GFX12_5-SDAG: ; %bb.0:
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, v1, v2
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v1, v0, v1
; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-SDAG-NEXT: v_dual_add_f32 v0, v1, v2 :: v_dual_mul_f32 v1, -v1, v3
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: mul_fneg_nonfsub_noncontractable:
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_mul_f32 v4, v0, -v1 :: v_dual_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v1, v0, v1
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v1, v4, v3
+; GFX12_5-GISEL-NEXT: v_dual_add_f32 v0, v1, v2 :: v_dual_mul_f32 v1, -v1, v3
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: mul_fneg_nonfsub_noncontractable:
-; GFX9-SDAG-F32DENORM: ; %bb.0:
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: mul_fneg_nonfsub_noncontractable:
-; GFX9-GISEL-F32DENORM: ; %bb.0:
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e64 v4, v0, -v1
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v1, v4, v3
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
%neg = fneg float %mul
%add = fadd contract float %mul, %c ; contractable
@@ -1201,7 +1099,7 @@ define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %
; GFX9-GISEL-F32FLUSH: ; %bb.0:
; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, -v0, v1, -v3
; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1217,7 +1115,7 @@ define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, v2
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, -v0, v1, -v3
; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1234,7 +1132,7 @@ define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
@@ -1251,7 +1149,7 @@ define { float, float } @mul_direct_and_fneg_contractable_uses(float %a, float %
; GFX9-GISEL-F32DENORM: ; %bb.0:
; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, v2
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, -v0, v1, -v3
; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
@@ -1281,7 +1179,7 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
; GFX9-GISEL-F32FLUSH: ; %bb.0:
; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v2, v0, v1, -v2
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, -v0, v1, -v3
; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1297,7 +1195,7 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
; GFX9_4-GISEL: ; %bb.0:
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-GISEL-NEXT: v_fma_f32 v2, v0, v1, -v2
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v1, -v0, v1, -v3
; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1314,7 +1212,7 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
; GFX12_5-GISEL: ; %bb.0:
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, v0, -v1, -v3
+; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v2, v0, v1, -v2 :: v_dual_fma_f32 v1, -v0, v1, -v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
@@ -1331,7 +1229,7 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
; GFX9-GISEL-F32DENORM: ; %bb.0:
; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v2, v0, v1, -v2
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v0, -v1, -v3
+; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, -v0, v1, -v3
; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
%mul = fmul contract float %a, %b
@@ -1347,6 +1245,12 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b,
; FPEXT patterns
; Tests for allMulUsesCanBeContracted with fpext(fmul) feeding into
; fadd, fsub, and fneg combinations.
+;
+; NOTE: The allMulUsesCanBeContracted guard does not yet recognize fpext
+; users of the multiply. That support is added by later patches in the
+; series. Until then, the CHECK lines below reflect current (potentially
+; over-conservative) codegen and may not match the "Expected:" comments on
+; individual tests.
; ==========================================================================
; Test case: fpext(fmul) -> {fadd, fadd} (chained adds, second uses result of first).
@@ -2777,6 +2681,12 @@ entry:
; ==========================================================================
; FMA/FMAD chain patterns
; Tests for allMulUsesCanBeContracted recognizing FMA/FMAD as contractable.
+;
+; NOTE: The allMulUsesCanBeContracted guard does not yet recognize FMA/FMAD
+; users of the multiply. That support is added by the final patch in the
+; series. Until then, the CHECK lines below reflect current (potentially
+; over-conservative) codegen and may not match the "Expected:" comments on
+; individual tests.
; ==========================================================================
; Test case: fpext(fmul) -> {fma -> fadd, fadd} (chained fma with fpext).
@@ -2907,10 +2817,10 @@ define {float, float} @fma_chain_fpext_outer_contractable(half %x, half %y, half
; GFX9-SDAG-LABEL: fma_chain_fpext_outer_contractable:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mul_f16_e32 v6, v2, v3
-; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX9-SDAG-NEXT: v_fma_f16 v1, v2, v3, v5
+; GFX9-SDAG-NEXT: v_add_f16_e32 v1, v2, v5
; GFX9-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2918,10 +2828,10 @@ define {float, float} @fma_chain_fpext_outer_contractable(half %x, half %y, half
; GFX9-GISEL-LABEL: fma_chain_fpext_outer_contractable:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v6, v2, v3
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX9-GISEL-NEXT: v_fma_f16 v1, v2, v3, v5
+; GFX9-GISEL-NEXT: v_add_f16_e32 v1, v2, v5
; GFX9-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2973,10 +2883,10 @@ define {float, float} @fma_chain_fpext_outer_contractable(half %x, half %y, half
; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry
; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
-; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_f16 v2, v2, v3, v5
+; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v3, v2, v3
+; GFX9_4-SDAG-F32DENORM-NEXT: v_add_f16_e32 v2, v3, v5
; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v3
; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
; GFX9_4-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v1, v2
; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
@@ -2984,10 +2894,10 @@ define {float, float} @fma_chain_fpext_outer_contractable(half %x, half %y, half
; GFX9_4-GISEL-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry
; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
-; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_f16 v2, v2, v3, v5
+; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v3, v2, v3
+; GFX9_4-GISEL-F32DENORM-NEXT: v_add_f16_e32 v2, v3, v5
; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v6
+; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v3
; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
; GFX9_4-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v1, v2
; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
@@ -2996,26 +2906,26 @@ define {float, float} @fma_chain_fpext_outer_contractable(half %x, half %y, half
; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry
; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
-; GFX12_5-SDAG-F32DENORM-NEXT: v_fmac_f16_e32 v5, v2, v3
-; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12_5-SDAG-F32DENORM-NEXT: v_fmac_f16_e32 v6, v0, v1
-; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX12_5-SDAG-F32DENORM-NEXT: v_add_f16_e32 v1, v2, v5
+; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v6, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-F32DENORM-LABEL: fma_chain_fpext_outer_contractable:
; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry
; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v6, v2, v3
-; GFX12_5-GISEL-F32DENORM-NEXT: v_fmac_f16_e32 v5, v2, v3
-; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12_5-GISEL-F32DENORM-NEXT: v_fmac_f16_e32 v6, v0, v1
-; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX12_5-GISEL-F32DENORM-NEXT: v_add_f16_e32 v1, v2, v5
+; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0]
; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v6, 1.0, v4 op_sel_hi:[1,1,0]
+; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31]
entry:
%mul = fmul contract half %u, %v
@@ -3288,79 +3198,61 @@ entry:
; reassociation can't fire to eliminate the multiply.
; Expected: v_mul + v_fma + v_add + v_add (no contraction, mul shared by both paths).
define {float, float} @fma_chain_fadd_no_reassoc(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
-; GFX9-SDAG-F32FLUSH-LABEL: fma_chain_fadd_no_reassoc:
-; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fadd_no_reassoc:
-; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: fma_chain_fadd_no_reassoc:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: fma_chain_fadd_no_reassoc:
; GFX9_4-SDAG: ; %bb.0: ; %entry
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-SDAG-NEXT: v_fmac_f32_e32 v6, v0, v1
-; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v6, v4
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-GISEL-LABEL: fma_chain_fadd_no_reassoc:
; GFX9_4-GISEL: ; %bb.0: ; %entry
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-GISEL-NEXT: v_fmac_f32_e32 v6, v0, v1
-; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v6, v4
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: fma_chain_fadd_no_reassoc:
; GFX12_5-SDAG: ; %bb.0: ; %entry
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-SDAG-NEXT: v_dual_fmac_f32 v6, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-SDAG-NEXT: v_add_f32_e32 v0, v6, v4
+; GFX12_5-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-SDAG-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: fma_chain_fadd_no_reassoc:
; GFX12_5-GISEL: ; %bb.0: ; %entry
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_dual_fmac_f32 v6, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-GISEL-NEXT: v_add_f32_e32 v0, v6, v4
+; GFX12_5-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fadd_no_reassoc:
-; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fadd_no_reassoc:
-; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = fmul contract float %c, %d
%fma.res = call contract float @llvm.fma.f32(float %a, float %b, float %mul)
@@ -3377,85 +3269,61 @@ entry:
; Should NOT contract -- one user (direct return) is not contractable.
; Expected: v_mul shared by all paths, no fma contraction of the multiply.
define {float, float, float} @fma_direct_use_noncontractable(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
-; GFX9-SDAG-F32FLUSH-LABEL: fma_direct_use_noncontractable:
-; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fma_direct_use_noncontractable:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: fma_direct_use_noncontractable:
-; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: fma_direct_use_noncontractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: fma_direct_use_noncontractable:
; GFX9_4-SDAG: ; %bb.0: ; %entry
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-GISEL-LABEL: fma_direct_use_noncontractable:
; GFX9_4-GISEL: ; %bb.0: ; %entry
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: fma_direct_use_noncontractable:
; GFX12_5-SDAG: ; %bb.0: ; %entry
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v0, v0, v1, v6 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-SDAG-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_add_f32 v0, v0, v4
+; GFX12_5-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-SDAG-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: fma_direct_use_noncontractable:
; GFX12_5-GISEL: ; %bb.0: ; %entry
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v0, v0, v1, v6 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_add_f32 v0, v0, v4
+; GFX12_5-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: fma_direct_use_noncontractable:
-; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: fma_direct_use_noncontractable:
-; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = fmul contract float %c, %d
%fma.res = call contract float @llvm.fma.f32(float %a, float %b, float %mul)
@@ -3483,14 +3351,14 @@ define {float, float} @fma_chain_fsub_contractable(float %a, float %b, float %c,
; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fsub_contractable:
-; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v4, v0
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: fma_chain_fsub_contractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: fma_chain_fsub_contractable:
; GFX9_4-SDAG: ; %bb.0: ; %entry
@@ -3503,10 +3371,10 @@ define {float, float} @fma_chain_fsub_contractable(float %a, float %b, float %c,
; GFX9_4-GISEL-LABEL: fma_chain_fsub_contractable:
; GFX9_4-GISEL: ; %bb.0: ; %entry
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-GISEL-NEXT: v_fmac_f32_e32 v6, v0, v1
-; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v4, v6
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: fma_chain_fsub_contractable:
@@ -3522,10 +3390,10 @@ define {float, float} @fma_chain_fsub_contractable(float %a, float %b, float %c,
; GFX12_5-GISEL: ; %bb.0: ; %entry
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_dual_fmac_f32 v6, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-GISEL-NEXT: v_sub_f32_e32 v0, v4, v6
+; GFX12_5-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: v_dual_sub_f32 v0, v4, v0 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fsub_contractable:
@@ -3535,15 +3403,6 @@ define {float, float} @fma_chain_fsub_contractable(float %a, float %b, float %c,
; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, v4
; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fsub_contractable:
-; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v4, v0
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = fmul contract reassoc nsz float %c, %d
%fma.res = call contract reassoc nsz float @llvm.fma.f32(float %a, float %b, float %mul)
@@ -3564,81 +3423,69 @@ define {float, float, float} @fma_chain_fsub_noncontractable(float %a, float %b,
; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v4, -v2, v3, v4
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v4
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v2, -v2, v3, v4
+; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2
+; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v1, v6, v5
; GFX9-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fsub_noncontractable:
-; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v4, v0
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: fma_chain_fsub_noncontractable:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v4, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: fma_chain_fsub_noncontractable:
; GFX9_4-SDAG: ; %bb.0: ; %entry
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-SDAG-NEXT: v_fma_f32 v4, -v2, v3, v4
-; GFX9_4-SDAG-NEXT: v_fma_f32 v0, -v0, v1, v4
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9_4-SDAG-NEXT: v_fma_f32 v2, -v2, v3, v4
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, -v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v1, v6, v5
; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v2, v6
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-GISEL-LABEL: fma_chain_fsub_noncontractable:
; GFX9_4-GISEL: ; %bb.0: ; %entry
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v6
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v4, v0
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: fma_chain_fsub_noncontractable:
; GFX12_5-SDAG: ; %bb.0: ; %entry
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v6, -v2, v3, v4 :: v_dual_fma_f32 v4, v2, v3, v5
+; GFX12_5-SDAG-NEXT: v_dual_fma_f32 v4, -v2, v3, v4 :: v_dual_mul_f32 v2, v2, v3
; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12_5-SDAG-NEXT: v_dual_mul_f32 v2, v2, v3 :: v_dual_fma_f32 v0, -v0, v1, v6
-; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v1, v4
+; GFX12_5-SDAG-NEXT: v_fma_f32 v0, -v0, v1, v4
+; GFX12_5-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: fma_chain_fsub_noncontractable:
; GFX12_5-GISEL: ; %bb.0: ; %entry
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v6, v2, v3
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_dual_fma_f32 v0, v0, v1, v6 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-GISEL-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_sub_f32 v0, v4, v0
+; GFX12_5-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: v_dual_sub_f32 v0, v4, v0 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fsub_noncontractable:
; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v4, -v2, v3, v4
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, v4
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v2, -v2, v3, v4
+; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, -v0, v1, v2
+; GFX9-SDAG-F32DENORM-NEXT: v_add_f32_e32 v1, v6, v5
; GFX9-SDAG-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fsub_noncontractable:
-; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v4, v0
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32DENORM-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = fmul contract reassoc nsz float %c, %d
%fma.res = call contract reassoc nsz float @llvm.fma.f32(float %a, float %b, float %mul)
@@ -3659,77 +3506,61 @@ entry:
; reassociation does not fire and the multiply remains.
; Expected: fma chain (no v_mul) on denorm-capable targets; v_mul remains on gfx9-flush.
define {float, float} @fma_chain_fadd_reassoc(float %a, float %b, float %c, float %d, float %e, float %f) #0 {
-; GFX9-SDAG-F32FLUSH-LABEL: fma_chain_fadd_reassoc:
-; GFX9-SDAG-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-SDAG-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-SDAG-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-SDAG-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: fma_chain_fadd_reassoc:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-F32FLUSH-LABEL: fma_chain_fadd_reassoc:
-; GFX9-GISEL-F32FLUSH: ; %bb.0: ; %entry
-; GFX9-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32FLUSH-NEXT: v_mul_f32_e32 v6, v2, v3
-; GFX9-GISEL-F32FLUSH-NEXT: v_fma_f32 v0, v0, v1, v6
-; GFX9-GISEL-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-GISEL-F32FLUSH-NEXT: v_mad_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: fma_chain_fadd_reassoc:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-SDAG-LABEL: fma_chain_fadd_reassoc:
; GFX9_4-SDAG: ; %bb.0: ; %entry
; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-SDAG-NEXT: v_fma_f32 v4, v2, v3, v4
-; GFX9_4-SDAG-NEXT: v_fmac_f32_e32 v4, v0, v1
-; GFX9_4-SDAG-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9_4-SDAG-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-SDAG-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9_4-GISEL-LABEL: fma_chain_fadd_reassoc:
; GFX9_4-GISEL: ; %bb.0: ; %entry
; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9_4-GISEL-NEXT: v_fma_f32 v4, v2, v3, v4
-; GFX9_4-GISEL-NEXT: v_fmac_f32_e32 v4, v0, v1
-; GFX9_4-GISEL-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9_4-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX9_4-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX9_4-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX9_4-GISEL-NEXT: v_add_f32_e32 v1, v2, v5
; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12_5-SDAG-LABEL: fma_chain_fadd_reassoc:
; GFX12_5-SDAG: ; %bb.0: ; %entry
; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-SDAG-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX12_5-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-SDAG-NEXT: v_dual_fmac_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-SDAG-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-SDAG-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-SDAG-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX12_5-GISEL-LABEL: fma_chain_fadd_reassoc:
; GFX12_5-GISEL: ; %bb.0: ; %entry
; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12_5-GISEL-NEXT: v_fma_f32 v4, v2, v3, v4
+; GFX12_5-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12_5-GISEL-NEXT: v_dual_fmac_f32 v4, v0, v1 :: v_dual_fma_f32 v1, v2, v3, v5
-; GFX12_5-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX12_5-GISEL-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX12_5-GISEL-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v2, v5
; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX9-SDAG-F32DENORM-LABEL: fma_chain_fadd_reassoc:
-; GFX9-SDAG-F32DENORM: ; %bb.0: ; %entry
-; GFX9-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v4, v2, v3, v4
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v4
-; GFX9-SDAG-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-F32DENORM-LABEL: fma_chain_fadd_reassoc:
-; GFX9-GISEL-F32DENORM: ; %bb.0: ; %entry
-; GFX9-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v4, v2, v3, v4
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v4
-; GFX9-GISEL-F32DENORM-NEXT: v_fma_f32 v1, v2, v3, v5
-; GFX9-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = fmul contract reassoc nsz float %c, %d
%fma.res = call contract reassoc nsz float @llvm.fma.f32(float %a, float %b, float %mul)
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 8d1d08d1e7c68..137c0731aeecb 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -367,11 +367,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX9-SDAG-LABEL: test_D139469_f16:
; GFX9-SDAG: ; %bb.0: ; %bb
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x211e
-; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0
-; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, s4, v2
-; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, 0x291e, v0
+; GFX9-SDAG-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -379,11 +377,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX9-GISEL-LABEL: test_D139469_f16:
; GFX9-GISEL: ; %bb.0: ; %bb
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x291e
-; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x211e
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, 0x291e, v0
+; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, 0x211e, v0
; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0
; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -392,10 +388,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-SDAG-LABEL: test_D139469_f16:
; GFX10-SDAG: ; %bb.0: ; %bb
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x291e
-; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0
-; GFX10-SDAG-NEXT: v_fmaak_f16 v0, s4, v0, 0x211e
-; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, 0x291e, v0
+; GFX10-SDAG-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -403,10 +398,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-GISEL-LABEL: test_D139469_f16:
; GFX10-GISEL: ; %bb.0: ; %bb
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX10-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, 0x291e, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v1
; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -415,13 +409,11 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0x291e
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x211e
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0x291e, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -429,11 +421,11 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x291e
-; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, 0x291e, v0
-; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -441,28 +433,26 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16:
; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16:
; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -473,13 +463,11 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0x291e
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x211e
-; GFX12-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0x291e, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -492,12 +480,11 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x291e
-; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, 0x291e, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v1, v0
+; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -510,12 +497,10 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
-; GFX12-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
-; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
-; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
-; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -529,12 +514,10 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX12-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -555,10 +538,10 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX9-SDAG: ; %bb.0: ; %bb
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x211e
-; GFX9-SDAG-NEXT: v_pk_mul_f16 v1, v0, s4 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v0, s4, v2 op_sel_hi:[1,0,0]
-; GFX9-SDAG-NEXT: v_pk_min_f16 v1, v1, v0
+; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x211e
+; GFX9-SDAG-NEXT: v_pk_add_f16 v1, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: v_pk_min_f16 v1, v0, v1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1
; GFX9-SDAG-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, v2 src0_sel:WORD_1 src1_sel:DWORD
@@ -570,14 +553,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX9-GISEL: ; %bb.0: ; %bb
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x291e291e
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v2, v0, v1
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2
-; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[6:7], v2, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x211e211e
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[6:7], v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v2
; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0
-; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9]
@@ -587,11 +570,10 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX10-SDAG-LABEL: test_D139469_v2f16:
; GFX10-SDAG: ; %bb.0: ; %bb
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x211e
-; GFX10-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s4 op_sel_hi:[0,1,0]
+; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1]
; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-SDAG-NEXT: v_pk_min_f16 v1, v1, v0
+; GFX10-SDAG-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT: v_pk_min_f16 v1, v0, v1
; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
; GFX10-SDAG-NEXT: v_cmp_lt_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -601,14 +583,13 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX10-GISEL-LABEL: test_D139469_v2f16:
; GFX10-GISEL: ; %bb.0: ; %bb
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v0
-; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s5, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s6, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: v_pk_add_f16 v1, 0x211e211e, v0
+; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v1
+; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s6, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-GISEL-NEXT: s_or_b32 s4, s5, s6
@@ -618,14 +599,13 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1]
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v1, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v1, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.h
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -633,15 +613,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
-; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1]
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -649,16 +628,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16:
; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v1.h
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
@@ -669,22 +646,20 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16:
; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
-; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -695,16 +670,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1]
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v1, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v1, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
@@ -717,17 +690,15 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
-; GFX12-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1]
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v1, v0
+; GFX12-SDAG-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
@@ -740,16 +711,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX12-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
-; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h
-; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX12-GISEL-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v0.h
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v1.h
; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -765,17 +734,15 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX12-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
-; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0
+; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 02ce8be125afc..dbcd9cd1f5df5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -113,7 +113,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f32_e64 v1, s2, s2
-; GFX10-NEXT: v_fma_f32 v2, s2, 2.0, s3
+; GFX10-NEXT: v_mad_f32 v2, s2, 2.0, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v0, v2, s[0:1] offset:4
@@ -172,7 +172,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2|
-; GFX10-NEXT: v_fma_f32 v2, |s2|, 2.0, s3
+; GFX10-NEXT: v_mad_f32 v2, |s2|, 2.0, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v0, v2, s[0:1] offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index cf6732d30d080..a1bbe00635ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -960,11 +960,12 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-STD-NEXT: s_waitcnt vmcnt(0)
; SI-STD-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-STD-NEXT: v_mad_f32 v4, -v2, v3, -v4
-; SI-STD-NEXT: v_mad_f32 v2, v2, v3, -v5
-; SI-STD-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-STD-NEXT: v_mul_f32_e32 v6, v2, v3
+; SI-STD-NEXT: v_mad_f32 v2, -v2, v3, -v4
+; SI-STD-NEXT: v_sub_f32_e32 v3, v6, v5
+; SI-STD-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-STD-NEXT: s_waitcnt vmcnt(0)
-; SI-STD-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-STD-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; SI-STD-NEXT: s_waitcnt vmcnt(0)
; SI-STD-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/PowerPC/fma-aggr-FMF.ll b/llvm/test/CodeGen/PowerPC/fma-aggr-FMF.ll
index f7ea279d06e98..323dc98ea375d 100644
--- a/llvm/test/CodeGen/PowerPC/fma-aggr-FMF.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-aggr-FMF.ll
@@ -22,10 +22,10 @@ define float @can_fma_with_fewer_uses(float %f1, float %f2, float %f3, float %f4
define float @no_fma_with_fewer_uses(float %f1, float %f2, float %f3, float %f4) {
; CHECK-LABEL: no_fma_with_fewer_uses:
; CHECK: # %bb.0:
-; CHECK-NEXT: xsmulsp 3, 3, 4
; CHECK-NEXT: xsmulsp 0, 1, 2
-; CHECK-NEXT: xsmaddasp 3, 1, 2
-; CHECK-NEXT: xsdivsp 1, 0, 3
+; CHECK-NEXT: xsmulsp 1, 3, 4
+; CHECK-NEXT: xsaddsp 1, 0, 1
+; CHECK-NEXT: xsdivsp 1, 0, 1
; CHECK-NEXT: blr
%mul1 = fmul contract float %f1, %f2
%mul2 = fmul float %f3, %f4
diff --git a/llvm/test/CodeGen/PowerPC/fma-precision.ll b/llvm/test/CodeGen/PowerPC/fma-precision.ll
index 762d2336e2932..27c2d2f9f9e48 100644
--- a/llvm/test/CodeGen/PowerPC/fma-precision.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-precision.ll
@@ -101,12 +101,12 @@ entry:
define double @fma_multi_uses1(double %a, double %b, double %c, double %d, ptr %p1, ptr %p2, ptr %p3) {
; CHECK-LABEL: fma_multi_uses1:
; CHECK: # %bb.0:
-; CHECK-NEXT: xsmuldp 1, 1, 2
-; CHECK-NEXT: xsmuldp 0, 3, 4
-; CHECK-NEXT: stfd 1, 0(7)
-; CHECK-NEXT: stfd 1, 0(8)
-; CHECK-NEXT: xsnmsubadp 1, 3, 4
-; CHECK-NEXT: stfd 0, 0(9)
+; CHECK-NEXT: xsmuldp 0, 1, 2
+; CHECK-NEXT: xsmuldp 1, 3, 4
+; CHECK-NEXT: stfd 0, 0(7)
+; CHECK-NEXT: stfd 0, 0(8)
+; CHECK-NEXT: stfd 1, 0(9)
+; CHECK-NEXT: xssubdp 1, 0, 1
; CHECK-NEXT: blr
%ab = fmul contract reassoc double %a, %b
%cd = fmul contract reassoc double %c, %d
@@ -120,13 +120,12 @@ define double @fma_multi_uses1(double %a, double %b, double %c, double %d, ptr %
define double @fma_multi_uses2(double %a, double %b, double %c, double %d, ptr %p1, ptr %p2, ptr %p3) {
; CHECK-LABEL: fma_multi_uses2:
; CHECK: # %bb.0:
-; CHECK-NEXT: xsmuldp 5, 1, 2
-; CHECK-NEXT: xsmuldp 0, 3, 4
-; CHECK-NEXT: stfd 5, 0(7)
-; CHECK-NEXT: stfd 0, 0(8)
-; CHECK-NEXT: stfd 0, 0(9)
-; CHECK-NEXT: xsmsubadp 0, 1, 2
-; CHECK-NEXT: fmr 1, 0
+; CHECK-NEXT: xsmuldp 0, 1, 2
+; CHECK-NEXT: xsmuldp 1, 3, 4
+; CHECK-NEXT: stfd 0, 0(7)
+; CHECK-NEXT: stfd 1, 0(8)
+; CHECK-NEXT: stfd 1, 0(9)
+; CHECK-NEXT: xssubdp 1, 0, 1
; CHECK-NEXT: blr
%ab = fmul contract reassoc double %a, %b
%cd = fmul contract reassoc double %c, %d
More information about the llvm-branch-commits
mailing list