[llvm] [SDAG] Heed enableAggressiveFMAFusion when folding fma(a,b,c*d)+e (PR #94209)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 11 00:17:46 PDT 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/94209
>From 41c4da1224f7f48c4de8421b4f12d516fe1e056c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 31 May 2024 17:21:25 +0100
Subject: [PATCH 1/2] [SDAG] Heed enableAggressiveFMAFusion when folding
fma(a,b,c*d)+e
enableAggressiveFMAFusion is supposed to enable folding to fma even if
the intermediate nodes have multiple uses. Implement this for the fold:
fma(a,b,c*d)+e -> fma(a,b,fma(c,d,e))
and its generalization to longer chains of fmas.
Since the intermediate nodes can have multiple uses we can no longer
modify them in-place, so build a new chain of fmas instead.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 36 +-
.../test/CodeGen/AArch64/storepairsuppress.ll | 524 +++++++++---------
.../CodeGen/AMDGPU/dagcombine-fma-crash.ll | 24 +-
.../CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 32 +-
.../CodeGen/AMDGPU/fadd-fma-fmul-combine.ll | 84 +--
llvm/test/CodeGen/PowerPC/fma-precision.ll | 16 +-
6 files changed, 361 insertions(+), 355 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5148b7258257f..d87f9a9a2977c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15881,29 +15881,29 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
bool CanReassociate =
Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
if (CanReassociate) {
- SDValue FMA, E;
- if (isFusedOp(N0) && N0.hasOneUse()) {
- FMA = N0;
+ SDValue Tmp, E;
+ if (isFusedOp(N0) && (Aggressive || N0.hasOneUse())) {
+ Tmp = N0;
E = N1;
- } else if (isFusedOp(N1) && N1.hasOneUse()) {
- FMA = N1;
+ } else if (isFusedOp(N1) && (Aggressive || N1.hasOneUse())) {
+ Tmp = N1;
E = N0;
}
- SDValue TmpFMA = FMA;
- while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) {
- SDValue FMul = TmpFMA->getOperand(2);
- if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) {
- SDValue C = FMul.getOperand(0);
- SDValue D = FMul.getOperand(1);
- SDValue CDE = matcher.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
- DAG.ReplaceAllUsesOfValueWith(FMul, CDE);
- // Replacing the inner FMul could cause the outer FMA to be simplified
- // away.
- return FMA.getOpcode() == ISD::DELETED_NODE ? SDValue(N, 0) : FMA;
+ if (Tmp) {
+ SmallVector<SDNode *> FMAs;
+ do {
+ FMAs.push_back(Tmp.getNode());
+ Tmp = Tmp->getOperand(2);
+ } while (isFusedOp(Tmp) && (Aggressive || Tmp.hasOneUse()));
+ if (matcher.match(Tmp, ISD::FMUL) && (Aggressive || Tmp.hasOneUse())) {
+ Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT, Tmp.getOperand(0),
+ Tmp.getOperand(1), E);
+ for (SDNode *FMA : reverse(FMAs))
+ Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT,
+ FMA->getOperand(0), FMA->getOperand(1), Tmp);
+ return Tmp;
}
-
- TmpFMA = TmpFMA->getOperand(2);
}
}
diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress.ll b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
index 0571bbc278a6f..c2dc73b54a057 100644
--- a/llvm/test/CodeGen/AArch64/storepairsuppress.ll
+++ b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
@@ -39,165 +39,165 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
; SUPPRESS-NEXT: ldp s2, s3, [x8]
; SUPPRESS-NEXT: ldp s4, s5, [x8, #8]
; SUPPRESS-NEXT: fmul s6, s5, s1
-; SUPPRESS-NEXT: fmul s1, s4, s1
-; SUPPRESS-NEXT: fnmsub s4, s4, s0, s6
-; SUPPRESS-NEXT: fmadd s0, s5, s0, s1
-; SUPPRESS-NEXT: fadd s1, s4, s2
-; SUPPRESS-NEXT: fadd s5, s0, s3
-; SUPPRESS-NEXT: stp s1, s5, [x8]
-; SUPPRESS-NEXT: fsub s2, s2, s4
-; SUPPRESS-NEXT: fsub s0, s3, s0
+; SUPPRESS-NEXT: fnmsub s6, s4, s0, s6
+; SUPPRESS-NEXT: fadd s7, s6, s2
+; SUPPRESS-NEXT: fmadd s16, s4, s1, s3
+; SUPPRESS-NEXT: fmadd s16, s5, s0, s16
+; SUPPRESS-NEXT: stp s7, s16, [x8]
+; SUPPRESS-NEXT: fsub s2, s2, s6
+; SUPPRESS-NEXT: fmsub s1, s4, s1, s3
+; SUPPRESS-NEXT: fmsub s0, s5, s0, s1
; SUPPRESS-NEXT: stp s2, s0, [x8, #8]
; SUPPRESS-NEXT: ldr x9, [x0, #8]
-; SUPPRESS-NEXT: ldp s3, s4, [x9]
-; SUPPRESS-NEXT: ldp s6, s7, [x8, #16]
-; SUPPRESS-NEXT: ldp s16, s17, [x8, #24]
-; SUPPRESS-NEXT: fmul s18, s17, s4
-; SUPPRESS-NEXT: fmul s4, s16, s4
-; SUPPRESS-NEXT: fnmsub s16, s16, s3, s18
-; SUPPRESS-NEXT: fmadd s3, s17, s3, s4
-; SUPPRESS-NEXT: fadd s4, s16, s6
-; SUPPRESS-NEXT: fadd s17, s3, s7
-; SUPPRESS-NEXT: stp s4, s17, [x8, #16]
-; SUPPRESS-NEXT: fsub s6, s6, s16
-; SUPPRESS-NEXT: fsub s3, s7, s3
-; SUPPRESS-NEXT: stp s6, s3, [x8, #24]
+; SUPPRESS-NEXT: ldp s1, s3, [x9]
+; SUPPRESS-NEXT: ldp s4, s5, [x8, #16]
+; SUPPRESS-NEXT: ldp s6, s17, [x8, #24]
+; SUPPRESS-NEXT: fmul s18, s17, s3
+; SUPPRESS-NEXT: fnmsub s18, s6, s1, s18
+; SUPPRESS-NEXT: fadd s19, s18, s4
+; SUPPRESS-NEXT: fmadd s20, s6, s3, s5
+; SUPPRESS-NEXT: fmadd s20, s17, s1, s20
+; SUPPRESS-NEXT: stp s19, s20, [x8, #16]
+; SUPPRESS-NEXT: fsub s4, s4, s18
+; SUPPRESS-NEXT: fmsub s3, s6, s3, s5
+; SUPPRESS-NEXT: fmsub s1, s17, s1, s3
+; SUPPRESS-NEXT: stp s4, s1, [x8, #24]
; SUPPRESS-NEXT: ldr x9, [x0, #8]
-; SUPPRESS-NEXT: ldp s7, s16, [x9]
-; SUPPRESS-NEXT: fmul s18, s16, s17
-; SUPPRESS-NEXT: fmul s17, s7, s17
-; SUPPRESS-NEXT: fnmsub s7, s7, s4, s18
-; SUPPRESS-NEXT: fmadd s4, s16, s4, s17
-; SUPPRESS-NEXT: fadd s16, s7, s1
-; SUPPRESS-NEXT: fadd s17, s4, s5
-; SUPPRESS-NEXT: stp s16, s17, [x8]
-; SUPPRESS-NEXT: fsub s1, s1, s7
-; SUPPRESS-NEXT: fsub s4, s5, s4
-; SUPPRESS-NEXT: stp s1, s4, [x8, #16]
+; SUPPRESS-NEXT: ldp s3, s5, [x9]
+; SUPPRESS-NEXT: fmul s6, s5, s20
+; SUPPRESS-NEXT: fnmsub s6, s3, s19, s6
+; SUPPRESS-NEXT: fadd s17, s6, s7
+; SUPPRESS-NEXT: fmadd s18, s3, s20, s16
+; SUPPRESS-NEXT: fmadd s18, s5, s19, s18
+; SUPPRESS-NEXT: stp s17, s18, [x8]
+; SUPPRESS-NEXT: fsub s6, s7, s6
+; SUPPRESS-NEXT: fmsub s3, s3, s20, s16
+; SUPPRESS-NEXT: fmsub s3, s5, s19, s3
+; SUPPRESS-NEXT: stp s6, s3, [x8, #16]
; SUPPRESS-NEXT: ldr x10, [x0, #8]
; SUPPRESS-NEXT: lsl x9, x3, #4
; SUPPRESS-NEXT: add x10, x10, x9
-; SUPPRESS-NEXT: ldp s1, s4, [x10]
-; SUPPRESS-NEXT: fmul s5, s4, s3
-; SUPPRESS-NEXT: fmul s3, s1, s3
-; SUPPRESS-NEXT: fnmsub s1, s1, s6, s5
-; SUPPRESS-NEXT: fmadd s3, s4, s6, s3
-; SUPPRESS-NEXT: fadd s4, s1, s2
-; SUPPRESS-NEXT: fadd s5, s3, s0
-; SUPPRESS-NEXT: stp s4, s5, [x8, #8]
-; SUPPRESS-NEXT: fsub s1, s2, s1
-; SUPPRESS-NEXT: fsub s0, s0, s3
-; SUPPRESS-NEXT: stp s1, s0, [x8, #24]
+; SUPPRESS-NEXT: ldp s3, s5, [x10]
+; SUPPRESS-NEXT: fmul s6, s5, s1
+; SUPPRESS-NEXT: fnmsub s6, s3, s4, s6
+; SUPPRESS-NEXT: fadd s7, s6, s2
+; SUPPRESS-NEXT: fmadd s16, s3, s1, s0
+; SUPPRESS-NEXT: fmadd s16, s5, s4, s16
+; SUPPRESS-NEXT: stp s7, s16, [x8, #8]
+; SUPPRESS-NEXT: fsub s2, s2, s6
+; SUPPRESS-NEXT: fmsub s0, s3, s1, s0
+; SUPPRESS-NEXT: fmsub s0, s5, s4, s0
+; SUPPRESS-NEXT: stp s2, s0, [x8, #24]
; SUPPRESS-NEXT: ldr x10, [x0, #8]
; SUPPRESS-NEXT: ldp s0, s1, [x10]
; SUPPRESS-NEXT: ldp s2, s3, [x8, #32]
; SUPPRESS-NEXT: ldp s4, s5, [x8, #40]
; SUPPRESS-NEXT: fmul s6, s5, s1
-; SUPPRESS-NEXT: fmul s1, s4, s1
-; SUPPRESS-NEXT: fnmsub s4, s4, s0, s6
-; SUPPRESS-NEXT: fmadd s0, s5, s0, s1
-; SUPPRESS-NEXT: fadd s1, s4, s2
-; SUPPRESS-NEXT: fadd s5, s0, s3
-; SUPPRESS-NEXT: stp s1, s5, [x8, #32]
-; SUPPRESS-NEXT: fsub s2, s2, s4
-; SUPPRESS-NEXT: fsub s3, s3, s0
-; SUPPRESS-NEXT: stp s2, s3, [x8, #40]
+; SUPPRESS-NEXT: fnmsub s6, s4, s0, s6
+; SUPPRESS-NEXT: fadd s7, s6, s2
+; SUPPRESS-NEXT: fmadd s16, s4, s1, s3
+; SUPPRESS-NEXT: fmadd s16, s5, s0, s16
+; SUPPRESS-NEXT: stp s7, s16, [x8, #32]
+; SUPPRESS-NEXT: fsub s6, s2, s6
+; SUPPRESS-NEXT: fmsub s1, s4, s1, s3
+; SUPPRESS-NEXT: fmsub s1, s5, s0, s1
+; SUPPRESS-NEXT: stp s6, s1, [x8, #40]
; SUPPRESS-NEXT: ldr x10, [x0, #8]
-; SUPPRESS-NEXT: ldp s0, s4, [x10]
-; SUPPRESS-NEXT: ldp s6, s7, [x8, #48]
-; SUPPRESS-NEXT: ldp s16, s17, [x8, #56]
-; SUPPRESS-NEXT: fmul s18, s17, s4
-; SUPPRESS-NEXT: fmul s4, s16, s4
-; SUPPRESS-NEXT: fnmsub s16, s16, s0, s18
-; SUPPRESS-NEXT: fmadd s0, s17, s0, s4
-; SUPPRESS-NEXT: fadd s4, s16, s6
-; SUPPRESS-NEXT: fadd s17, s0, s7
-; SUPPRESS-NEXT: stp s4, s17, [x8, #48]
-; SUPPRESS-NEXT: fsub s6, s6, s16
-; SUPPRESS-NEXT: fsub s0, s7, s0
-; SUPPRESS-NEXT: stp s6, s0, [x8, #56]
+; SUPPRESS-NEXT: ldp s0, s2, [x10]
+; SUPPRESS-NEXT: ldp s3, s4, [x8, #48]
+; SUPPRESS-NEXT: ldp s5, s17, [x8, #56]
+; SUPPRESS-NEXT: fmul s18, s17, s2
+; SUPPRESS-NEXT: fnmsub s18, s5, s0, s18
+; SUPPRESS-NEXT: fadd s19, s18, s3
+; SUPPRESS-NEXT: fmadd s20, s5, s2, s4
+; SUPPRESS-NEXT: fmadd s20, s17, s0, s20
+; SUPPRESS-NEXT: stp s19, s20, [x8, #48]
+; SUPPRESS-NEXT: fsub s18, s3, s18
+; SUPPRESS-NEXT: fmsub s2, s5, s2, s4
+; SUPPRESS-NEXT: fmsub s4, s17, s0, s2
+; SUPPRESS-NEXT: stp s18, s4, [x8, #56]
; SUPPRESS-NEXT: ldr x10, [x0, #8]
-; SUPPRESS-NEXT: ldp s7, s16, [x10]
-; SUPPRESS-NEXT: fmul s18, s16, s17
-; SUPPRESS-NEXT: fmul s17, s7, s17
-; SUPPRESS-NEXT: fnmsub s7, s7, s4, s18
-; SUPPRESS-NEXT: fmadd s4, s16, s4, s17
-; SUPPRESS-NEXT: fadd s16, s7, s1
-; SUPPRESS-NEXT: fadd s17, s4, s5
-; SUPPRESS-NEXT: stp s16, s17, [x8, #32]
-; SUPPRESS-NEXT: fsub s7, s1, s7
-; SUPPRESS-NEXT: fsub s4, s5, s4
-; SUPPRESS-NEXT: stp s7, s4, [x8, #48]
+; SUPPRESS-NEXT: ldp s0, s3, [x10]
+; SUPPRESS-NEXT: fmul s2, s3, s20
+; SUPPRESS-NEXT: fnmsub s2, s0, s19, s2
+; SUPPRESS-NEXT: fadd s5, s2, s7
+; SUPPRESS-NEXT: fmadd s17, s0, s20, s16
+; SUPPRESS-NEXT: fmadd s17, s3, s19, s17
+; SUPPRESS-NEXT: stp s5, s17, [x8, #32]
+; SUPPRESS-NEXT: fsub s2, s7, s2
+; SUPPRESS-NEXT: fmsub s0, s0, s20, s16
+; SUPPRESS-NEXT: fmsub s3, s3, s19, s0
+; SUPPRESS-NEXT: stp s2, s3, [x8, #48]
; SUPPRESS-NEXT: ldr x10, [x0, #8]
; SUPPRESS-NEXT: add x9, x10, x9
-; SUPPRESS-NEXT: ldp s1, s5, [x9]
-; SUPPRESS-NEXT: fmul s18, s5, s0
-; SUPPRESS-NEXT: fmul s0, s1, s0
-; SUPPRESS-NEXT: fnmsub s1, s1, s6, s18
-; SUPPRESS-NEXT: fmadd s5, s5, s6, s0
-; SUPPRESS-NEXT: fadd s6, s1, s2
-; SUPPRESS-NEXT: fadd s18, s5, s3
-; SUPPRESS-NEXT: stp s6, s18, [x8, #40]
-; SUPPRESS-NEXT: fsub s0, s2, s1
-; SUPPRESS-NEXT: fsub s1, s3, s5
+; SUPPRESS-NEXT: ldp s7, s16, [x9]
+; SUPPRESS-NEXT: fmul s0, s16, s4
+; SUPPRESS-NEXT: fnmsub s0, s7, s18, s0
+; SUPPRESS-NEXT: fadd s19, s0, s6
+; SUPPRESS-NEXT: fmadd s20, s7, s4, s1
+; SUPPRESS-NEXT: fmadd s20, s16, s18, s20
+; SUPPRESS-NEXT: stp s19, s20, [x8, #40]
+; SUPPRESS-NEXT: fsub s0, s6, s0
+; SUPPRESS-NEXT: fmsub s1, s7, s4, s1
+; SUPPRESS-NEXT: fmsub s1, s16, s18, s1
; SUPPRESS-NEXT: stp s0, s1, [x8, #56]
; SUPPRESS-NEXT: ldr x9, [x0, #8]
-; SUPPRESS-NEXT: ldp s2, s3, [x9]
-; SUPPRESS-NEXT: ldp s5, s19, [x8]
-; SUPPRESS-NEXT: fmul s20, s17, s3
-; SUPPRESS-NEXT: fmul s3, s16, s3
-; SUPPRESS-NEXT: fnmsub s16, s16, s2, s20
-; SUPPRESS-NEXT: fmadd s2, s17, s2, s3
-; SUPPRESS-NEXT: fadd s3, s16, s5
-; SUPPRESS-NEXT: fadd s17, s2, s19
-; SUPPRESS-NEXT: stp s3, s17, [x8]
-; SUPPRESS-NEXT: fsub s3, s5, s16
-; SUPPRESS-NEXT: fsub s2, s19, s2
-; SUPPRESS-NEXT: stp s3, s2, [x8, #32]
+; SUPPRESS-NEXT: ldp s4, s6, [x9]
+; SUPPRESS-NEXT: ldp s7, s16, [x8]
+; SUPPRESS-NEXT: fmul s18, s17, s6
+; SUPPRESS-NEXT: fnmsub s18, s5, s4, s18
+; SUPPRESS-NEXT: fadd s21, s18, s7
+; SUPPRESS-NEXT: fmadd s22, s5, s6, s16
+; SUPPRESS-NEXT: fmadd s22, s17, s4, s22
+; SUPPRESS-NEXT: stp s21, s22, [x8]
+; SUPPRESS-NEXT: fsub s7, s7, s18
+; SUPPRESS-NEXT: fmsub s5, s5, s6, s16
+; SUPPRESS-NEXT: fmsub s4, s17, s4, s5
+; SUPPRESS-NEXT: stp s7, s4, [x8, #32]
; SUPPRESS-NEXT: ldr x9, [x0, #8]
; SUPPRESS-NEXT: add x9, x9, w3, sxtw #3
-; SUPPRESS-NEXT: ldp s2, s3, [x9]
-; SUPPRESS-NEXT: ldp s5, s16, [x8, #8]
-; SUPPRESS-NEXT: fmul s17, s18, s3
-; SUPPRESS-NEXT: fmul s3, s6, s3
-; SUPPRESS-NEXT: fnmsub s6, s6, s2, s17
-; SUPPRESS-NEXT: fmadd s2, s18, s2, s3
-; SUPPRESS-NEXT: fadd s3, s6, s5
-; SUPPRESS-NEXT: fadd s17, s2, s16
-; SUPPRESS-NEXT: stp s3, s17, [x8, #8]
-; SUPPRESS-NEXT: fsub s3, s5, s6
-; SUPPRESS-NEXT: fsub s2, s16, s2
-; SUPPRESS-NEXT: stp s3, s2, [x8, #40]
+; SUPPRESS-NEXT: ldp s4, s5, [x9]
+; SUPPRESS-NEXT: ldp s6, s7, [x8, #8]
+; SUPPRESS-NEXT: fmul s16, s20, s5
+; SUPPRESS-NEXT: fnmsub s16, s19, s4, s16
+; SUPPRESS-NEXT: fadd s17, s16, s6
+; SUPPRESS-NEXT: fmadd s18, s19, s5, s7
+; SUPPRESS-NEXT: fmadd s18, s20, s4, s18
+; SUPPRESS-NEXT: stp s17, s18, [x8, #8]
+; SUPPRESS-NEXT: fsub s6, s6, s16
+; SUPPRESS-NEXT: fmsub s5, s19, s5, s7
+; SUPPRESS-NEXT: fmsub s4, s20, s4, s5
+; SUPPRESS-NEXT: stp s6, s4, [x8, #40]
; SUPPRESS-NEXT: lsl x9, x3, #33
; SUPPRESS-NEXT: ldr x10, [x0, #8]
; SUPPRESS-NEXT: add x9, x10, x9, asr #29
-; SUPPRESS-NEXT: ldp s2, s3, [x9]
-; SUPPRESS-NEXT: ldp s5, s6, [x8, #16]
-; SUPPRESS-NEXT: fmul s16, s4, s3
-; SUPPRESS-NEXT: fmul s3, s7, s3
-; SUPPRESS-NEXT: fnmsub s7, s7, s2, s16
-; SUPPRESS-NEXT: fmadd s2, s4, s2, s3
-; SUPPRESS-NEXT: fadd s3, s7, s5
-; SUPPRESS-NEXT: fadd s4, s2, s6
-; SUPPRESS-NEXT: stp s3, s4, [x8, #16]
-; SUPPRESS-NEXT: fsub s3, s5, s7
-; SUPPRESS-NEXT: fsub s2, s6, s2
-; SUPPRESS-NEXT: stp s3, s2, [x8, #48]
+; SUPPRESS-NEXT: ldp s4, s5, [x9]
+; SUPPRESS-NEXT: ldp s6, s7, [x8, #16]
+; SUPPRESS-NEXT: fmul s16, s3, s5
+; SUPPRESS-NEXT: fnmsub s16, s2, s4, s16
+; SUPPRESS-NEXT: fadd s17, s16, s6
+; SUPPRESS-NEXT: fmadd s18, s2, s5, s7
+; SUPPRESS-NEXT: fmadd s18, s3, s4, s18
+; SUPPRESS-NEXT: stp s17, s18, [x8, #16]
+; SUPPRESS-NEXT: fsub s6, s6, s16
+; SUPPRESS-NEXT: fmsub s2, s2, s5, s7
+; SUPPRESS-NEXT: fmsub s2, s3, s4, s2
+; SUPPRESS-NEXT: stp s6, s2, [x8, #48]
; SUPPRESS-NEXT: add w9, w3, w3, lsl #1
; SUPPRESS-NEXT: ldr x10, [x0, #8]
; SUPPRESS-NEXT: add x9, x10, w9, sxtw #3
; SUPPRESS-NEXT: ldp s2, s3, [x9]
; SUPPRESS-NEXT: ldp s4, s5, [x8, #24]
; SUPPRESS-NEXT: fmul s6, s1, s3
-; SUPPRESS-NEXT: fmul s3, s0, s3
-; SUPPRESS-NEXT: fnmsub s0, s0, s2, s6
-; SUPPRESS-NEXT: fmadd s1, s1, s2, s3
-; SUPPRESS-NEXT: fadd s2, s0, s4
-; SUPPRESS-NEXT: fadd s3, s1, s5
-; SUPPRESS-NEXT: stp s2, s3, [x8, #24]
-; SUPPRESS-NEXT: fsub s0, s4, s0
-; SUPPRESS-NEXT: fsub s1, s5, s1
-; SUPPRESS-NEXT: stp s0, s1, [x8, #56]
+; SUPPRESS-NEXT: fnmsub s6, s0, s2, s6
+; SUPPRESS-NEXT: fadd s7, s6, s4
+; SUPPRESS-NEXT: fmadd s16, s0, s3, s5
+; SUPPRESS-NEXT: fmadd s16, s1, s2, s16
+; SUPPRESS-NEXT: stp s7, s16, [x8, #24]
+; SUPPRESS-NEXT: fsub s4, s4, s6
+; SUPPRESS-NEXT: fmsub s0, s0, s3, s5
+; SUPPRESS-NEXT: fmsub s0, s1, s2, s0
+; SUPPRESS-NEXT: stp s4, s0, [x8, #56]
; SUPPRESS-NEXT: ret
;
; NOSUPPRESS-LABEL: load_store_units_critical:
@@ -208,165 +208,165 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
; NOSUPPRESS-NEXT: ldp s2, s3, [x8]
; NOSUPPRESS-NEXT: ldp s4, s5, [x8, #8]
; NOSUPPRESS-NEXT: fmul s6, s5, s1
-; NOSUPPRESS-NEXT: fmul s1, s4, s1
-; NOSUPPRESS-NEXT: fnmsub s4, s4, s0, s6
-; NOSUPPRESS-NEXT: fmadd s0, s5, s0, s1
-; NOSUPPRESS-NEXT: fadd s1, s4, s2
-; NOSUPPRESS-NEXT: fadd s5, s0, s3
-; NOSUPPRESS-NEXT: stp s1, s5, [x8]
-; NOSUPPRESS-NEXT: fsub s2, s2, s4
-; NOSUPPRESS-NEXT: fsub s0, s3, s0
+; NOSUPPRESS-NEXT: fnmsub s6, s4, s0, s6
+; NOSUPPRESS-NEXT: fadd s7, s6, s2
+; NOSUPPRESS-NEXT: fmadd s16, s4, s1, s3
+; NOSUPPRESS-NEXT: fmadd s16, s5, s0, s16
+; NOSUPPRESS-NEXT: stp s7, s16, [x8]
+; NOSUPPRESS-NEXT: fsub s2, s2, s6
+; NOSUPPRESS-NEXT: fmsub s1, s4, s1, s3
+; NOSUPPRESS-NEXT: fmsub s0, s5, s0, s1
; NOSUPPRESS-NEXT: stp s2, s0, [x8, #8]
; NOSUPPRESS-NEXT: ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT: ldp s3, s4, [x9]
-; NOSUPPRESS-NEXT: ldp s6, s7, [x8, #16]
-; NOSUPPRESS-NEXT: ldp s16, s17, [x8, #24]
-; NOSUPPRESS-NEXT: fmul s18, s17, s4
-; NOSUPPRESS-NEXT: fmul s4, s16, s4
-; NOSUPPRESS-NEXT: fnmsub s16, s16, s3, s18
-; NOSUPPRESS-NEXT: fmadd s3, s17, s3, s4
-; NOSUPPRESS-NEXT: fadd s4, s16, s6
-; NOSUPPRESS-NEXT: fadd s17, s3, s7
-; NOSUPPRESS-NEXT: stp s4, s17, [x8, #16]
-; NOSUPPRESS-NEXT: fsub s6, s6, s16
-; NOSUPPRESS-NEXT: fsub s3, s7, s3
-; NOSUPPRESS-NEXT: stp s6, s3, [x8, #24]
+; NOSUPPRESS-NEXT: ldp s1, s3, [x9]
+; NOSUPPRESS-NEXT: ldp s4, s5, [x8, #16]
+; NOSUPPRESS-NEXT: ldp s6, s17, [x8, #24]
+; NOSUPPRESS-NEXT: fmul s18, s17, s3
+; NOSUPPRESS-NEXT: fnmsub s18, s6, s1, s18
+; NOSUPPRESS-NEXT: fadd s19, s18, s4
+; NOSUPPRESS-NEXT: fmadd s20, s6, s3, s5
+; NOSUPPRESS-NEXT: fmadd s20, s17, s1, s20
+; NOSUPPRESS-NEXT: stp s19, s20, [x8, #16]
+; NOSUPPRESS-NEXT: fsub s4, s4, s18
+; NOSUPPRESS-NEXT: fmsub s3, s6, s3, s5
+; NOSUPPRESS-NEXT: fmsub s1, s17, s1, s3
+; NOSUPPRESS-NEXT: stp s4, s1, [x8, #24]
; NOSUPPRESS-NEXT: ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT: ldp s7, s16, [x9]
-; NOSUPPRESS-NEXT: fmul s18, s16, s17
-; NOSUPPRESS-NEXT: fmul s17, s7, s17
-; NOSUPPRESS-NEXT: fnmsub s7, s7, s4, s18
-; NOSUPPRESS-NEXT: fmadd s4, s16, s4, s17
-; NOSUPPRESS-NEXT: fadd s16, s7, s1
-; NOSUPPRESS-NEXT: fadd s17, s4, s5
-; NOSUPPRESS-NEXT: stp s16, s17, [x8]
-; NOSUPPRESS-NEXT: fsub s1, s1, s7
-; NOSUPPRESS-NEXT: fsub s4, s5, s4
-; NOSUPPRESS-NEXT: stp s1, s4, [x8, #16]
+; NOSUPPRESS-NEXT: ldp s3, s5, [x9]
+; NOSUPPRESS-NEXT: fmul s6, s5, s20
+; NOSUPPRESS-NEXT: fnmsub s6, s3, s19, s6
+; NOSUPPRESS-NEXT: fadd s17, s6, s7
+; NOSUPPRESS-NEXT: fmadd s18, s3, s20, s16
+; NOSUPPRESS-NEXT: fmadd s18, s5, s19, s18
+; NOSUPPRESS-NEXT: stp s17, s18, [x8]
+; NOSUPPRESS-NEXT: fsub s6, s7, s6
+; NOSUPPRESS-NEXT: fmsub s3, s3, s20, s16
+; NOSUPPRESS-NEXT: fmsub s3, s5, s19, s3
+; NOSUPPRESS-NEXT: stp s6, s3, [x8, #16]
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
; NOSUPPRESS-NEXT: lsl x9, x3, #4
; NOSUPPRESS-NEXT: add x10, x10, x9
-; NOSUPPRESS-NEXT: ldp s1, s4, [x10]
-; NOSUPPRESS-NEXT: fmul s5, s4, s3
-; NOSUPPRESS-NEXT: fmul s3, s1, s3
-; NOSUPPRESS-NEXT: fnmsub s1, s1, s6, s5
-; NOSUPPRESS-NEXT: fmadd s3, s4, s6, s3
-; NOSUPPRESS-NEXT: fadd s4, s1, s2
-; NOSUPPRESS-NEXT: fadd s5, s3, s0
-; NOSUPPRESS-NEXT: stp s4, s5, [x8, #8]
-; NOSUPPRESS-NEXT: fsub s1, s2, s1
-; NOSUPPRESS-NEXT: fsub s0, s0, s3
-; NOSUPPRESS-NEXT: stp s1, s0, [x8, #24]
+; NOSUPPRESS-NEXT: ldp s3, s5, [x10]
+; NOSUPPRESS-NEXT: fmul s6, s5, s1
+; NOSUPPRESS-NEXT: fnmsub s6, s3, s4, s6
+; NOSUPPRESS-NEXT: fadd s7, s6, s2
+; NOSUPPRESS-NEXT: fmadd s16, s3, s1, s0
+; NOSUPPRESS-NEXT: fmadd s16, s5, s4, s16
+; NOSUPPRESS-NEXT: stp s7, s16, [x8, #8]
+; NOSUPPRESS-NEXT: fsub s2, s2, s6
+; NOSUPPRESS-NEXT: fmsub s0, s3, s1, s0
+; NOSUPPRESS-NEXT: fmsub s0, s5, s4, s0
+; NOSUPPRESS-NEXT: stp s2, s0, [x8, #24]
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
; NOSUPPRESS-NEXT: ldp s0, s1, [x10]
; NOSUPPRESS-NEXT: ldp s2, s3, [x8, #32]
; NOSUPPRESS-NEXT: ldp s4, s5, [x8, #40]
; NOSUPPRESS-NEXT: fmul s6, s5, s1
-; NOSUPPRESS-NEXT: fmul s1, s4, s1
-; NOSUPPRESS-NEXT: fnmsub s4, s4, s0, s6
-; NOSUPPRESS-NEXT: fmadd s0, s5, s0, s1
-; NOSUPPRESS-NEXT: fadd s1, s4, s2
-; NOSUPPRESS-NEXT: fadd s5, s0, s3
-; NOSUPPRESS-NEXT: stp s1, s5, [x8, #32]
-; NOSUPPRESS-NEXT: fsub s2, s2, s4
-; NOSUPPRESS-NEXT: fsub s3, s3, s0
-; NOSUPPRESS-NEXT: stp s2, s3, [x8, #40]
+; NOSUPPRESS-NEXT: fnmsub s6, s4, s0, s6
+; NOSUPPRESS-NEXT: fadd s7, s6, s2
+; NOSUPPRESS-NEXT: fmadd s16, s4, s1, s3
+; NOSUPPRESS-NEXT: fmadd s16, s5, s0, s16
+; NOSUPPRESS-NEXT: stp s7, s16, [x8, #32]
+; NOSUPPRESS-NEXT: fsub s6, s2, s6
+; NOSUPPRESS-NEXT: fmsub s1, s4, s1, s3
+; NOSUPPRESS-NEXT: fmsub s1, s5, s0, s1
+; NOSUPPRESS-NEXT: stp s6, s1, [x8, #40]
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
-; NOSUPPRESS-NEXT: ldp s0, s4, [x10]
-; NOSUPPRESS-NEXT: ldp s6, s7, [x8, #48]
-; NOSUPPRESS-NEXT: ldp s16, s17, [x8, #56]
-; NOSUPPRESS-NEXT: fmul s18, s17, s4
-; NOSUPPRESS-NEXT: fmul s4, s16, s4
-; NOSUPPRESS-NEXT: fnmsub s16, s16, s0, s18
-; NOSUPPRESS-NEXT: fmadd s0, s17, s0, s4
-; NOSUPPRESS-NEXT: fadd s4, s16, s6
-; NOSUPPRESS-NEXT: fadd s17, s0, s7
-; NOSUPPRESS-NEXT: stp s4, s17, [x8, #48]
-; NOSUPPRESS-NEXT: fsub s6, s6, s16
-; NOSUPPRESS-NEXT: fsub s0, s7, s0
-; NOSUPPRESS-NEXT: stp s6, s0, [x8, #56]
+; NOSUPPRESS-NEXT: ldp s0, s2, [x10]
+; NOSUPPRESS-NEXT: ldp s3, s4, [x8, #48]
+; NOSUPPRESS-NEXT: ldp s5, s17, [x8, #56]
+; NOSUPPRESS-NEXT: fmul s18, s17, s2
+; NOSUPPRESS-NEXT: fnmsub s18, s5, s0, s18
+; NOSUPPRESS-NEXT: fadd s19, s18, s3
+; NOSUPPRESS-NEXT: fmadd s20, s5, s2, s4
+; NOSUPPRESS-NEXT: fmadd s20, s17, s0, s20
+; NOSUPPRESS-NEXT: stp s19, s20, [x8, #48]
+; NOSUPPRESS-NEXT: fsub s18, s3, s18
+; NOSUPPRESS-NEXT: fmsub s2, s5, s2, s4
+; NOSUPPRESS-NEXT: fmsub s4, s17, s0, s2
+; NOSUPPRESS-NEXT: stp s18, s4, [x8, #56]
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
-; NOSUPPRESS-NEXT: ldp s7, s16, [x10]
-; NOSUPPRESS-NEXT: fmul s18, s16, s17
-; NOSUPPRESS-NEXT: fmul s17, s7, s17
-; NOSUPPRESS-NEXT: fnmsub s7, s7, s4, s18
-; NOSUPPRESS-NEXT: fmadd s4, s16, s4, s17
-; NOSUPPRESS-NEXT: fadd s16, s7, s1
-; NOSUPPRESS-NEXT: fadd s17, s4, s5
-; NOSUPPRESS-NEXT: stp s16, s17, [x8, #32]
-; NOSUPPRESS-NEXT: fsub s7, s1, s7
-; NOSUPPRESS-NEXT: fsub s4, s5, s4
-; NOSUPPRESS-NEXT: stp s7, s4, [x8, #48]
+; NOSUPPRESS-NEXT: ldp s0, s3, [x10]
+; NOSUPPRESS-NEXT: fmul s2, s3, s20
+; NOSUPPRESS-NEXT: fnmsub s2, s0, s19, s2
+; NOSUPPRESS-NEXT: fadd s5, s2, s7
+; NOSUPPRESS-NEXT: fmadd s17, s0, s20, s16
+; NOSUPPRESS-NEXT: fmadd s17, s3, s19, s17
+; NOSUPPRESS-NEXT: stp s5, s17, [x8, #32]
+; NOSUPPRESS-NEXT: fsub s2, s7, s2
+; NOSUPPRESS-NEXT: fmsub s0, s0, s20, s16
+; NOSUPPRESS-NEXT: fmsub s3, s3, s19, s0
+; NOSUPPRESS-NEXT: stp s2, s3, [x8, #48]
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
; NOSUPPRESS-NEXT: add x9, x10, x9
-; NOSUPPRESS-NEXT: ldp s1, s5, [x9]
-; NOSUPPRESS-NEXT: fmul s18, s5, s0
-; NOSUPPRESS-NEXT: fmul s0, s1, s0
-; NOSUPPRESS-NEXT: fnmsub s1, s1, s6, s18
-; NOSUPPRESS-NEXT: fmadd s5, s5, s6, s0
-; NOSUPPRESS-NEXT: fadd s6, s1, s2
-; NOSUPPRESS-NEXT: fadd s18, s5, s3
-; NOSUPPRESS-NEXT: stp s6, s18, [x8, #40]
-; NOSUPPRESS-NEXT: fsub s0, s2, s1
-; NOSUPPRESS-NEXT: fsub s1, s3, s5
+; NOSUPPRESS-NEXT: ldp s7, s16, [x9]
+; NOSUPPRESS-NEXT: fmul s0, s16, s4
+; NOSUPPRESS-NEXT: fnmsub s0, s7, s18, s0
+; NOSUPPRESS-NEXT: fadd s19, s0, s6
+; NOSUPPRESS-NEXT: fmadd s20, s7, s4, s1
+; NOSUPPRESS-NEXT: fmadd s20, s16, s18, s20
+; NOSUPPRESS-NEXT: stp s19, s20, [x8, #40]
+; NOSUPPRESS-NEXT: fsub s0, s6, s0
+; NOSUPPRESS-NEXT: fmsub s1, s7, s4, s1
+; NOSUPPRESS-NEXT: fmsub s1, s16, s18, s1
; NOSUPPRESS-NEXT: stp s0, s1, [x8, #56]
; NOSUPPRESS-NEXT: ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT: ldp s2, s3, [x9]
-; NOSUPPRESS-NEXT: ldp s5, s19, [x8]
-; NOSUPPRESS-NEXT: fmul s20, s17, s3
-; NOSUPPRESS-NEXT: fmul s3, s16, s3
-; NOSUPPRESS-NEXT: fnmsub s16, s16, s2, s20
-; NOSUPPRESS-NEXT: fmadd s2, s17, s2, s3
-; NOSUPPRESS-NEXT: fadd s3, s16, s5
-; NOSUPPRESS-NEXT: fadd s17, s2, s19
-; NOSUPPRESS-NEXT: stp s3, s17, [x8]
-; NOSUPPRESS-NEXT: fsub s3, s5, s16
-; NOSUPPRESS-NEXT: fsub s2, s19, s2
-; NOSUPPRESS-NEXT: stp s3, s2, [x8, #32]
+; NOSUPPRESS-NEXT: ldp s4, s6, [x9]
+; NOSUPPRESS-NEXT: ldp s7, s16, [x8]
+; NOSUPPRESS-NEXT: fmul s18, s17, s6
+; NOSUPPRESS-NEXT: fnmsub s18, s5, s4, s18
+; NOSUPPRESS-NEXT: fadd s21, s18, s7
+; NOSUPPRESS-NEXT: fmadd s22, s5, s6, s16
+; NOSUPPRESS-NEXT: fmadd s22, s17, s4, s22
+; NOSUPPRESS-NEXT: stp s21, s22, [x8]
+; NOSUPPRESS-NEXT: fsub s7, s7, s18
+; NOSUPPRESS-NEXT: fmsub s5, s5, s6, s16
+; NOSUPPRESS-NEXT: fmsub s4, s17, s4, s5
+; NOSUPPRESS-NEXT: stp s7, s4, [x8, #32]
; NOSUPPRESS-NEXT: ldr x9, [x0, #8]
; NOSUPPRESS-NEXT: add x9, x9, w3, sxtw #3
-; NOSUPPRESS-NEXT: ldp s2, s3, [x9]
-; NOSUPPRESS-NEXT: ldp s5, s16, [x8, #8]
-; NOSUPPRESS-NEXT: fmul s17, s18, s3
-; NOSUPPRESS-NEXT: fmul s3, s6, s3
-; NOSUPPRESS-NEXT: fnmsub s6, s6, s2, s17
-; NOSUPPRESS-NEXT: fmadd s2, s18, s2, s3
-; NOSUPPRESS-NEXT: fadd s3, s6, s5
-; NOSUPPRESS-NEXT: fadd s17, s2, s16
-; NOSUPPRESS-NEXT: stp s3, s17, [x8, #8]
-; NOSUPPRESS-NEXT: fsub s3, s5, s6
-; NOSUPPRESS-NEXT: fsub s2, s16, s2
-; NOSUPPRESS-NEXT: stp s3, s2, [x8, #40]
+; NOSUPPRESS-NEXT: ldp s4, s5, [x9]
+; NOSUPPRESS-NEXT: ldp s6, s7, [x8, #8]
+; NOSUPPRESS-NEXT: fmul s16, s20, s5
+; NOSUPPRESS-NEXT: fnmsub s16, s19, s4, s16
+; NOSUPPRESS-NEXT: fadd s17, s16, s6
+; NOSUPPRESS-NEXT: fmadd s18, s19, s5, s7
+; NOSUPPRESS-NEXT: fmadd s18, s20, s4, s18
+; NOSUPPRESS-NEXT: stp s17, s18, [x8, #8]
+; NOSUPPRESS-NEXT: fsub s6, s6, s16
+; NOSUPPRESS-NEXT: fmsub s5, s19, s5, s7
+; NOSUPPRESS-NEXT: fmsub s4, s20, s4, s5
+; NOSUPPRESS-NEXT: stp s6, s4, [x8, #40]
; NOSUPPRESS-NEXT: lsl x9, x3, #33
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
; NOSUPPRESS-NEXT: add x9, x10, x9, asr #29
-; NOSUPPRESS-NEXT: ldp s2, s3, [x9]
-; NOSUPPRESS-NEXT: ldp s5, s6, [x8, #16]
-; NOSUPPRESS-NEXT: fmul s16, s4, s3
-; NOSUPPRESS-NEXT: fmul s3, s7, s3
-; NOSUPPRESS-NEXT: fnmsub s7, s7, s2, s16
-; NOSUPPRESS-NEXT: fmadd s2, s4, s2, s3
-; NOSUPPRESS-NEXT: fadd s3, s7, s5
-; NOSUPPRESS-NEXT: fadd s4, s2, s6
-; NOSUPPRESS-NEXT: stp s3, s4, [x8, #16]
-; NOSUPPRESS-NEXT: fsub s3, s5, s7
-; NOSUPPRESS-NEXT: fsub s2, s6, s2
-; NOSUPPRESS-NEXT: stp s3, s2, [x8, #48]
+; NOSUPPRESS-NEXT: ldp s4, s5, [x9]
+; NOSUPPRESS-NEXT: ldp s6, s7, [x8, #16]
+; NOSUPPRESS-NEXT: fmul s16, s3, s5
+; NOSUPPRESS-NEXT: fnmsub s16, s2, s4, s16
+; NOSUPPRESS-NEXT: fadd s17, s16, s6
+; NOSUPPRESS-NEXT: fmadd s18, s2, s5, s7
+; NOSUPPRESS-NEXT: fmadd s18, s3, s4, s18
+; NOSUPPRESS-NEXT: stp s17, s18, [x8, #16]
+; NOSUPPRESS-NEXT: fsub s6, s6, s16
+; NOSUPPRESS-NEXT: fmsub s2, s2, s5, s7
+; NOSUPPRESS-NEXT: fmsub s2, s3, s4, s2
+; NOSUPPRESS-NEXT: stp s6, s2, [x8, #48]
; NOSUPPRESS-NEXT: add w9, w3, w3, lsl #1
; NOSUPPRESS-NEXT: ldr x10, [x0, #8]
; NOSUPPRESS-NEXT: add x9, x10, w9, sxtw #3
; NOSUPPRESS-NEXT: ldp s2, s3, [x9]
; NOSUPPRESS-NEXT: ldp s4, s5, [x8, #24]
; NOSUPPRESS-NEXT: fmul s6, s1, s3
-; NOSUPPRESS-NEXT: fmul s3, s0, s3
-; NOSUPPRESS-NEXT: fnmsub s0, s0, s2, s6
-; NOSUPPRESS-NEXT: fmadd s1, s1, s2, s3
-; NOSUPPRESS-NEXT: fadd s2, s0, s4
-; NOSUPPRESS-NEXT: fadd s3, s1, s5
-; NOSUPPRESS-NEXT: stp s2, s3, [x8, #24]
-; NOSUPPRESS-NEXT: fsub s0, s4, s0
-; NOSUPPRESS-NEXT: fsub s1, s5, s1
-; NOSUPPRESS-NEXT: stp s0, s1, [x8, #56]
+; NOSUPPRESS-NEXT: fnmsub s6, s0, s2, s6
+; NOSUPPRESS-NEXT: fadd s7, s6, s4
+; NOSUPPRESS-NEXT: fmadd s16, s0, s3, s5
+; NOSUPPRESS-NEXT: fmadd s16, s1, s2, s16
+; NOSUPPRESS-NEXT: stp s7, s16, [x8, #24]
+; NOSUPPRESS-NEXT: fsub s4, s4, s6
+; NOSUPPRESS-NEXT: fmsub s0, s0, s3, s5
+; NOSUPPRESS-NEXT: fmsub s0, s1, s2, s0
+; NOSUPPRESS-NEXT: stp s4, s0, [x8, #56]
; NOSUPPRESS-NEXT: ret
bb:
%shl = shl i64 %arg3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
index 09a1f45557608..53952c505621f 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
@@ -22,21 +22,21 @@ define void @main(float %arg) {
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
; CHECK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FMAC_F32_e64_1]]
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb11:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_1]], %bb.1
- ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_]], %bb.1
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[COPY1]], %bb.1
; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1
; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI2]], implicit $exec
; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
- ; CHECK-NEXT: S_CMP_LG_U32 killed [[COPY1]], killed [[S_MOV_B32_3]], implicit-def $scc
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
- ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
+ ; CHECK-NEXT: S_CMP_LG_U32 killed [[COPY2]], killed [[S_MOV_B32_3]], implicit-def $scc
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc
+ ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY3]], implicit-def dead $scc
; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_1]]
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.4, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.3
@@ -90,13 +90,11 @@ define float @test2(float %arg, float %arg1) {
; CHECK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = nsz contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, killed [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; CHECK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = nsz contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY1]], 0, killed [[S_MOV_B32_1]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nsz contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_RCP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_RCP_F32_e64_1]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_MAX_F32_e64_]], 0, killed [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
+ ; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_RCP_F32_e64_]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_MAX_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
bb:
%i = fmul contract float %arg1, 1.000000e+02
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 451f64f71282b..67331bdab1ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -39,7 +39,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GFX10-NEXT: v_sub_f32_e32 v8, s0, v1
; GFX10-NEXT: v_fma_f32 v7, -s2, v6, s6
; GFX10-NEXT: v_fma_f32 v5, v6, v5, 1.0
-; GFX10-NEXT: v_mad_f32 v10, s2, v6, v2
+; GFX10-NEXT: v_fma_f32 v10, s2, v6, v2
; GFX10-NEXT: s_mov_b32 s0, 0x3c23d70a
; GFX10-NEXT: v_fmac_f32_e32 v1, v6, v8
; GFX10-NEXT: v_fmac_f32_e32 v10, v7, v6
@@ -265,8 +265,8 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5
+; GFX10-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_mac_f32_e32 v2, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -294,8 +294,8 @@ define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5
+; GFX10-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_mac_f32_e32 v2, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -322,8 +322,8 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float
; GFX10-LABEL: fmac_sequence_innermost_fmul_sgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mac_f32_e64 v0, s2, s3
-; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1
-; GFX10-NEXT: v_fmac_f32_e64 v0, s4, s5
+; GFX10-NEXT: v_mac_f32_e64 v0, s0, s1
+; GFX10-NEXT: v_mac_f32_e64 v0, s4, s5
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: fmac_sequence_innermost_fmul_sgpr:
@@ -346,21 +346,23 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a
; GFX10-LABEL: fmac_sequence_innermost_fmul_multiple_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mul_f32_e64 v1, s2, s3
+; GFX10-NEXT: v_mac_f32_e64 v0, s2, s3
; GFX10-NEXT: v_fmac_f32_e64 v1, s0, s1
-; GFX10-NEXT: v_fma_f32 v2, s5, s4, v1
-; GFX10-NEXT: v_fmac_f32_e32 v1, s5, v2
-; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT: v_mac_f32_e64 v0, s0, s1
+; GFX10-NEXT: v_fmac_f32_e64 v1, s4, s5
+; GFX10-NEXT: v_mac_f32_e32 v0, s5, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: fmac_sequence_innermost_fmul_multiple_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mul_f32_e64 v1, s2, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e64 v0, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fmac_f32_e64 v1, s0, s1
-; GFX11-NEXT: v_fma_f32 v2, s5, s4, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v1, s5, v2
-; GFX11-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_fmac_f32_e64 v0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e64 v1, s4, s5
+; GFX11-NEXT: v_fmac_f32_e32 v0, s5, v1
; GFX11-NEXT: ; return to shader part epilog
%t0 = fmul fast float %a, %b
%t1 = fmul fast float %c, %d
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index b36c6e707ebab..f32030e6eab88 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -172,12 +172,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v3, v2
-; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT: buffer_store_dword v5, off, s[0:3], 0
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: s_endpgm
;
@@ -195,11 +195,11 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: s_endpgm
@@ -255,12 +255,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v2, v3
-; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT: buffer_store_dword v5, off, s[0:3], 0
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: s_endpgm
;
@@ -278,11 +278,11 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v2, v0
+; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: s_endpgm
@@ -338,12 +338,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v5, v0, v1
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v3, v2
-; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: s_endpgm
;
@@ -361,11 +362,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT: v_fma_f32 v3, v0, v1, v5
+; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
+; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: s_endpgm
@@ -421,12 +423,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v5, v0, v1
+; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v2, v3
-; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: s_endpgm
;
@@ -444,11 +447,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v2, v0
+; GCN-FASTFMA-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT: v_fma_f32 v3, v0, v1, v5
+; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
+; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/PowerPC/fma-precision.ll b/llvm/test/CodeGen/PowerPC/fma-precision.ll
index 762d2336e2932..bf592589e3261 100644
--- a/llvm/test/CodeGen/PowerPC/fma-precision.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-precision.ll
@@ -140,15 +140,17 @@ define double @fma_multi_uses2(double %a, double %b, double %c, double %d, ptr %
define double @fma_multi_uses3(double %a, double %b, double %c, double %d, double %f, double %g, ptr %p1, ptr %p2, ptr %p3) {
; CHECK-LABEL: fma_multi_uses3:
; CHECK: # %bb.0:
-; CHECK-NEXT: xsmuldp 0, 1, 2
-; CHECK-NEXT: xsmuldp 1, 5, 6
+; CHECK-NEXT: xsmuldp 7, 1, 2
+; CHECK-NEXT: xsmuldp 0, 5, 6
; CHECK-NEXT: ld 3, 96(1)
-; CHECK-NEXT: stfd 0, 0(9)
-; CHECK-NEXT: stfd 0, 0(10)
-; CHECK-NEXT: stfd 1, 0(3)
-; CHECK-NEXT: xsnmsubadp 1, 3, 4
+; CHECK-NEXT: stfd 7, 0(9)
+; CHECK-NEXT: stfd 7, 0(10)
+; CHECK-NEXT: stfd 0, 0(3)
; CHECK-NEXT: xsnmsubadp 0, 3, 4
-; CHECK-NEXT: xsadddp 1, 0, 1
+; CHECK-NEXT: xsnegdp 3, 3
+; CHECK-NEXT: xsmaddadp 0, 1, 2
+; CHECK-NEXT: xsmaddadp 0, 3, 4
+; CHECK-NEXT: fmr 1, 0
; CHECK-NEXT: blr
%ab = fmul contract reassoc double %a, %b
%cd = fmul contract reassoc double %c, %d
>From 3cd802cb1bbf5bace12245f59013e8658b3c8ed6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 10 Jun 2024 17:18:21 +0100
Subject: [PATCH 2/2] Stop converting existing FMAs to FMAD
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 16 ++++++++--------
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d87f9a9a2977c..cf8a682327c00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15900,8 +15900,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT, Tmp.getOperand(0),
Tmp.getOperand(1), E);
for (SDNode *FMA : reverse(FMAs))
- Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT,
- FMA->getOperand(0), FMA->getOperand(1), Tmp);
+ Tmp = matcher.getNode(FMA->getOpcode(), SL, VT, FMA->getOperand(0),
+ FMA->getOperand(1), Tmp);
return Tmp;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 67331bdab1ee4..c4d7a5c29f2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -265,8 +265,8 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT: v_mac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_mac_f32_e32 v2, v4, v5
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -294,8 +294,8 @@ define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT: v_mac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_mac_f32_e32 v2, v4, v5
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -322,8 +322,8 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float
; GFX10-LABEL: fmac_sequence_innermost_fmul_sgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mac_f32_e64 v0, s2, s3
-; GFX10-NEXT: v_mac_f32_e64 v0, s0, s1
-; GFX10-NEXT: v_mac_f32_e64 v0, s4, s5
+; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1
+; GFX10-NEXT: v_fmac_f32_e64 v0, s4, s5
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: fmac_sequence_innermost_fmul_sgpr:
@@ -348,9 +348,9 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a
; GFX10-NEXT: v_mul_f32_e64 v1, s2, s3
; GFX10-NEXT: v_mac_f32_e64 v0, s2, s3
; GFX10-NEXT: v_fmac_f32_e64 v1, s0, s1
-; GFX10-NEXT: v_mac_f32_e64 v0, s0, s1
+; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1
; GFX10-NEXT: v_fmac_f32_e64 v1, s4, s5
-; GFX10-NEXT: v_mac_f32_e32 v0, s5, v1
+; GFX10-NEXT: v_fmac_f32_e32 v0, s5, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: fmac_sequence_innermost_fmul_multiple_use:
More information about the llvm-commits
mailing list